From 673f77278f5b65576de7fee651ae290345e65282 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 22 Nov 2021 17:26:41 +0000 Subject: rescribe: Add support for reading images directly from PDFs There are several TODO items before this can be considered "good enough", let alone complete. See the comments in the code for details. On a good day, with a fair wind, though, this works. --- cmd/rescribe/main.go | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++-- go.mod | 1 + go.sum | 2 ++ 3 files changed, 98 insertions(+), 3 deletions(-) diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 2afa7f2..cd0b955 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -27,12 +27,12 @@ import ( "time" "rescribe.xyz/bookpipeline" - "rescribe.xyz/utils/pkg/hocr" - "rescribe.xyz/bookpipeline/internal/pipeline" + "rescribe.xyz/pdf" + "rescribe.xyz/utils/pkg/hocr" ) -const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir [savedir] +const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir/book.pdf [savedir] Process and OCR a book using the Rescribe pipeline on a local machine. @@ -249,10 +249,102 @@ These training files are included in rescribe, and are always available: savedir = flag.Arg(1) } + ispdf := false + + fi, err := os.Stat(bookdir) + if err != nil { + log.Fatalln("Error opening book file/dir:", err) + } + + // try opening as a PDF, and extracting + if !fi.IsDir() { + if flag.NArg() < 2 { + savedir = strings.TrimSuffix(bookdir, ".pdf") + } + + bookdir, err = extractPdfImgs(bookdir) + if err != nil { + log.Fatalln("Error opening file as PDF:", err) + } + + bookname = strings.TrimSuffix(bookname, ".pdf") + + ispdf = true + } + err = startProcess(*verboselog, tessCommand, bookdir, bookname, trainingName, *systess, savedir, tessdir) if err != nil { log.Fatalln(err) } + + if ispdf { + os.RemoveAll(filepath.Clean(filepath.Join(bookdir, ".."))) + } +} + +// extractPdfImgs extracts all images embedded in a PDF to a +// temporary directory, which is returned on success. +func extractPdfImgs(path string) (string, error) { + p, err := pdf.Open(path) + if err != nil { + return "", err + } + + bookname := strings.TrimSuffix(filepath.Base(path), ".pdf") + + tempdir, err := ioutil.TempDir("", "bookpipeline") + if err != nil { + return "", fmt.Errorf("Error setting up temporary directory: %v", err) + } + tempdir = filepath.Join(tempdir, bookname) + err = os.Mkdir(tempdir, 0755) + if err != nil { + return "", fmt.Errorf("Error setting up temporary directory: %v", err) + } + + for pgnum := 1; pgnum <= p.NumPage(); pgnum++ { + if p.Page(pgnum).V.IsNull() { + fmt.Printf("Warning: page %d not found, skipping\n", pgnum) + continue + } + res := p.Page(pgnum).Resources() + if res.Kind() != pdf.Dict { + fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum) + continue + } + xobj := res.Key("XObject") + if xobj.Kind() != pdf.Dict { + fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum) + continue + } + // BUG: for some PDFs this includes images multiple times for each page + for _, k := range xobj.Keys() { + obj := xobj.Key(k) + if obj.Kind() != pdf.Stream { + continue + } + + fn := fmt.Sprintf("%s-%04d.jpg", k, pgnum) + w, err := os.Create(filepath.Join(tempdir, fn)) + defer w.Close() + if err != nil { + return tempdir, fmt.Errorf("Error creating file to extract PDF image: %v\n", err) + } + r := obj.Reader() + defer r.Close() + _, err = io.Copy(w, r) + if err != nil { + return tempdir, fmt.Errorf("Error writing extracted image %s from PDF: %v\n", fn, err) + } + w.Close() + r.Close() + + // TODO: check that what we've written is actually a JPEG + } + } + // TODO: check for places where there are multiple images per page, and only keep largest ones where that's the case + + return tempdir, nil } func startProcess(logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, systess bool, savedir string, tessdir string) error { diff --git a/go.mod b/go.mod index 7171045..f12564b 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/wcharczuk/go-chart/v2 v2.1.0 golang.org/x/image v0.0.0-20210628002857-a66eb6448b8d golang.org/x/sys v0.0.0-20211023085530-d6a326fbbf70 // indirect + rescribe.xyz/pdf v0.1.3 rescribe.xyz/preproc v0.4.2 rescribe.xyz/utils v0.1.3 ) diff --git a/go.sum b/go.sum index 37128ed..eb92ea0 100644 --- a/go.sum +++ b/go.sum @@ -113,6 +113,8 @@ gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= rescribe.xyz/integral v0.6.0 h1:CLF3sQ6th/OuG+/rp/lLR+AGOT4R7tG3IiUjSLKsriw= rescribe.xyz/integral v0.6.0/go.mod h1:gKJq4UaVn17RsMsUasEMcJDkTkwqeb6AzPIJtwcUipg= +rescribe.xyz/pdf v0.1.3 h1:Fl4HHQPfkIUJs8WIkpjCm8yGu6Wd1TIDLZgXhVy8Pdk= +rescribe.xyz/pdf v0.1.3/go.mod h1:fIia5YlYagNbBARPP2JXDoXXR5zd14Us5RkaKXUz7Nw= rescribe.xyz/preproc v0.4.2 h1:aX6rOf6ha3UNcHM0oHuY1MQi7ZwYj+46OxhTcptAI4E= rescribe.xyz/preproc v0.4.2/go.mod h1:LJe+rQ9cAxn/29cVK5l6X1hH1ZWRAI1Bs73yDGjvT4A= rescribe.xyz/utils v0.1.3 h1:2rlHbUjAGXy/xgtmUb6Y7Kbpxl3qkwtWzkFUQ/cOaIA= -- cgit v1.2.1-24-ge1ad