diff options
author | Nick White <git@njw.name> | 2021-11-22 17:26:41 +0000 |
---|---|---|
committer | Nick White <git@njw.name> | 2021-11-22 17:26:41 +0000 |
commit | 673f77278f5b65576de7fee651ae290345e65282 (patch) | |
tree | 90ad3b6040fa7a0b95f188836631f3f1f180886e | |
parent | e9814508f443d10f91c962d2cad147ae0e2579fb (diff) |
rescribe: Add support for reading images directly from PDFs
There are several TODO items before this can be considered "good
enough", let alone complete. See the comments in the code for
details.
On a good day, with a fair wind, though, this works.
-rw-r--r-- | cmd/rescribe/main.go | 98 | ||||
-rw-r--r-- | go.mod | 1 | ||||
-rw-r--r-- | go.sum | 2 |
3 files changed, 98 insertions, 3 deletions
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 2afa7f2..cd0b955 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -27,12 +27,12 @@ import ( "time" "rescribe.xyz/bookpipeline" - "rescribe.xyz/utils/pkg/hocr" - "rescribe.xyz/bookpipeline/internal/pipeline" + "rescribe.xyz/pdf" + "rescribe.xyz/utils/pkg/hocr" ) -const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir [savedir] +const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir/book.pdf [savedir] Process and OCR a book using the Rescribe pipeline on a local machine. @@ -249,10 +249,102 @@ These training files are included in rescribe, and are always available: savedir = flag.Arg(1) } + ispdf := false + + fi, err := os.Stat(bookdir) + if err != nil { + log.Fatalln("Error opening book file/dir:", err) + } + + // try opening as a PDF, and extracting + if !fi.IsDir() { + if flag.NArg() < 2 { + savedir = strings.TrimSuffix(bookdir, ".pdf") + } + + bookdir, err = extractPdfImgs(bookdir) + if err != nil { + log.Fatalln("Error opening file as PDF:", err) + } + + bookname = strings.TrimSuffix(bookname, ".pdf") + + ispdf = true + } + err = startProcess(*verboselog, tessCommand, bookdir, bookname, trainingName, *systess, savedir, tessdir) if err != nil { log.Fatalln(err) } + + if ispdf { + os.RemoveAll(filepath.Clean(filepath.Join(bookdir, ".."))) + } +} + +// extractPdfImgs extracts all images embedded in a PDF to a +// temporary directory, which is returned on success. +func extractPdfImgs(path string) (string, error) { + p, err := pdf.Open(path) + if err != nil { + return "", err + } + + bookname := strings.TrimSuffix(filepath.Base(path), ".pdf") + + tempdir, err := ioutil.TempDir("", "bookpipeline") + if err != nil { + return "", fmt.Errorf("Error setting up temporary directory: %v", err) + } + tempdir = filepath.Join(tempdir, bookname) + err = os.Mkdir(tempdir, 0755) + if err != nil { + return "", fmt.Errorf("Error setting up temporary directory: %v", err) + } + + for pgnum := 1; pgnum <= p.NumPage(); pgnum++ { + if p.Page(pgnum).V.IsNull() { + fmt.Printf("Warning: page %d not found, skipping\n", pgnum) + continue + } + res := p.Page(pgnum).Resources() + if res.Kind() != pdf.Dict { + fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum) + continue + } + xobj := res.Key("XObject") + if xobj.Kind() != pdf.Dict { + fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum) + continue + } + // BUG: for some PDFs this includes images multiple times for each page + for _, k := range xobj.Keys() { + obj := xobj.Key(k) + if obj.Kind() != pdf.Stream { + continue + } + + fn := fmt.Sprintf("%s-%04d.jpg", k, pgnum) + w, err := os.Create(filepath.Join(tempdir, fn)) + defer w.Close() + if err != nil { + return tempdir, fmt.Errorf("Error creating file to extract PDF image: %v\n", err) + } + r := obj.Reader() + defer r.Close() + _, err = io.Copy(w, r) + if err != nil { + return tempdir, fmt.Errorf("Error writing extracted image %s from PDF: %v\n", fn, err) + } + w.Close() + r.Close() + + // TODO: check that what we've written is actually a JPEG + } + } + // TODO: check for places where there are multiple images per page, and only keep largest ones where that's the case + + return tempdir, nil } func startProcess(logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, systess bool, savedir string, tessdir string) error { @@ -9,6 +9,7 @@ require ( github.com/wcharczuk/go-chart/v2 v2.1.0 golang.org/x/image v0.0.0-20210628002857-a66eb6448b8d golang.org/x/sys v0.0.0-20211023085530-d6a326fbbf70 // indirect + rescribe.xyz/pdf v0.1.3 rescribe.xyz/preproc v0.4.2 rescribe.xyz/utils v0.1.3 ) @@ -113,6 +113,8 @@ gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= rescribe.xyz/integral v0.6.0 h1:CLF3sQ6th/OuG+/rp/lLR+AGOT4R7tG3IiUjSLKsriw= rescribe.xyz/integral v0.6.0/go.mod h1:gKJq4UaVn17RsMsUasEMcJDkTkwqeb6AzPIJtwcUipg= +rescribe.xyz/pdf v0.1.3 h1:Fl4HHQPfkIUJs8WIkpjCm8yGu6Wd1TIDLZgXhVy8Pdk= +rescribe.xyz/pdf v0.1.3/go.mod h1:fIia5YlYagNbBARPP2JXDoXXR5zd14Us5RkaKXUz7Nw= rescribe.xyz/preproc v0.4.2 h1:aX6rOf6ha3UNcHM0oHuY1MQi7ZwYj+46OxhTcptAI4E= rescribe.xyz/preproc v0.4.2/go.mod h1:LJe+rQ9cAxn/29cVK5l6X1hH1ZWRAI1Bs73yDGjvT4A= rescribe.xyz/utils v0.1.3 h1:2rlHbUjAGXy/xgtmUb6Y7Kbpxl3qkwtWzkFUQ/cOaIA= |