From 673f77278f5b65576de7fee651ae290345e65282 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 22 Nov 2021 17:26:41 +0000 Subject: rescribe: Add support for reading images directly from PDFs There are several TODO items before this can be considered "good enough", let alone complete. See the comments in the code for details. On a good day, with a fair wind, though, this works. --- cmd/rescribe/main.go | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 3 deletions(-) (limited to 'cmd') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 2afa7f2..cd0b955 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -27,12 +27,12 @@ import ( "time" "rescribe.xyz/bookpipeline" - "rescribe.xyz/utils/pkg/hocr" - "rescribe.xyz/bookpipeline/internal/pipeline" + "rescribe.xyz/pdf" + "rescribe.xyz/utils/pkg/hocr" ) -const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir [savedir] +const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir/book.pdf [savedir] Process and OCR a book using the Rescribe pipeline on a local machine. @@ -249,10 +249,102 @@ These training files are included in rescribe, and are always available: savedir = flag.Arg(1) } + ispdf := false + + fi, err := os.Stat(bookdir) + if err != nil { + log.Fatalln("Error opening book file/dir:", err) + } + + // try opening as a PDF, and extracting + if !fi.IsDir() { + if flag.NArg() < 2 { + savedir = strings.TrimSuffix(bookdir, ".pdf") + } + + bookdir, err = extractPdfImgs(bookdir) + if err != nil { + log.Fatalln("Error opening file as PDF:", err) + } + + bookname = strings.TrimSuffix(bookname, ".pdf") + + ispdf = true + } + err = startProcess(*verboselog, tessCommand, bookdir, bookname, trainingName, *systess, savedir, tessdir) if err != nil { log.Fatalln(err) } + + if ispdf { + os.RemoveAll(filepath.Clean(filepath.Join(bookdir, ".."))) + } +} + +// extractPdfImgs extracts all images embedded in a PDF to a +// temporary directory, which is returned on success. +func extractPdfImgs(path string) (string, error) { + p, err := pdf.Open(path) + if err != nil { + return "", err + } + + bookname := strings.TrimSuffix(filepath.Base(path), ".pdf") + + tempdir, err := ioutil.TempDir("", "bookpipeline") + if err != nil { + return "", fmt.Errorf("Error setting up temporary directory: %v", err) + } + tempdir = filepath.Join(tempdir, bookname) + err = os.Mkdir(tempdir, 0755) + if err != nil { + return "", fmt.Errorf("Error setting up temporary directory: %v", err) + } + + for pgnum := 1; pgnum <= p.NumPage(); pgnum++ { + if p.Page(pgnum).V.IsNull() { + fmt.Printf("Warning: page %d not found, skipping\n", pgnum) + continue + } + res := p.Page(pgnum).Resources() + if res.Kind() != pdf.Dict { + fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum) + continue + } + xobj := res.Key("XObject") + if xobj.Kind() != pdf.Dict { + fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum) + continue + } + // BUG: for some PDFs this includes images multiple times for each page + for _, k := range xobj.Keys() { + obj := xobj.Key(k) + if obj.Kind() != pdf.Stream { + continue + } + + fn := fmt.Sprintf("%s-%04d.jpg", k, pgnum) + w, err := os.Create(filepath.Join(tempdir, fn)) + defer w.Close() + if err != nil { + return tempdir, fmt.Errorf("Error creating file to extract PDF image: %v\n", err) + } + r := obj.Reader() + defer r.Close() + _, err = io.Copy(w, r) + if err != nil { + return tempdir, fmt.Errorf("Error writing extracted image %s from PDF: %v\n", fn, err) + } + w.Close() + r.Close() + + // TODO: check that what we've written is actually a JPEG + } + } + // TODO: check for places where there are multiple images per page, and only keep largest ones where that's the case + + return tempdir, nil } func startProcess(logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, systess bool, savedir string, tessdir string) error { -- cgit v1.2.1-24-ge1ad