diff options
-rw-r--r-- | cmd/rescribe/main.go | 98 | ||||
-rw-r--r-- | go.mod | 1 | ||||
-rw-r--r-- | go.sum | 2 |
3 files changed, 98 insertions, 3 deletions
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 2afa7f2..cd0b955 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -27,12 +27,12 @@ import ( "time" "rescribe.xyz/bookpipeline" - "rescribe.xyz/utils/pkg/hocr" - "rescribe.xyz/bookpipeline/internal/pipeline" + "rescribe.xyz/pdf" + "rescribe.xyz/utils/pkg/hocr" ) -const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir [savedir] +const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir/book.pdf [savedir] Process and OCR a book using the Rescribe pipeline on a local machine. @@ -249,10 +249,102 @@ These training files are included in rescribe, and are always available: savedir = flag.Arg(1) } + ispdf := false + + fi, err := os.Stat(bookdir) + if err != nil { + log.Fatalln("Error opening book file/dir:", err) + } + + // try opening as a PDF, and extracting + if !fi.IsDir() { + if flag.NArg() < 2 { + savedir = strings.TrimSuffix(bookdir, ".pdf") + } + + bookdir, err = extractPdfImgs(bookdir) + if err != nil { + log.Fatalln("Error opening file as PDF:", err) + } + + bookname = strings.TrimSuffix(bookname, ".pdf") + + ispdf = true + } + err = startProcess(*verboselog, tessCommand, bookdir, bookname, trainingName, *systess, savedir, tessdir) if err != nil { log.Fatalln(err) } + + if ispdf { + os.RemoveAll(filepath.Clean(filepath.Join(bookdir, ".."))) + } +} + +// extractPdfImgs extracts all images embedded in a PDF to a +// temporary directory, which is returned on success. +func extractPdfImgs(path string) (string, error) { + p, err := pdf.Open(path) + if err != nil { + return "", err + } + + bookname := strings.TrimSuffix(filepath.Base(path), ".pdf") + + tempdir, err := ioutil.TempDir("", "bookpipeline") + if err != nil { + return "", fmt.Errorf("Error setting up temporary directory: %v", err) + } + tempdir = filepath.Join(tempdir, bookname) + err = os.Mkdir(tempdir, 0755) + if err != nil { + return "", fmt.Errorf("Error setting up temporary directory: %v", err) + } + + for pgnum := 1; pgnum <= p.NumPage(); pgnum++ { + if p.Page(pgnum).V.IsNull() { + fmt.Printf("Warning: page %d not found, skipping\n", pgnum) + continue + } + res := p.Page(pgnum).Resources() + if res.Kind() != pdf.Dict { + fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum) + continue + } + xobj := res.Key("XObject") + if xobj.Kind() != pdf.Dict { + fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum) + continue + } + // BUG: for some PDFs this includes images multiple times for each page + for _, k := range xobj.Keys() { + obj := xobj.Key(k) + if obj.Kind() != pdf.Stream { + continue + } + + fn := fmt.Sprintf("%s-%04d.jpg", k, pgnum) + w, err := os.Create(filepath.Join(tempdir, fn)) + defer w.Close() + if err != nil { + return tempdir, fmt.Errorf("Error creating file to extract PDF image: %v\n", err) + } + r := obj.Reader() + defer r.Close() + _, err = io.Copy(w, r) + if err != nil { + return tempdir, fmt.Errorf("Error writing extracted image %s from PDF: %v\n", fn, err) + } + w.Close() + r.Close() + + // TODO: check that what we've written is actually a JPEG + } + } + // TODO: check for places where there are multiple images per page, and only keep largest ones where that's the case + + return tempdir, nil } func startProcess(logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, systess bool, savedir string, tessdir string) error { @@ -9,6 +9,7 @@ require ( github.com/wcharczuk/go-chart/v2 v2.1.0 golang.org/x/image v0.0.0-20210628002857-a66eb6448b8d golang.org/x/sys v0.0.0-20211023085530-d6a326fbbf70 // indirect + rescribe.xyz/pdf v0.1.3 rescribe.xyz/preproc v0.4.2 rescribe.xyz/utils v0.1.3 ) @@ -113,6 +113,8 @@ gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= rescribe.xyz/integral v0.6.0 h1:CLF3sQ6th/OuG+/rp/lLR+AGOT4R7tG3IiUjSLKsriw= rescribe.xyz/integral v0.6.0/go.mod h1:gKJq4UaVn17RsMsUasEMcJDkTkwqeb6AzPIJtwcUipg= +rescribe.xyz/pdf v0.1.3 h1:Fl4HHQPfkIUJs8WIkpjCm8yGu6Wd1TIDLZgXhVy8Pdk= +rescribe.xyz/pdf v0.1.3/go.mod h1:fIia5YlYagNbBARPP2JXDoXXR5zd14Us5RkaKXUz7Nw= rescribe.xyz/preproc v0.4.2 h1:aX6rOf6ha3UNcHM0oHuY1MQi7ZwYj+46OxhTcptAI4E= rescribe.xyz/preproc v0.4.2/go.mod h1:LJe+rQ9cAxn/29cVK5l6X1hH1ZWRAI1Bs73yDGjvT4A= rescribe.xyz/utils v0.1.3 h1:2rlHbUjAGXy/xgtmUb6Y7Kbpxl3qkwtWzkFUQ/cOaIA= |