summaryrefslogtreecommitdiff
path: root/cmd/rescribe
diff options
context:
space:
mode:
authorNick White <git@njw.name>2021-11-22 17:26:41 +0000
committerNick White <git@njw.name>2021-11-22 17:26:41 +0000
commit673f77278f5b65576de7fee651ae290345e65282 (patch)
tree90ad3b6040fa7a0b95f188836631f3f1f180886e /cmd/rescribe
parente9814508f443d10f91c962d2cad147ae0e2579fb (diff)
rescribe: Add support for reading images directly from PDFs
There are several TODO items before this can be considered "good enough", let alone complete. See the comments in the code for details. On a good day, with a fair wind, though, this works.
Diffstat (limited to 'cmd/rescribe')
-rw-r--r--cmd/rescribe/main.go98
1 files changed, 95 insertions, 3 deletions
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go
index 2afa7f2..cd0b955 100644
--- a/cmd/rescribe/main.go
+++ b/cmd/rescribe/main.go
@@ -27,12 +27,12 @@ import (
"time"
"rescribe.xyz/bookpipeline"
- "rescribe.xyz/utils/pkg/hocr"
-
"rescribe.xyz/bookpipeline/internal/pipeline"
+ "rescribe.xyz/pdf"
+ "rescribe.xyz/utils/pkg/hocr"
)
-const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir [savedir]
+const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir/book.pdf [savedir]
Process and OCR a book using the Rescribe pipeline on a local machine.
@@ -249,10 +249,102 @@ These training files are included in rescribe, and are always available:
savedir = flag.Arg(1)
}
+ ispdf := false
+
+ fi, err := os.Stat(bookdir)
+ if err != nil {
+ log.Fatalln("Error opening book file/dir:", err)
+ }
+
+ // try opening as a PDF, and extracting
+ if !fi.IsDir() {
+ if flag.NArg() < 2 {
+ savedir = strings.TrimSuffix(bookdir, ".pdf")
+ }
+
+ bookdir, err = extractPdfImgs(bookdir)
+ if err != nil {
+ log.Fatalln("Error opening file as PDF:", err)
+ }
+
+ bookname = strings.TrimSuffix(bookname, ".pdf")
+
+ ispdf = true
+ }
+
err = startProcess(*verboselog, tessCommand, bookdir, bookname, trainingName, *systess, savedir, tessdir)
if err != nil {
log.Fatalln(err)
}
+
+ if ispdf {
+ os.RemoveAll(filepath.Clean(filepath.Join(bookdir, "..")))
+ }
+}
+
+// extractPdfImgs extracts all images embedded in a PDF to a
+// temporary directory, which is returned on success.
+func extractPdfImgs(path string) (string, error) {
+ p, err := pdf.Open(path)
+ if err != nil {
+ return "", err
+ }
+
+ bookname := strings.TrimSuffix(filepath.Base(path), ".pdf")
+
+ tempdir, err := ioutil.TempDir("", "bookpipeline")
+ if err != nil {
+ return "", fmt.Errorf("Error setting up temporary directory: %v", err)
+ }
+ tempdir = filepath.Join(tempdir, bookname)
+ err = os.Mkdir(tempdir, 0755)
+ if err != nil {
+ return "", fmt.Errorf("Error setting up temporary directory: %v", err)
+ }
+
+ for pgnum := 1; pgnum <= p.NumPage(); pgnum++ {
+ if p.Page(pgnum).V.IsNull() {
+ fmt.Printf("Warning: page %d not found, skipping\n", pgnum)
+ continue
+ }
+ res := p.Page(pgnum).Resources()
+ if res.Kind() != pdf.Dict {
+ fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum)
+ continue
+ }
+ xobj := res.Key("XObject")
+ if xobj.Kind() != pdf.Dict {
+ fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum)
+ continue
+ }
+ // BUG: for some PDFs this includes images multiple times for each page
+ for _, k := range xobj.Keys() {
+ obj := xobj.Key(k)
+ if obj.Kind() != pdf.Stream {
+ continue
+ }
+
+ fn := fmt.Sprintf("%s-%04d.jpg", k, pgnum)
+ w, err := os.Create(filepath.Join(tempdir, fn))
+ defer w.Close()
+ if err != nil {
+ return tempdir, fmt.Errorf("Error creating file to extract PDF image: %v\n", err)
+ }
+ r := obj.Reader()
+ defer r.Close()
+ _, err = io.Copy(w, r)
+ if err != nil {
+ return tempdir, fmt.Errorf("Error writing extracted image %s from PDF: %v\n", fn, err)
+ }
+ w.Close()
+ r.Close()
+
+ // TODO: check that what we've written is actually a JPEG
+ }
+ }
+ // TODO: check for places where there are multiple images per page, and only keep largest ones where that's the case
+
+ return tempdir, nil
}
func startProcess(logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, systess bool, savedir string, tessdir string) error {