summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2021-11-22 17:26:41 +0000
committerNick White <git@njw.name>2021-11-22 17:26:41 +0000
commit673f77278f5b65576de7fee651ae290345e65282 (patch)
tree90ad3b6040fa7a0b95f188836631f3f1f180886e
parente9814508f443d10f91c962d2cad147ae0e2579fb (diff)
rescribe: Add support for reading images directly from PDFs
There are several TODO items before this can be considered "good enough", let alone complete. See the comments in the code for details. On a good day, with a fair wind, though, this works.
-rw-r--r--cmd/rescribe/main.go98
-rw-r--r--go.mod1
-rw-r--r--go.sum2
3 files changed, 98 insertions, 3 deletions
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go
index 2afa7f2..cd0b955 100644
--- a/cmd/rescribe/main.go
+++ b/cmd/rescribe/main.go
@@ -27,12 +27,12 @@ import (
"time"
"rescribe.xyz/bookpipeline"
- "rescribe.xyz/utils/pkg/hocr"
-
"rescribe.xyz/bookpipeline/internal/pipeline"
+ "rescribe.xyz/pdf"
+ "rescribe.xyz/utils/pkg/hocr"
)
-const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir [savedir]
+const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir/book.pdf [savedir]
Process and OCR a book using the Rescribe pipeline on a local machine.
@@ -249,10 +249,102 @@ These training files are included in rescribe, and are always available:
savedir = flag.Arg(1)
}
+ ispdf := false
+
+ fi, err := os.Stat(bookdir)
+ if err != nil {
+ log.Fatalln("Error opening book file/dir:", err)
+ }
+
+ // try opening as a PDF, and extracting
+ if !fi.IsDir() {
+ if flag.NArg() < 2 {
+ savedir = strings.TrimSuffix(bookdir, ".pdf")
+ }
+
+ bookdir, err = extractPdfImgs(bookdir)
+ if err != nil {
+ log.Fatalln("Error opening file as PDF:", err)
+ }
+
+ bookname = strings.TrimSuffix(bookname, ".pdf")
+
+ ispdf = true
+ }
+
err = startProcess(*verboselog, tessCommand, bookdir, bookname, trainingName, *systess, savedir, tessdir)
if err != nil {
log.Fatalln(err)
}
+
+ if ispdf {
+ os.RemoveAll(filepath.Clean(filepath.Join(bookdir, "..")))
+ }
+}
+
+// extractPdfImgs extracts all images embedded in a PDF to a
+// temporary directory, which is returned on success.
+func extractPdfImgs(path string) (string, error) {
+ p, err := pdf.Open(path)
+ if err != nil {
+ return "", err
+ }
+
+ bookname := strings.TrimSuffix(filepath.Base(path), ".pdf")
+
+ tempdir, err := ioutil.TempDir("", "bookpipeline")
+ if err != nil {
+ return "", fmt.Errorf("Error setting up temporary directory: %v", err)
+ }
+ tempdir = filepath.Join(tempdir, bookname)
+ err = os.Mkdir(tempdir, 0755)
+ if err != nil {
+ return "", fmt.Errorf("Error setting up temporary directory: %v", err)
+ }
+
+ for pgnum := 1; pgnum <= p.NumPage(); pgnum++ {
+ if p.Page(pgnum).V.IsNull() {
+ fmt.Printf("Warning: page %d not found, skipping\n", pgnum)
+ continue
+ }
+ res := p.Page(pgnum).Resources()
+ if res.Kind() != pdf.Dict {
+ fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum)
+ continue
+ }
+ xobj := res.Key("XObject")
+ if xobj.Kind() != pdf.Dict {
+ fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum)
+ continue
+ }
+ // BUG: for some PDFs this includes images multiple times for each page
+ for _, k := range xobj.Keys() {
+ obj := xobj.Key(k)
+ if obj.Kind() != pdf.Stream {
+ continue
+ }
+
+ fn := fmt.Sprintf("%s-%04d.jpg", k, pgnum)
+ w, err := os.Create(filepath.Join(tempdir, fn))
+ defer w.Close()
+ if err != nil {
+ return tempdir, fmt.Errorf("Error creating file to extract PDF image: %v\n", err)
+ }
+ r := obj.Reader()
+ defer r.Close()
+ _, err = io.Copy(w, r)
+ if err != nil {
+ return tempdir, fmt.Errorf("Error writing extracted image %s from PDF: %v\n", fn, err)
+ }
+ w.Close()
+ r.Close()
+
+ // TODO: check that what we've written is actually a JPEG
+ }
+ }
+ // TODO: check for places where there are multiple images per page, and only keep largest ones where that's the case
+
+ return tempdir, nil
}
func startProcess(logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, systess bool, savedir string, tessdir string) error {
diff --git a/go.mod b/go.mod
index 7171045..f12564b 100644
--- a/go.mod
+++ b/go.mod
@@ -9,6 +9,7 @@ require (
github.com/wcharczuk/go-chart/v2 v2.1.0
golang.org/x/image v0.0.0-20210628002857-a66eb6448b8d
golang.org/x/sys v0.0.0-20211023085530-d6a326fbbf70 // indirect
+ rescribe.xyz/pdf v0.1.3
rescribe.xyz/preproc v0.4.2
rescribe.xyz/utils v0.1.3
)
diff --git a/go.sum b/go.sum
index 37128ed..eb92ea0 100644
--- a/go.sum
+++ b/go.sum
@@ -113,6 +113,8 @@ gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
rescribe.xyz/integral v0.6.0 h1:CLF3sQ6th/OuG+/rp/lLR+AGOT4R7tG3IiUjSLKsriw=
rescribe.xyz/integral v0.6.0/go.mod h1:gKJq4UaVn17RsMsUasEMcJDkTkwqeb6AzPIJtwcUipg=
+rescribe.xyz/pdf v0.1.3 h1:Fl4HHQPfkIUJs8WIkpjCm8yGu6Wd1TIDLZgXhVy8Pdk=
+rescribe.xyz/pdf v0.1.3/go.mod h1:fIia5YlYagNbBARPP2JXDoXXR5zd14Us5RkaKXUz7Nw=
rescribe.xyz/preproc v0.4.2 h1:aX6rOf6ha3UNcHM0oHuY1MQi7ZwYj+46OxhTcptAI4E=
rescribe.xyz/preproc v0.4.2/go.mod h1:LJe+rQ9cAxn/29cVK5l6X1hH1ZWRAI1Bs73yDGjvT4A=
rescribe.xyz/utils v0.1.3 h1:2rlHbUjAGXy/xgtmUb6Y7Kbpxl3qkwtWzkFUQ/cOaIA=