summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cmd/rescribe/main.go98
-rw-r--r--go.mod1
-rw-r--r--go.sum2
3 files changed, 98 insertions, 3 deletions
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go
index 2afa7f2..cd0b955 100644
--- a/cmd/rescribe/main.go
+++ b/cmd/rescribe/main.go
@@ -27,12 +27,12 @@ import (
"time"
"rescribe.xyz/bookpipeline"
- "rescribe.xyz/utils/pkg/hocr"
-
"rescribe.xyz/bookpipeline/internal/pipeline"
+ "rescribe.xyz/pdf"
+ "rescribe.xyz/utils/pkg/hocr"
)
-const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir [savedir]
+const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir/book.pdf [savedir]
Process and OCR a book using the Rescribe pipeline on a local machine.
@@ -249,10 +249,102 @@ These training files are included in rescribe, and are always available:
savedir = flag.Arg(1)
}
+ ispdf := false
+
+ fi, err := os.Stat(bookdir)
+ if err != nil {
+ log.Fatalln("Error opening book file/dir:", err)
+ }
+
+ // try opening as a PDF, and extracting
+ if !fi.IsDir() {
+ if flag.NArg() < 2 {
+ savedir = strings.TrimSuffix(bookdir, ".pdf")
+ }
+
+ bookdir, err = extractPdfImgs(bookdir)
+ if err != nil {
+ log.Fatalln("Error opening file as PDF:", err)
+ }
+
+ bookname = strings.TrimSuffix(bookname, ".pdf")
+
+ ispdf = true
+ }
+
err = startProcess(*verboselog, tessCommand, bookdir, bookname, trainingName, *systess, savedir, tessdir)
if err != nil {
log.Fatalln(err)
}
+
+ if ispdf {
+ os.RemoveAll(filepath.Clean(filepath.Join(bookdir, "..")))
+ }
+}
+
+// extractPdfImgs extracts all images embedded in a PDF to a
+// temporary directory, which is returned on success.
+func extractPdfImgs(path string) (string, error) {
+ p, err := pdf.Open(path)
+ if err != nil {
+ return "", err
+ }
+
+ bookname := strings.TrimSuffix(filepath.Base(path), ".pdf")
+
+ tempdir, err := ioutil.TempDir("", "bookpipeline")
+ if err != nil {
+ return "", fmt.Errorf("Error setting up temporary directory: %v", err)
+ }
+ tempdir = filepath.Join(tempdir, bookname)
+ err = os.Mkdir(tempdir, 0755)
+ if err != nil {
+ return "", fmt.Errorf("Error setting up temporary directory: %v", err)
+ }
+
+ for pgnum := 1; pgnum <= p.NumPage(); pgnum++ {
+ if p.Page(pgnum).V.IsNull() {
+ fmt.Printf("Warning: page %d not found, skipping\n", pgnum)
+ continue
+ }
+ res := p.Page(pgnum).Resources()
+ if res.Kind() != pdf.Dict {
+ fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum)
+ continue
+ }
+ xobj := res.Key("XObject")
+ if xobj.Kind() != pdf.Dict {
+ fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum)
+ continue
+ }
+ // BUG: for some PDFs this includes images multiple times for each page
+ for _, k := range xobj.Keys() {
+ obj := xobj.Key(k)
+ if obj.Kind() != pdf.Stream {
+ continue
+ }
+
+ fn := fmt.Sprintf("%s-%04d.jpg", k, pgnum)
+ w, err := os.Create(filepath.Join(tempdir, fn))
+ defer w.Close()
+ if err != nil {
+ return tempdir, fmt.Errorf("Error creating file to extract PDF image: %v\n", err)
+ }
+ r := obj.Reader()
+ defer r.Close()
+ _, err = io.Copy(w, r)
+ if err != nil {
+ return tempdir, fmt.Errorf("Error writing extracted image %s from PDF: %v\n", fn, err)
+ }
+ w.Close()
+ r.Close()
+
+ // TODO: check that what we've written is actually a JPEG
+ }
+ }
+ // TODO: check for places where there are multiple images per page, and only keep largest ones where that's the case
+
+ return tempdir, nil
}
func startProcess(logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, systess bool, savedir string, tessdir string) error {
diff --git a/go.mod b/go.mod
index 7171045..f12564b 100644
--- a/go.mod
+++ b/go.mod
@@ -9,6 +9,7 @@ require (
github.com/wcharczuk/go-chart/v2 v2.1.0
golang.org/x/image v0.0.0-20210628002857-a66eb6448b8d
golang.org/x/sys v0.0.0-20211023085530-d6a326fbbf70 // indirect
+ rescribe.xyz/pdf v0.1.3
rescribe.xyz/preproc v0.4.2
rescribe.xyz/utils v0.1.3
)
diff --git a/go.sum b/go.sum
index 37128ed..eb92ea0 100644
--- a/go.sum
+++ b/go.sum
@@ -113,6 +113,8 @@ gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
rescribe.xyz/integral v0.6.0 h1:CLF3sQ6th/OuG+/rp/lLR+AGOT4R7tG3IiUjSLKsriw=
rescribe.xyz/integral v0.6.0/go.mod h1:gKJq4UaVn17RsMsUasEMcJDkTkwqeb6AzPIJtwcUipg=
+rescribe.xyz/pdf v0.1.3 h1:Fl4HHQPfkIUJs8WIkpjCm8yGu6Wd1TIDLZgXhVy8Pdk=
+rescribe.xyz/pdf v0.1.3/go.mod h1:fIia5YlYagNbBARPP2JXDoXXR5zd14Us5RkaKXUz7Nw=
rescribe.xyz/preproc v0.4.2 h1:aX6rOf6ha3UNcHM0oHuY1MQi7ZwYj+46OxhTcptAI4E=
rescribe.xyz/preproc v0.4.2/go.mod h1:LJe+rQ9cAxn/29cVK5l6X1hH1ZWRAI1Bs73yDGjvT4A=
rescribe.xyz/utils v0.1.3 h1:2rlHbUjAGXy/xgtmUb6Y7Kbpxl3qkwtWzkFUQ/cOaIA=