summaryrefslogtreecommitdiff
path: root/cmd/rescribe/main.go
diff options
context:
space:
mode:
Diffstat (limited to 'cmd/rescribe/main.go')
-rw-r--r--cmd/rescribe/main.go98
1 files changed, 95 insertions, 3 deletions
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go
index 2afa7f2..cd0b955 100644
--- a/cmd/rescribe/main.go
+++ b/cmd/rescribe/main.go
@@ -27,12 +27,12 @@ import (
"time"
"rescribe.xyz/bookpipeline"
- "rescribe.xyz/utils/pkg/hocr"
-
"rescribe.xyz/bookpipeline/internal/pipeline"
+ "rescribe.xyz/pdf"
+ "rescribe.xyz/utils/pkg/hocr"
)
-const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir [savedir]
+const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir/book.pdf [savedir]
Process and OCR a book using the Rescribe pipeline on a local machine.
@@ -249,10 +249,102 @@ These training files are included in rescribe, and are always available:
savedir = flag.Arg(1)
}
+ ispdf := false
+
+ fi, err := os.Stat(bookdir)
+ if err != nil {
+ log.Fatalln("Error opening book file/dir:", err)
+ }
+
+ // try opening as a PDF, and extracting
+ if !fi.IsDir() {
+ if flag.NArg() < 2 {
+ savedir = strings.TrimSuffix(bookdir, ".pdf")
+ }
+
+ bookdir, err = extractPdfImgs(bookdir)
+ if err != nil {
+ log.Fatalln("Error opening file as PDF:", err)
+ }
+
+ bookname = strings.TrimSuffix(bookname, ".pdf")
+
+ ispdf = true
+ }
+
err = startProcess(*verboselog, tessCommand, bookdir, bookname, trainingName, *systess, savedir, tessdir)
if err != nil {
log.Fatalln(err)
}
+
+ if ispdf {
+ os.RemoveAll(filepath.Clean(filepath.Join(bookdir, "..")))
+ }
+}
+
+// extractPdfImgs extracts all images embedded in a PDF to a
+// temporary directory, which is returned on success.
+func extractPdfImgs(path string) (string, error) {
+ p, err := pdf.Open(path)
+ if err != nil {
+ return "", err
+ }
+
+ bookname := strings.TrimSuffix(filepath.Base(path), ".pdf")
+
+ tempdir, err := ioutil.TempDir("", "bookpipeline")
+ if err != nil {
+ return "", fmt.Errorf("Error setting up temporary directory: %v", err)
+ }
+ tempdir = filepath.Join(tempdir, bookname)
+ err = os.Mkdir(tempdir, 0755)
+ if err != nil {
+ return "", fmt.Errorf("Error setting up temporary directory: %v", err)
+ }
+
+ for pgnum := 1; pgnum <= p.NumPage(); pgnum++ {
+ if p.Page(pgnum).V.IsNull() {
+ fmt.Printf("Warning: page %d not found, skipping\n", pgnum)
+ continue
+ }
+ res := p.Page(pgnum).Resources()
+ if res.Kind() != pdf.Dict {
+ fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum)
+ continue
+ }
+ xobj := res.Key("XObject")
+ if xobj.Kind() != pdf.Dict {
+ fmt.Printf("Warning: no resources found on page %d, skipping\n", pgnum)
+ continue
+ }
+ // BUG: for some PDFs this includes images multiple times for each page
+ for _, k := range xobj.Keys() {
+ obj := xobj.Key(k)
+ if obj.Kind() != pdf.Stream {
+ continue
+ }
+
+ fn := fmt.Sprintf("%s-%04d.jpg", k, pgnum)
+ w, err := os.Create(filepath.Join(tempdir, fn))
+ defer w.Close()
+ if err != nil {
+ return tempdir, fmt.Errorf("Error creating file to extract PDF image: %v\n", err)
+ }
+ r := obj.Reader()
+ defer r.Close()
+ _, err = io.Copy(w, r)
+ if err != nil {
+ return tempdir, fmt.Errorf("Error writing extracted image %s from PDF: %v\n", fn, err)
+ }
+ w.Close()
+ r.Close()
+
+ // TODO: check that what we've written is actually a JPEG
+ }
+ }
+ // TODO: check for places where there are multiple images per page, and only keep largest ones where that's the case
+
+ return tempdir, nil
}
func startProcess(logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, systess bool, savedir string, tessdir string) error {