summaryrefslogtreecommitdiff
path: root/cmd/rescribe/main.go
diff options
context:
space:
mode:
Diffstat (limited to 'cmd/rescribe/main.go')
-rw-r--r--cmd/rescribe/main.go70
1 files changed, 42 insertions, 28 deletions
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go
index fd5b33b..16ca024 100644
--- a/cmd/rescribe/main.go
+++ b/cmd/rescribe/main.go
@@ -13,7 +13,6 @@ import (
"archive/zip"
"bytes"
"context"
- _ "embed"
"flag"
"fmt"
"image/jpeg"
@@ -35,7 +34,7 @@ import (
"rescribe.xyz/utils/pkg/hocr"
)
-const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir/book.pdf [savedir]
+const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd cmd] [-gbookcmd cmd] [-t training] bookdir/book.pdf [savedir]
Process and OCR a book using the Rescribe pipeline on a local machine.
@@ -43,9 +42,6 @@ OCR results are saved into the bookdir directory unless savedir is
specified.
`
-//go:embed tessdata.20220322.zip
-var tessdatazip []byte
-
const QueueTimeoutSecs = 2 * 60
const PauseBetweenChecks = 1 * time.Second
const LogSaveTime = 1 * time.Minute
@@ -95,7 +91,7 @@ func resetTimer(t *time.Timer, d time.Duration) {
}
}
-// unpackTessZip unpacks a byte array of a zip file into a directory
+// unpackZip unpacks a byte array of a zip file into a directory
func unpackZip(b []byte, dir string) error {
br := bytes.NewReader(b)
zr, err := zip.NewReader(br, br.Size())
@@ -140,8 +136,10 @@ func unpackZip(b []byte, dir string) error {
func main() {
deftesscmd := "tesseract"
+ defgbookcmd := "getgbook"
if runtime.GOOS == "windows" {
deftesscmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
+ defgbookcmd = "getgbook.exe"
}
verbose := flag.Bool("v", false, "verbose")
@@ -153,6 +151,7 @@ These training files are included in rescribe, and are always available:
- lat.traineddata (Latin, modern print)
- rescribev9_fast.traineddata (Latin/English/French, printed ca 1500-1800)
`)
+ gbookcmd := flag.String("gbookcmd", defgbookcmd, "The getgbook executable to run. You may need to set this to the full path of getgbook.exe if you're on Windows.")
tesscmd := flag.String("tesscmd", deftesscmd, "The Tesseract executable to run. You may need to set this to the full path of Tesseract.exe if you're on Windows.")
wipe := flag.Bool("wipe", false, "Use wiper tool to remove noise like gutters from page before processing.")
fullpdf := flag.Bool("fullpdf", false, "Use highest image quality for searchable PDF (requires lots of RAM).")
@@ -187,7 +186,7 @@ These training files are included in rescribe, and are always available:
log.Fatalln("Error setting up tesseract directory:", err)
}
- if !*systess {
+ if !*systess && len(tesszip) > 0 {
err = unpackZip(tesszip, tessdir)
if err != nil {
log.Fatalln("Error unpacking embedded Tesseract zip:", err)
@@ -202,18 +201,31 @@ These training files are included in rescribe, and are always available:
}
}
- err = unpackZip(gbookzip, tessdir)
+ _, err = exec.LookPath(tessCommand)
if err != nil {
- log.Fatalln("Error unpacking embedded getgbook zip:", err)
+ log.Fatalf("No tesseract executable found [tried %s], either set -tesscmd and -systess on the command line or use the official build which includes an embedded copy of Tesseract.", tessCommand)
+ }
+
+ gbookCommand := *gbookcmd
+ if len(gbookzip) > 0 {
+ err = unpackZip(gbookzip, tessdir)
+ if err != nil {
+ log.Fatalln("Error unpacking embedded getgbook zip:", err)
+ }
+ switch runtime.GOOS {
+ case "darwin":
+ gbookCommand = filepath.Join(tessdir, "getgbook")
+ case "linux":
+ gbookCommand = filepath.Join(tessdir, "getgbook")
+ case "windows":
+ gbookCommand = filepath.Join(tessdir, "getgbook.exe")
+ }
}
- var gbookCommand string
- switch runtime.GOOS {
- case "darwin":
- gbookCommand = filepath.Join(tessdir, "getgbook")
- case "linux":
- gbookCommand = filepath.Join(tessdir, "getgbook")
- case "windows":
- gbookCommand = filepath.Join(tessdir, "getgbook.exe")
+
+ _, err = exec.LookPath(gbookCommand)
+ if err != nil {
+ log.Printf("No getgbook found [tried %s], google book downloading will be disabled, either set -gbookcmd on the command line or use the official build which includes an embedded getgbook.", gbookCommand)
+ gbookCommand = ""
}
tessdatadir := filepath.Join(tessdir, "tessdata")
@@ -221,9 +233,11 @@ These training files are included in rescribe, and are always available:
if err != nil {
log.Fatalln("Error setting up tessdata directory:", err)
}
- err = unpackZip(tessdatazip, tessdatadir)
- if err != nil {
- log.Fatalln("Error unpacking embedded tessdata zip:", err)
+ if len(tessdatazip) > 0 {
+ err = unpackZip(tessdatazip, tessdatadir)
+ if err != nil {
+ log.Fatalln("Error unpacking embedded tessdata zip:", err)
+ }
}
// if trainingPath doesn't exist, set it to the embedded training instead
@@ -233,14 +247,6 @@ These training files are included in rescribe, and are always available:
trainingPath = filepath.Join(tessdatadir, trainingPath)
}
- f, err := os.Open(trainingPath)
- if err != nil {
- fmt.Fprintf(os.Stderr, "Error: Training files %s or %s could not be opened.\n", *training, trainingPath)
- fmt.Fprintf(os.Stderr, "Set the `-t` flag with path to a tesseract .traineddata file.\n")
- os.Exit(1)
- }
- f.Close()
-
abstraining, err := filepath.Abs(trainingPath)
if err != nil {
log.Fatalf("Error getting absolute path of training %s: %v", trainingPath, err)
@@ -265,6 +271,14 @@ These training files are included in rescribe, and are always available:
return
}
+ f, err := os.Open(trainingPath)
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "Error: Training files %s or %s could not be opened.\n", *training, trainingPath)
+ fmt.Fprintf(os.Stderr, "Set the `-t` flag with path to a tesseract .traineddata file.\n")
+ os.Exit(1)
+ }
+ f.Close()
+
bookdir := flag.Arg(0)
bookname := strings.ReplaceAll(filepath.Base(bookdir), " ", "_")
savedir := bookdir