From 811601e4b446b1c598af965b74155f7f76ffb7e9 Mon Sep 17 00:00:00 2001 From: Nick White Date: Thu, 27 Oct 2022 16:27:55 +0100 Subject: Allow completely non-embedded builds This enables installs straight from 'go install' or 'fyne install'. It also means warning if a system getgbook isn't found, and erroring if tesseract isn't found (as was done already). The location of getgbook can therefore now be specified on the command line. Embedded builds are enabled with the -tags embed flag, which the makefile sets for all builds. --- cmd/rescribe/embed_darwin.go | 2 ++ cmd/rescribe/embed_darwin_amd64.go | 2 ++ cmd/rescribe/embed_darwin_arm64.go | 2 ++ cmd/rescribe/embed_linux.go | 2 ++ cmd/rescribe/embed_other.go | 7 ++-- cmd/rescribe/embed_tessdata.go | 12 +++++++ cmd/rescribe/embed_windows.go | 2 ++ cmd/rescribe/gui.go | 11 ++++++ cmd/rescribe/main.go | 70 +++++++++++++++++++++++--------------- cmd/rescribe/makefile | 8 ++--- 10 files changed, 82 insertions(+), 36 deletions(-) create mode 100644 cmd/rescribe/embed_tessdata.go (limited to 'cmd/rescribe') diff --git a/cmd/rescribe/embed_darwin.go b/cmd/rescribe/embed_darwin.go index da781c3..4f22b84 100644 --- a/cmd/rescribe/embed_darwin.go +++ b/cmd/rescribe/embed_darwin.go @@ -2,6 +2,8 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +//go:build embed + package main import _ "embed" diff --git a/cmd/rescribe/embed_darwin_amd64.go b/cmd/rescribe/embed_darwin_amd64.go index 719c9cc..1f7f8c2 100644 --- a/cmd/rescribe/embed_darwin_amd64.go +++ b/cmd/rescribe/embed_darwin_amd64.go @@ -2,6 +2,8 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +//go:build embed + package main import _ "embed" diff --git a/cmd/rescribe/embed_darwin_arm64.go b/cmd/rescribe/embed_darwin_arm64.go index a1ca9b8..4c154be 100644 --- a/cmd/rescribe/embed_darwin_arm64.go +++ b/cmd/rescribe/embed_darwin_arm64.go @@ -2,6 +2,8 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +//go:build embed + package main import _ "embed" diff --git a/cmd/rescribe/embed_linux.go b/cmd/rescribe/embed_linux.go index eb09dce..3cfd18b 100644 --- a/cmd/rescribe/embed_linux.go +++ b/cmd/rescribe/embed_linux.go @@ -2,6 +2,8 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +//go:build embed + package main import _ "embed" diff --git a/cmd/rescribe/embed_other.go b/cmd/rescribe/embed_other.go index 86848d2..ac9ce3a 100644 --- a/cmd/rescribe/embed_other.go +++ b/cmd/rescribe/embed_other.go @@ -2,13 +2,12 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. -// +build !darwin -// +build !linux -// +build !windows +//go:build (!darwin && !linux && !windows) || !embed package main // if not one of the above platforms, we won't embed anything, so -// just create an empty byte slice +// just create empty byte slices var tesszip []byte var gbookzip []byte +var tessdatazip []byte diff --git a/cmd/rescribe/embed_tessdata.go b/cmd/rescribe/embed_tessdata.go new file mode 100644 index 0000000..ea9ce8f --- /dev/null +++ b/cmd/rescribe/embed_tessdata.go @@ -0,0 +1,12 @@ +// Copyright 2022 Nick White. +// Use of this source code is governed by the GPLv3 +// license that can be found in the LICENSE file. + +//go:build embed + +package main + +import _ "embed" + +//go:embed tessdata.20220322.zip +var tessdatazip []byte diff --git a/cmd/rescribe/embed_windows.go b/cmd/rescribe/embed_windows.go index 3e49161..f3fe193 100644 --- a/cmd/rescribe/embed_windows.go +++ b/cmd/rescribe/embed_windows.go @@ -2,6 +2,8 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +//go:build embed + package main import _ "embed" diff --git a/cmd/rescribe/gui.go b/cmd/rescribe/gui.go index f4a622d..5031f0d 100644 --- a/cmd/rescribe/gui.go +++ b/cmd/rescribe/gui.go @@ -438,6 +438,17 @@ func startGui(log log.Logger, cmd string, gbookcmd string, training string, tess progressBar.SetValue(0.1) if strings.HasPrefix(dir.Text, "Google Book: ") { + if gbookcmd == "" { + msg := fmt.Sprintf("No getgbook found, can't download Google Book. Either set -gbookcmd on the command line, or use the official build which includes an embedded copy of getgbook.\n") + dialog.ShowError(errors.New(msg), myWindow) + fmt.Fprintf(os.Stderr, msg) + progressBar.SetValue(0.0) + for _, v := range disableWidgets { + v.Enable() + } + abortbtn.Disable() + return + } progressBar.SetValue(0.11) start := len("Google Book: ") bookname = dir.Text[start : start+12] diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index fd5b33b..16ca024 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -13,7 +13,6 @@ import ( "archive/zip" "bytes" "context" - _ "embed" "flag" "fmt" "image/jpeg" @@ -35,7 +34,7 @@ import ( "rescribe.xyz/utils/pkg/hocr" ) -const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir/book.pdf [savedir] +const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd cmd] [-gbookcmd cmd] [-t training] bookdir/book.pdf [savedir] Process and OCR a book using the Rescribe pipeline on a local machine. @@ -43,9 +42,6 @@ OCR results are saved into the bookdir directory unless savedir is specified. ` -//go:embed tessdata.20220322.zip -var tessdatazip []byte - const QueueTimeoutSecs = 2 * 60 const PauseBetweenChecks = 1 * time.Second const LogSaveTime = 1 * time.Minute @@ -95,7 +91,7 @@ func resetTimer(t *time.Timer, d time.Duration) { } } -// unpackTessZip unpacks a byte array of a zip file into a directory +// unpackZip unpacks a byte array of a zip file into a directory func unpackZip(b []byte, dir string) error { br := bytes.NewReader(b) zr, err := zip.NewReader(br, br.Size()) @@ -140,8 +136,10 @@ func unpackZip(b []byte, dir string) error { func main() { deftesscmd := "tesseract" + defgbookcmd := "getgbook" if runtime.GOOS == "windows" { deftesscmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe" + defgbookcmd = "getgbook.exe" } verbose := flag.Bool("v", false, "verbose") @@ -153,6 +151,7 @@ These training files are included in rescribe, and are always available: - lat.traineddata (Latin, modern print) - rescribev9_fast.traineddata (Latin/English/French, printed ca 1500-1800) `) + gbookcmd := flag.String("gbookcmd", defgbookcmd, "The getgbook executable to run. You may need to set this to the full path of getgbook.exe if you're on Windows.") tesscmd := flag.String("tesscmd", deftesscmd, "The Tesseract executable to run. You may need to set this to the full path of Tesseract.exe if you're on Windows.") wipe := flag.Bool("wipe", false, "Use wiper tool to remove noise like gutters from page before processing.") fullpdf := flag.Bool("fullpdf", false, "Use highest image quality for searchable PDF (requires lots of RAM).") @@ -187,7 +186,7 @@ These training files are included in rescribe, and are always available: log.Fatalln("Error setting up tesseract directory:", err) } - if !*systess { + if !*systess && len(tesszip) > 0 { err = unpackZip(tesszip, tessdir) if err != nil { log.Fatalln("Error unpacking embedded Tesseract zip:", err) @@ -202,18 +201,31 @@ These training files are included in rescribe, and are always available: } } - err = unpackZip(gbookzip, tessdir) + _, err = exec.LookPath(tessCommand) if err != nil { - log.Fatalln("Error unpacking embedded getgbook zip:", err) + log.Fatalf("No tesseract executable found [tried %s], either set -tesscmd and -systess on the command line or use the official build which includes an embedded copy of Tesseract.", tessCommand) + } + + gbookCommand := *gbookcmd + if len(gbookzip) > 0 { + err = unpackZip(gbookzip, tessdir) + if err != nil { + log.Fatalln("Error unpacking embedded getgbook zip:", err) + } + switch runtime.GOOS { + case "darwin": + gbookCommand = filepath.Join(tessdir, "getgbook") + case "linux": + gbookCommand = filepath.Join(tessdir, "getgbook") + case "windows": + gbookCommand = filepath.Join(tessdir, "getgbook.exe") + } } - var gbookCommand string - switch runtime.GOOS { - case "darwin": - gbookCommand = filepath.Join(tessdir, "getgbook") - case "linux": - gbookCommand = filepath.Join(tessdir, "getgbook") - case "windows": - gbookCommand = filepath.Join(tessdir, "getgbook.exe") + + _, err = exec.LookPath(gbookCommand) + if err != nil { + log.Printf("No getgbook found [tried %s], google book downloading will be disabled, either set -gbookcmd on the command line or use the official build which includes an embedded getgbook.", gbookCommand) + gbookCommand = "" } tessdatadir := filepath.Join(tessdir, "tessdata") @@ -221,9 +233,11 @@ These training files are included in rescribe, and are always available: if err != nil { log.Fatalln("Error setting up tessdata directory:", err) } - err = unpackZip(tessdatazip, tessdatadir) - if err != nil { - log.Fatalln("Error unpacking embedded tessdata zip:", err) + if len(tessdatazip) > 0 { + err = unpackZip(tessdatazip, tessdatadir) + if err != nil { + log.Fatalln("Error unpacking embedded tessdata zip:", err) + } } // if trainingPath doesn't exist, set it to the embedded training instead @@ -233,14 +247,6 @@ These training files are included in rescribe, and are always available: trainingPath = filepath.Join(tessdatadir, trainingPath) } - f, err := os.Open(trainingPath) - if err != nil { - fmt.Fprintf(os.Stderr, "Error: Training files %s or %s could not be opened.\n", *training, trainingPath) - fmt.Fprintf(os.Stderr, "Set the `-t` flag with path to a tesseract .traineddata file.\n") - os.Exit(1) - } - f.Close() - abstraining, err := filepath.Abs(trainingPath) if err != nil { log.Fatalf("Error getting absolute path of training %s: %v", trainingPath, err) @@ -265,6 +271,14 @@ These training files are included in rescribe, and are always available: return } + f, err := os.Open(trainingPath) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: Training files %s or %s could not be opened.\n", *training, trainingPath) + fmt.Fprintf(os.Stderr, "Set the `-t` flag with path to a tesseract .traineddata file.\n") + os.Exit(1) + } + f.Close() + bookdir := flag.Arg(0) bookname := strings.ReplaceAll(filepath.Base(bookdir), " ", "_") savedir := bookdir diff --git a/cmd/rescribe/makefile b/cmd/rescribe/makefile index ae92dda..23f17fb 100644 --- a/cmd/rescribe/makefile +++ b/cmd/rescribe/makefile @@ -26,17 +26,17 @@ all: dist/linux/rescribe dist/darwin/rescribe.zip dist/windows/rescribe.exe dist/linux/rescribe: $(GODEPS) go generate mkdir -p dist/linux - GOOS=linux GOARCH=amd64 go build -o $@ . + GOOS=linux GOARCH=amd64 go build -tags embed -o $@ . build/darwin_amd64/rescribe: $(GODEPS) go generate mkdir -p build/darwin_amd64 - PATH="$(PATH):$(OSXCROSSBIN)" CC="o64-clang" CGO_ENABLED=1 GOOS=darwin GOARCH=amd64 go build -o $@ . + PATH="$(PATH):$(OSXCROSSBIN)" CC="o64-clang" CGO_ENABLED=1 GOOS=darwin GOARCH=amd64 go build -tags embed -o $@ . build/darwin_arm64/rescribe: $(GODEPS) go generate mkdir -p build/darwin_arm64 - PATH="$(PATH):$(OSXCROSSBIN)" CC="oa64-clang" CGO_ENABLED=1 GOOS=darwin GOARCH=arm64 go build -o $@ . + PATH="$(PATH):$(OSXCROSSBIN)" CC="oa64-clang" CGO_ENABLED=1 GOOS=darwin GOARCH=arm64 go build -tags embed -o $@ . build/darwin/rescribe: build/darwin_amd64/rescribe build/darwin_arm64/rescribe mkdir -p build/darwin @@ -54,7 +54,7 @@ dist/darwin/rescribe.zip: build/darwin/Rescribe.app build/windows/rescribe-bin.exe: $(GODEPS) go generate mkdir -p build/windows - CC="x86_64-w64-mingw32-gcc" CGO_ENABLED=1 GOOS=windows GOARCH=amd64 go build -o $@ . + CC="x86_64-w64-mingw32-gcc" CGO_ENABLED=1 GOOS=windows GOARCH=amd64 go build -tags embed -o $@ . dist/windows/rescribe.exe: build/windows/rescribe-bin.exe mkdir -p dist/windows -- cgit v1.2.1-24-ge1ad