summaryrefslogtreecommitdiff
path: root/cmd/rescribe/main.go
diff options
context:
space:
mode:
Diffstat (limited to 'cmd/rescribe/main.go')
-rw-r--r--cmd/rescribe/main.go443
1 files changed, 379 insertions, 64 deletions
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go
index 59d8166..96f2853 100644
--- a/cmd/rescribe/main.go
+++ b/cmd/rescribe/main.go
@@ -1,4 +1,4 @@
-// Copyright 2021 Nick White.
+// Copyright 2021-2022 Nick White.
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
@@ -12,9 +12,11 @@ package main
import (
"archive/zip"
"bytes"
- _ "embed"
+ "context"
"flag"
"fmt"
+ "image"
+ "image/draw"
"image/jpeg"
"image/png"
"io"
@@ -28,13 +30,14 @@ import (
"strings"
"time"
+ "golang.org/x/image/tiff"
"rescribe.xyz/bookpipeline"
"rescribe.xyz/bookpipeline/internal/pipeline"
"rescribe.xyz/pdf"
"rescribe.xyz/utils/pkg/hocr"
)
-const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir/book.pdf [savedir]
+const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd cmd] [-gbookcmd cmd] [-t training] bookdir/book.pdf [savedir]
Process and OCR a book using the Rescribe pipeline on a local machine.
@@ -42,9 +45,6 @@ OCR results are saved into the bookdir directory unless savedir is
specified.
`
-//go:embed tessdata.20211001.zip
-var tessdatazip []byte
-
const QueueTimeoutSecs = 2 * 60
const PauseBetweenChecks = 1 * time.Second
const LogSaveTime = 1 * time.Minute
@@ -73,6 +73,7 @@ type Clouder interface {
type Pipeliner interface {
Clouder
PreQueueId() string
+ PreNoWipeQueueId() string
WipeQueueId() string
OCRPageQueueId() string
AnalyseQueueId() string
@@ -93,7 +94,7 @@ func resetTimer(t *time.Timer, d time.Duration) {
}
}
-// unpackTessZip unpacks a byte array of a zip file into a directory
+// unpackZip unpacks a byte array of a zip file into a directory
func unpackZip(b []byte, dir string) error {
br := bytes.NewReader(b)
zr, err := zip.NewReader(br, br.Size())
@@ -138,22 +139,25 @@ func unpackZip(b []byte, dir string) error {
func main() {
deftesscmd := "tesseract"
+ defgbookcmd := "getgbook"
if runtime.GOOS == "windows" {
deftesscmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
+ defgbookcmd = "getgbook.exe"
}
verbose := flag.Bool("v", false, "verbose")
usegui := flag.Bool("gui", false, "Use graphical user interface")
systess := flag.Bool("systess", false, "Use the system installed Tesseract, rather than the copy embedded in rescribe.")
- training := flag.String("t", "rescribev8_fast.traineddata", `Path to the tesseract training file to use.
+ training := flag.String("t", "rescribev9_fast.traineddata", `Path to the tesseract training file to use.
These training files are included in rescribe, and are always available:
-- carolinemsv1_fast.traineddata (Caroline Miniscule)
-- eng.traineddata (Modern English)
-- lat.traineddata (Latin modern printing)
-- rescribefrav2_fast.traineddata (French historic printing)
-- rescribev8_fast.traineddata (Latin historic printing)
+- eng.traineddata (English, modern print)
+- lat.traineddata (Latin, modern print)
+- rescribev9_fast.traineddata (Latin/English/French, printed ca 1500-1800)
`)
+ gbookcmd := flag.String("gbookcmd", defgbookcmd, "The getgbook executable to run. You may need to set this to the full path of getgbook.exe if you're on Windows.")
tesscmd := flag.String("tesscmd", deftesscmd, "The Tesseract executable to run. You may need to set this to the full path of Tesseract.exe if you're on Windows.")
+ wipe := flag.Bool("wipe", false, "Use wiper tool to remove noise like gutters from page before processing.")
+ fullpdf := flag.Bool("fullpdf", false, "Use highest image quality for searchable PDF (requires lots of RAM).")
flag.Usage = func() {
fmt.Fprintf(flag.CommandLine.Output(), usage)
@@ -185,7 +189,7 @@ These training files are included in rescribe, and are always available:
log.Fatalln("Error setting up tesseract directory:", err)
}
- if !*systess {
+ if !*systess && len(tesszip) > 0 {
err = unpackZip(tesszip, tessdir)
if err != nil {
log.Fatalln("Error unpacking embedded Tesseract zip:", err)
@@ -200,30 +204,73 @@ These training files are included in rescribe, and are always available:
}
}
+ _, err = exec.LookPath(tessCommand)
+ if err != nil {
+ log.Fatalf("No tesseract executable found [tried %s], either set -tesscmd and -systess on the command line or use the official build which includes an embedded copy of Tesseract.", tessCommand)
+ }
+
+ gbookCommand := *gbookcmd
+ if len(gbookzip) > 0 {
+ err = unpackZip(gbookzip, tessdir)
+ if err != nil {
+ log.Fatalln("Error unpacking embedded getgbook zip:", err)
+ }
+ switch runtime.GOOS {
+ case "darwin":
+ gbookCommand = filepath.Join(tessdir, "getgbook")
+ case "linux":
+ gbookCommand = filepath.Join(tessdir, "getgbook")
+ case "windows":
+ gbookCommand = filepath.Join(tessdir, "getgbook.exe")
+ }
+ }
+
+ _, err = exec.LookPath(gbookCommand)
+ if err != nil {
+ log.Printf("No getgbook found [tried %s], google book downloading will be disabled, either set -gbookcmd on the command line or use the official build which includes an embedded getgbook.", gbookCommand)
+ gbookCommand = ""
+ }
+
tessdatadir := filepath.Join(tessdir, "tessdata")
err = os.MkdirAll(tessdatadir, 0755)
if err != nil {
log.Fatalln("Error setting up tessdata directory:", err)
}
- err = unpackZip(tessdatazip, tessdatadir)
- if err != nil {
- log.Fatalln("Error unpacking embedded tessdata zip:", err)
+ if len(tessdatazip) > 0 {
+ err = unpackZip(tessdatazip, tessdatadir)
+ if err != nil {
+ log.Fatalln("Error unpacking embedded tessdata zip:", err)
+ }
}
- // if trainingPath doesn't exist, set it to the embedded training instead
- _, err = os.Stat(trainingPath)
- if err != nil && !os.IsExist(err) {
- trainingPath = filepath.Base(trainingPath)
- trainingPath = filepath.Join(tessdatadir, trainingPath)
+ // copy training path to the tessdir directory, so that we can keep that a
+ // writeable space, which is needed opening other trainings in sandboxes
+ // like flatpak
+ in, err := os.Open(trainingPath)
+ trainingPath = filepath.Join(tessdatadir, filepath.Base(trainingPath))
+ if err != nil {
+ in, err = os.Open(trainingPath)
+ if err != nil {
+ log.Fatalf("Error opening training file %s: %v", trainingPath, err)
+ }
}
-
- f, err := os.Open(trainingPath)
+ defer in.Close()
+ newPath := trainingPath + ".new"
+ out, err := os.Create(newPath)
if err != nil {
- fmt.Fprintf(os.Stderr, "Error: Training files %s or %s could not be opened.\n", *training, trainingPath)
- fmt.Fprintf(os.Stderr, "Set the `-t` flag with path to a tesseract .traineddata file.\n")
- os.Exit(1)
+ log.Fatalf("Error creating training file %s: %v", newPath, err)
+ }
+ defer out.Close()
+ _, err = io.Copy(out, in)
+ if err != nil {
+ log.Fatalf("Error copying training file to %s: %v", newPath, err)
+ }
+ in.Close()
+ out.Close()
+ err = os.Rename(newPath, trainingPath)
+ if err != nil {
+ log.Fatalf("Error moving new training file to %s: %v", trainingPath, err)
}
- f.Close()
abstraining, err := filepath.Abs(trainingPath)
if err != nil {
@@ -237,13 +284,26 @@ These training files are included in rescribe, and are always available:
}
if flag.NArg() < 1 || *usegui {
- err := startGui(*verboselog, tessCommand, trainingName, *systess, tessdir)
+ err := startGui(verboselog, tessCommand, gbookCommand, trainingName, tessdir)
+ err = os.RemoveAll(tessdir)
+ if err != nil {
+ log.Printf("Error removing tesseract directory %s: %v", tessdir, err)
+ }
+
if err != nil {
log.Fatalln("Error in gui:", err)
}
return
}
+ f, err := os.Open(trainingPath)
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "Error: Training files %s or %s could not be opened.\n", *training, trainingPath)
+ fmt.Fprintf(os.Stderr, "Set the `-t` flag with path to a tesseract .traineddata file.\n")
+ os.Exit(1)
+ }
+ f.Close()
+
bookdir := flag.Arg(0)
bookname := strings.ReplaceAll(filepath.Base(bookdir), " ", "_")
savedir := bookdir
@@ -258,27 +318,44 @@ These training files are included in rescribe, and are always available:
log.Fatalln("Error opening book file/dir:", err)
}
+ var ctx context.Context
+ ctx = context.Background()
+
+ // TODO: support google book downloading, as done with the GUI
+
// try opening as a PDF, and extracting
if !fi.IsDir() {
if flag.NArg() < 2 {
savedir = strings.TrimSuffix(bookdir, ".pdf")
}
- bookdir, err = extractPdfImgs(bookdir)
+ bookdir, err = extractPdfImgs(ctx, bookdir)
if err != nil {
log.Fatalln("Error opening file as PDF:", err)
}
+ // if this occurs then extractPdfImgs() will have recovered from
+ // a panic in the pdf package
+ if bookdir == "" {
+ log.Fatalln("Error opening file as PDF: image type not supported, you will need to extract images manually.")
+ }
bookname = strings.TrimSuffix(bookname, ".pdf")
ispdf = true
}
- err = startProcess(*verboselog, tessCommand, bookdir, bookname, trainingName, *systess, savedir, tessdir)
+ err = startProcess(ctx, verboselog, tessCommand, bookdir, bookname, trainingName, savedir, tessdir, !*wipe, *fullpdf)
if err != nil {
log.Fatalln(err)
}
+ if !*systess {
+ err = os.RemoveAll(tessdir)
+ if err != nil {
+ log.Printf("Error removing tesseract directory %s: %v", tessdir, err)
+ }
+ }
+
if ispdf {
os.RemoveAll(filepath.Clean(filepath.Join(bookdir, "..")))
}
@@ -286,7 +363,16 @@ These training files are included in rescribe, and are always available:
// extractPdfImgs extracts all images embedded in a PDF to a
// temporary directory, which is returned on success.
-func extractPdfImgs(path string) (string, error) {
+func extractPdfImgs(ctx context.Context, path string) (string, error) {
+ defer func() {
+ // unfortunately the pdf library will panic if it sees an encoding
+ // it can't decode, so recover from that and give a warning
+ r := recover()
+ if r != nil {
+ fmt.Fprintf(os.Stderr, "Warning: Error extracting from PDF: %v\n", r)
+ }
+ }()
+
p, err := pdf.Open(path)
if err != nil {
return "", err
@@ -305,9 +391,20 @@ func extractPdfImgs(path string) (string, error) {
}
for pgnum := 1; pgnum <= p.NumPage(); pgnum++ {
+ select {
+ case <-ctx.Done():
+ return "", ctx.Err()
+ default:
+ }
if p.Page(pgnum).V.IsNull() {
continue
}
+ var rotate int64
+ for v := p.Page(pgnum).V; !v.IsNull(); v = v.Key("Parent") {
+ if r := v.Key("Rotate"); !r.IsNull() {
+ rotate = r.Int64()
+ }
+ }
res := p.Page(pgnum).Resources()
if res.Kind() != pdf.Dict {
continue
@@ -323,7 +420,7 @@ func extractPdfImgs(path string) (string, error) {
continue
}
- fn := fmt.Sprintf("%s-%04d.jpg", k, pgnum)
+ fn := fmt.Sprintf("%04d-%s.jpg", pgnum, k)
path := filepath.Join(tempdir, fn)
w, err := os.Create(path)
defer w.Close()
@@ -343,16 +440,31 @@ func extractPdfImgs(path string) (string, error) {
if err != nil {
return tempdir, fmt.Errorf("Error removing extracted image %s from PDF: %v\n", fn, err)
}
+
+ if rotate != 0 {
+ err = rotateImage(path, rotate)
+ if err != nil {
+ return tempdir, fmt.Errorf("Error rotating extracted image %s from PDF: %v\n", fn, err)
+ }
+ }
}
}
// TODO: check for places where there are multiple images per page, and only keep largest ones where that's the case
+ select {
+ case <-ctx.Done():
+ return "", ctx.Err()
+ default:
+ }
+
return tempdir, nil
}
// rmIfNotImage attempts to decode a given file as an image. If it is
// decode-able as PNG, then rename file extension from .jpg to .png,
-// if it fails to be read as PNG or JPEG it will be deleted.
+// if it is decode-able as TIFF then convert to PNG and rename file
+// extension appropriately, if it fails to be read as PNG, TIFF or
+// JPEG it will just be deleted.
func rmIfNotImage(f string) error {
r, err := os.Open(f)
defer r.Close()
@@ -363,9 +475,9 @@ func rmIfNotImage(f string) error {
r.Close()
if err == nil {
b := strings.TrimSuffix(f, ".jpg")
- err = os.Rename(f, b + ".png")
+ err = os.Rename(f, b+".png")
if err != nil {
- return fmt.Errorf("Error renaming %s to %s: %v", f, b + ".png", err)
+ return fmt.Errorf("Error renaming %s to %s: %v", f, b+".png", err)
}
return nil
}
@@ -376,19 +488,134 @@ func rmIfNotImage(f string) error {
return fmt.Errorf("Failed to open image %s: %v\n", f, err)
}
_, err = jpeg.Decode(r)
+ r.Close()
+ if err == nil {
+ return nil
+ }
+
+ r, err = os.Open(f)
+ defer r.Close()
if err != nil {
+ return fmt.Errorf("Failed to open image %s: %v\n", f, err)
+ }
+ t, err := tiff.Decode(r)
+ if err == nil {
+ b := strings.TrimSuffix(f, ".jpg")
+ n, err := os.Create(b + ".png")
+ defer n.Close()
+ if err != nil {
+ return fmt.Errorf("Failed to create file to store new png %s from tiff %s: %v\n", b+".png", f, err)
+ }
+ err = png.Encode(n, t)
+ if err != nil {
+ return fmt.Errorf("Failed to encode tiff as png for %s: %v\n", f, err)
+ }
r.Close()
err = os.Remove(f)
if err != nil {
- return fmt.Errorf("Failed to remove invalid image %s: %v", f, err)
+ return fmt.Errorf("Failed to remove original tiff %s: %v\n", f, err)
+ }
+ return nil
+ }
+
+ r.Close()
+ err = os.Remove(f)
+ if err != nil {
+ return fmt.Errorf("Failed to remove invalid image %s: %v", f, err)
+ }
+
+ return nil
+}
+
+// rotateImage rotates an image at the given path by the given angle
+func rotateImage(path string, angle int64) error {
+ switch angle {
+ case 90:
+ // proceed with the rest of the function
+ case 180, 270:
+ // rotate the image again first, as many times as necessary.
+ // this is inefficient but easy.
+ err := rotateImage(path, angle-90)
+ if err != nil {
+ return fmt.Errorf("error with a rotation run: %w", err)
}
+ default:
+ return fmt.Errorf("Rotation angle of %d is not supported", angle)
+ }
+
+ r, err := os.Open(path)
+ defer r.Close()
+ if err != nil {
+ return fmt.Errorf("Failed to open image: %w", err)
+ }
+ img, err := png.Decode(r)
+ if err != nil {
+ r.Close()
+ r, err = os.Open(path)
+ defer r.Close()
+ if err != nil {
+ return fmt.Errorf("Failed to open image: %w", err)
+ }
+ img, err = jpeg.Decode(r)
+ }
+ if err != nil {
+ r.Close()
+ r, err = os.Open(path)
+ defer r.Close()
+ if err != nil {
+ return fmt.Errorf("Failed to open image: %w", err)
+ }
+ img, err = tiff.Decode(r)
+ }
+ if err != nil {
+ return fmt.Errorf("Failed to decode image as png, jpeg or tiff: %w", err)
+ }
+
+ b := img.Bounds()
+
+ orig := image.NewRGBA(b)
+ draw.Draw(orig, b, img, b.Min, draw.Src)
+
+ newb := image.Rectangle{
+ Min: image.Point{X: 0, Y: 0},
+ Max: image.Point{X: b.Dy(), Y: b.Dx()},
+ }
+ new := image.NewRGBA(newb)
+
+ for x := b.Min.X; x < b.Max.X; x++ {
+ desty := newb.Min.Y + x
+ for y := b.Max.Y; y > b.Min.Y; y-- {
+ destx := b.Dy() - y + newb.Min.X
+ new.SetRGBA(destx, desty, orig.RGBAAt(x, y))
+ }
+ }
+
+ err = r.Close()
+ if err != nil {
+ return fmt.Errorf("Failed to close image: %w", err)
+ }
+ w, err := os.Create(path)
+ if err != nil {
+ return fmt.Errorf("Failed to create rotated image: %w", err)
+ }
+ defer w.Close()
+
+ if !strings.HasSuffix(path, ".jpg") {
+ err = jpeg.Encode(w, new, nil)
+ } else {
+ err = png.Encode(w, new)
+ }
+ if err != nil {
+ return fmt.Errorf("Failed to encode rotated image: %w", err)
}
return nil
}
-func startProcess(logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, systess bool, savedir string, tessdir string) error {
- _, err := exec.Command(tessCommand, "--help").Output()
+func startProcess(ctx context.Context, logger *log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, savedir string, tessdir string, nowipe bool, fullpdf bool) error {
+ cmd := exec.Command(tessCommand, "--help")
+ pipeline.HideCmd(cmd)
+ _, err := cmd.Output()
if err != nil {
errmsg := "Error, Can't run Tesseract\n"
errmsg += "Ensure that Tesseract is installed and available, or don't use the -systess flag.\n"
@@ -404,7 +631,7 @@ func startProcess(logger log.Logger, tessCommand string, bookdir string, booknam
}
var conn Pipeliner
- conn = &bookpipeline.LocalConn{Logger: &logger, TempDir: tempdir}
+ conn = &bookpipeline.LocalConn{Logger: logger, TempDir: tempdir}
conn.Log("Setting up session")
err = conn.Init()
@@ -415,14 +642,14 @@ func startProcess(logger log.Logger, tessCommand string, bookdir string, booknam
fmt.Printf("Copying book to pipeline\n")
- err = uploadbook(bookdir, bookname, conn)
+ err = uploadbook(ctx, bookdir, bookname, conn, nowipe)
if err != nil {
_ = os.RemoveAll(tempdir)
return fmt.Errorf("Error uploading book: %v", err)
}
fmt.Printf("Processing book\n")
- err = processbook(trainingName, tessCommand, conn)
+ err = processbook(ctx, trainingName, tessCommand, conn, fullpdf)
if err != nil {
_ = os.RemoveAll(tempdir)
return fmt.Errorf("Error processing book: %v", err)
@@ -444,18 +671,16 @@ func startProcess(logger log.Logger, tessCommand string, bookdir string, booknam
return fmt.Errorf("Error removing temporary directory %s: %v", tempdir, err)
}
- if !systess {
- err = os.RemoveAll(tessdir)
- if err != nil {
- return fmt.Errorf("Error removing tesseract directory %s: %v", tessdir, err)
- }
- }
-
hocrs, err := filepath.Glob(fmt.Sprintf("%s%s*.hocr", savedir, string(filepath.Separator)))
if err != nil {
return fmt.Errorf("Error looking for .hocr files: %v", err)
}
+ err = addFullTxt(hocrs, bookname)
+ if err != nil {
+ log.Fatalf("Error creating full txt version: %v", err)
+ }
+
for _, v := range hocrs {
err = addTxtVersion(v)
if err != nil {
@@ -471,11 +696,46 @@ func startProcess(logger log.Logger, tessCommand string, bookdir string, booknam
if err != nil {
log.Fatalf("Error moving hocr %s to hocr directory: %v", v, err)
}
+
+ pngname := strings.Replace(v, ".hocr", ".png", 1)
+ err = os.MkdirAll(filepath.Join(savedir, "png"), 0755)
+ if err != nil {
+ log.Fatalf("Error creating png directory: %v", err)
+ }
+
+ err = os.Rename(pngname, filepath.Join(savedir, "png", filepath.Base(pngname)))
+ if err != nil {
+ log.Fatalf("Error moving png %s to png directory: %v", pngname, err)
+ }
+
}
// For simplicity, remove .binarised.pdf and rename .colour.pdf to .pdf
- _ = os.Remove(filepath.Join(savedir, bookname+".binarised.pdf"))
- _ = os.Rename(filepath.Join(savedir, bookname+".colour.pdf"), filepath.Join(savedir, bookname+".pdf"))
+ // providing they both exist, otherwise just rename whichever exists
+ // to .pdf.
+ binpath := filepath.Join(savedir, bookname+".binarised.pdf")
+ colourpath := filepath.Join(savedir, bookname+".colour.pdf")
+ fullsizepath := filepath.Join(savedir, bookname+".original.pdf")
+ pdfpath := filepath.Join(savedir, bookname+" searchable.pdf")
+
+ // If full size pdf is requested, replace colour.pdf with it
+ if fullpdf {
+ _ = os.Rename(fullsizepath, colourpath)
+ }
+
+ _, err = os.Stat(binpath)
+ binexists := err == nil || os.IsExist(err)
+ _, err = os.Stat(colourpath)
+ colourexists := err == nil || os.IsExist(err)
+
+ if binexists && colourexists {
+ _ = os.Remove(binpath)
+ _ = os.Rename(colourpath, pdfpath)
+ } else if binexists {
+ _ = os.Rename(binpath, pdfpath)
+ } else if colourexists {
+ _ = os.Rename(colourpath, pdfpath)
+ }
return nil
}
@@ -506,21 +766,48 @@ func addTxtVersion(hocrfn string) error {
return nil
}
-func uploadbook(dir string, name string, conn Pipeliner) error {
+func addFullTxt(hocrs []string, bookname string) error {
+ if len(hocrs) == 0 {
+ return nil
+ }
+ var full string
+ for i, v := range hocrs {
+ t, err := hocr.GetText(v)
+ if err != nil {
+ return fmt.Errorf("Error getting text from hocr file %s: %v", v, err)
+ }
+ if i > 0 {
+ full += "\n"
+ }
+ full += t
+ }
+
+ dir := filepath.Dir(hocrs[0])
+ fn := filepath.Join(dir, bookname+".txt")
+ err := ioutil.WriteFile(fn, []byte(full), 0644)
+ if err != nil {
+ return fmt.Errorf("Error creating text file %s: %v", fn, err)
+ }
+
+ return nil
+}
+
+func uploadbook(ctx context.Context, dir string, name string, conn Pipeliner, nowipe bool) error {
_, err := os.Stat(dir)
if err != nil && !os.IsExist(err) {
return fmt.Errorf("Error: directory %s not found", dir)
}
- err = pipeline.CheckImages(dir)
+ err = pipeline.CheckImages(ctx, dir)
if err != nil {
return fmt.Errorf("Error with images in %s: %v", dir, err)
}
- err = pipeline.UploadImages(dir, name, conn)
+ err = pipeline.UploadImages(ctx, dir, name, conn)
if err != nil {
return fmt.Errorf("Error saving images to process from %s: %v", dir, err)
}
- qid := pipeline.DetectQueueType(dir, conn)
+ qid := pipeline.DetectQueueType(dir, conn, nowipe)
+ fmt.Printf("Uploading to queue %s\n", qid)
err = conn.AddToQueue(qid, name)
if err != nil {
@@ -531,9 +818,14 @@ func uploadbook(dir string, name string, conn Pipeliner) error {
}
func downloadbook(dir string, name string, conn Pipeliner) error {
- err := pipeline.DownloadBestPages(dir, name, conn, false)
+ err := pipeline.DownloadBestPages(dir, name, conn)
if err != nil {
- return fmt.Errorf("Error downloading best pages: %v", err)
+ return fmt.Errorf("No images found")
+ }
+
+ err = pipeline.DownloadBestPngs(dir, name, conn)
+ if err != nil {
+ return fmt.Errorf("No images found")
}
err = pipeline.DownloadPdfs(dir, name, conn)
@@ -549,17 +841,19 @@ func downloadbook(dir string, name string, conn Pipeliner) error {
return nil
}
-func processbook(training string, tesscmd string, conn Pipeliner) error {
+func processbook(ctx context.Context, training string, tesscmd string, conn Pipeliner, fullpdf bool) error {
origPattern := regexp.MustCompile(`[0-9]{4}.(jpg|png)$`)
wipePattern := regexp.MustCompile(`[0-9]{4,6}(.bin)?.(jpg|png)$`)
ocredPattern := regexp.MustCompile(`.hocr$`)
var checkPreQueue <-chan time.Time
+ var checkPreNoWipeQueue <-chan time.Time
var checkWipeQueue <-chan time.Time
var checkOCRPageQueue <-chan time.Time
var checkAnalyseQueue <-chan time.Time
var stopIfQuiet *time.Timer
checkPreQueue = time.After(0)
+ checkPreNoWipeQueue = time.After(0)
checkWipeQueue = time.After(0)
checkOCRPageQueue = time.After(0)
checkAnalyseQueue = time.After(0)
@@ -571,6 +865,27 @@ func processbook(training string, tesscmd string, conn Pipeliner) error {
for {
select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case <-checkPreNoWipeQueue:
+ msg, err := conn.CheckQueue(conn.PreNoWipeQueueId(), QueueTimeoutSecs)
+ checkPreNoWipeQueue = time.After(PauseBetweenChecks)
+ if err != nil {
+ return fmt.Errorf("Error checking preprocess no wipe queue: %v", err)
+ }
+ if msg.Handle == "" {
+ conn.Log("No message received on preprocess no wipe queue, sleeping")
+ continue
+ }
+ stopTimer(stopIfQuiet)
+ conn.Log("Message received on preprocess no wipe queue, processing", msg.Body)
+ fmt.Printf(" Preprocessing book (binarising only, no wiping)\n")
+ err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess(thresholds, true), origPattern, conn.PreNoWipeQueueId(), conn.OCRPageQueueId())
+ resetTimer(stopIfQuiet, quietTime)
+ if err != nil {
+ return fmt.Errorf("Error during preprocess (no wipe): %v", err)
+ }
+ fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output
case <-checkPreQueue:
msg, err := conn.CheckQueue(conn.PreQueueId(), QueueTimeoutSecs)
checkPreQueue = time.After(PauseBetweenChecks)
@@ -584,12 +899,12 @@ func processbook(training string, tesscmd string, conn Pipeliner) error {
stopTimer(stopIfQuiet)
conn.Log("Message received on preprocess queue, processing", msg.Body)
fmt.Printf(" Preprocessing book (binarising and wiping)\n")
- err = pipeline.ProcessBook(msg, conn, pipeline.Preprocess(thresholds), origPattern, conn.PreQueueId(), conn.OCRPageQueueId())
- fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output
+ err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess(thresholds, false), origPattern, conn.PreQueueId(), conn.OCRPageQueueId())
resetTimer(stopIfQuiet, quietTime)
if err != nil {
return fmt.Errorf("Error during preprocess: %v", err)
}
+ fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output
case <-checkWipeQueue:
msg, err := conn.CheckQueue(conn.WipeQueueId(), QueueTimeoutSecs)
checkWipeQueue = time.After(PauseBetweenChecks)
@@ -603,12 +918,12 @@ func processbook(training string, tesscmd string, conn Pipeliner) error {
stopTimer(stopIfQuiet)
conn.Log("Message received on wipeonly queue, processing", msg.Body)
fmt.Printf(" Preprocessing book (wiping only)\n")
- err = pipeline.ProcessBook(msg, conn, pipeline.Wipe, wipePattern, conn.WipeQueueId(), conn.OCRPageQueueId())
- fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output
+ err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Wipe, wipePattern, conn.WipeQueueId(), conn.OCRPageQueueId())
resetTimer(stopIfQuiet, quietTime)
if err != nil {
return fmt.Errorf("Error during wipe: %v", err)
}
+ fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output
case <-checkOCRPageQueue:
msg, err := conn.CheckQueue(conn.OCRPageQueueId(), QueueTimeoutSecs)
checkOCRPageQueue = time.After(PauseBetweenChecks)
@@ -624,7 +939,7 @@ func processbook(training string, tesscmd string, conn Pipeliner) error {
stopTimer(stopIfQuiet)
conn.Log("Message received on OCR Page queue, processing", msg.Body)
fmt.Printf(".")
- err = pipeline.OcrPage(msg, conn, pipeline.Ocr(training, tesscmd), conn.OCRPageQueueId(), conn.AnalyseQueueId())
+ err = pipeline.OcrPage(ctx, msg, conn, pipeline.Ocr(training, tesscmd), conn.OCRPageQueueId(), conn.AnalyseQueueId())
resetTimer(stopIfQuiet, quietTime)
if err != nil {
return fmt.Errorf("\nError during OCR Page process: %v", err)
@@ -642,7 +957,7 @@ func processbook(training string, tesscmd string, conn Pipeliner) error {
stopTimer(stopIfQuiet)
conn.Log("Message received on analyse queue, processing", msg.Body)
fmt.Printf("\n Analysing OCR and compiling PDFs\n")
- err = pipeline.ProcessBook(msg, conn, pipeline.Analyse(conn), ocredPattern, conn.AnalyseQueueId(), "")
+ err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Analyse(conn, fullpdf), ocredPattern, conn.AnalyseQueueId(), "")
resetTimer(stopIfQuiet, quietTime)
if err != nil {
return fmt.Errorf("Error during analysis: %v", err)