diff options
Diffstat (limited to 'cmd/rescribe/main.go')
-rw-r--r-- | cmd/rescribe/main.go | 443 |
1 files changed, 379 insertions, 64 deletions
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 59d8166..96f2853 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -1,4 +1,4 @@ -// Copyright 2021 Nick White. +// Copyright 2021-2022 Nick White. // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. @@ -12,9 +12,11 @@ package main import ( "archive/zip" "bytes" - _ "embed" + "context" "flag" "fmt" + "image" + "image/draw" "image/jpeg" "image/png" "io" @@ -28,13 +30,14 @@ import ( "strings" "time" + "golang.org/x/image/tiff" "rescribe.xyz/bookpipeline" "rescribe.xyz/bookpipeline/internal/pipeline" "rescribe.xyz/pdf" "rescribe.xyz/utils/pkg/hocr" ) -const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir/book.pdf [savedir] +const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd cmd] [-gbookcmd cmd] [-t training] bookdir/book.pdf [savedir] Process and OCR a book using the Rescribe pipeline on a local machine. @@ -42,9 +45,6 @@ OCR results are saved into the bookdir directory unless savedir is specified. ` -//go:embed tessdata.20211001.zip -var tessdatazip []byte - const QueueTimeoutSecs = 2 * 60 const PauseBetweenChecks = 1 * time.Second const LogSaveTime = 1 * time.Minute @@ -73,6 +73,7 @@ type Clouder interface { type Pipeliner interface { Clouder PreQueueId() string + PreNoWipeQueueId() string WipeQueueId() string OCRPageQueueId() string AnalyseQueueId() string @@ -93,7 +94,7 @@ func resetTimer(t *time.Timer, d time.Duration) { } } -// unpackTessZip unpacks a byte array of a zip file into a directory +// unpackZip unpacks a byte array of a zip file into a directory func unpackZip(b []byte, dir string) error { br := bytes.NewReader(b) zr, err := zip.NewReader(br, br.Size()) @@ -138,22 +139,25 @@ func unpackZip(b []byte, dir string) error { func main() { deftesscmd := "tesseract" + defgbookcmd := "getgbook" if runtime.GOOS == "windows" { deftesscmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe" + defgbookcmd = "getgbook.exe" } verbose := flag.Bool("v", false, "verbose") usegui := flag.Bool("gui", false, "Use graphical user interface") systess := flag.Bool("systess", false, "Use the system installed Tesseract, rather than the copy embedded in rescribe.") - training := flag.String("t", "rescribev8_fast.traineddata", `Path to the tesseract training file to use. + training := flag.String("t", "rescribev9_fast.traineddata", `Path to the tesseract training file to use. These training files are included in rescribe, and are always available: -- carolinemsv1_fast.traineddata (Caroline Miniscule) -- eng.traineddata (Modern English) -- lat.traineddata (Latin modern printing) -- rescribefrav2_fast.traineddata (French historic printing) -- rescribev8_fast.traineddata (Latin historic printing) +- eng.traineddata (English, modern print) +- lat.traineddata (Latin, modern print) +- rescribev9_fast.traineddata (Latin/English/French, printed ca 1500-1800) `) + gbookcmd := flag.String("gbookcmd", defgbookcmd, "The getgbook executable to run. You may need to set this to the full path of getgbook.exe if you're on Windows.") tesscmd := flag.String("tesscmd", deftesscmd, "The Tesseract executable to run. You may need to set this to the full path of Tesseract.exe if you're on Windows.") + wipe := flag.Bool("wipe", false, "Use wiper tool to remove noise like gutters from page before processing.") + fullpdf := flag.Bool("fullpdf", false, "Use highest image quality for searchable PDF (requires lots of RAM).") flag.Usage = func() { fmt.Fprintf(flag.CommandLine.Output(), usage) @@ -185,7 +189,7 @@ These training files are included in rescribe, and are always available: log.Fatalln("Error setting up tesseract directory:", err) } - if !*systess { + if !*systess && len(tesszip) > 0 { err = unpackZip(tesszip, tessdir) if err != nil { log.Fatalln("Error unpacking embedded Tesseract zip:", err) @@ -200,30 +204,73 @@ These training files are included in rescribe, and are always available: } } + _, err = exec.LookPath(tessCommand) + if err != nil { + log.Fatalf("No tesseract executable found [tried %s], either set -tesscmd and -systess on the command line or use the official build which includes an embedded copy of Tesseract.", tessCommand) + } + + gbookCommand := *gbookcmd + if len(gbookzip) > 0 { + err = unpackZip(gbookzip, tessdir) + if err != nil { + log.Fatalln("Error unpacking embedded getgbook zip:", err) + } + switch runtime.GOOS { + case "darwin": + gbookCommand = filepath.Join(tessdir, "getgbook") + case "linux": + gbookCommand = filepath.Join(tessdir, "getgbook") + case "windows": + gbookCommand = filepath.Join(tessdir, "getgbook.exe") + } + } + + _, err = exec.LookPath(gbookCommand) + if err != nil { + log.Printf("No getgbook found [tried %s], google book downloading will be disabled, either set -gbookcmd on the command line or use the official build which includes an embedded getgbook.", gbookCommand) + gbookCommand = "" + } + tessdatadir := filepath.Join(tessdir, "tessdata") err = os.MkdirAll(tessdatadir, 0755) if err != nil { log.Fatalln("Error setting up tessdata directory:", err) } - err = unpackZip(tessdatazip, tessdatadir) - if err != nil { - log.Fatalln("Error unpacking embedded tessdata zip:", err) + if len(tessdatazip) > 0 { + err = unpackZip(tessdatazip, tessdatadir) + if err != nil { + log.Fatalln("Error unpacking embedded tessdata zip:", err) + } } - // if trainingPath doesn't exist, set it to the embedded training instead - _, err = os.Stat(trainingPath) - if err != nil && !os.IsExist(err) { - trainingPath = filepath.Base(trainingPath) - trainingPath = filepath.Join(tessdatadir, trainingPath) + // copy training path to the tessdir directory, so that we can keep that a + // writeable space, which is needed opening other trainings in sandboxes + // like flatpak + in, err := os.Open(trainingPath) + trainingPath = filepath.Join(tessdatadir, filepath.Base(trainingPath)) + if err != nil { + in, err = os.Open(trainingPath) + if err != nil { + log.Fatalf("Error opening training file %s: %v", trainingPath, err) + } } - - f, err := os.Open(trainingPath) + defer in.Close() + newPath := trainingPath + ".new" + out, err := os.Create(newPath) if err != nil { - fmt.Fprintf(os.Stderr, "Error: Training files %s or %s could not be opened.\n", *training, trainingPath) - fmt.Fprintf(os.Stderr, "Set the `-t` flag with path to a tesseract .traineddata file.\n") - os.Exit(1) + log.Fatalf("Error creating training file %s: %v", newPath, err) + } + defer out.Close() + _, err = io.Copy(out, in) + if err != nil { + log.Fatalf("Error copying training file to %s: %v", newPath, err) + } + in.Close() + out.Close() + err = os.Rename(newPath, trainingPath) + if err != nil { + log.Fatalf("Error moving new training file to %s: %v", trainingPath, err) } - f.Close() abstraining, err := filepath.Abs(trainingPath) if err != nil { @@ -237,13 +284,26 @@ These training files are included in rescribe, and are always available: } if flag.NArg() < 1 || *usegui { - err := startGui(*verboselog, tessCommand, trainingName, *systess, tessdir) + err := startGui(verboselog, tessCommand, gbookCommand, trainingName, tessdir) + err = os.RemoveAll(tessdir) + if err != nil { + log.Printf("Error removing tesseract directory %s: %v", tessdir, err) + } + if err != nil { log.Fatalln("Error in gui:", err) } return } + f, err := os.Open(trainingPath) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: Training files %s or %s could not be opened.\n", *training, trainingPath) + fmt.Fprintf(os.Stderr, "Set the `-t` flag with path to a tesseract .traineddata file.\n") + os.Exit(1) + } + f.Close() + bookdir := flag.Arg(0) bookname := strings.ReplaceAll(filepath.Base(bookdir), " ", "_") savedir := bookdir @@ -258,27 +318,44 @@ These training files are included in rescribe, and are always available: log.Fatalln("Error opening book file/dir:", err) } + var ctx context.Context + ctx = context.Background() + + // TODO: support google book downloading, as done with the GUI + // try opening as a PDF, and extracting if !fi.IsDir() { if flag.NArg() < 2 { savedir = strings.TrimSuffix(bookdir, ".pdf") } - bookdir, err = extractPdfImgs(bookdir) + bookdir, err = extractPdfImgs(ctx, bookdir) if err != nil { log.Fatalln("Error opening file as PDF:", err) } + // if this occurs then extractPdfImgs() will have recovered from + // a panic in the pdf package + if bookdir == "" { + log.Fatalln("Error opening file as PDF: image type not supported, you will need to extract images manually.") + } bookname = strings.TrimSuffix(bookname, ".pdf") ispdf = true } - err = startProcess(*verboselog, tessCommand, bookdir, bookname, trainingName, *systess, savedir, tessdir) + err = startProcess(ctx, verboselog, tessCommand, bookdir, bookname, trainingName, savedir, tessdir, !*wipe, *fullpdf) if err != nil { log.Fatalln(err) } + if !*systess { + err = os.RemoveAll(tessdir) + if err != nil { + log.Printf("Error removing tesseract directory %s: %v", tessdir, err) + } + } + if ispdf { os.RemoveAll(filepath.Clean(filepath.Join(bookdir, ".."))) } @@ -286,7 +363,16 @@ These training files are included in rescribe, and are always available: // extractPdfImgs extracts all images embedded in a PDF to a // temporary directory, which is returned on success. -func extractPdfImgs(path string) (string, error) { +func extractPdfImgs(ctx context.Context, path string) (string, error) { + defer func() { + // unfortunately the pdf library will panic if it sees an encoding + // it can't decode, so recover from that and give a warning + r := recover() + if r != nil { + fmt.Fprintf(os.Stderr, "Warning: Error extracting from PDF: %v\n", r) + } + }() + p, err := pdf.Open(path) if err != nil { return "", err @@ -305,9 +391,20 @@ func extractPdfImgs(path string) (string, error) { } for pgnum := 1; pgnum <= p.NumPage(); pgnum++ { + select { + case <-ctx.Done(): + return "", ctx.Err() + default: + } if p.Page(pgnum).V.IsNull() { continue } + var rotate int64 + for v := p.Page(pgnum).V; !v.IsNull(); v = v.Key("Parent") { + if r := v.Key("Rotate"); !r.IsNull() { + rotate = r.Int64() + } + } res := p.Page(pgnum).Resources() if res.Kind() != pdf.Dict { continue @@ -323,7 +420,7 @@ func extractPdfImgs(path string) (string, error) { continue } - fn := fmt.Sprintf("%s-%04d.jpg", k, pgnum) + fn := fmt.Sprintf("%04d-%s.jpg", pgnum, k) path := filepath.Join(tempdir, fn) w, err := os.Create(path) defer w.Close() @@ -343,16 +440,31 @@ func extractPdfImgs(path string) (string, error) { if err != nil { return tempdir, fmt.Errorf("Error removing extracted image %s from PDF: %v\n", fn, err) } + + if rotate != 0 { + err = rotateImage(path, rotate) + if err != nil { + return tempdir, fmt.Errorf("Error rotating extracted image %s from PDF: %v\n", fn, err) + } + } } } // TODO: check for places where there are multiple images per page, and only keep largest ones where that's the case + select { + case <-ctx.Done(): + return "", ctx.Err() + default: + } + return tempdir, nil } // rmIfNotImage attempts to decode a given file as an image. If it is // decode-able as PNG, then rename file extension from .jpg to .png, -// if it fails to be read as PNG or JPEG it will be deleted. +// if it is decode-able as TIFF then convert to PNG and rename file +// extension appropriately, if it fails to be read as PNG, TIFF or +// JPEG it will just be deleted. func rmIfNotImage(f string) error { r, err := os.Open(f) defer r.Close() @@ -363,9 +475,9 @@ func rmIfNotImage(f string) error { r.Close() if err == nil { b := strings.TrimSuffix(f, ".jpg") - err = os.Rename(f, b + ".png") + err = os.Rename(f, b+".png") if err != nil { - return fmt.Errorf("Error renaming %s to %s: %v", f, b + ".png", err) + return fmt.Errorf("Error renaming %s to %s: %v", f, b+".png", err) } return nil } @@ -376,19 +488,134 @@ func rmIfNotImage(f string) error { return fmt.Errorf("Failed to open image %s: %v\n", f, err) } _, err = jpeg.Decode(r) + r.Close() + if err == nil { + return nil + } + + r, err = os.Open(f) + defer r.Close() if err != nil { + return fmt.Errorf("Failed to open image %s: %v\n", f, err) + } + t, err := tiff.Decode(r) + if err == nil { + b := strings.TrimSuffix(f, ".jpg") + n, err := os.Create(b + ".png") + defer n.Close() + if err != nil { + return fmt.Errorf("Failed to create file to store new png %s from tiff %s: %v\n", b+".png", f, err) + } + err = png.Encode(n, t) + if err != nil { + return fmt.Errorf("Failed to encode tiff as png for %s: %v\n", f, err) + } r.Close() err = os.Remove(f) if err != nil { - return fmt.Errorf("Failed to remove invalid image %s: %v", f, err) + return fmt.Errorf("Failed to remove original tiff %s: %v\n", f, err) + } + return nil + } + + r.Close() + err = os.Remove(f) + if err != nil { + return fmt.Errorf("Failed to remove invalid image %s: %v", f, err) + } + + return nil +} + +// rotateImage rotates an image at the given path by the given angle +func rotateImage(path string, angle int64) error { + switch angle { + case 90: + // proceed with the rest of the function + case 180, 270: + // rotate the image again first, as many times as necessary. + // this is inefficient but easy. + err := rotateImage(path, angle-90) + if err != nil { + return fmt.Errorf("error with a rotation run: %w", err) } + default: + return fmt.Errorf("Rotation angle of %d is not supported", angle) + } + + r, err := os.Open(path) + defer r.Close() + if err != nil { + return fmt.Errorf("Failed to open image: %w", err) + } + img, err := png.Decode(r) + if err != nil { + r.Close() + r, err = os.Open(path) + defer r.Close() + if err != nil { + return fmt.Errorf("Failed to open image: %w", err) + } + img, err = jpeg.Decode(r) + } + if err != nil { + r.Close() + r, err = os.Open(path) + defer r.Close() + if err != nil { + return fmt.Errorf("Failed to open image: %w", err) + } + img, err = tiff.Decode(r) + } + if err != nil { + return fmt.Errorf("Failed to decode image as png, jpeg or tiff: %w", err) + } + + b := img.Bounds() + + orig := image.NewRGBA(b) + draw.Draw(orig, b, img, b.Min, draw.Src) + + newb := image.Rectangle{ + Min: image.Point{X: 0, Y: 0}, + Max: image.Point{X: b.Dy(), Y: b.Dx()}, + } + new := image.NewRGBA(newb) + + for x := b.Min.X; x < b.Max.X; x++ { + desty := newb.Min.Y + x + for y := b.Max.Y; y > b.Min.Y; y-- { + destx := b.Dy() - y + newb.Min.X + new.SetRGBA(destx, desty, orig.RGBAAt(x, y)) + } + } + + err = r.Close() + if err != nil { + return fmt.Errorf("Failed to close image: %w", err) + } + w, err := os.Create(path) + if err != nil { + return fmt.Errorf("Failed to create rotated image: %w", err) + } + defer w.Close() + + if !strings.HasSuffix(path, ".jpg") { + err = jpeg.Encode(w, new, nil) + } else { + err = png.Encode(w, new) + } + if err != nil { + return fmt.Errorf("Failed to encode rotated image: %w", err) } return nil } -func startProcess(logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, systess bool, savedir string, tessdir string) error { - _, err := exec.Command(tessCommand, "--help").Output() +func startProcess(ctx context.Context, logger *log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, savedir string, tessdir string, nowipe bool, fullpdf bool) error { + cmd := exec.Command(tessCommand, "--help") + pipeline.HideCmd(cmd) + _, err := cmd.Output() if err != nil { errmsg := "Error, Can't run Tesseract\n" errmsg += "Ensure that Tesseract is installed and available, or don't use the -systess flag.\n" @@ -404,7 +631,7 @@ func startProcess(logger log.Logger, tessCommand string, bookdir string, booknam } var conn Pipeliner - conn = &bookpipeline.LocalConn{Logger: &logger, TempDir: tempdir} + conn = &bookpipeline.LocalConn{Logger: logger, TempDir: tempdir} conn.Log("Setting up session") err = conn.Init() @@ -415,14 +642,14 @@ func startProcess(logger log.Logger, tessCommand string, bookdir string, booknam fmt.Printf("Copying book to pipeline\n") - err = uploadbook(bookdir, bookname, conn) + err = uploadbook(ctx, bookdir, bookname, conn, nowipe) if err != nil { _ = os.RemoveAll(tempdir) return fmt.Errorf("Error uploading book: %v", err) } fmt.Printf("Processing book\n") - err = processbook(trainingName, tessCommand, conn) + err = processbook(ctx, trainingName, tessCommand, conn, fullpdf) if err != nil { _ = os.RemoveAll(tempdir) return fmt.Errorf("Error processing book: %v", err) @@ -444,18 +671,16 @@ func startProcess(logger log.Logger, tessCommand string, bookdir string, booknam return fmt.Errorf("Error removing temporary directory %s: %v", tempdir, err) } - if !systess { - err = os.RemoveAll(tessdir) - if err != nil { - return fmt.Errorf("Error removing tesseract directory %s: %v", tessdir, err) - } - } - hocrs, err := filepath.Glob(fmt.Sprintf("%s%s*.hocr", savedir, string(filepath.Separator))) if err != nil { return fmt.Errorf("Error looking for .hocr files: %v", err) } + err = addFullTxt(hocrs, bookname) + if err != nil { + log.Fatalf("Error creating full txt version: %v", err) + } + for _, v := range hocrs { err = addTxtVersion(v) if err != nil { @@ -471,11 +696,46 @@ func startProcess(logger log.Logger, tessCommand string, bookdir string, booknam if err != nil { log.Fatalf("Error moving hocr %s to hocr directory: %v", v, err) } + + pngname := strings.Replace(v, ".hocr", ".png", 1) + err = os.MkdirAll(filepath.Join(savedir, "png"), 0755) + if err != nil { + log.Fatalf("Error creating png directory: %v", err) + } + + err = os.Rename(pngname, filepath.Join(savedir, "png", filepath.Base(pngname))) + if err != nil { + log.Fatalf("Error moving png %s to png directory: %v", pngname, err) + } + } // For simplicity, remove .binarised.pdf and rename .colour.pdf to .pdf - _ = os.Remove(filepath.Join(savedir, bookname+".binarised.pdf")) - _ = os.Rename(filepath.Join(savedir, bookname+".colour.pdf"), filepath.Join(savedir, bookname+".pdf")) + // providing they both exist, otherwise just rename whichever exists + // to .pdf. + binpath := filepath.Join(savedir, bookname+".binarised.pdf") + colourpath := filepath.Join(savedir, bookname+".colour.pdf") + fullsizepath := filepath.Join(savedir, bookname+".original.pdf") + pdfpath := filepath.Join(savedir, bookname+" searchable.pdf") + + // If full size pdf is requested, replace colour.pdf with it + if fullpdf { + _ = os.Rename(fullsizepath, colourpath) + } + + _, err = os.Stat(binpath) + binexists := err == nil || os.IsExist(err) + _, err = os.Stat(colourpath) + colourexists := err == nil || os.IsExist(err) + + if binexists && colourexists { + _ = os.Remove(binpath) + _ = os.Rename(colourpath, pdfpath) + } else if binexists { + _ = os.Rename(binpath, pdfpath) + } else if colourexists { + _ = os.Rename(colourpath, pdfpath) + } return nil } @@ -506,21 +766,48 @@ func addTxtVersion(hocrfn string) error { return nil } -func uploadbook(dir string, name string, conn Pipeliner) error { +func addFullTxt(hocrs []string, bookname string) error { + if len(hocrs) == 0 { + return nil + } + var full string + for i, v := range hocrs { + t, err := hocr.GetText(v) + if err != nil { + return fmt.Errorf("Error getting text from hocr file %s: %v", v, err) + } + if i > 0 { + full += "\n" + } + full += t + } + + dir := filepath.Dir(hocrs[0]) + fn := filepath.Join(dir, bookname+".txt") + err := ioutil.WriteFile(fn, []byte(full), 0644) + if err != nil { + return fmt.Errorf("Error creating text file %s: %v", fn, err) + } + + return nil +} + +func uploadbook(ctx context.Context, dir string, name string, conn Pipeliner, nowipe bool) error { _, err := os.Stat(dir) if err != nil && !os.IsExist(err) { return fmt.Errorf("Error: directory %s not found", dir) } - err = pipeline.CheckImages(dir) + err = pipeline.CheckImages(ctx, dir) if err != nil { return fmt.Errorf("Error with images in %s: %v", dir, err) } - err = pipeline.UploadImages(dir, name, conn) + err = pipeline.UploadImages(ctx, dir, name, conn) if err != nil { return fmt.Errorf("Error saving images to process from %s: %v", dir, err) } - qid := pipeline.DetectQueueType(dir, conn) + qid := pipeline.DetectQueueType(dir, conn, nowipe) + fmt.Printf("Uploading to queue %s\n", qid) err = conn.AddToQueue(qid, name) if err != nil { @@ -531,9 +818,14 @@ func uploadbook(dir string, name string, conn Pipeliner) error { } func downloadbook(dir string, name string, conn Pipeliner) error { - err := pipeline.DownloadBestPages(dir, name, conn, false) + err := pipeline.DownloadBestPages(dir, name, conn) if err != nil { - return fmt.Errorf("Error downloading best pages: %v", err) + return fmt.Errorf("No images found") + } + + err = pipeline.DownloadBestPngs(dir, name, conn) + if err != nil { + return fmt.Errorf("No images found") } err = pipeline.DownloadPdfs(dir, name, conn) @@ -549,17 +841,19 @@ func downloadbook(dir string, name string, conn Pipeliner) error { return nil } -func processbook(training string, tesscmd string, conn Pipeliner) error { +func processbook(ctx context.Context, training string, tesscmd string, conn Pipeliner, fullpdf bool) error { origPattern := regexp.MustCompile(`[0-9]{4}.(jpg|png)$`) wipePattern := regexp.MustCompile(`[0-9]{4,6}(.bin)?.(jpg|png)$`) ocredPattern := regexp.MustCompile(`.hocr$`) var checkPreQueue <-chan time.Time + var checkPreNoWipeQueue <-chan time.Time var checkWipeQueue <-chan time.Time var checkOCRPageQueue <-chan time.Time var checkAnalyseQueue <-chan time.Time var stopIfQuiet *time.Timer checkPreQueue = time.After(0) + checkPreNoWipeQueue = time.After(0) checkWipeQueue = time.After(0) checkOCRPageQueue = time.After(0) checkAnalyseQueue = time.After(0) @@ -571,6 +865,27 @@ func processbook(training string, tesscmd string, conn Pipeliner) error { for { select { + case <-ctx.Done(): + return ctx.Err() + case <-checkPreNoWipeQueue: + msg, err := conn.CheckQueue(conn.PreNoWipeQueueId(), QueueTimeoutSecs) + checkPreNoWipeQueue = time.After(PauseBetweenChecks) + if err != nil { + return fmt.Errorf("Error checking preprocess no wipe queue: %v", err) + } + if msg.Handle == "" { + conn.Log("No message received on preprocess no wipe queue, sleeping") + continue + } + stopTimer(stopIfQuiet) + conn.Log("Message received on preprocess no wipe queue, processing", msg.Body) + fmt.Printf(" Preprocessing book (binarising only, no wiping)\n") + err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess(thresholds, true), origPattern, conn.PreNoWipeQueueId(), conn.OCRPageQueueId()) + resetTimer(stopIfQuiet, quietTime) + if err != nil { + return fmt.Errorf("Error during preprocess (no wipe): %v", err) + } + fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output case <-checkPreQueue: msg, err := conn.CheckQueue(conn.PreQueueId(), QueueTimeoutSecs) checkPreQueue = time.After(PauseBetweenChecks) @@ -584,12 +899,12 @@ func processbook(training string, tesscmd string, conn Pipeliner) error { stopTimer(stopIfQuiet) conn.Log("Message received on preprocess queue, processing", msg.Body) fmt.Printf(" Preprocessing book (binarising and wiping)\n") - err = pipeline.ProcessBook(msg, conn, pipeline.Preprocess(thresholds), origPattern, conn.PreQueueId(), conn.OCRPageQueueId()) - fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output + err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess(thresholds, false), origPattern, conn.PreQueueId(), conn.OCRPageQueueId()) resetTimer(stopIfQuiet, quietTime) if err != nil { return fmt.Errorf("Error during preprocess: %v", err) } + fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output case <-checkWipeQueue: msg, err := conn.CheckQueue(conn.WipeQueueId(), QueueTimeoutSecs) checkWipeQueue = time.After(PauseBetweenChecks) @@ -603,12 +918,12 @@ func processbook(training string, tesscmd string, conn Pipeliner) error { stopTimer(stopIfQuiet) conn.Log("Message received on wipeonly queue, processing", msg.Body) fmt.Printf(" Preprocessing book (wiping only)\n") - err = pipeline.ProcessBook(msg, conn, pipeline.Wipe, wipePattern, conn.WipeQueueId(), conn.OCRPageQueueId()) - fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output + err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Wipe, wipePattern, conn.WipeQueueId(), conn.OCRPageQueueId()) resetTimer(stopIfQuiet, quietTime) if err != nil { return fmt.Errorf("Error during wipe: %v", err) } + fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output case <-checkOCRPageQueue: msg, err := conn.CheckQueue(conn.OCRPageQueueId(), QueueTimeoutSecs) checkOCRPageQueue = time.After(PauseBetweenChecks) @@ -624,7 +939,7 @@ func processbook(training string, tesscmd string, conn Pipeliner) error { stopTimer(stopIfQuiet) conn.Log("Message received on OCR Page queue, processing", msg.Body) fmt.Printf(".") - err = pipeline.OcrPage(msg, conn, pipeline.Ocr(training, tesscmd), conn.OCRPageQueueId(), conn.AnalyseQueueId()) + err = pipeline.OcrPage(ctx, msg, conn, pipeline.Ocr(training, tesscmd), conn.OCRPageQueueId(), conn.AnalyseQueueId()) resetTimer(stopIfQuiet, quietTime) if err != nil { return fmt.Errorf("\nError during OCR Page process: %v", err) @@ -642,7 +957,7 @@ func processbook(training string, tesscmd string, conn Pipeliner) error { stopTimer(stopIfQuiet) conn.Log("Message received on analyse queue, processing", msg.Body) fmt.Printf("\n Analysing OCR and compiling PDFs\n") - err = pipeline.ProcessBook(msg, conn, pipeline.Analyse(conn), ocredPattern, conn.AnalyseQueueId(), "") + err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Analyse(conn, fullpdf), ocredPattern, conn.AnalyseQueueId(), "") resetTimer(stopIfQuiet, quietTime) if err != nil { return fmt.Errorf("Error during analysis: %v", err) |