From fc6becf5ed98e9c0815532fd76639c15eb481ed1 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 9 Nov 2020 17:33:52 +0000 Subject: [rescribe] work in progress at a self-contained local pipeline processor, called rescribe --- cmd/rescribe/main.go | 247 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 cmd/rescribe/main.go (limited to 'cmd/rescribe/main.go') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go new file mode 100644 index 0000000..e3781cb --- /dev/null +++ b/cmd/rescribe/main.go @@ -0,0 +1,247 @@ +// Copyright 2019 Nick White. +// Use of this source code is governed by the GPLv3 +// license that can be found in the LICENSE file. + +// rescribe is a modification of bookpipeline designed for local-only +// operation, which rolls uploading, processing, and downloading of +// a single book by the pipeline into one command. +package main + +import ( + "flag" + "fmt" + "log" + "os" + "path/filepath" + "regexp" + "time" + + "rescribe.xyz/bookpipeline" + + "rescribe.xyz/bookpipeline/internal/pipeline" +) + +const usage = `Usage: rescribe [-v] [-t training] bookdir + +Process and OCR a book using the Rescribe pipeline on a local machine. +` + +const QueueTimeoutSecs = 2 * 60 +const PauseBetweenChecks = 1 * time.Second +const LogSaveTime = 1 * time.Minute + +// null writer to enable non-verbose logging to be discarded +type NullWriter bool + +func (w NullWriter) Write(p []byte) (n int, err error) { + return len(p), nil +} + +type Clouder interface { + Init() error + ListObjects(bucket string, prefix string) ([]string, error) + Download(bucket string, key string, fn string) error + Upload(bucket string, key string, path string) error + CheckQueue(url string, timeout int64) (bookpipeline.Qmsg, error) + AddToQueue(url string, msg string) error + DelFromQueue(url string, handle string) error + QueueHeartbeat(msg bookpipeline.Qmsg, qurl string, duration int64) (bookpipeline.Qmsg, error) +} + +type Pipeliner interface { + Clouder + PreQueueId() string + WipeQueueId() string + OCRPageQueueId() string + AnalyseQueueId() string + WIPStorageId() string + GetLogger() *log.Logger + Log(v ...interface{}) +} + +func stopTimer(t *time.Timer) { + if !t.Stop() { + <-t.C + } +} + +func resetTimer(t *time.Timer, d time.Duration) { + if d > 0 { + t.Reset(d) + } +} + +func main() { + verbose := flag.Bool("v", false, "verbose") + training := flag.String("t", "rescribealphav5", "default tesseract training file to use (without the .traineddata part)") + + flag.Usage = func() { + fmt.Fprintf(flag.CommandLine.Output(), usage) + flag.PrintDefaults() + } + flag.Parse() + + if flag.NArg() < 1 || flag.NArg() > 3 { + flag.Usage() + return + } + + bookdir := flag.Arg(0) + var bookname string + if flag.NArg() > 2 { + bookname = flag.Arg(1) + } else { + bookname = filepath.Base(bookdir) + } + + var verboselog *log.Logger + if *verbose { + verboselog = log.New(os.Stdout, "", 0) + } else { + var n NullWriter + verboselog = log.New(n, "", 0) + } + + var conn Pipeliner + // TODO: set tmpdir to a specific random thing for this run only + conn = &bookpipeline.LocalConn{Logger: verboselog} + + conn.Log("Setting up session") + err := conn.Init() + if err != nil { + log.Fatalln("Error setting up connection:", err) + } + conn.Log("Finished setting up session") + + uploadbook(bookdir, bookname, *training, conn) + + processbook(*training, conn) + + // TODO: save book +} + +func uploadbook(dir string, name string, training string, conn Pipeliner) error { + err := pipeline.CheckImages(dir) + if err != nil { + return fmt.Errorf("Error with images in %s: %v", dir, err) + } + err = pipeline.UploadImages(dir, name, conn) + if err != nil { + return fmt.Errorf("Error saving images to process from %s: %v", dir, err) + } + + qid := pipeline.DetectQueueType(dir, conn) + if training != "" { + name = name + " " + training + } + err = conn.AddToQueue(qid, name) + if err != nil { + return fmt.Errorf("Error adding book job to queue %s: %v", qid, err) + } + + return nil +} + + +func processbook(training string, conn Pipeliner) { + origPattern := regexp.MustCompile(`[0-9]{4}.jpg$`) + wipePattern := regexp.MustCompile(`[0-9]{4,6}(.bin)?.png$`) + ocredPattern := regexp.MustCompile(`.hocr$`) + + var checkPreQueue <-chan time.Time + var checkWipeQueue <-chan time.Time + var checkOCRPageQueue <-chan time.Time + var checkAnalyseQueue <-chan time.Time + var stopIfQuiet *time.Timer + checkPreQueue = time.After(0) + checkWipeQueue = time.After(0) + checkOCRPageQueue = time.After(0) + checkAnalyseQueue = time.After(0) + var quietTime = 1 * time.Second + stopIfQuiet = time.NewTimer(quietTime) + if quietTime == 0 { + stopIfQuiet.Stop() + } + + for { + select { + case <-checkPreQueue: + msg, err := conn.CheckQueue(conn.PreQueueId(), QueueTimeoutSecs) + checkPreQueue = time.After(PauseBetweenChecks) + if err != nil { + conn.Log("Error checking preprocess queue", err) + continue + } + if msg.Handle == "" { + conn.Log("No message received on preprocess queue, sleeping") + continue + } + conn.Log("Message received on preprocess queue, processing", msg.Body) + stopTimer(stopIfQuiet) + err = pipeline.ProcessBook(msg, conn, pipeline.Preprocess, origPattern, conn.PreQueueId(), conn.OCRPageQueueId()) + resetTimer(stopIfQuiet, quietTime) + if err != nil { + conn.Log("Error during preprocess", err) + } + case <-checkWipeQueue: + msg, err := conn.CheckQueue(conn.WipeQueueId(), QueueTimeoutSecs) + checkWipeQueue = time.After(PauseBetweenChecks) + if err != nil { + conn.Log("Error checking wipeonly queue", err) + continue + } + if msg.Handle == "" { + conn.Log("No message received on wipeonly queue, sleeping") + continue + } + stopTimer(stopIfQuiet) + conn.Log("Message received on wipeonly queue, processing", msg.Body) + err = pipeline.ProcessBook(msg, conn, pipeline.Wipe, wipePattern, conn.WipeQueueId(), conn.OCRPageQueueId()) + resetTimer(stopIfQuiet, quietTime) + if err != nil { + conn.Log("Error during wipe", err) + } + case <-checkOCRPageQueue: + msg, err := conn.CheckQueue(conn.OCRPageQueueId(), QueueTimeoutSecs) + checkOCRPageQueue = time.After(PauseBetweenChecks) + if err != nil { + conn.Log("Error checking OCR Page queue", err) + continue + } + if msg.Handle == "" { + continue + } + // Have OCRPageQueue checked immediately after completion, as chances are high that + // there will be more pages that should be done without delay + checkOCRPageQueue = time.After(0) + stopTimer(stopIfQuiet) + conn.Log("Message received on OCR Page queue, processing", msg.Body) + err = pipeline.OcrPage(msg, conn, pipeline.Ocr(training), conn.OCRPageQueueId(), conn.AnalyseQueueId()) + resetTimer(stopIfQuiet, quietTime) + if err != nil { + conn.Log("Error during OCR Page process", err) + } + case <-checkAnalyseQueue: + msg, err := conn.CheckQueue(conn.AnalyseQueueId(), QueueTimeoutSecs) + checkAnalyseQueue = time.After(PauseBetweenChecks) + if err != nil { + conn.Log("Error checking analyse queue", err) + continue + } + if msg.Handle == "" { + conn.Log("No message received on analyse queue, sleeping") + continue + } + stopTimer(stopIfQuiet) + conn.Log("Message received on analyse queue, processing", msg.Body) + err = pipeline.ProcessBook(msg, conn, pipeline.Analyse(conn), ocredPattern, conn.AnalyseQueueId(), "") + resetTimer(stopIfQuiet, quietTime) + if err != nil { + conn.Log("Error during analysis", err) + } + case <-stopIfQuiet.C: + conn.Log("Processing finished") + return + } + } +} -- cgit v1.2.1-24-ge1ad From a1de8862a091f9584220db40671a0d43346c4519 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 9 Nov 2020 18:29:56 +0000 Subject: [rescribe] Local only combo tool basically now working. Testing is still minimal. --- cmd/rescribe/main.go | 55 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 6 deletions(-) (limited to 'cmd/rescribe/main.go') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index e3781cb..c309367 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -10,6 +10,7 @@ package main import ( "flag" "fmt" + "io/ioutil" "log" "os" "path/filepath" @@ -88,7 +89,7 @@ func main() { bookdir := flag.Arg(0) var bookname string - if flag.NArg() > 2 { + if flag.NArg() > 1 { bookname = flag.Arg(1) } else { bookname = filepath.Base(bookdir) @@ -102,22 +103,41 @@ func main() { verboselog = log.New(n, "", 0) } + tempdir, err := ioutil.TempDir("", "bookpipeline") + if err != nil { + log.Fatalln("Error setting up temporary directory:", err) + } + var conn Pipeliner - // TODO: set tmpdir to a specific random thing for this run only - conn = &bookpipeline.LocalConn{Logger: verboselog} + conn = &bookpipeline.LocalConn{Logger: verboselog, TempDir: tempdir} conn.Log("Setting up session") - err := conn.Init() + err = conn.Init() if err != nil { log.Fatalln("Error setting up connection:", err) } conn.Log("Finished setting up session") - uploadbook(bookdir, bookname, *training, conn) + fmt.Printf("Copying book to pipeline\n") + err = uploadbook(bookdir, bookname, *training, conn) + if err != nil { + log.Fatalln(err) + } + + fmt.Printf("Processing book (this may take some time)\n") processbook(*training, conn) - // TODO: save book + fmt.Printf("Saving finished book to %s\n", bookname) + err = downloadbook(bookname, conn) + if err != nil { + log.Fatalln(err) + } + + err = os.RemoveAll(tempdir) + if err != nil { + log.Fatalf("Error removing temporary directory %s: %v", tempdir, err) + } } func uploadbook(dir string, name string, training string, conn Pipeliner) error { @@ -142,6 +162,29 @@ func uploadbook(dir string, name string, training string, conn Pipeliner) error return nil } +func downloadbook(name string, conn Pipeliner) error { + err := os.MkdirAll(name, 0755) + if err != nil { + log.Fatalln("Failed to create directory", name, err) + } + + err = pipeline.DownloadBestPages(name, conn) + if err != nil { + return fmt.Errorf("Error downloading best pages: %v", err) + } + + err = pipeline.DownloadPdfs(name, conn) + if err != nil { + return fmt.Errorf("Error downloading PDFs: %v", err) + } + + err = pipeline.DownloadAnalyses(name, conn) + if err != nil { + return fmt.Errorf("Error downloading analyses: %v", err) + } + + return nil +} func processbook(training string, conn Pipeliner) { origPattern := regexp.MustCompile(`[0-9]{4}.jpg$`) -- cgit v1.2.1-24-ge1ad From f19df9e8c1213a49c426caefd2fadc711f5faf11 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 9 Nov 2020 18:55:36 +0000 Subject: Switch Preprocess() to take the thresholds to use, and have rescribe tool only use 0.1,0.2,0.3 --- cmd/rescribe/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'cmd/rescribe/main.go') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index c309367..1a3dcff 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -221,7 +221,7 @@ func processbook(training string, conn Pipeliner) { } conn.Log("Message received on preprocess queue, processing", msg.Body) stopTimer(stopIfQuiet) - err = pipeline.ProcessBook(msg, conn, pipeline.Preprocess, origPattern, conn.PreQueueId(), conn.OCRPageQueueId()) + err = pipeline.ProcessBook(msg, conn, pipeline.Preprocess([]float64{0.1, 0.2, 0.3}), origPattern, conn.PreQueueId(), conn.OCRPageQueueId()) resetTimer(stopIfQuiet, quietTime) if err != nil { conn.Log("Error during preprocess", err) -- cgit v1.2.1-24-ge1ad From 198f8215f8dd0460608abcd03fa49451462c9d11 Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 10 Nov 2020 10:41:15 +0000 Subject: [getpipelinebook] Rewrite to use internal package functions --- cmd/rescribe/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'cmd/rescribe/main.go') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 1a3dcff..8e2fe69 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -168,7 +168,7 @@ func downloadbook(name string, conn Pipeliner) error { log.Fatalln("Failed to create directory", name, err) } - err = pipeline.DownloadBestPages(name, conn) + err = pipeline.DownloadBestPages(name, conn, false) if err != nil { return fmt.Errorf("Error downloading best pages: %v", err) } -- cgit v1.2.1-24-ge1ad From 7921b5ca6d6667dda09ae67dcc1ee987aef62ebb Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 10 Nov 2020 11:22:36 +0000 Subject: [rescribe] Handle errors in processbook correctly, and improve console output --- cmd/rescribe/main.go | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) (limited to 'cmd/rescribe/main.go') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 8e2fe69..3b69b21 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -122,15 +122,21 @@ func main() { err = uploadbook(bookdir, bookname, *training, conn) if err != nil { + _ = os.RemoveAll(tempdir) log.Fatalln(err) } - fmt.Printf("Processing book (this may take some time)\n") - processbook(*training, conn) + fmt.Printf("Processing book\n") + err = processbook(*training, conn) + if err != nil { + _ = os.RemoveAll(tempdir) + log.Fatalln(err) + } fmt.Printf("Saving finished book to %s\n", bookname) err = downloadbook(bookname, conn) if err != nil { + _ = os.RemoveAll(tempdir) log.Fatalln(err) } @@ -186,7 +192,7 @@ func downloadbook(name string, conn Pipeliner) error { return nil } -func processbook(training string, conn Pipeliner) { +func processbook(training string, conn Pipeliner) error { origPattern := regexp.MustCompile(`[0-9]{4}.jpg$`) wipePattern := regexp.MustCompile(`[0-9]{4,6}(.bin)?.png$`) ocredPattern := regexp.MustCompile(`.hocr$`) @@ -212,26 +218,26 @@ func processbook(training string, conn Pipeliner) { msg, err := conn.CheckQueue(conn.PreQueueId(), QueueTimeoutSecs) checkPreQueue = time.After(PauseBetweenChecks) if err != nil { - conn.Log("Error checking preprocess queue", err) - continue + return fmt.Errorf("Error checking preprocess queue", err) } if msg.Handle == "" { conn.Log("No message received on preprocess queue, sleeping") continue } - conn.Log("Message received on preprocess queue, processing", msg.Body) stopTimer(stopIfQuiet) + conn.Log("Message received on preprocess queue, processing", msg.Body) + fmt.Printf(" Preprocessing book (binarising and wiping)\n") err = pipeline.ProcessBook(msg, conn, pipeline.Preprocess([]float64{0.1, 0.2, 0.3}), origPattern, conn.PreQueueId(), conn.OCRPageQueueId()) + fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output resetTimer(stopIfQuiet, quietTime) if err != nil { - conn.Log("Error during preprocess", err) + return fmt.Errorf("Error during preprocess", err) } case <-checkWipeQueue: msg, err := conn.CheckQueue(conn.WipeQueueId(), QueueTimeoutSecs) checkWipeQueue = time.After(PauseBetweenChecks) if err != nil { - conn.Log("Error checking wipeonly queue", err) - continue + return fmt.Errorf("Error checking wipeonly queue", err) } if msg.Handle == "" { conn.Log("No message received on wipeonly queue, sleeping") @@ -239,17 +245,18 @@ func processbook(training string, conn Pipeliner) { } stopTimer(stopIfQuiet) conn.Log("Message received on wipeonly queue, processing", msg.Body) + fmt.Printf(" Preprocessing book (wiping only)\n") err = pipeline.ProcessBook(msg, conn, pipeline.Wipe, wipePattern, conn.WipeQueueId(), conn.OCRPageQueueId()) + fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output resetTimer(stopIfQuiet, quietTime) if err != nil { - conn.Log("Error during wipe", err) + return fmt.Errorf("Error during wipe", err) } case <-checkOCRPageQueue: msg, err := conn.CheckQueue(conn.OCRPageQueueId(), QueueTimeoutSecs) checkOCRPageQueue = time.After(PauseBetweenChecks) if err != nil { - conn.Log("Error checking OCR Page queue", err) - continue + return fmt.Errorf("Error checking OCR Page queue", err) } if msg.Handle == "" { continue @@ -259,17 +266,17 @@ func processbook(training string, conn Pipeliner) { checkOCRPageQueue = time.After(0) stopTimer(stopIfQuiet) conn.Log("Message received on OCR Page queue, processing", msg.Body) + fmt.Printf(".") err = pipeline.OcrPage(msg, conn, pipeline.Ocr(training), conn.OCRPageQueueId(), conn.AnalyseQueueId()) resetTimer(stopIfQuiet, quietTime) if err != nil { - conn.Log("Error during OCR Page process", err) + return fmt.Errorf("\nError during OCR Page process", err) } case <-checkAnalyseQueue: msg, err := conn.CheckQueue(conn.AnalyseQueueId(), QueueTimeoutSecs) checkAnalyseQueue = time.After(PauseBetweenChecks) if err != nil { - conn.Log("Error checking analyse queue", err) - continue + return fmt.Errorf("Error checking analyse queue", err) } if msg.Handle == "" { conn.Log("No message received on analyse queue, sleeping") @@ -277,14 +284,17 @@ func processbook(training string, conn Pipeliner) { } stopTimer(stopIfQuiet) conn.Log("Message received on analyse queue, processing", msg.Body) + fmt.Printf("\n Analysing OCR and compiling PDFs\n") err = pipeline.ProcessBook(msg, conn, pipeline.Analyse(conn), ocredPattern, conn.AnalyseQueueId(), "") resetTimer(stopIfQuiet, quietTime) if err != nil { - conn.Log("Error during analysis", err) + return fmt.Errorf("Error during analysis", err) } case <-stopIfQuiet.C: conn.Log("Processing finished") - return + return nil } } + + return fmt.Errorf("Ended unexpectedly") // should never be reached } -- cgit v1.2.1-24-ge1ad From dac2f1ad471cd9896c16569fe02c69ff9b9855ba Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 10 Nov 2020 11:59:14 +0000 Subject: [rescribe] Change -t to the path of the traineddata file, and set TESSDATA_PREFIX accordingly --- cmd/rescribe/main.go | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) (limited to 'cmd/rescribe/main.go') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 3b69b21..8d7c07b 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -13,8 +13,10 @@ import ( "io/ioutil" "log" "os" + "os/exec" "path/filepath" "regexp" + "strings" "time" "rescribe.xyz/bookpipeline" @@ -74,7 +76,7 @@ func resetTimer(t *time.Timer, d time.Duration) { func main() { verbose := flag.Bool("v", false, "verbose") - training := flag.String("t", "rescribealphav5", "default tesseract training file to use (without the .traineddata part)") + training := flag.String("t", "training/rescribev7_fast.traineddata", "path to the tesseract training file to use") flag.Usage = func() { fmt.Fprintf(flag.CommandLine.Output(), usage) @@ -103,6 +105,33 @@ func main() { verboselog = log.New(n, "", 0) } + f, err := os.Open(*training) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: Training file %s could not be opened.\n", *training) + fmt.Fprintf(os.Stderr, "Set the `-t` flag with path to a tesseract .traineddata file.\n") + os.Exit(1) + } + f.Close() + + abstraining, err := filepath.Abs(*training) + if err != nil { + log.Fatalf("Error getting absolute path of training %s: %v", err) + } + tessPrefix, trainingName := filepath.Split(abstraining) + trainingName = strings.TrimSuffix(trainingName, ".traineddata") + err = os.Setenv("TESSDATA_PREFIX", tessPrefix) + if err != nil { + log.Fatalln("Error setting TESSDATA_PREFIX:", err) + } + + // TODO: would be good to be able to set custom path to tesseract + _, err = exec.Command("tesseract", "--help").Output() + if err != nil { + fmt.Fprintf(os.Stderr, "Error: Can't run Tesseract.\n") + fmt.Fprintf(os.Stderr, "Ensure that Tesseract is installed and available.\n") + os.Exit(1) + } + tempdir, err := ioutil.TempDir("", "bookpipeline") if err != nil { log.Fatalln("Error setting up temporary directory:", err) @@ -120,14 +149,14 @@ func main() { fmt.Printf("Copying book to pipeline\n") - err = uploadbook(bookdir, bookname, *training, conn) + err = uploadbook(bookdir, bookname, trainingName, conn) if err != nil { _ = os.RemoveAll(tempdir) log.Fatalln(err) } fmt.Printf("Processing book\n") - err = processbook(*training, conn) + err = processbook(trainingName, conn) if err != nil { _ = os.RemoveAll(tempdir) log.Fatalln(err) -- cgit v1.2.1-24-ge1ad From ad7aaf490e78e969bb5495dfda06a33d2a176aec Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 10 Nov 2020 12:28:50 +0000 Subject: [rescribe] Enable custom paths to tesseract command to be set (also improve some error output) --- cmd/rescribe/main.go | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'cmd/rescribe/main.go') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 8d7c07b..6a2fb9f 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -77,6 +77,7 @@ func resetTimer(t *time.Timer, d time.Duration) { func main() { verbose := flag.Bool("v", false, "verbose") training := flag.String("t", "training/rescribev7_fast.traineddata", "path to the tesseract training file to use") + tesscmd := flag.String("tesscmd", "tesseract", "The Tesseract executable to run. You may need to set this to the full path of Tesseract.exe if you're on Windows.") flag.Usage = func() { fmt.Fprintf(flag.CommandLine.Output(), usage) @@ -124,11 +125,12 @@ func main() { log.Fatalln("Error setting TESSDATA_PREFIX:", err) } - // TODO: would be good to be able to set custom path to tesseract - _, err = exec.Command("tesseract", "--help").Output() + _, err = exec.Command(*tesscmd, "--help").Output() if err != nil { fmt.Fprintf(os.Stderr, "Error: Can't run Tesseract.\n") fmt.Fprintf(os.Stderr, "Ensure that Tesseract is installed and available.\n") + fmt.Fprintf(os.Stderr, "You may need to -tesscmd to the full path of Tesseract.exe if you're on Windows, like this:\n") + fmt.Fprintf(os.Stderr, " rescribe -tesscmd 'C:\\Program Files\\Tesseract OCR\\tesseract.exe' ...\n") os.Exit(1) } @@ -149,14 +151,14 @@ func main() { fmt.Printf("Copying book to pipeline\n") - err = uploadbook(bookdir, bookname, trainingName, conn) + err = uploadbook(bookdir, bookname, conn) if err != nil { _ = os.RemoveAll(tempdir) log.Fatalln(err) } fmt.Printf("Processing book\n") - err = processbook(trainingName, conn) + err = processbook(trainingName, *tesscmd, conn) if err != nil { _ = os.RemoveAll(tempdir) log.Fatalln(err) @@ -175,7 +177,7 @@ func main() { } } -func uploadbook(dir string, name string, training string, conn Pipeliner) error { +func uploadbook(dir string, name string, conn Pipeliner) error { err := pipeline.CheckImages(dir) if err != nil { return fmt.Errorf("Error with images in %s: %v", dir, err) @@ -186,9 +188,7 @@ func uploadbook(dir string, name string, training string, conn Pipeliner) error } qid := pipeline.DetectQueueType(dir, conn) - if training != "" { - name = name + " " + training - } + err = conn.AddToQueue(qid, name) if err != nil { return fmt.Errorf("Error adding book job to queue %s: %v", qid, err) @@ -221,7 +221,7 @@ func downloadbook(name string, conn Pipeliner) error { return nil } -func processbook(training string, conn Pipeliner) error { +func processbook(training string, tesscmd string, conn Pipeliner) error { origPattern := regexp.MustCompile(`[0-9]{4}.jpg$`) wipePattern := regexp.MustCompile(`[0-9]{4,6}(.bin)?.png$`) ocredPattern := regexp.MustCompile(`.hocr$`) @@ -247,7 +247,7 @@ func processbook(training string, conn Pipeliner) error { msg, err := conn.CheckQueue(conn.PreQueueId(), QueueTimeoutSecs) checkPreQueue = time.After(PauseBetweenChecks) if err != nil { - return fmt.Errorf("Error checking preprocess queue", err) + return fmt.Errorf("Error checking preprocess queue: %v", err) } if msg.Handle == "" { conn.Log("No message received on preprocess queue, sleeping") @@ -260,13 +260,13 @@ func processbook(training string, conn Pipeliner) error { fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output resetTimer(stopIfQuiet, quietTime) if err != nil { - return fmt.Errorf("Error during preprocess", err) + return fmt.Errorf("Error during preprocess: %v", err) } case <-checkWipeQueue: msg, err := conn.CheckQueue(conn.WipeQueueId(), QueueTimeoutSecs) checkWipeQueue = time.After(PauseBetweenChecks) if err != nil { - return fmt.Errorf("Error checking wipeonly queue", err) + return fmt.Errorf("Error checking wipeonly queue, %v", err) } if msg.Handle == "" { conn.Log("No message received on wipeonly queue, sleeping") @@ -279,13 +279,13 @@ func processbook(training string, conn Pipeliner) error { fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output resetTimer(stopIfQuiet, quietTime) if err != nil { - return fmt.Errorf("Error during wipe", err) + return fmt.Errorf("Error during wipe: %v", err) } case <-checkOCRPageQueue: msg, err := conn.CheckQueue(conn.OCRPageQueueId(), QueueTimeoutSecs) checkOCRPageQueue = time.After(PauseBetweenChecks) if err != nil { - return fmt.Errorf("Error checking OCR Page queue", err) + return fmt.Errorf("Error checking OCR Page queue: %v", err) } if msg.Handle == "" { continue @@ -296,16 +296,16 @@ func processbook(training string, conn Pipeliner) error { stopTimer(stopIfQuiet) conn.Log("Message received on OCR Page queue, processing", msg.Body) fmt.Printf(".") - err = pipeline.OcrPage(msg, conn, pipeline.Ocr(training), conn.OCRPageQueueId(), conn.AnalyseQueueId()) + err = pipeline.OcrPage(msg, conn, pipeline.Ocr(training, tesscmd), conn.OCRPageQueueId(), conn.AnalyseQueueId()) resetTimer(stopIfQuiet, quietTime) if err != nil { - return fmt.Errorf("\nError during OCR Page process", err) + return fmt.Errorf("\nError during OCR Page process: %v", err) } case <-checkAnalyseQueue: msg, err := conn.CheckQueue(conn.AnalyseQueueId(), QueueTimeoutSecs) checkAnalyseQueue = time.After(PauseBetweenChecks) if err != nil { - return fmt.Errorf("Error checking analyse queue", err) + return fmt.Errorf("Error checking analyse queue: %v", err) } if msg.Handle == "" { conn.Log("No message received on analyse queue, sleeping") @@ -317,7 +317,7 @@ func processbook(training string, conn Pipeliner) error { err = pipeline.ProcessBook(msg, conn, pipeline.Analyse(conn), ocredPattern, conn.AnalyseQueueId(), "") resetTimer(stopIfQuiet, quietTime) if err != nil { - return fmt.Errorf("Error during analysis", err) + return fmt.Errorf("Error during analysis: %v", err) } case <-stopIfQuiet.C: conn.Log("Processing finished") -- cgit v1.2.1-24-ge1ad From 6b5145f0b75c8d5719bf44d5f654b9a2d1e3b2cd Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 16 Nov 2020 16:43:53 +0000 Subject: [rescribe] Mention in usage that things can be saved in a different directory --- cmd/rescribe/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'cmd/rescribe/main.go') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 6a2fb9f..8ca4189 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -24,7 +24,7 @@ import ( "rescribe.xyz/bookpipeline/internal/pipeline" ) -const usage = `Usage: rescribe [-v] [-t training] bookdir +const usage = `Usage: rescribe [-v] [-t training] bookdir [savedir] Process and OCR a book using the Rescribe pipeline on a local machine. ` -- cgit v1.2.1-24-ge1ad From 56c1cf041aec9cb2352a3bd4a4b46e65a3cc04c0 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 16 Nov 2020 16:44:42 +0000 Subject: [rescribe] Add txt output, only keep colour pdf, and reorganise files so they're more user-friendly --- cmd/rescribe/main.go | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'cmd/rescribe/main.go') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 8ca4189..fe36aea 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -20,6 +20,7 @@ import ( "time" "rescribe.xyz/bookpipeline" + "rescribe.xyz/utils/pkg/hocr" "rescribe.xyz/bookpipeline/internal/pipeline" ) @@ -175,6 +176,55 @@ func main() { if err != nil { log.Fatalf("Error removing temporary directory %s: %v", tempdir, err) } + + hocrs, err := filepath.Glob(fmt.Sprintf("%s/*hocr", bookname)) + if err != nil { + log.Fatalf("Error looking for .hocr files: %v", err) + } + + for _, v := range hocrs { + err = addTxtVersion(v) + if err != nil { + log.Fatalf("Error creating txt version of %s: %v", v, err) + } + + err = os.MkdirAll(filepath.Join(bookname, "hocr"), 0755) + if err != nil { + log.Fatalf("Error creating hocr directory: %v", err) + } + + err = os.Rename(v, filepath.Join(bookname, "hocr", filepath.Base(v))) + if err != nil { + log.Fatalf("Error moving hocr %s to hocr directory: %v", v, err) + } + } + + // For simplicity, remove .binarised.pdf and rename .colour.pdf to .pdf + _ = os.Remove(filepath.Join(bookname, bookname + ".binarised.pdf")) + _ = os.Rename(filepath.Join(bookname, bookname + ".colour.pdf"), filepath.Join(bookname, bookname + ".pdf")) +} + +func addTxtVersion(hocrfn string) error { + dir := filepath.Dir(hocrfn) + err := os.MkdirAll(filepath.Join(dir, "text"), 0755) + if err != nil { + log.Fatalf("Error creating text directory: %v", err) + } + + t, err := hocr.GetText(hocrfn) + if err != nil { + return fmt.Errorf("Error getting text from hocr file %s: %v", hocrfn, err) + } + + basefn := strings.TrimSuffix(filepath.Base(hocrfn), ".hocr") + ".txt" + fn := filepath.Join(dir, "text", basefn) + + err = ioutil.WriteFile(fn, []byte(t), 0644) + if err != nil { + return fmt.Errorf("Error creating text file %s: %v", fn, err) + } + + return nil } func uploadbook(dir string, name string, conn Pipeliner) error { -- cgit v1.2.1-24-ge1ad From eefa8f50d7ab915ce426c837cf504d26b7d4ccee Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 16 Nov 2020 17:43:13 +0000 Subject: [rescribe] Default to an appropriate tesscmd for Windows --- cmd/rescribe/main.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'cmd/rescribe/main.go') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index fe36aea..2320a2c 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -16,6 +16,7 @@ import ( "os/exec" "path/filepath" "regexp" + "runtime" "strings" "time" @@ -76,9 +77,14 @@ func resetTimer(t *time.Timer, d time.Duration) { } func main() { + deftesscmd := "tesseract" + if runtime.GOOS == "windows" { + deftesscmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe" + } + verbose := flag.Bool("v", false, "verbose") training := flag.String("t", "training/rescribev7_fast.traineddata", "path to the tesseract training file to use") - tesscmd := flag.String("tesscmd", "tesseract", "The Tesseract executable to run. You may need to set this to the full path of Tesseract.exe if you're on Windows.") + tesscmd := flag.String("tesscmd", deftesscmd, "The Tesseract executable to run. You may need to set this to the full path of Tesseract.exe if you're on Windows.") flag.Usage = func() { fmt.Fprintf(flag.CommandLine.Output(), usage) @@ -131,7 +137,7 @@ func main() { fmt.Fprintf(os.Stderr, "Error: Can't run Tesseract.\n") fmt.Fprintf(os.Stderr, "Ensure that Tesseract is installed and available.\n") fmt.Fprintf(os.Stderr, "You may need to -tesscmd to the full path of Tesseract.exe if you're on Windows, like this:\n") - fmt.Fprintf(os.Stderr, " rescribe -tesscmd 'C:\\Program Files\\Tesseract OCR\\tesseract.exe' ...\n") + fmt.Fprintf(os.Stderr, " rescribe -tesscmd 'C:\\Program Files\\Tesseract OCR (x86)\\tesseract.exe' ...\n") os.Exit(1) } -- cgit v1.2.1-24-ge1ad From f71fd636f151e5cb7eafb2ae6c21c1c188d43fdd Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 17 Nov 2020 12:24:42 +0000 Subject: Remove _bin0.x from txt filenames --- cmd/rescribe/main.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'cmd/rescribe/main.go') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 2320a2c..f4489d8 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -34,6 +34,7 @@ Process and OCR a book using the Rescribe pipeline on a local machine. const QueueTimeoutSecs = 2 * 60 const PauseBetweenChecks = 1 * time.Second const LogSaveTime = 1 * time.Minute +var thresholds = []float64{0.1, 0.2, 0.3} // null writer to enable non-verbose logging to be discarded type NullWriter bool @@ -222,8 +223,11 @@ func addTxtVersion(hocrfn string) error { return fmt.Errorf("Error getting text from hocr file %s: %v", hocrfn, err) } - basefn := strings.TrimSuffix(filepath.Base(hocrfn), ".hocr") + ".txt" - fn := filepath.Join(dir, "text", basefn) + basefn := filepath.Base(hocrfn) + for _, v := range thresholds { + basefn = strings.TrimSuffix(basefn, fmt.Sprintf("_bin%.1f.hocr", v)) + } + fn := filepath.Join(dir, "text", basefn + ".txt") err = ioutil.WriteFile(fn, []byte(t), 0644) if err != nil { @@ -312,7 +316,7 @@ func processbook(training string, tesscmd string, conn Pipeliner) error { stopTimer(stopIfQuiet) conn.Log("Message received on preprocess queue, processing", msg.Body) fmt.Printf(" Preprocessing book (binarising and wiping)\n") - err = pipeline.ProcessBook(msg, conn, pipeline.Preprocess([]float64{0.1, 0.2, 0.3}), origPattern, conn.PreQueueId(), conn.OCRPageQueueId()) + err = pipeline.ProcessBook(msg, conn, pipeline.Preprocess(thresholds), origPattern, conn.PreQueueId(), conn.OCRPageQueueId()) fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output resetTimer(stopIfQuiet, quietTime) if err != nil { -- cgit v1.2.1-24-ge1ad From cbe02a57377787cd34172453a477f68f200448e8 Mon Sep 17 00:00:00 2001 From: Nick White Date: Thu, 3 Dec 2020 15:20:15 +0000 Subject: [rescribe] Fix portability issue where hocrs may not be correctly moved and txt-ified on windows --- cmd/rescribe/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'cmd/rescribe/main.go') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index f4489d8..880bbc2 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -184,7 +184,7 @@ func main() { log.Fatalf("Error removing temporary directory %s: %v", tempdir, err) } - hocrs, err := filepath.Glob(fmt.Sprintf("%s/*hocr", bookname)) + hocrs, err := filepath.Glob(fmt.Sprintf("%s%s*hocr", bookname, string(filepath.Separator))) if err != nil { log.Fatalf("Error looking for .hocr files: %v", err) } -- cgit v1.2.1-24-ge1ad From 068ad0b666705a49ab22d7b48cd6a7d67b37f234 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 7 Dec 2020 16:53:58 +0000 Subject: [rescribe] Allow saving of results to somewhere other than a directory named after the book being processed --- cmd/rescribe/main.go | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'cmd/rescribe/main.go') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 880bbc2..8414c53 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -29,6 +29,9 @@ import ( const usage = `Usage: rescribe [-v] [-t training] bookdir [savedir] Process and OCR a book using the Rescribe pipeline on a local machine. + +OCR results are saved into the bookdir directory unless savedir is +specified. ` const QueueTimeoutSecs = 2 * 60 @@ -93,17 +96,16 @@ func main() { } flag.Parse() - if flag.NArg() < 1 || flag.NArg() > 3 { + if flag.NArg() < 1 || flag.NArg() > 2 { flag.Usage() return } bookdir := flag.Arg(0) - var bookname string + bookname := filepath.Base(bookdir) + savedir := bookdir if flag.NArg() > 1 { - bookname = flag.Arg(1) - } else { - bookname = filepath.Base(bookdir) + savedir = flag.Arg(1) } var verboselog *log.Logger @@ -172,8 +174,12 @@ func main() { log.Fatalln(err) } - fmt.Printf("Saving finished book to %s\n", bookname) - err = downloadbook(bookname, conn) + fmt.Printf("Saving finished book to %s\n", savedir) + err = os.MkdirAll(savedir, 0755) + if err != nil { + log.Fatalf("Error creating save directory %s: %v", savedir, err) + } + err = downloadbook(savedir, bookname, conn) if err != nil { _ = os.RemoveAll(tempdir) log.Fatalln(err) @@ -184,7 +190,7 @@ func main() { log.Fatalf("Error removing temporary directory %s: %v", tempdir, err) } - hocrs, err := filepath.Glob(fmt.Sprintf("%s%s*hocr", bookname, string(filepath.Separator))) + hocrs, err := filepath.Glob(fmt.Sprintf("%s%s*hocr", savedir, string(filepath.Separator))) if err != nil { log.Fatalf("Error looking for .hocr files: %v", err) } @@ -195,20 +201,20 @@ func main() { log.Fatalf("Error creating txt version of %s: %v", v, err) } - err = os.MkdirAll(filepath.Join(bookname, "hocr"), 0755) + err = os.MkdirAll(filepath.Join(savedir, "hocr"), 0755) if err != nil { log.Fatalf("Error creating hocr directory: %v", err) } - err = os.Rename(v, filepath.Join(bookname, "hocr", filepath.Base(v))) + err = os.Rename(v, filepath.Join(savedir, "hocr", filepath.Base(v))) if err != nil { log.Fatalf("Error moving hocr %s to hocr directory: %v", v, err) } } // For simplicity, remove .binarised.pdf and rename .colour.pdf to .pdf - _ = os.Remove(filepath.Join(bookname, bookname + ".binarised.pdf")) - _ = os.Rename(filepath.Join(bookname, bookname + ".colour.pdf"), filepath.Join(bookname, bookname + ".pdf")) + _ = os.Remove(filepath.Join(savedir, bookname + ".binarised.pdf")) + _ = os.Rename(filepath.Join(savedir, bookname + ".colour.pdf"), filepath.Join(savedir, bookname + ".pdf")) } func addTxtVersion(hocrfn string) error { @@ -257,23 +263,23 @@ func uploadbook(dir string, name string, conn Pipeliner) error { return nil } -func downloadbook(name string, conn Pipeliner) error { +func downloadbook(dir string, name string, conn Pipeliner) error { err := os.MkdirAll(name, 0755) if err != nil { log.Fatalln("Failed to create directory", name, err) } - err = pipeline.DownloadBestPages(name, conn, false) + err = pipeline.DownloadBestPages(dir, name, conn, false) if err != nil { return fmt.Errorf("Error downloading best pages: %v", err) } - err = pipeline.DownloadPdfs(name, conn) + err = pipeline.DownloadPdfs(dir, name, conn) if err != nil { return fmt.Errorf("Error downloading PDFs: %v", err) } - err = pipeline.DownloadAnalyses(name, conn) + err = pipeline.DownloadAnalyses(dir, name, conn) if err != nil { return fmt.Errorf("Error downloading analyses: %v", err) } -- cgit v1.2.1-24-ge1ad From 17b2d91d5f323fd985ca012e50d36908cbceba87 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 7 Dec 2020 17:04:12 +0000 Subject: [rescribe] Fix up *.hocr glob, which ensures that using a savedir that already has a hocr directory in it will work --- cmd/rescribe/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'cmd/rescribe/main.go') diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 8414c53..07eeaf0 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -190,7 +190,7 @@ func main() { log.Fatalf("Error removing temporary directory %s: %v", tempdir, err) } - hocrs, err := filepath.Glob(fmt.Sprintf("%s%s*hocr", savedir, string(filepath.Separator))) + hocrs, err := filepath.Glob(fmt.Sprintf("%s%s*.hocr", savedir, string(filepath.Separator))) if err != nil { log.Fatalf("Error looking for .hocr files: %v", err) } -- cgit v1.2.1-24-ge1ad