summaryrefslogtreecommitdiff
path: root/cmd
diff options
context:
space:
mode:
authorNick White <git@njw.name>2022-02-28 16:17:35 +0000
committerNick White <git@njw.name>2022-02-28 16:17:35 +0000
commit21d49b546a27de6c53d8fe7d1a68d5a3b5506c93 (patch)
tree729663de5863c125fad24d37df5147c2569771d1 /cmd
parent54aca8994863f5c58bce0224bc943b81c60f5d04 (diff)
Add PreNoWipe queue, that just does binarisation but no wiping
Diffstat (limited to 'cmd')
-rw-r--r--cmd/bookpipeline/main.go23
-rw-r--r--cmd/booktopipeline/main.go12
-rw-r--r--cmd/rescribe/gui.go8
-rw-r--r--cmd/rescribe/main.go39
4 files changed, 67 insertions, 15 deletions
diff --git a/cmd/bookpipeline/main.go b/cmd/bookpipeline/main.go
index 11c5a41..2a9f54b 100644
--- a/cmd/bookpipeline/main.go
+++ b/cmd/bookpipeline/main.go
@@ -69,6 +69,7 @@ type Clouder interface {
type Pipeliner interface {
Clouder
PreQueueId() string
+ PreNoWipeQueueId() string
WipeQueueId() string
OCRPageQueueId() string
AnalyseQueueId() string
@@ -151,6 +152,7 @@ func main() {
hostname, err := os.Hostname()
var checkPreQueue <-chan time.Time
+ var checkPreNoWipeQueue <-chan time.Time
var checkWipeQueue <-chan time.Time
var checkOCRPageQueue <-chan time.Time
var checkAnalyseQueue <-chan time.Time
@@ -168,6 +170,7 @@ func main() {
if !*noanalyse {
checkAnalyseQueue = time.After(0)
}
+ checkPreNoWipeQueue = time.After(0)
var quietTime = time.Duration(*autostop) * time.Second
stopIfQuiet = time.NewTimer(quietTime)
if quietTime == 0 {
@@ -194,11 +197,29 @@ func main() {
}
conn.Log("Message received on preprocess queue, processing", msg.Body)
stopTimer(stopIfQuiet)
- err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess([]float64{0.1, 0.2, 0.4, 0.5}), origPattern, conn.PreQueueId(), conn.OCRPageQueueId())
+ err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess([]float64{0.1, 0.2, 0.4, 0.5}, false), origPattern, conn.PreQueueId(), conn.OCRPageQueueId())
resetTimer(stopIfQuiet, quietTime)
if err != nil {
conn.Log("Error during preprocess", err)
}
+ case <-checkPreNoWipeQueue:
+ msg, err := conn.CheckQueue(conn.PreNoWipeQueueId(), QueueTimeoutSecs)
+ checkPreNoWipeQueue = time.After(PauseBetweenChecks)
+ if err != nil {
+ conn.Log("Error checking preprocess (no wipe) queue", err)
+ continue
+ }
+ if msg.Handle == "" {
+ conn.Log("No message received on preprocess (no wipe) queue, sleeping")
+ continue
+ }
+ conn.Log("Message received on preprocess (no wipe) queue, processing", msg.Body)
+ stopTimer(stopIfQuiet)
+ err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess([]float64{0.1, 0.2, 0.4, 0.5}, true), origPattern, conn.PreQueueId(), conn.OCRPageQueueId())
+ resetTimer(stopIfQuiet, quietTime)
+ if err != nil {
+ conn.Log("Error during preprocess (no wipe)", err)
+ }
case <-checkWipeQueue:
msg, err := conn.CheckQueue(conn.WipeQueueId(), QueueTimeoutSecs)
checkWipeQueue = time.After(PauseBetweenChecks)
diff --git a/cmd/booktopipeline/main.go b/cmd/booktopipeline/main.go
index bf088a0..ee2ef47 100644
--- a/cmd/booktopipeline/main.go
+++ b/cmd/booktopipeline/main.go
@@ -19,7 +19,7 @@ import (
"rescribe.xyz/bookpipeline/internal/pipeline"
)
-const usage = `Usage: booktopipeline [-c conn] [-t training] [-prebinarised] [-notbinarised] [-v] bookdir [bookname]
+const usage = `Usage: booktopipeline [-c conn] [-t training] [-prebinarised] [-notbinarised] [-nowipe] [-v] bookdir [bookname]
Uploads the book in bookdir to the S3 'inprogress' bucket and adds it
to the 'preprocess' or 'wipeonly' SQS queue. The queue to send to is
@@ -46,6 +46,7 @@ func main() {
conntype := flag.String("c", "aws", "connection type ('aws' or 'local')")
wipeonly := flag.Bool("prebinarised", false, "Prebinarised: only preprocessing will be to wipe")
dobinarise := flag.Bool("notbinarised", false, "Not binarised: all preprocessing will be done including binarisation")
+ nowipe := flag.Bool("nowipe", false, "No wipe: Disable wiping as part of preprocessing")
training := flag.String("t", "", "Training to use (training filename without the .traineddata part)")
flag.Usage = func() {
@@ -89,7 +90,7 @@ func main() {
log.Fatalln("Failed to set up cloud connection:", err)
}
- qid := pipeline.DetectQueueType(bookdir, conn)
+ qid := pipeline.DetectQueueType(bookdir, conn, false)
// Flags set override the queue selection
if *wipeonly {
@@ -98,6 +99,9 @@ func main() {
if *dobinarise {
qid = conn.PreQueueId()
}
+ if *nowipe {
+ qid = conn.PreNoWipeQueueId()
+ }
verboselog.Println("Checking that all images are valid in", bookdir)
err = pipeline.CheckImages(ctx, bookdir)
@@ -131,8 +135,10 @@ func main() {
var qname string
if qid == conn.PreQueueId() {
qname = "preprocess"
- } else {
+ } else if qid == conn.WipeQueueId() {
qname = "wipeonly"
+ } else {
+ qname = "nowipe"
}
fmt.Println("Uploaded book to queue", qname)
diff --git a/cmd/rescribe/gui.go b/cmd/rescribe/gui.go
index 8603e08..c67d15a 100644
--- a/cmd/rescribe/gui.go
+++ b/cmd/rescribe/gui.go
@@ -338,6 +338,8 @@ func startGui(log log.Logger, cmd string, gbookcmd string, training string, tess
d.Show()
})
+ wipe := widget.NewCheck("Automatically clean image sides", func(bool) {})
+
trainingLabel := widget.NewLabel("Training")
trainingOpts := mkTrainingSelect([]string{training}, myWindow)
@@ -527,7 +529,7 @@ func startGui(log log.Logger, cmd string, gbookcmd string, training string, tess
training = training[start:end]
}
- err = startProcess(ctx, log, cmd, bookdir, bookname, training, savedir, tessdir)
+ err = startProcess(ctx, log, cmd, bookdir, bookname, training, savedir, tessdir, !wipe.Checked)
if err != nil && strings.HasSuffix(err.Error(), "context canceled") {
progressBar.SetValue(0.0)
return
@@ -561,8 +563,8 @@ func startGui(log log.Logger, cmd string, gbookcmd string, training string, tess
trainingBits := container.New(layout.NewBorderLayout(nil, nil, trainingLabel, nil), trainingLabel, trainingOpts)
- fullContent = container.NewVBox(choices, chosen, trainingBits, gobtn, abortbtn, progressBar, detail)
- startContent := container.NewVBox(choices, trainingBits, gobtn, abortbtn, progressBar, detail)
+ fullContent = container.NewVBox(choices, chosen, trainingBits, wipe, gobtn, abortbtn, progressBar, detail)
+ startContent := container.NewVBox(choices, trainingBits, wipe, gobtn, abortbtn, progressBar, detail)
myWindow.SetContent(startContent)
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go
index ec37f05..54623b1 100644
--- a/cmd/rescribe/main.go
+++ b/cmd/rescribe/main.go
@@ -74,6 +74,7 @@ type Clouder interface {
type Pipeliner interface {
Clouder
PreQueueId() string
+ PreNoWipeQueueId() string
WipeQueueId() string
OCRPageQueueId() string
AnalyseQueueId() string
@@ -155,6 +156,7 @@ These training files are included in rescribe, and are always available:
- rescribev8_fast.traineddata (Latin historic printing)
`)
tesscmd := flag.String("tesscmd", deftesscmd, "The Tesseract executable to run. You may need to set this to the full path of Tesseract.exe if you're on Windows.")
+ wipe := flag.Bool("wipe", false, "Use wiper tool to remove noise like gutters from page before processing.")
flag.Usage = func() {
fmt.Fprintf(flag.CommandLine.Output(), usage)
@@ -279,6 +281,7 @@ These training files are included in rescribe, and are always available:
}
var ctx context.Context
+ ctx = context.Background()
// TODO: support google book downloading, as done with the GUI
@@ -288,8 +291,6 @@ These training files are included in rescribe, and are always available:
savedir = strings.TrimSuffix(bookdir, ".pdf")
}
- // BUG: this seems to fail from command line, yet works from GUI
- // (used to work)
bookdir, err = extractPdfImgs(ctx, bookdir)
if err != nil {
log.Fatalln("Error opening file as PDF:", err)
@@ -305,7 +306,7 @@ These training files are included in rescribe, and are always available:
ispdf = true
}
- err = startProcess(ctx, *verboselog, tessCommand, bookdir, bookname, trainingName, savedir, tessdir)
+ err = startProcess(ctx, *verboselog, tessCommand, bookdir, bookname, trainingName, savedir, tessdir, !*wipe)
if err != nil {
log.Fatalln(err)
}
@@ -445,7 +446,7 @@ func rmIfNotImage(f string) error {
return nil
}
-func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, savedir string, tessdir string) error {
+func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, savedir string, tessdir string, nowipe bool) error {
cmd := exec.Command(tessCommand, "--help")
pipeline.HideCmd(cmd)
_, err := cmd.Output()
@@ -475,7 +476,7 @@ func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bo
fmt.Printf("Copying book to pipeline\n")
- err = uploadbook(ctx, bookdir, bookname, conn)
+ err = uploadbook(ctx, bookdir, bookname, conn, nowipe)
if err != nil {
_ = os.RemoveAll(tempdir)
return fmt.Errorf("Error uploading book: %v", err)
@@ -588,7 +589,7 @@ func addTxtVersion(hocrfn string) error {
return nil
}
-func uploadbook(ctx context.Context, dir string, name string, conn Pipeliner) error {
+func uploadbook(ctx context.Context, dir string, name string, conn Pipeliner, nowipe bool) error {
_, err := os.Stat(dir)
if err != nil && !os.IsExist(err) {
return fmt.Errorf("Error: directory %s not found", dir)
@@ -602,7 +603,8 @@ func uploadbook(ctx context.Context, dir string, name string, conn Pipeliner) er
return fmt.Errorf("Error saving images to process from %s: %v", dir, err)
}
- qid := pipeline.DetectQueueType(dir, conn)
+ qid := pipeline.DetectQueueType(dir, conn, nowipe)
+ fmt.Printf("Uploading to queue %s\n", qid)
err = conn.AddToQueue(qid, name)
if err != nil {
@@ -642,11 +644,13 @@ func processbook(ctx context.Context, training string, tesscmd string, conn Pipe
ocredPattern := regexp.MustCompile(`.hocr$`)
var checkPreQueue <-chan time.Time
+ var checkPreNoWipeQueue <-chan time.Time
var checkWipeQueue <-chan time.Time
var checkOCRPageQueue <-chan time.Time
var checkAnalyseQueue <-chan time.Time
var stopIfQuiet *time.Timer
checkPreQueue = time.After(0)
+ checkPreNoWipeQueue = time.After(0)
checkWipeQueue = time.After(0)
checkOCRPageQueue = time.After(0)
checkAnalyseQueue = time.After(0)
@@ -660,6 +664,25 @@ func processbook(ctx context.Context, training string, tesscmd string, conn Pipe
select {
case <-ctx.Done():
return ctx.Err()
+ case <-checkPreNoWipeQueue:
+ msg, err := conn.CheckQueue(conn.PreNoWipeQueueId(), QueueTimeoutSecs)
+ checkPreNoWipeQueue = time.After(PauseBetweenChecks)
+ if err != nil {
+ return fmt.Errorf("Error checking preprocess no wipe queue: %v", err)
+ }
+ if msg.Handle == "" {
+ conn.Log("No message received on preprocess no wipe queue, sleeping")
+ continue
+ }
+ stopTimer(stopIfQuiet)
+ conn.Log("Message received on preprocess no wipe queue, processing", msg.Body)
+ fmt.Printf(" Preprocessing book (binarising only, no wiping)\n")
+ err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess(thresholds, true), origPattern, conn.PreNoWipeQueueId(), conn.OCRPageQueueId())
+ resetTimer(stopIfQuiet, quietTime)
+ if err != nil {
+ return fmt.Errorf("Error during preprocess (no wipe): %v", err)
+ }
+ fmt.Printf(" OCRing pages ") // this is expected to be added to with dots by OCRPage output
case <-checkPreQueue:
msg, err := conn.CheckQueue(conn.PreQueueId(), QueueTimeoutSecs)
checkPreQueue = time.After(PauseBetweenChecks)
@@ -673,7 +696,7 @@ func processbook(ctx context.Context, training string, tesscmd string, conn Pipe
stopTimer(stopIfQuiet)
conn.Log("Message received on preprocess queue, processing", msg.Body)
fmt.Printf(" Preprocessing book (binarising and wiping)\n")
- err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess(thresholds), origPattern, conn.PreQueueId(), conn.OCRPageQueueId())
+ err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess(thresholds, false), origPattern, conn.PreQueueId(), conn.OCRPageQueueId())
resetTimer(stopIfQuiet, quietTime)
if err != nil {
return fmt.Errorf("Error during preprocess: %v", err)