diff options
| author | Nick White <git@njw.name> | 2022-02-28 16:17:35 +0000 | 
|---|---|---|
| committer | Nick White <git@njw.name> | 2022-02-28 16:17:35 +0000 | 
| commit | 21d49b546a27de6c53d8fe7d1a68d5a3b5506c93 (patch) | |
| tree | 729663de5863c125fad24d37df5147c2569771d1 /cmd | |
| parent | 54aca8994863f5c58bce0224bc943b81c60f5d04 (diff) | |
Add PreNoWipe queue, that just does binarisation but no wiping
Diffstat (limited to 'cmd')
| -rw-r--r-- | cmd/bookpipeline/main.go | 23 | ||||
| -rw-r--r-- | cmd/booktopipeline/main.go | 12 | ||||
| -rw-r--r-- | cmd/rescribe/gui.go | 8 | ||||
| -rw-r--r-- | cmd/rescribe/main.go | 39 | 
4 files changed, 67 insertions, 15 deletions
| diff --git a/cmd/bookpipeline/main.go b/cmd/bookpipeline/main.go index 11c5a41..2a9f54b 100644 --- a/cmd/bookpipeline/main.go +++ b/cmd/bookpipeline/main.go @@ -69,6 +69,7 @@ type Clouder interface {  type Pipeliner interface {  	Clouder  	PreQueueId() string +	PreNoWipeQueueId() string  	WipeQueueId() string  	OCRPageQueueId() string  	AnalyseQueueId() string @@ -151,6 +152,7 @@ func main() {  	hostname, err := os.Hostname()  	var checkPreQueue <-chan time.Time +	var checkPreNoWipeQueue <-chan time.Time  	var checkWipeQueue <-chan time.Time  	var checkOCRPageQueue <-chan time.Time  	var checkAnalyseQueue <-chan time.Time @@ -168,6 +170,7 @@ func main() {  	if !*noanalyse {  		checkAnalyseQueue = time.After(0)  	} +	checkPreNoWipeQueue = time.After(0)  	var quietTime = time.Duration(*autostop) * time.Second  	stopIfQuiet = time.NewTimer(quietTime)  	if quietTime == 0 { @@ -194,11 +197,29 @@ func main() {  			}  			conn.Log("Message received on preprocess queue, processing", msg.Body)  			stopTimer(stopIfQuiet) -			err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess([]float64{0.1, 0.2, 0.4, 0.5}), origPattern, conn.PreQueueId(), conn.OCRPageQueueId()) +			err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess([]float64{0.1, 0.2, 0.4, 0.5}, false), origPattern, conn.PreQueueId(), conn.OCRPageQueueId())  			resetTimer(stopIfQuiet, quietTime)  			if err != nil {  				conn.Log("Error during preprocess", err)  			} +		case <-checkPreNoWipeQueue: +			msg, err := conn.CheckQueue(conn.PreNoWipeQueueId(), QueueTimeoutSecs) +			checkPreNoWipeQueue = time.After(PauseBetweenChecks) +			if err != nil { +				conn.Log("Error checking preprocess (no wipe) queue", err) +				continue +			} +			if msg.Handle == "" { +				conn.Log("No message received on preprocess (no wipe) queue, sleeping") +				continue +			} +			conn.Log("Message received on preprocess (no wipe) queue, processing", msg.Body) +			stopTimer(stopIfQuiet) +			err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess([]float64{0.1, 0.2, 0.4, 0.5}, true), origPattern, conn.PreQueueId(), conn.OCRPageQueueId()) +			resetTimer(stopIfQuiet, quietTime) +			if err != nil { +				conn.Log("Error during preprocess (no wipe)", err) +			}  		case <-checkWipeQueue:  			msg, err := conn.CheckQueue(conn.WipeQueueId(), QueueTimeoutSecs)  			checkWipeQueue = time.After(PauseBetweenChecks) diff --git a/cmd/booktopipeline/main.go b/cmd/booktopipeline/main.go index bf088a0..ee2ef47 100644 --- a/cmd/booktopipeline/main.go +++ b/cmd/booktopipeline/main.go @@ -19,7 +19,7 @@ import (  	"rescribe.xyz/bookpipeline/internal/pipeline"  ) -const usage = `Usage: booktopipeline [-c conn] [-t training] [-prebinarised] [-notbinarised] [-v] bookdir [bookname] +const usage = `Usage: booktopipeline [-c conn] [-t training] [-prebinarised] [-notbinarised] [-nowipe] [-v] bookdir [bookname]  Uploads the book in bookdir to the S3 'inprogress' bucket and adds it  to the 'preprocess' or 'wipeonly' SQS queue. The queue to send to is @@ -46,6 +46,7 @@ func main() {  	conntype := flag.String("c", "aws", "connection type ('aws' or 'local')")  	wipeonly := flag.Bool("prebinarised", false, "Prebinarised: only preprocessing will be to wipe")  	dobinarise := flag.Bool("notbinarised", false, "Not binarised: all preprocessing will be done including binarisation") +	nowipe := flag.Bool("nowipe", false, "No wipe: Disable wiping as part of preprocessing")  	training := flag.String("t", "", "Training to use (training filename without the .traineddata part)")  	flag.Usage = func() { @@ -89,7 +90,7 @@ func main() {  		log.Fatalln("Failed to set up cloud connection:", err)  	} -	qid := pipeline.DetectQueueType(bookdir, conn) +	qid := pipeline.DetectQueueType(bookdir, conn, false)  	// Flags set override the queue selection  	if *wipeonly { @@ -98,6 +99,9 @@ func main() {  	if *dobinarise {  		qid = conn.PreQueueId()  	} +	if *nowipe { +		qid = conn.PreNoWipeQueueId() +	}  	verboselog.Println("Checking that all images are valid in", bookdir)  	err = pipeline.CheckImages(ctx, bookdir) @@ -131,8 +135,10 @@ func main() {  	var qname string  	if qid == conn.PreQueueId() {  		qname = "preprocess" -	} else { +	} else if qid == conn.WipeQueueId() {  		qname = "wipeonly" +	} else { +		qname = "nowipe"  	}  	fmt.Println("Uploaded book to queue", qname) diff --git a/cmd/rescribe/gui.go b/cmd/rescribe/gui.go index 8603e08..c67d15a 100644 --- a/cmd/rescribe/gui.go +++ b/cmd/rescribe/gui.go @@ -338,6 +338,8 @@ func startGui(log log.Logger, cmd string, gbookcmd string, training string, tess  		d.Show()  	}) +	wipe := widget.NewCheck("Automatically clean image sides", func(bool) {}) +  	trainingLabel := widget.NewLabel("Training")  	trainingOpts := mkTrainingSelect([]string{training}, myWindow) @@ -527,7 +529,7 @@ func startGui(log log.Logger, cmd string, gbookcmd string, training string, tess  				training = training[start:end]  			} -			err = startProcess(ctx, log, cmd, bookdir, bookname, training, savedir, tessdir) +			err = startProcess(ctx, log, cmd, bookdir, bookname, training, savedir, tessdir, !wipe.Checked)  			if err != nil && strings.HasSuffix(err.Error(), "context canceled") {  				progressBar.SetValue(0.0)  				return @@ -561,8 +563,8 @@ func startGui(log log.Logger, cmd string, gbookcmd string, training string, tess  	trainingBits := container.New(layout.NewBorderLayout(nil, nil, trainingLabel, nil), trainingLabel, trainingOpts) -	fullContent = container.NewVBox(choices, chosen, trainingBits, gobtn, abortbtn, progressBar, detail) -	startContent := container.NewVBox(choices, trainingBits, gobtn, abortbtn, progressBar, detail) +	fullContent = container.NewVBox(choices, chosen, trainingBits, wipe, gobtn, abortbtn, progressBar, detail) +	startContent := container.NewVBox(choices, trainingBits, wipe, gobtn, abortbtn, progressBar, detail)  	myWindow.SetContent(startContent) diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index ec37f05..54623b1 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -74,6 +74,7 @@ type Clouder interface {  type Pipeliner interface {  	Clouder  	PreQueueId() string +	PreNoWipeQueueId() string  	WipeQueueId() string  	OCRPageQueueId() string  	AnalyseQueueId() string @@ -155,6 +156,7 @@ These training files are included in rescribe, and are always available:  - rescribev8_fast.traineddata (Latin historic printing)  	`)  	tesscmd := flag.String("tesscmd", deftesscmd, "The Tesseract executable to run. You may need to set this to the full path of Tesseract.exe if you're on Windows.") +	wipe := flag.Bool("wipe", false, "Use wiper tool to remove noise like gutters from page before processing.")  	flag.Usage = func() {  		fmt.Fprintf(flag.CommandLine.Output(), usage) @@ -279,6 +281,7 @@ These training files are included in rescribe, and are always available:  	}  	var ctx context.Context +	ctx = context.Background()  	// TODO: support google book downloading, as done with the GUI @@ -288,8 +291,6 @@ These training files are included in rescribe, and are always available:  			savedir = strings.TrimSuffix(bookdir, ".pdf")  		} -		// BUG: this seems to fail from command line, yet works from GUI -		// (used to work)  		bookdir, err = extractPdfImgs(ctx, bookdir)  		if err != nil {  			log.Fatalln("Error opening file as PDF:", err) @@ -305,7 +306,7 @@ These training files are included in rescribe, and are always available:  		ispdf = true  	} -	err = startProcess(ctx, *verboselog, tessCommand, bookdir, bookname, trainingName, savedir, tessdir) +	err = startProcess(ctx, *verboselog, tessCommand, bookdir, bookname, trainingName, savedir, tessdir, !*wipe)  	if err != nil {  		log.Fatalln(err)  	} @@ -445,7 +446,7 @@ func rmIfNotImage(f string) error {  	return nil  } -func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, savedir string, tessdir string) error { +func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, savedir string, tessdir string, nowipe bool) error {  	cmd := exec.Command(tessCommand, "--help")  	pipeline.HideCmd(cmd)  	_, err := cmd.Output() @@ -475,7 +476,7 @@ func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bo  	fmt.Printf("Copying book to pipeline\n") -	err = uploadbook(ctx, bookdir, bookname, conn) +	err = uploadbook(ctx, bookdir, bookname, conn, nowipe)  	if err != nil {  		_ = os.RemoveAll(tempdir)  		return fmt.Errorf("Error uploading book: %v", err) @@ -588,7 +589,7 @@ func addTxtVersion(hocrfn string) error {  	return nil  } -func uploadbook(ctx context.Context, dir string, name string, conn Pipeliner) error { +func uploadbook(ctx context.Context, dir string, name string, conn Pipeliner, nowipe bool) error {  	_, err := os.Stat(dir)  	if err != nil && !os.IsExist(err) {  		return fmt.Errorf("Error: directory %s not found", dir) @@ -602,7 +603,8 @@ func uploadbook(ctx context.Context, dir string, name string, conn Pipeliner) er  		return fmt.Errorf("Error saving images to process from %s: %v", dir, err)  	} -	qid := pipeline.DetectQueueType(dir, conn) +	qid := pipeline.DetectQueueType(dir, conn, nowipe) +	fmt.Printf("Uploading to queue %s\n", qid)  	err = conn.AddToQueue(qid, name)  	if err != nil { @@ -642,11 +644,13 @@ func processbook(ctx context.Context, training string, tesscmd string, conn Pipe  	ocredPattern := regexp.MustCompile(`.hocr$`)  	var checkPreQueue <-chan time.Time +	var checkPreNoWipeQueue <-chan time.Time  	var checkWipeQueue <-chan time.Time  	var checkOCRPageQueue <-chan time.Time  	var checkAnalyseQueue <-chan time.Time  	var stopIfQuiet *time.Timer  	checkPreQueue = time.After(0) +	checkPreNoWipeQueue = time.After(0)  	checkWipeQueue = time.After(0)  	checkOCRPageQueue = time.After(0)  	checkAnalyseQueue = time.After(0) @@ -660,6 +664,25 @@ func processbook(ctx context.Context, training string, tesscmd string, conn Pipe  		select {  		case <-ctx.Done():  			return ctx.Err() +		case <-checkPreNoWipeQueue: +			msg, err := conn.CheckQueue(conn.PreNoWipeQueueId(), QueueTimeoutSecs) +			checkPreNoWipeQueue = time.After(PauseBetweenChecks) +			if err != nil { +				return fmt.Errorf("Error checking preprocess no wipe queue: %v", err) +			} +			if msg.Handle == "" { +				conn.Log("No message received on preprocess no wipe queue, sleeping") +				continue +			} +			stopTimer(stopIfQuiet) +			conn.Log("Message received on preprocess no wipe queue, processing", msg.Body) +			fmt.Printf("  Preprocessing book (binarising only, no wiping)\n") +			err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess(thresholds, true), origPattern, conn.PreNoWipeQueueId(), conn.OCRPageQueueId()) +			resetTimer(stopIfQuiet, quietTime) +			if err != nil { +				return fmt.Errorf("Error during preprocess (no wipe): %v", err) +			} +			fmt.Printf("  OCRing pages ") // this is expected to be added to with dots by OCRPage output  		case <-checkPreQueue:  			msg, err := conn.CheckQueue(conn.PreQueueId(), QueueTimeoutSecs)  			checkPreQueue = time.After(PauseBetweenChecks) @@ -673,7 +696,7 @@ func processbook(ctx context.Context, training string, tesscmd string, conn Pipe  			stopTimer(stopIfQuiet)  			conn.Log("Message received on preprocess queue, processing", msg.Body)  			fmt.Printf("  Preprocessing book (binarising and wiping)\n") -			err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess(thresholds), origPattern, conn.PreQueueId(), conn.OCRPageQueueId()) +			err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Preprocess(thresholds, false), origPattern, conn.PreQueueId(), conn.OCRPageQueueId())  			resetTimer(stopIfQuiet, quietTime)  			if err != nil {  				return fmt.Errorf("Error during preprocess: %v", err) | 
