diff options
| author | Nick White <git@njw.name> | 2022-03-21 13:51:51 +0000 | 
|---|---|---|
| committer | Nick White <git@njw.name> | 2022-03-21 13:51:51 +0000 | 
| commit | 76d91ea8f65c6ad52efb24ac2c94b22c2908bc5c (patch) | |
| tree | 35bc0e895befc1c686a64565f8bdff470a8db1c2 | |
| parent | af8650c074bc111200b132b0918d44cacd423b6e (diff) | |
Only generate full-size PDF if requested
This avoids the issue that large PDFs require a lot of RAM, so there
are chances of running out of memory. Plus it's a waste of space and
time.
| -rw-r--r-- | cmd/bookpipeline/main.go | 2 | ||||
| -rw-r--r-- | cmd/rescribe/gui.go | 10 | ||||
| -rw-r--r-- | cmd/rescribe/main.go | 6 | ||||
| -rw-r--r-- | internal/pipeline/pipeline.go | 90 | 
4 files changed, 55 insertions, 53 deletions
diff --git a/cmd/bookpipeline/main.go b/cmd/bookpipeline/main.go index 2a9f54b..076df32 100644 --- a/cmd/bookpipeline/main.go +++ b/cmd/bookpipeline/main.go @@ -271,7 +271,7 @@ func main() {  			}  			stopTimer(stopIfQuiet)  			conn.Log("Message received on analyse queue, processing", msg.Body) -			err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Analyse(conn), ocredPattern, conn.AnalyseQueueId(), "") +			err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Analyse(conn, false), ocredPattern, conn.AnalyseQueueId(), "")  			resetTimer(stopIfQuiet, quietTime)  			if err != nil {  				conn.Log("Error during analysis", err) diff --git a/cmd/rescribe/gui.go b/cmd/rescribe/gui.go index 36b4f16..7c5a727 100644 --- a/cmd/rescribe/gui.go +++ b/cmd/rescribe/gui.go @@ -340,8 +340,8 @@ func startGui(log log.Logger, cmd string, gbookcmd string, training string, tess  	wipe := widget.NewCheck("Automatically clean image sides", func(bool) {}) -	smallpdf := widget.NewCheck("Reduce size of searchable PDF", func(bool) {}) -	smallpdf.Checked = true +	bigpdf := widget.NewCheck("Use highest image quality for searchable PDF (requires lots of RAM)", func(bool) {}) +	bigpdf.Checked = false  	trainingLabel := widget.NewLabel("Language / Script") @@ -532,7 +532,7 @@ func startGui(log log.Logger, cmd string, gbookcmd string, training string, tess  				training = training[start:end]  			} -			err = startProcess(ctx, log, cmd, bookdir, bookname, training, savedir, tessdir, !wipe.Checked, !smallpdf.Checked) +			err = startProcess(ctx, log, cmd, bookdir, bookname, training, savedir, tessdir, !wipe.Checked, bigpdf.Checked)  			if err != nil && strings.HasSuffix(err.Error(), "context canceled") {  				progressBar.SetValue(0.0)  				return @@ -569,8 +569,8 @@ func startGui(log log.Logger, cmd string, gbookcmd string, training string, tess  	trainingBits := container.New(layout.NewBorderLayout(nil, nil, trainingLabel, nil), trainingLabel, trainingOpts) -	fullContent = container.NewVBox(choices, chosen, trainingBits, wipe, smallpdf, gobtn, abortbtn, progressBar, detail) -	startContent := container.NewVBox(choices, trainingBits, wipe, smallpdf, gobtn, abortbtn, progressBar, detail) +	fullContent = container.NewVBox(choices, chosen, trainingBits, wipe, bigpdf, gobtn, abortbtn, progressBar, detail) +	startContent := container.NewVBox(choices, trainingBits, wipe, bigpdf, gobtn, abortbtn, progressBar, detail)  	myWindow.SetContent(startContent) diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 96f6162..eba8e84 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -484,7 +484,7 @@ func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bo  	}  	fmt.Printf("Processing book\n") -	err = processbook(ctx, trainingName, tessCommand, conn) +	err = processbook(ctx, trainingName, tessCommand, conn, fullpdf)  	if err != nil {  		_ = os.RemoveAll(tempdir)  		return fmt.Errorf("Error processing book: %v", err) @@ -648,7 +648,7 @@ func downloadbook(dir string, name string, conn Pipeliner) error {  	return nil  } -func processbook(ctx context.Context, training string, tesscmd string, conn Pipeliner) error { +func processbook(ctx context.Context, training string, tesscmd string, conn Pipeliner, fullpdf bool) error {  	origPattern := regexp.MustCompile(`[0-9]{4}.(jpg|png)$`)  	wipePattern := regexp.MustCompile(`[0-9]{4,6}(.bin)?.(jpg|png)$`)  	ocredPattern := regexp.MustCompile(`.hocr$`) @@ -764,7 +764,7 @@ func processbook(ctx context.Context, training string, tesscmd string, conn Pipe  			stopTimer(stopIfQuiet)  			conn.Log("Message received on analyse queue, processing", msg.Body)  			fmt.Printf("\n  Analysing OCR and compiling PDFs\n") -			err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Analyse(conn), ocredPattern, conn.AnalyseQueueId(), "") +			err = pipeline.ProcessBook(ctx, msg, conn, pipeline.Analyse(conn, fullpdf), ocredPattern, conn.AnalyseQueueId(), "")  			resetTimer(stopIfQuiet, quietTime)  			if err != nil {  				return fmt.Errorf("Error during analysis: %v", err) diff --git a/internal/pipeline/pipeline.go b/internal/pipeline/pipeline.go index a09a414..d8beeb9 100644 --- a/internal/pipeline/pipeline.go +++ b/internal/pipeline/pipeline.go @@ -330,7 +330,7 @@ func Ocr(training string, tesscmd string) func(context.Context, chan string, cha  	}  } -func Analyse(conn Downloader) func(context.Context, chan string, chan string, chan error, *log.Logger) { +func Analyse(conn Downloader, mkfullpdf bool) func(context.Context, chan string, chan string, chan error, *log.Logger) {  	return func(ctx context.Context, toanalyse chan string, up chan string, errc chan error, logger *log.Logger) {  		confs := make(map[string][]*bookpipeline.Conf)  		bestconfs := make(map[string]*bookpipeline.Conf) @@ -455,12 +455,6 @@ func Analyse(conn Downloader) func(context.Context, chan string, chan string, ch  			errc <- fmt.Errorf("Failed to set up PDF: %s", err)  			return  		} -		fullsizepdf := new(bookpipeline.Fpdf) -		err = fullsizepdf.Setup() -		if err != nil { -			errc <- fmt.Errorf("Failed to set up PDF: %s", err) -			return -		}  		binhascontent, colourhascontent := false, false  		select { @@ -583,54 +577,62 @@ func Analyse(conn Downloader) func(context.Context, chan string, chan string, ch  			up <- fn  		} -		for _, pg := range colourimgs { -			select { -			case <-ctx.Done(): -				errc <- ctx.Err() +		if mkfullpdf { +			fullsizepdf := new(bookpipeline.Fpdf) +			err = fullsizepdf.Setup() +			if err != nil { +				errc <- fmt.Errorf("Failed to set up PDF: %s", err)  				return -			default:  			} +			for _, pg := range colourimgs { +				select { +				case <-ctx.Done(): +					errc <- ctx.Err() +					return +				default: +				} -			logger.Println("Downloading colour page to add to PDF", pg.img) -			colourfn := pg.img -			err = conn.Download(conn.WIPStorageId(), bookname+"/"+colourfn, filepath.Join(savedir, colourfn)) -			if err != nil { -				colourfn = strings.Replace(pg.img, ".jpg", ".png", 1) -				logger.Println("Download failed; trying", colourfn) +				logger.Println("Downloading colour page to add to PDF", pg.img) +				colourfn := pg.img  				err = conn.Download(conn.WIPStorageId(), bookname+"/"+colourfn, filepath.Join(savedir, colourfn))  				if err != nil { -					logger.Println("Download failed; skipping page", pg.img) +					colourfn = strings.Replace(pg.img, ".jpg", ".png", 1) +					logger.Println("Download failed; trying", colourfn) +					err = conn.Download(conn.WIPStorageId(), bookname+"/"+colourfn, filepath.Join(savedir, colourfn)) +					if err != nil { +						logger.Println("Download failed; skipping page", pg.img) +					}  				} -			} -			if err == nil { -				err = fullsizepdf.AddPage(filepath.Join(savedir, colourfn), filepath.Join(savedir, pg.hocr), false) -				if err != nil { -					errc <- fmt.Errorf("Failed to add page %s to PDF: %s", pg.img, err) -					return -				} -				err = os.Remove(filepath.Join(savedir, colourfn)) -				if err != nil { -					errc <- err -					return +				if err == nil { +					err = fullsizepdf.AddPage(filepath.Join(savedir, colourfn), filepath.Join(savedir, pg.hocr), false) +					if err != nil { +						errc <- fmt.Errorf("Failed to add page %s to PDF: %s", pg.img, err) +						return +					} +					err = os.Remove(filepath.Join(savedir, colourfn)) +					if err != nil { +						errc <- err +						return +					}  				}  			} -		} - -		select { -		case <-ctx.Done(): -			errc <- ctx.Err() -			return -		default: -		} -		if colourhascontent { -			fn = filepath.Join(savedir, bookname+".original.pdf") -			err = fullsizepdf.Save(fn) -			if err != nil { -				errc <- fmt.Errorf("Failed to save full size pdf: %s", err) +			select { +			case <-ctx.Done(): +				errc <- ctx.Err()  				return +			default: +			} + +			if colourhascontent { +				fn = filepath.Join(savedir, bookname+".original.pdf") +				err = fullsizepdf.Save(fn) +				if err != nil { +					errc <- fmt.Errorf("Failed to save full size pdf: %s", err) +					return +				} +				up <- fn  			} -			up <- fn  		}  		select {  | 
