summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2022-03-11 13:36:59 +0000
committerNick White <git@njw.name>2022-03-11 13:36:59 +0000
commit9d1382b69700129a66541d786ba3b784eda56e36 (patch)
treea1e607d55dbd08b586f528eea07ea15079ed1684
parentd6e1cb61da7a9155023ce9dece96da79c7246790 (diff)
Add initial support for full-size PDF generation
Some issues: 1) The PDF generation stores every page in memory while it constructs it. That means that there's a higher chance of failure due to running out of memory with these. There's no getting around this except by improving the PDF generation library, which is not easy. 2) Currently I've just changed the pipeline to always generate these full size PDFs, and then the rescribe tool will just delete them if they weren't requested. This is bad in particular because of point 1, and would probably cause issues of failures in the server pipeline as a result Therefore the plan is to add a tag to queue messages so that full size generation can be selectively enabled. Also, it should be split from the loop with colour pdf generation, as holding them both in RAM at the same time is unnecessary.
-rw-r--r--cmd/rescribe/gui.go9
-rw-r--r--cmd/rescribe/main.go14
-rw-r--r--internal/pipeline/get.go2
-rw-r--r--internal/pipeline/pipeline.go19
4 files changed, 38 insertions, 6 deletions
diff --git a/cmd/rescribe/gui.go b/cmd/rescribe/gui.go
index 1c55462..36b4f16 100644
--- a/cmd/rescribe/gui.go
+++ b/cmd/rescribe/gui.go
@@ -340,6 +340,9 @@ func startGui(log log.Logger, cmd string, gbookcmd string, training string, tess
wipe := widget.NewCheck("Automatically clean image sides", func(bool) {})
+ smallpdf := widget.NewCheck("Reduce size of searchable PDF", func(bool) {})
+ smallpdf.Checked = true
+
trainingLabel := widget.NewLabel("Language / Script")
trainingOpts := mkTrainingSelect([]string{training}, myWindow)
@@ -529,7 +532,7 @@ func startGui(log log.Logger, cmd string, gbookcmd string, training string, tess
training = training[start:end]
}
- err = startProcess(ctx, log, cmd, bookdir, bookname, training, savedir, tessdir, !wipe.Checked)
+ err = startProcess(ctx, log, cmd, bookdir, bookname, training, savedir, tessdir, !wipe.Checked, !smallpdf.Checked)
if err != nil && strings.HasSuffix(err.Error(), "context canceled") {
progressBar.SetValue(0.0)
return
@@ -566,8 +569,8 @@ func startGui(log log.Logger, cmd string, gbookcmd string, training string, tess
trainingBits := container.New(layout.NewBorderLayout(nil, nil, trainingLabel, nil), trainingLabel, trainingOpts)
- fullContent = container.NewVBox(choices, chosen, trainingBits, wipe, gobtn, abortbtn, progressBar, detail)
- startContent := container.NewVBox(choices, trainingBits, wipe, gobtn, abortbtn, progressBar, detail)
+ fullContent = container.NewVBox(choices, chosen, trainingBits, wipe, smallpdf, gobtn, abortbtn, progressBar, detail)
+ startContent := container.NewVBox(choices, trainingBits, wipe, smallpdf, gobtn, abortbtn, progressBar, detail)
myWindow.SetContent(startContent)
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go
index 83153c6..96f6162 100644
--- a/cmd/rescribe/main.go
+++ b/cmd/rescribe/main.go
@@ -157,6 +157,7 @@ These training files are included in rescribe, and are always available:
`)
tesscmd := flag.String("tesscmd", deftesscmd, "The Tesseract executable to run. You may need to set this to the full path of Tesseract.exe if you're on Windows.")
wipe := flag.Bool("wipe", false, "Use wiper tool to remove noise like gutters from page before processing.")
+ fullpdf := flag.Bool("fullpdf", false, "Create a full-size searchable PDF (rather than a reduced size one).")
flag.Usage = func() {
fmt.Fprintf(flag.CommandLine.Output(), usage)
@@ -306,7 +307,7 @@ These training files are included in rescribe, and are always available:
ispdf = true
}
- err = startProcess(ctx, *verboselog, tessCommand, bookdir, bookname, trainingName, savedir, tessdir, !*wipe)
+ err = startProcess(ctx, *verboselog, tessCommand, bookdir, bookname, trainingName, savedir, tessdir, !*wipe, *fullpdf)
if err != nil {
log.Fatalln(err)
}
@@ -446,7 +447,7 @@ func rmIfNotImage(f string) error {
return nil
}
-func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, savedir string, tessdir string, nowipe bool) error {
+func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, savedir string, tessdir string, nowipe bool, fullpdf bool) error {
cmd := exec.Command(tessCommand, "--help")
pipeline.HideCmd(cmd)
_, err := cmd.Output()
@@ -544,8 +545,17 @@ func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bo
// to .pdf.
binpath := filepath.Join(savedir, bookname+".binarised.pdf")
colourpath := filepath.Join(savedir, bookname+".colour.pdf")
+ fullsizepath := filepath.Join(savedir, bookname+".original.pdf")
pdfpath := filepath.Join(savedir, bookname+" searchable.pdf")
+ // If full size pdf is requested, replace colour.pdf with it,
+ // otherwise just remove it
+ if fullpdf {
+ _ = os.Rename(fullsizepath, colourpath)
+ } else {
+ _ = os.Remove(fullsizepath)
+ }
+
_, err = os.Stat(binpath)
binexists := err == nil || os.IsExist(err)
_, err = os.Stat(colourpath)
diff --git a/internal/pipeline/get.go b/internal/pipeline/get.go
index de4ac3b..8fac060 100644
--- a/internal/pipeline/get.go
+++ b/internal/pipeline/get.go
@@ -68,7 +68,7 @@ func DownloadBestPngs(dir string, name string, conn Downloader) error {
func DownloadPdfs(dir string, name string, conn Downloader) error {
anydone := false
errmsg := ""
- for _, suffix := range []string{".colour.pdf", ".binarised.pdf"} {
+ for _, suffix := range []string{".colour.pdf", ".binarised.pdf", ".original.pdf"} {
key := filepath.Join(name, name+suffix)
fn := filepath.Join(dir, name+suffix)
err := conn.Download(conn.WIPStorageId(), key, fn)
diff --git a/internal/pipeline/pipeline.go b/internal/pipeline/pipeline.go
index 40ed02c..cbd179b 100644
--- a/internal/pipeline/pipeline.go
+++ b/internal/pipeline/pipeline.go
@@ -455,6 +455,12 @@ func Analyse(conn Downloader) func(context.Context, chan string, chan string, ch
errc <- fmt.Errorf("Failed to set up PDF: %s", err)
return
}
+ fullsizepdf := new(bookpipeline.Fpdf)
+ err = fullsizepdf.Setup()
+ if err != nil {
+ errc <- fmt.Errorf("Failed to set up PDF: %s", err)
+ return
+ }
binhascontent, colourhascontent := false, false
select {
@@ -551,6 +557,11 @@ func Analyse(conn Downloader) func(context.Context, chan string, chan string, ch
errc <- fmt.Errorf("Failed to add page %s to PDF: %s", pg.img, err)
return
}
+ err = fullsizepdf.AddPage(filepath.Join(savedir, colourfn), filepath.Join(savedir, pg.hocr), false)
+ if err != nil {
+ errc <- fmt.Errorf("Failed to add page %s to PDF: %s", pg.img, err)
+ return
+ }
colourhascontent = true
err = os.Remove(filepath.Join(savedir, colourfn))
if err != nil {
@@ -575,6 +586,14 @@ func Analyse(conn Downloader) func(context.Context, chan string, chan string, ch
return
}
up <- fn
+
+ fn = filepath.Join(savedir, bookname+".original.pdf")
+ err = fullsizepdf.Save(fn)
+ if err != nil {
+ errc <- fmt.Errorf("Failed to save full size pdf: %s", err)
+ return
+ }
+ up <- fn
}
select {