summaryrefslogtreecommitdiff
path: root/cmd/rescribe
diff options
context:
space:
mode:
Diffstat (limited to 'cmd/rescribe')
-rw-r--r--cmd/rescribe/gbook.go12
-rw-r--r--cmd/rescribe/gbook_test.go17
-rw-r--r--cmd/rescribe/gui.go424
-rw-r--r--cmd/rescribe/gui_test.go77
-rw-r--r--cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/174f82f558636f2a2
-rw-r--r--cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/60892155cf2f79632
6 files changed, 323 insertions, 211 deletions
diff --git a/cmd/rescribe/gbook.go b/cmd/rescribe/gbook.go
index 320f574..a011181 100644
--- a/cmd/rescribe/gbook.go
+++ b/cmd/rescribe/gbook.go
@@ -41,7 +41,9 @@ func formatAuthors(authors []string) string {
s = strings.ToUpper(s)
if len(s) > maxPartLength {
- s = s[:maxPartLength]
+ // truncate to maxPartLength
+ m := fmt.Sprintf("%%.%ds", maxPartLength)
+ s = fmt.Sprintf(m, s)
}
s = strings.Map(stripNonLetters, s)
@@ -63,7 +65,9 @@ func stripNonLetters(r rune) rune {
func formatTitle(title string) string {
s := strings.Map(stripNonLetters, title)
if len(s) > maxPartLength {
- s = s[:maxPartLength]
+ // truncate to maxPartLength
+ m := fmt.Sprintf("%%.%ds", maxPartLength)
+ s = fmt.Sprintf(m, s)
}
return s
}
@@ -232,7 +236,7 @@ func getBookIdFromUrl(url string) (string, error) {
if start >= 0 {
start += 4
- if len(url[start:]) < 12 {
+ if len(url) - start < 12 {
return "", fmt.Errorf("Could not find book ID in URL")
}
return url[start : start+12], nil
@@ -245,7 +249,7 @@ func getBookIdFromUrl(url string) (string, error) {
if start >= 0 {
start += 10
- if len(url[start:]) < 12 {
+ if len(url) - start < 12 {
return "", fmt.Errorf("Could not find book ID in URL")
}
return url[start : start+12], nil
diff --git a/cmd/rescribe/gbook_test.go b/cmd/rescribe/gbook_test.go
index 56b4b40..f7df595 100644
--- a/cmd/rescribe/gbook_test.go
+++ b/cmd/rescribe/gbook_test.go
@@ -8,7 +8,7 @@ import (
"testing"
)
-func Test_getBookIdFromUrl(t *testing.T) {
+func TestGetBookIdFromUrl(t *testing.T) {
cases := []struct {
url string
id string
@@ -29,3 +29,18 @@ func Test_getBookIdFromUrl(t *testing.T) {
})
}
}
+
+func FuzzGetBookIdFromUrl(f *testing.F) {
+ cases := []string {
+ "https://books.google.it/books?id=QjQepCuN8JYC",
+ "https://www.google.it/books/edition/_/VJbr-Oe2au0C",
+ }
+
+ for _, c := range cases {
+ f.Add(c)
+ }
+
+ f.Fuzz(func(t *testing.T, url string) {
+ getBookIdFromUrl(url)
+ })
+}
diff --git a/cmd/rescribe/gui.go b/cmd/rescribe/gui.go
index f14f288..16e6bd8 100644
--- a/cmd/rescribe/gui.go
+++ b/cmd/rescribe/gui.go
@@ -78,7 +78,7 @@ func copyStdoutToChan() (chan rune, error) {
// copyStderrToChan creates a pipe to copy anything written
// to the file also to a rune channel.
// TODO: would be nice to merge this with copyStdoutToChan,
-// but a naive version using *os.File didn't work.
+// but a naive version using *os.File didn't work.
func copyStderrToChan() (chan rune, error) {
c := make(chan rune)
@@ -198,9 +198,9 @@ func mkTrainingSelect(extras []string, parent fyne.Window) *widget.Select {
return s
}
-// formatProgressBarText uses the progressPoints map to set the text for the progress bar
+// formatProgressBar uses the progressPoints map to set the text for the progress bar
// appropriately
-func formatProgressBarText(bar *widget.ProgressBar) func() string {
+func formatProgressBar(bar *widget.ProgressBar) func() string {
return func() string {
for i, v := range progressPoints {
if bar.Value == i {
@@ -218,6 +218,218 @@ func formatProgressBarText(bar *widget.ProgressBar) func() string {
}
}
+// updateProgress parses the last line of a log and updates a progress
+// bar appropriately.
+func updateProgress(log string, progressBar *widget.ProgressBar) {
+ lines := strings.Split(log, "\n")
+ lastline := lines[len(lines)-1]
+ for i, v := range progressPoints {
+ if strings.HasPrefix(lastline, " "+v) {
+ // OCRing has a number of dots after it showing how many pages have been processed,
+ // which we can use to update progress bar more often
+ // TODO: calculate number of pages we expect, so this can be set accurately
+ if v == "OCRing" {
+ if progressBar.Value < 0.5 {
+ progressBar.SetValue(0.5)
+ }
+ numdots := strings.Count(lastline, ".")
+ newval := float64(0.5) + (float64(numdots) * float64(0.01))
+ if newval >= 0.9 {
+ newval = 0.89
+ }
+ progressBar.SetValue(newval)
+ break
+ }
+ progressBar.SetValue(i)
+ }
+ }
+}
+
+// start sets up the gui to start the core process, and if all is well
+// it starts it
+func start(ctx context.Context, log *log.Logger, cmd string, tessdir string, gbookcmd string, dir string, training string, win fyne.Window, logarea *widget.Entry, progressBar *widget.ProgressBar, abortbtn *widget.Button, wipe bool, bigpdf bool, disableWidgets []fyne.Disableable) {
+ if dir == "" {
+ return
+ }
+
+ stdout, err := copyStdoutToChan()
+ if err != nil {
+ msg := fmt.Sprintf("Internal error\n\nError copying stdout to chan: %v\n", err)
+ dialog.ShowError(errors.New(msg), win)
+ fmt.Fprintf(os.Stderr, msg)
+ return
+ }
+ go func() {
+ for r := range stdout {
+ logarea.SetText(logarea.Text + string(r))
+ logarea.CursorRow = strings.Count(logarea.Text, "\n")
+ updateProgress(logarea.Text, progressBar)
+ }
+ }()
+
+ stderr, err := copyStderrToChan()
+ if err != nil {
+ msg := fmt.Sprintf("Internal error\n\nError copying stdout to chan: %v\n", err)
+ dialog.ShowError(errors.New(msg), win)
+ fmt.Fprintf(os.Stderr, msg)
+ return
+ }
+ go func() {
+ for r := range stderr {
+ logarea.SetText(logarea.Text + string(r))
+ logarea.CursorRow = strings.Count(logarea.Text, "\n")
+ }
+ }()
+
+ // Do this in a goroutine so the GUI remains responsive
+ go func() {
+ letsGo(ctx, log, cmd, tessdir, gbookcmd, dir, training, win, logarea, progressBar, abortbtn, wipe, bigpdf, disableWidgets)
+ }()
+}
+
+// letsGo starts the core process
+func letsGo(ctx context.Context, log *log.Logger, cmd string, tessdir string, gbookcmd string, dir string, training string, win fyne.Window, logarea *widget.Entry, progressBar *widget.ProgressBar, abortbtn *widget.Button, wipe bool, bigpdf bool, disableWidgets []fyne.Disableable) {
+ bookdir := dir
+ savedir := dir
+ bookname := strings.ReplaceAll(filepath.Base(dir), " ", "_")
+
+ f, err := os.Stat(bookdir)
+ if err != nil && !strings.HasPrefix(bookdir, "Google Book: ") {
+ msg := fmt.Sprintf("Error opening %s: %v", bookdir, err)
+ dialog.ShowError(errors.New(msg), win)
+ fmt.Fprintf(os.Stderr, msg)
+
+ progressBar.SetValue(0.0)
+ for _, v := range disableWidgets {
+ v.Enable()
+ }
+ abortbtn.Disable()
+ return
+ }
+
+ for _, v := range disableWidgets {
+ v.Disable()
+ }
+
+ abortbtn.Enable()
+
+ progressBar.SetValue(0.1)
+
+ if strings.HasPrefix(dir, "Google Book: ") {
+ if gbookcmd == "" {
+ msg := fmt.Sprintf("No getgbook found, can't download Google Book. Either set -gbookcmd on the command line, or use the official build which includes an embedded copy of getgbook.\n")
+ dialog.ShowError(errors.New(msg), win)
+ fmt.Fprintf(os.Stderr, msg)
+ progressBar.SetValue(0.0)
+ for _, v := range disableWidgets {
+ v.Enable()
+ }
+ abortbtn.Disable()
+ return
+ }
+ progressBar.SetValue(0.11)
+ start := len("Google Book: ")
+ bookname = dir[start : start+12]
+
+ start = start + 12 + len(" Save to: ")
+ bookdir = dir[start:]
+ savedir = bookdir
+
+ fmt.Printf("Downloading Google Book\n")
+ d, err := getGoogleBook(ctx, gbookcmd, bookname, bookdir)
+ if err != nil {
+ if !strings.HasSuffix(err.Error(), "signal: killed") {
+ msg := fmt.Sprintf("Error downloading Google Book %s\n", bookname)
+ dialog.ShowError(errors.New(msg), win)
+ fmt.Fprintf(os.Stderr, msg)
+ }
+ progressBar.SetValue(0.0)
+ for _, v := range disableWidgets {
+ v.Enable()
+ }
+ abortbtn.Disable()
+ return
+ }
+ bookdir = d
+ savedir = d
+ bookname = filepath.Base(d)
+ }
+
+ if strings.HasSuffix(dir, ".pdf") && !f.IsDir() {
+ progressBar.SetValue(0.12)
+ bookdir, err = extractPdfImgs(ctx, bookdir)
+ if err != nil {
+ if !strings.HasSuffix(err.Error(), "context canceled") {
+ msg := fmt.Sprintf("Error opening PDF %s: %v\n", bookdir, err)
+ dialog.ShowError(errors.New(msg), win)
+ fmt.Fprintf(os.Stderr, msg)
+ }
+
+ progressBar.SetValue(0.0)
+ for _, v := range disableWidgets {
+ v.Enable()
+ }
+ abortbtn.Disable()
+ return
+ }
+
+ // happens if extractPdfImgs recovers from a PDF panic,
+ // which will occur if we encounter an image we can't decode
+ if bookdir == "" {
+ msg := fmt.Sprintf("Error opening PDF\nThe format of this PDF is not supported, extract the images to .jpg manually into a\nfolder first, using a tool like the PDF image extractor at https://pdfcandy.com/extract-images.html.\n")
+ dialog.ShowError(errors.New(msg), win)
+ fmt.Fprintf(os.Stderr, msg)
+
+ progressBar.SetValue(0.0)
+ for _, v := range disableWidgets {
+ v.Enable()
+ }
+ abortbtn.Disable()
+ return
+ }
+
+ savedir = strings.TrimSuffix(savedir, ".pdf")
+ bookname = strings.TrimSuffix(bookname, ".pdf")
+ }
+
+ if strings.Contains(training, "[") {
+ start := strings.Index(training, "[") + 1
+ end := strings.Index(training, "]")
+ training = training[start:end]
+ }
+
+ err = startProcess(ctx, log, cmd, bookdir, bookname, training, savedir, tessdir, wipe, bigpdf)
+ if err != nil && strings.HasSuffix(err.Error(), "context canceled") {
+ progressBar.SetValue(0.0)
+ return
+ }
+ if err != nil {
+ msg := fmt.Sprintf("Error during processing: %v\n", err)
+ if strings.HasSuffix(err.Error(), "No images found") && strings.HasSuffix(dir.Text, ".pdf") && !f.IsDir() {
+ msg = fmt.Sprintf("Error opening PDF\nNo images found in the PDF. Most likely the format of this PDF is not supported,\nextract the images to .jpg manually into a folder first, using a tool like\nthe PDF image extractor at https://pdfcandy.com/extract-images.html.\n")
+ }
+ dialog.ShowError(errors.New(msg), win)
+ fmt.Fprintf(os.Stderr, msg)
+
+ progressBar.SetValue(0.0)
+ for _, v := range disableWidgets {
+ v.Enable()
+ }
+ abortbtn.Disable()
+ return
+ }
+
+ progressBar.SetValue(1.0)
+
+ for _, v := range disableWidgets {
+ v.Enable()
+ }
+ abortbtn.Disable()
+
+ msg := fmt.Sprintf("OCR process finished successfully.\n\nYour completed files have been saved in:\n%s", savedir)
+ dialog.ShowInformation("OCR Complete", msg, win)
+}
+
// startGui starts the gui process
func startGui(log *log.Logger, cmd string, gbookcmd string, training string, tessdir string) error {
myApp := app.New()
@@ -322,7 +534,7 @@ func startGui(log *log.Logger, cmd string, gbookcmd string, training string, tes
trainingOpts := mkTrainingSelect([]string{training}, myWindow)
progressBar := widget.NewProgressBar()
- progressBar.TextFormatter = formatProgressBarText(progressBar)
+ progressBar.TextFormatter = formatProgressBar(progressBar)
logarea := widget.NewMultiLineEntry()
logarea.Disable()
@@ -350,209 +562,9 @@ func startGui(log *log.Logger, cmd string, gbookcmd string, training string, tes
abortbtn.Disable()
gobtn.OnTapped = func() {
- if dir.Text == "" {
- return
- }
-
- stdout, err := copyStdoutToChan()
- if err != nil {
- msg := fmt.Sprintf("Internal error\n\nError copying stdout to chan: %v\n", err)
- dialog.ShowError(errors.New(msg), myWindow)
- fmt.Fprintf(os.Stderr, msg)
- return
- }
-
- // update log area with stdout in a concurrent goroutine, and parse it to update the progress bar
- go func() {
- for r := range stdout {
- logarea.SetText(logarea.Text + string(r))
- logarea.CursorRow = strings.Count(logarea.Text, "\n")
-
- lines := strings.Split(logarea.Text, "\n")
- lastline := lines[len(lines)-1]
- for i, v := range progressPoints {
- if strings.HasPrefix(lastline, " "+v) {
- // OCRing has a number of dots after it showing how many pages have been processed,
- // which we can use to update progress bar more often
- // TODO: calculate number of pages we expect, so this can be set accurately
- if v == "OCRing" {
- if progressBar.Value < 0.5 {
- progressBar.SetValue(0.5)
- }
- numdots := strings.Count(lastline, ".")
- newval := float64(0.5) + (float64(numdots) * float64(0.01))
- if newval >= 0.9 {
- newval = 0.89
- }
- progressBar.SetValue(newval)
- break
- }
- progressBar.SetValue(i)
- }
- }
- }
- }()
-
- stderr, err := copyStderrToChan()
- if err != nil {
- msg := fmt.Sprintf("Internal error\n\nError copying stdout to chan: %v\n", err)
- dialog.ShowError(errors.New(msg), myWindow)
- fmt.Fprintf(os.Stderr, msg)
- return
- }
-
- // update log area with stderr in a concurrent goroutine
- go func() {
- for r := range stderr {
- logarea.SetText(logarea.Text + string(r))
- logarea.CursorRow = strings.Count(logarea.Text, "\n")
- }
- }()
-
- bookdir := dir.Text
- savedir := dir.Text
- bookname := strings.ReplaceAll(filepath.Base(dir.Text), " ", "_")
-
- f, err := os.Stat(bookdir)
- if err != nil && !strings.HasPrefix(bookdir, "Google Book: ") {
- msg := fmt.Sprintf("Error opening %s: %v", bookdir, err)
- dialog.ShowError(errors.New(msg), myWindow)
- fmt.Fprintf(os.Stderr, msg)
-
- progressBar.SetValue(0.0)
- for _, v := range disableWidgets {
- v.Enable()
- }
- abortbtn.Disable()
- return
- }
-
- // Do this in a goroutine so the GUI remains responsive
- go func() {
- for _, v := range disableWidgets {
- v.Disable()
- }
-
- abortbtn.Enable()
-
- progressBar.SetValue(0.1)
-
- if strings.HasPrefix(dir.Text, "Google Book: ") {
- if gbookcmd == "" {
- msg := fmt.Sprintf("No getgbook found, can't download Google Book. Either set -gbookcmd on the command line, or use the official build which includes an embedded copy of getgbook.\n")
- dialog.ShowError(errors.New(msg), myWindow)
- fmt.Fprintf(os.Stderr, msg)
- progressBar.SetValue(0.0)
- for _, v := range disableWidgets {
- v.Enable()
- }
- abortbtn.Disable()
- return
- }
- progressBar.SetValue(0.11)
- start := len("Google Book: ")
- bookname = dir.Text[start : start+12]
-
- start = start + 12 + len(" Save to: ")
- bookdir = dir.Text[start:]
- savedir = bookdir
-
- fmt.Printf("Downloading Google Book\n")
- d, err := getGoogleBook(ctx, gbookcmd, bookname, bookdir)
- if err != nil {
- if !strings.HasSuffix(err.Error(), "signal: killed") {
- msg := fmt.Sprintf("Error downloading Google Book %s\n", bookname)
- dialog.ShowError(errors.New(msg), myWindow)
- fmt.Fprintf(os.Stderr, msg)
- }
- progressBar.SetValue(0.0)
- for _, v := range disableWidgets {
- v.Enable()
- }
- abortbtn.Disable()
- return
- }
- bookdir = d
- savedir = d
- bookname = filepath.Base(d)
- }
-
- if strings.HasSuffix(dir.Text, ".pdf") && !f.IsDir() {
- progressBar.SetValue(0.12)
- bookdir, err = extractPdfImgs(ctx, bookdir)
- if err != nil {
- if !strings.HasSuffix(err.Error(), "context canceled") {
- msg := fmt.Sprintf("Error opening PDF %s: %v\n", bookdir, err)
- dialog.ShowError(errors.New(msg), myWindow)
- fmt.Fprintf(os.Stderr, msg)
- }
-
- progressBar.SetValue(0.0)
- for _, v := range disableWidgets {
- v.Enable()
- }
- abortbtn.Disable()
- return
- }
-
- // happens if extractPdfImgs recovers from a PDF panic,
- // which will occur if we encounter an image we can't decode
- if bookdir == "" {
- msg := fmt.Sprintf("Error opening PDF\nThe format of this PDF is not supported, extract the images to .jpg manually into a\nfolder first, using a tool like the PDF image extractor at https://pdfcandy.com/extract-images.html.\n")
- dialog.ShowError(errors.New(msg), myWindow)
- fmt.Fprintf(os.Stderr, msg)
-
- progressBar.SetValue(0.0)
- for _, v := range disableWidgets {
- v.Enable()
- }
- abortbtn.Disable()
- return
- }
-
- savedir = strings.TrimSuffix(savedir, ".pdf")
- bookname = strings.TrimSuffix(bookname, ".pdf")
- }
-
- training := trainingOpts.Selected
- if strings.Contains(training, "[") {
- start := strings.Index(training, "[") + 1
- end := strings.Index(training, "]")
- training = training[start:end]
- }
-
- err = startProcess(ctx, log, cmd, bookdir, bookname, training, savedir, tessdir, !wipe.Checked, bigpdf.Checked)
- if err != nil && strings.HasSuffix(err.Error(), "context canceled") {
- progressBar.SetValue(0.0)
- return
- }
- if err != nil {
- msg := fmt.Sprintf("Error during processing: %v\n", err)
- if strings.HasSuffix(err.Error(), "No images found") && strings.HasSuffix(dir.Text, ".pdf") && !f.IsDir() {
- msg = fmt.Sprintf("Error opening PDF\nNo images found in the PDF. Most likely the format of this PDF is not supported,\nextract the images to .jpg manually into a folder first, using a tool like\nthe PDF image extractor at https://pdfcandy.com/extract-images.html.\n")
- }
- dialog.ShowError(errors.New(msg), myWindow)
- fmt.Fprintf(os.Stderr, msg)
-
- progressBar.SetValue(0.0)
- for _, v := range disableWidgets {
- v.Enable()
- }
- abortbtn.Disable()
- return
- }
-
- progressBar.SetValue(1.0)
-
- for _, v := range disableWidgets {
- v.Enable()
- }
- abortbtn.Disable()
-
- msg := fmt.Sprintf("OCR process finished successfully.\n\nYour completed files have been saved in:\n%s", savedir)
- dialog.ShowInformation("OCR Complete", msg, myWindow)
- }()
+ start(ctx, log, cmd, tessdir, gbookcmd, dir.Text, trainingOpts.Selected, myWindow, logarea, progressBar, abortbtn, !wipe.Checked, bigpdf.Checked, disableWidgets)
}
+
gobtn.Disable()
choices := container.New(layout.NewGridLayout(3), folderBtn, pdfBtn, gbookBtn)
diff --git a/cmd/rescribe/gui_test.go b/cmd/rescribe/gui_test.go
new file mode 100644
index 0000000..99a924f
--- /dev/null
+++ b/cmd/rescribe/gui_test.go
@@ -0,0 +1,77 @@
+// Copyright 2022 Nick White.
+// Use of this source code is governed by the GPLv3
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "fmt"
+ "strings"
+ "testing"
+
+ "fyne.io/fyne/v2/app"
+ "fyne.io/fyne/v2/widget"
+)
+
+func TestFormatProgressBar(t *testing.T) {
+ cases := []struct {
+ val float64
+ str string
+ }{
+ {0.0, ""},
+ {0.01, "Processing"},
+ {0.11, "Downloading"},
+ {0.12, "Processing PDF"},
+ {0.2, "Preprocessing"},
+ {0.5, "OCRing"},
+ {0.55, "OCRing"},
+ {0.89, "OCRing"},
+ {0.9, "Analysing"},
+ {1.0, "Done"},
+ {1.1, "Processing"},
+ }
+
+ _ = app.New() // shouldn't be needed for test but we get a panic without it
+ bar := widget.NewProgressBar()
+
+ for _, c := range cases {
+ t.Run(fmt.Sprintf("%s_%.1f", c.str, c.val), func(t *testing.T) {
+ bar.Value = c.val
+ got := formatProgressBar(bar)()
+ if got != c.str {
+ t.Fatalf("Expected %s, got %s", c.str, got)
+ }
+ })
+ }
+}
+
+func TestUpdateProgress(t *testing.T) {
+ cases := []struct {
+ log string
+ val float64
+ }{
+ {"Downloading", 0.11},
+ {"Preprocessing", 0.2},
+ {"Preprocessing\nOCRing", 0.5},
+ {"Preprocessing\nOCRing...", 0.53},
+ {"OCRing........................................", 0.89},
+ {"OCRing..\nAnalysing", 0.9},
+ {"Done", 1.0},
+ {"Weirdness", 0.0},
+ }
+
+ _ = app.New() // shouldn't be needed for test but we get a panic without it
+ bar := widget.NewProgressBar()
+
+ for _, c := range cases {
+ t.Run(c.log, func(t *testing.T) {
+ l := strings.ReplaceAll(" "+c.log, "\n", "\n ")
+ bar.Value = 0.0
+ updateProgress(l, bar)
+ got := bar.Value
+ if got != c.val {
+ t.Fatalf("Expected %f, got %f", c.val, got)
+ }
+ })
+ }
+}
diff --git a/cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/174f82f558636f2a b/cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/174f82f558636f2a
new file mode 100644
index 0000000..1a7ed9c
--- /dev/null
+++ b/cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/174f82f558636f2a
@@ -0,0 +1,2 @@
+go test fuzz v1
+string("https://www0google\xf7/books/edition/_/")
diff --git a/cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/60892155cf2f7963 b/cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/60892155cf2f7963
new file mode 100644
index 0000000..b637539
--- /dev/null
+++ b/cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/60892155cf2f7963
@@ -0,0 +1,2 @@
+go test fuzz v1
+string("https://Books.google\xc1&id=")