diff options
Diffstat (limited to 'cmd/rescribe')
| -rw-r--r-- | cmd/rescribe/gbook.go | 12 | ||||
| -rw-r--r-- | cmd/rescribe/gbook_test.go | 17 | ||||
| -rw-r--r-- | cmd/rescribe/gui.go | 424 | ||||
| -rw-r--r-- | cmd/rescribe/gui_test.go | 77 | ||||
| -rw-r--r-- | cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/174f82f558636f2a | 2 | ||||
| -rw-r--r-- | cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/60892155cf2f7963 | 2 | 
6 files changed, 323 insertions, 211 deletions
| diff --git a/cmd/rescribe/gbook.go b/cmd/rescribe/gbook.go index 320f574..a011181 100644 --- a/cmd/rescribe/gbook.go +++ b/cmd/rescribe/gbook.go @@ -41,7 +41,9 @@ func formatAuthors(authors []string) string {  	s = strings.ToUpper(s)  	if len(s) > maxPartLength { -		s = s[:maxPartLength] +		// truncate to maxPartLength +		m := fmt.Sprintf("%%.%ds", maxPartLength) +		s = fmt.Sprintf(m, s)  	}  	s = strings.Map(stripNonLetters, s) @@ -63,7 +65,9 @@ func stripNonLetters(r rune) rune {  func formatTitle(title string) string {  	s := strings.Map(stripNonLetters, title)  	if len(s) > maxPartLength { -		s = s[:maxPartLength] +		// truncate to maxPartLength +		m := fmt.Sprintf("%%.%ds", maxPartLength) +		s = fmt.Sprintf(m, s)  	}  	return s  } @@ -232,7 +236,7 @@ func getBookIdFromUrl(url string) (string, error) {  		if start >= 0 {  			start += 4 -			if len(url[start:]) < 12 { +			if len(url) - start < 12 {  				return "", fmt.Errorf("Could not find book ID in URL")  			}  			return url[start : start+12], nil @@ -245,7 +249,7 @@ func getBookIdFromUrl(url string) (string, error) {  		if start >= 0 {  			start += 10 -			if len(url[start:]) < 12 { +			if len(url) - start < 12 {  				return "", fmt.Errorf("Could not find book ID in URL")  			}  			return url[start : start+12], nil diff --git a/cmd/rescribe/gbook_test.go b/cmd/rescribe/gbook_test.go index 56b4b40..f7df595 100644 --- a/cmd/rescribe/gbook_test.go +++ b/cmd/rescribe/gbook_test.go @@ -8,7 +8,7 @@ import (  	"testing"  ) -func Test_getBookIdFromUrl(t *testing.T) { +func TestGetBookIdFromUrl(t *testing.T) {  	cases := []struct {  		url string  		id  string @@ -29,3 +29,18 @@ func Test_getBookIdFromUrl(t *testing.T) {  		})  	}  } + +func FuzzGetBookIdFromUrl(f *testing.F) { +	cases := []string { +		"https://books.google.it/books?id=QjQepCuN8JYC", +		"https://www.google.it/books/edition/_/VJbr-Oe2au0C", +	} + +	for _, c := range cases { +		f.Add(c) +	} + +	f.Fuzz(func(t *testing.T, url string) { +		getBookIdFromUrl(url) +	}) +} diff --git a/cmd/rescribe/gui.go b/cmd/rescribe/gui.go index f14f288..16e6bd8 100644 --- a/cmd/rescribe/gui.go +++ b/cmd/rescribe/gui.go @@ -78,7 +78,7 @@ func copyStdoutToChan() (chan rune, error) {  // copyStderrToChan creates a pipe to copy anything written  // to the file also to a rune channel.  // TODO: would be nice to merge this with copyStdoutToChan, -//       but a naive version using *os.File didn't work. +// but a naive version using *os.File didn't work.  func copyStderrToChan() (chan rune, error) {  	c := make(chan rune) @@ -198,9 +198,9 @@ func mkTrainingSelect(extras []string, parent fyne.Window) *widget.Select {  	return s  } -// formatProgressBarText uses the progressPoints map to set the text for the progress bar +// formatProgressBar uses the progressPoints map to set the text for the progress bar  // appropriately -func formatProgressBarText(bar *widget.ProgressBar) func() string { +func formatProgressBar(bar *widget.ProgressBar) func() string {  	return func() string {  		for i, v := range progressPoints {  			if bar.Value == i { @@ -218,6 +218,218 @@ func formatProgressBarText(bar *widget.ProgressBar) func() string {  	}  } +// updateProgress parses the last line of a log and updates a progress +// bar appropriately. +func updateProgress(log string, progressBar *widget.ProgressBar) { +	lines := strings.Split(log, "\n") +	lastline := lines[len(lines)-1] +	for i, v := range progressPoints { +		if strings.HasPrefix(lastline, "  "+v) { +			// OCRing has a number of dots after it showing how many pages have been processed, +			// which we can use to update progress bar more often +			// TODO: calculate number of pages we expect, so this can be set accurately +			if v == "OCRing" { +				if progressBar.Value < 0.5 { +					progressBar.SetValue(0.5) +				} +				numdots := strings.Count(lastline, ".") +				newval := float64(0.5) + (float64(numdots) * float64(0.01)) +				if newval >= 0.9 { +					newval = 0.89 +				} +				progressBar.SetValue(newval) +				break +			} +			progressBar.SetValue(i) +		} +	} +} + +// start sets up the gui to start the core process, and if all is well +// it starts it +func start(ctx context.Context, log *log.Logger, cmd string, tessdir string, gbookcmd string, dir string, training string, win fyne.Window, logarea *widget.Entry, progressBar *widget.ProgressBar, abortbtn *widget.Button, wipe bool, bigpdf bool, disableWidgets []fyne.Disableable) { +	if dir == "" { +		return +	} + +	stdout, err := copyStdoutToChan() +	if err != nil { +		msg := fmt.Sprintf("Internal error\n\nError copying stdout to chan: %v\n", err) +		dialog.ShowError(errors.New(msg), win) +		fmt.Fprintf(os.Stderr, msg) +		return +	} +	go func() { +		for r := range stdout { +			logarea.SetText(logarea.Text + string(r)) +			logarea.CursorRow = strings.Count(logarea.Text, "\n") +			updateProgress(logarea.Text, progressBar) +		} +	}() + +	stderr, err := copyStderrToChan() +	if err != nil { +		msg := fmt.Sprintf("Internal error\n\nError copying stdout to chan: %v\n", err) +		dialog.ShowError(errors.New(msg), win) +		fmt.Fprintf(os.Stderr, msg) +		return +	} +	go func() { +		for r := range stderr { +			logarea.SetText(logarea.Text + string(r)) +			logarea.CursorRow = strings.Count(logarea.Text, "\n") +		} +	}() + +	// Do this in a goroutine so the GUI remains responsive +	go func() { +		letsGo(ctx, log, cmd, tessdir, gbookcmd, dir, training, win, logarea, progressBar, abortbtn, wipe, bigpdf, disableWidgets) +	}() +} + +// letsGo starts the core process +func letsGo(ctx context.Context, log *log.Logger, cmd string, tessdir string, gbookcmd string, dir string, training string, win fyne.Window, logarea *widget.Entry, progressBar *widget.ProgressBar, abortbtn *widget.Button, wipe bool, bigpdf bool, disableWidgets []fyne.Disableable) { +	bookdir := dir +	savedir := dir +	bookname := strings.ReplaceAll(filepath.Base(dir), " ", "_") + +	f, err := os.Stat(bookdir) +	if err != nil && !strings.HasPrefix(bookdir, "Google Book: ") { +		msg := fmt.Sprintf("Error opening %s: %v", bookdir, err) +		dialog.ShowError(errors.New(msg), win) +		fmt.Fprintf(os.Stderr, msg) + +		progressBar.SetValue(0.0) +		for _, v := range disableWidgets { +			v.Enable() +		} +		abortbtn.Disable() +		return +	} + +	for _, v := range disableWidgets { +		v.Disable() +	} + +	abortbtn.Enable() + +	progressBar.SetValue(0.1) + +	if strings.HasPrefix(dir, "Google Book: ") { +		if gbookcmd == "" { +			msg := fmt.Sprintf("No getgbook found, can't download Google Book. Either set -gbookcmd on the command line, or use the official build which includes an embedded copy of getgbook.\n") +			dialog.ShowError(errors.New(msg), win) +			fmt.Fprintf(os.Stderr, msg) +			progressBar.SetValue(0.0) +			for _, v := range disableWidgets { +				v.Enable() +			} +			abortbtn.Disable() +			return +		} +		progressBar.SetValue(0.11) +		start := len("Google Book: ") +		bookname = dir[start : start+12] + +		start = start + 12 + len(" Save to: ") +		bookdir = dir[start:] +		savedir = bookdir + +		fmt.Printf("Downloading Google Book\n") +		d, err := getGoogleBook(ctx, gbookcmd, bookname, bookdir) +		if err != nil { +			if !strings.HasSuffix(err.Error(), "signal: killed") { +				msg := fmt.Sprintf("Error downloading Google Book %s\n", bookname) +				dialog.ShowError(errors.New(msg), win) +				fmt.Fprintf(os.Stderr, msg) +			} +			progressBar.SetValue(0.0) +			for _, v := range disableWidgets { +				v.Enable() +			} +			abortbtn.Disable() +			return +		} +		bookdir = d +		savedir = d +		bookname = filepath.Base(d) +	} + +	if strings.HasSuffix(dir, ".pdf") && !f.IsDir() { +		progressBar.SetValue(0.12) +		bookdir, err = extractPdfImgs(ctx, bookdir) +		if err != nil { +			if !strings.HasSuffix(err.Error(), "context canceled") { +				msg := fmt.Sprintf("Error opening PDF %s: %v\n", bookdir, err) +				dialog.ShowError(errors.New(msg), win) +				fmt.Fprintf(os.Stderr, msg) +			} + +			progressBar.SetValue(0.0) +			for _, v := range disableWidgets { +				v.Enable() +			} +			abortbtn.Disable() +			return +		} + +		// happens if extractPdfImgs recovers from a PDF panic, +		// which will occur if we encounter an image we can't decode +		if bookdir == "" { +			msg := fmt.Sprintf("Error opening PDF\nThe format of this PDF is not supported, extract the images to .jpg manually into a\nfolder first, using a tool like the PDF image extractor at https://pdfcandy.com/extract-images.html.\n") +			dialog.ShowError(errors.New(msg), win) +			fmt.Fprintf(os.Stderr, msg) + +			progressBar.SetValue(0.0) +			for _, v := range disableWidgets { +				v.Enable() +			} +			abortbtn.Disable() +			return +		} + +		savedir = strings.TrimSuffix(savedir, ".pdf") +		bookname = strings.TrimSuffix(bookname, ".pdf") +	} + +	if strings.Contains(training, "[") { +		start := strings.Index(training, "[") + 1 +		end := strings.Index(training, "]") +		training = training[start:end] +	} + +	err = startProcess(ctx, log, cmd, bookdir, bookname, training, savedir, tessdir, wipe, bigpdf) +	if err != nil && strings.HasSuffix(err.Error(), "context canceled") { +		progressBar.SetValue(0.0) +		return +	} +	if err != nil { +		msg := fmt.Sprintf("Error during processing: %v\n", err) +		if strings.HasSuffix(err.Error(), "No images found") && strings.HasSuffix(dir.Text, ".pdf") && !f.IsDir() { +			msg = fmt.Sprintf("Error opening PDF\nNo images found in the PDF. Most likely the format of this PDF is not supported,\nextract the images to .jpg manually into a folder first, using a tool like\nthe PDF image extractor at https://pdfcandy.com/extract-images.html.\n") +		} +		dialog.ShowError(errors.New(msg), win) +		fmt.Fprintf(os.Stderr, msg) + +		progressBar.SetValue(0.0) +		for _, v := range disableWidgets { +			v.Enable() +		} +		abortbtn.Disable() +		return +	} + +	progressBar.SetValue(1.0) + +	for _, v := range disableWidgets { +		v.Enable() +	} +	abortbtn.Disable() + +	msg := fmt.Sprintf("OCR process finished successfully.\n\nYour completed files have been saved in:\n%s", savedir) +	dialog.ShowInformation("OCR Complete", msg, win) +} +  // startGui starts the gui process  func startGui(log *log.Logger, cmd string, gbookcmd string, training string, tessdir string) error {  	myApp := app.New() @@ -322,7 +534,7 @@ func startGui(log *log.Logger, cmd string, gbookcmd string, training string, tes  	trainingOpts := mkTrainingSelect([]string{training}, myWindow)  	progressBar := widget.NewProgressBar() -	progressBar.TextFormatter = formatProgressBarText(progressBar) +	progressBar.TextFormatter = formatProgressBar(progressBar)  	logarea := widget.NewMultiLineEntry()  	logarea.Disable() @@ -350,209 +562,9 @@ func startGui(log *log.Logger, cmd string, gbookcmd string, training string, tes  	abortbtn.Disable()  	gobtn.OnTapped = func() { -		if dir.Text == "" { -			return -		} - -		stdout, err := copyStdoutToChan() -		if err != nil { -			msg := fmt.Sprintf("Internal error\n\nError copying stdout to chan: %v\n", err) -			dialog.ShowError(errors.New(msg), myWindow) -			fmt.Fprintf(os.Stderr, msg) -			return -		} - -		// update log area with stdout in a concurrent goroutine, and parse it to update the progress bar -		go func() { -			for r := range stdout { -				logarea.SetText(logarea.Text + string(r)) -				logarea.CursorRow = strings.Count(logarea.Text, "\n") - -				lines := strings.Split(logarea.Text, "\n") -				lastline := lines[len(lines)-1] -				for i, v := range progressPoints { -					if strings.HasPrefix(lastline, "  "+v) { -						// OCRing has a number of dots after it showing how many pages have been processed, -						// which we can use to update progress bar more often -						// TODO: calculate number of pages we expect, so this can be set accurately -						if v == "OCRing" { -							if progressBar.Value < 0.5 { -								progressBar.SetValue(0.5) -							} -							numdots := strings.Count(lastline, ".") -							newval := float64(0.5) + (float64(numdots) * float64(0.01)) -							if newval >= 0.9 { -								newval = 0.89 -							} -							progressBar.SetValue(newval) -							break -						} -						progressBar.SetValue(i) -					} -				} -			} -		}() - -		stderr, err := copyStderrToChan() -		if err != nil { -			msg := fmt.Sprintf("Internal error\n\nError copying stdout to chan: %v\n", err) -			dialog.ShowError(errors.New(msg), myWindow) -			fmt.Fprintf(os.Stderr, msg) -			return -		} - -		// update log area with stderr in a concurrent goroutine -		go func() { -			for r := range stderr { -				logarea.SetText(logarea.Text + string(r)) -				logarea.CursorRow = strings.Count(logarea.Text, "\n") -			} -		}() - -		bookdir := dir.Text -		savedir := dir.Text -		bookname := strings.ReplaceAll(filepath.Base(dir.Text), " ", "_") - -		f, err := os.Stat(bookdir) -		if err != nil && !strings.HasPrefix(bookdir, "Google Book: ") { -			msg := fmt.Sprintf("Error opening %s: %v", bookdir, err) -			dialog.ShowError(errors.New(msg), myWindow) -			fmt.Fprintf(os.Stderr, msg) - -			progressBar.SetValue(0.0) -			for _, v := range disableWidgets { -				v.Enable() -			} -			abortbtn.Disable() -			return -		} - -		// Do this in a goroutine so the GUI remains responsive -		go func() { -			for _, v := range disableWidgets { -				v.Disable() -			} - -			abortbtn.Enable() - -			progressBar.SetValue(0.1) - -			if strings.HasPrefix(dir.Text, "Google Book: ") { -				if gbookcmd == "" { -					msg := fmt.Sprintf("No getgbook found, can't download Google Book. Either set -gbookcmd on the command line, or use the official build which includes an embedded copy of getgbook.\n") -					dialog.ShowError(errors.New(msg), myWindow) -					fmt.Fprintf(os.Stderr, msg) -					progressBar.SetValue(0.0) -					for _, v := range disableWidgets { -						v.Enable() -					} -					abortbtn.Disable() -					return -				} -				progressBar.SetValue(0.11) -				start := len("Google Book: ") -				bookname = dir.Text[start : start+12] - -				start = start + 12 + len(" Save to: ") -				bookdir = dir.Text[start:] -				savedir = bookdir - -				fmt.Printf("Downloading Google Book\n") -				d, err := getGoogleBook(ctx, gbookcmd, bookname, bookdir) -				if err != nil { -					if !strings.HasSuffix(err.Error(), "signal: killed") { -						msg := fmt.Sprintf("Error downloading Google Book %s\n", bookname) -						dialog.ShowError(errors.New(msg), myWindow) -						fmt.Fprintf(os.Stderr, msg) -					} -					progressBar.SetValue(0.0) -					for _, v := range disableWidgets { -						v.Enable() -					} -					abortbtn.Disable() -					return -				} -				bookdir = d -				savedir = d -				bookname = filepath.Base(d) -			} - -			if strings.HasSuffix(dir.Text, ".pdf") && !f.IsDir() { -				progressBar.SetValue(0.12) -				bookdir, err = extractPdfImgs(ctx, bookdir) -				if err != nil { -					if !strings.HasSuffix(err.Error(), "context canceled") { -						msg := fmt.Sprintf("Error opening PDF %s: %v\n", bookdir, err) -						dialog.ShowError(errors.New(msg), myWindow) -						fmt.Fprintf(os.Stderr, msg) -					} - -					progressBar.SetValue(0.0) -					for _, v := range disableWidgets { -						v.Enable() -					} -					abortbtn.Disable() -					return -				} - -				// happens if extractPdfImgs recovers from a PDF panic, -				// which will occur if we encounter an image we can't decode -				if bookdir == "" { -					msg := fmt.Sprintf("Error opening PDF\nThe format of this PDF is not supported, extract the images to .jpg manually into a\nfolder first, using a tool like the PDF image extractor at https://pdfcandy.com/extract-images.html.\n") -					dialog.ShowError(errors.New(msg), myWindow) -					fmt.Fprintf(os.Stderr, msg) - -					progressBar.SetValue(0.0) -					for _, v := range disableWidgets { -						v.Enable() -					} -					abortbtn.Disable() -					return -				} - -				savedir = strings.TrimSuffix(savedir, ".pdf") -				bookname = strings.TrimSuffix(bookname, ".pdf") -			} - -			training := trainingOpts.Selected -			if strings.Contains(training, "[") { -				start := strings.Index(training, "[") + 1 -				end := strings.Index(training, "]") -				training = training[start:end] -			} - -			err = startProcess(ctx, log, cmd, bookdir, bookname, training, savedir, tessdir, !wipe.Checked, bigpdf.Checked) -			if err != nil && strings.HasSuffix(err.Error(), "context canceled") { -				progressBar.SetValue(0.0) -				return -			} -			if err != nil { -				msg := fmt.Sprintf("Error during processing: %v\n", err) -				if strings.HasSuffix(err.Error(), "No images found") && strings.HasSuffix(dir.Text, ".pdf") && !f.IsDir() { -					msg = fmt.Sprintf("Error opening PDF\nNo images found in the PDF. Most likely the format of this PDF is not supported,\nextract the images to .jpg manually into a folder first, using a tool like\nthe PDF image extractor at https://pdfcandy.com/extract-images.html.\n") -				} -				dialog.ShowError(errors.New(msg), myWindow) -				fmt.Fprintf(os.Stderr, msg) - -				progressBar.SetValue(0.0) -				for _, v := range disableWidgets { -					v.Enable() -				} -				abortbtn.Disable() -				return -			} - -			progressBar.SetValue(1.0) - -			for _, v := range disableWidgets { -				v.Enable() -			} -			abortbtn.Disable() - -			msg := fmt.Sprintf("OCR process finished successfully.\n\nYour completed files have been saved in:\n%s", savedir) -			dialog.ShowInformation("OCR Complete", msg, myWindow) -		}() +		start(ctx, log, cmd, tessdir, gbookcmd, dir.Text, trainingOpts.Selected, myWindow, logarea, progressBar, abortbtn, !wipe.Checked, bigpdf.Checked, disableWidgets)  	} +  	gobtn.Disable()  	choices := container.New(layout.NewGridLayout(3), folderBtn, pdfBtn, gbookBtn) diff --git a/cmd/rescribe/gui_test.go b/cmd/rescribe/gui_test.go new file mode 100644 index 0000000..99a924f --- /dev/null +++ b/cmd/rescribe/gui_test.go @@ -0,0 +1,77 @@ +// Copyright 2022 Nick White. +// Use of this source code is governed by the GPLv3 +// license that can be found in the LICENSE file. + +package main + +import ( +	"fmt" +	"strings" +	"testing" + +	"fyne.io/fyne/v2/app" +	"fyne.io/fyne/v2/widget" +) + +func TestFormatProgressBar(t *testing.T) { +	cases := []struct { +		val float64 +		str string +	}{ +		{0.0, ""}, +		{0.01, "Processing"}, +		{0.11, "Downloading"}, +		{0.12, "Processing PDF"}, +		{0.2, "Preprocessing"}, +		{0.5, "OCRing"}, +		{0.55, "OCRing"}, +		{0.89, "OCRing"}, +		{0.9, "Analysing"}, +		{1.0, "Done"}, +		{1.1, "Processing"}, +	} + +	_ = app.New() // shouldn't be needed for test but we get a panic without it +	bar := widget.NewProgressBar() + +	for _, c := range cases { +		t.Run(fmt.Sprintf("%s_%.1f", c.str, c.val), func(t *testing.T) { +			bar.Value = c.val +			got := formatProgressBar(bar)() +			if got != c.str { +				t.Fatalf("Expected %s, got %s", c.str, got) +			} +		}) +	} +} + +func TestUpdateProgress(t *testing.T) { +	cases := []struct { +		log string +		val float64 +	}{ +		{"Downloading", 0.11}, +		{"Preprocessing", 0.2}, +		{"Preprocessing\nOCRing", 0.5}, +		{"Preprocessing\nOCRing...", 0.53}, +		{"OCRing........................................", 0.89}, +		{"OCRing..\nAnalysing", 0.9}, +		{"Done", 1.0}, +		{"Weirdness", 0.0}, +	} + +	_ = app.New() // shouldn't be needed for test but we get a panic without it +	bar := widget.NewProgressBar() + +	for _, c := range cases { +		t.Run(c.log, func(t *testing.T) { +			l := strings.ReplaceAll("  "+c.log, "\n", "\n  ") +			bar.Value = 0.0 +			updateProgress(l, bar) +			got := bar.Value +			if got != c.val { +				t.Fatalf("Expected %f, got %f", c.val, got) +			} +		}) +	} +} diff --git a/cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/174f82f558636f2a b/cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/174f82f558636f2a new file mode 100644 index 0000000..1a7ed9c --- /dev/null +++ b/cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/174f82f558636f2a @@ -0,0 +1,2 @@ +go test fuzz v1 +string("https://www0google\xf7/books/edition/_/") diff --git a/cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/60892155cf2f7963 b/cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/60892155cf2f7963 new file mode 100644 index 0000000..b637539 --- /dev/null +++ b/cmd/rescribe/testdata/fuzz/FuzzGetBookIdFromUrl/60892155cf2f7963 @@ -0,0 +1,2 @@ +go test fuzz v1 +string("https://Books.google\xc1&id=") | 
