diff options
Diffstat (limited to 'cmd/rescribe/main.go')
-rw-r--r-- | cmd/rescribe/main.go | 276 |
1 files changed, 237 insertions, 39 deletions
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index fd5b33b..96f2853 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -13,9 +13,10 @@ import ( "archive/zip" "bytes" "context" - _ "embed" "flag" "fmt" + "image" + "image/draw" "image/jpeg" "image/png" "io" @@ -29,13 +30,14 @@ import ( "strings" "time" + "golang.org/x/image/tiff" "rescribe.xyz/bookpipeline" "rescribe.xyz/bookpipeline/internal/pipeline" "rescribe.xyz/pdf" "rescribe.xyz/utils/pkg/hocr" ) -const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir/book.pdf [savedir] +const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd cmd] [-gbookcmd cmd] [-t training] bookdir/book.pdf [savedir] Process and OCR a book using the Rescribe pipeline on a local machine. @@ -43,9 +45,6 @@ OCR results are saved into the bookdir directory unless savedir is specified. ` -//go:embed tessdata.20220322.zip -var tessdatazip []byte - const QueueTimeoutSecs = 2 * 60 const PauseBetweenChecks = 1 * time.Second const LogSaveTime = 1 * time.Minute @@ -95,7 +94,7 @@ func resetTimer(t *time.Timer, d time.Duration) { } } -// unpackTessZip unpacks a byte array of a zip file into a directory +// unpackZip unpacks a byte array of a zip file into a directory func unpackZip(b []byte, dir string) error { br := bytes.NewReader(b) zr, err := zip.NewReader(br, br.Size()) @@ -140,8 +139,10 @@ func unpackZip(b []byte, dir string) error { func main() { deftesscmd := "tesseract" + defgbookcmd := "getgbook" if runtime.GOOS == "windows" { deftesscmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe" + defgbookcmd = "getgbook.exe" } verbose := flag.Bool("v", false, "verbose") @@ -153,6 +154,7 @@ These training files are included in rescribe, and are always available: - lat.traineddata (Latin, modern print) - rescribev9_fast.traineddata (Latin/English/French, printed ca 1500-1800) `) + gbookcmd := flag.String("gbookcmd", defgbookcmd, "The getgbook executable to run. You may need to set this to the full path of getgbook.exe if you're on Windows.") tesscmd := flag.String("tesscmd", deftesscmd, "The Tesseract executable to run. You may need to set this to the full path of Tesseract.exe if you're on Windows.") wipe := flag.Bool("wipe", false, "Use wiper tool to remove noise like gutters from page before processing.") fullpdf := flag.Bool("fullpdf", false, "Use highest image quality for searchable PDF (requires lots of RAM).") @@ -187,7 +189,7 @@ These training files are included in rescribe, and are always available: log.Fatalln("Error setting up tesseract directory:", err) } - if !*systess { + if !*systess && len(tesszip) > 0 { err = unpackZip(tesszip, tessdir) if err != nil { log.Fatalln("Error unpacking embedded Tesseract zip:", err) @@ -202,18 +204,31 @@ These training files are included in rescribe, and are always available: } } - err = unpackZip(gbookzip, tessdir) + _, err = exec.LookPath(tessCommand) if err != nil { - log.Fatalln("Error unpacking embedded getgbook zip:", err) + log.Fatalf("No tesseract executable found [tried %s], either set -tesscmd and -systess on the command line or use the official build which includes an embedded copy of Tesseract.", tessCommand) + } + + gbookCommand := *gbookcmd + if len(gbookzip) > 0 { + err = unpackZip(gbookzip, tessdir) + if err != nil { + log.Fatalln("Error unpacking embedded getgbook zip:", err) + } + switch runtime.GOOS { + case "darwin": + gbookCommand = filepath.Join(tessdir, "getgbook") + case "linux": + gbookCommand = filepath.Join(tessdir, "getgbook") + case "windows": + gbookCommand = filepath.Join(tessdir, "getgbook.exe") + } } - var gbookCommand string - switch runtime.GOOS { - case "darwin": - gbookCommand = filepath.Join(tessdir, "getgbook") - case "linux": - gbookCommand = filepath.Join(tessdir, "getgbook") - case "windows": - gbookCommand = filepath.Join(tessdir, "getgbook.exe") + + _, err = exec.LookPath(gbookCommand) + if err != nil { + log.Printf("No getgbook found [tried %s], google book downloading will be disabled, either set -gbookcmd on the command line or use the official build which includes an embedded getgbook.", gbookCommand) + gbookCommand = "" } tessdatadir := filepath.Join(tessdir, "tessdata") @@ -221,25 +236,41 @@ These training files are included in rescribe, and are always available: if err != nil { log.Fatalln("Error setting up tessdata directory:", err) } - err = unpackZip(tessdatazip, tessdatadir) - if err != nil { - log.Fatalln("Error unpacking embedded tessdata zip:", err) + if len(tessdatazip) > 0 { + err = unpackZip(tessdatazip, tessdatadir) + if err != nil { + log.Fatalln("Error unpacking embedded tessdata zip:", err) + } } - // if trainingPath doesn't exist, set it to the embedded training instead - _, err = os.Stat(trainingPath) - if err != nil && !os.IsExist(err) { - trainingPath = filepath.Base(trainingPath) - trainingPath = filepath.Join(tessdatadir, trainingPath) + // copy training path to the tessdir directory, so that we can keep that a + // writeable space, which is needed opening other trainings in sandboxes + // like flatpak + in, err := os.Open(trainingPath) + trainingPath = filepath.Join(tessdatadir, filepath.Base(trainingPath)) + if err != nil { + in, err = os.Open(trainingPath) + if err != nil { + log.Fatalf("Error opening training file %s: %v", trainingPath, err) + } } - - f, err := os.Open(trainingPath) + defer in.Close() + newPath := trainingPath + ".new" + out, err := os.Create(newPath) if err != nil { - fmt.Fprintf(os.Stderr, "Error: Training files %s or %s could not be opened.\n", *training, trainingPath) - fmt.Fprintf(os.Stderr, "Set the `-t` flag with path to a tesseract .traineddata file.\n") - os.Exit(1) + log.Fatalf("Error creating training file %s: %v", newPath, err) + } + defer out.Close() + _, err = io.Copy(out, in) + if err != nil { + log.Fatalf("Error copying training file to %s: %v", newPath, err) + } + in.Close() + out.Close() + err = os.Rename(newPath, trainingPath) + if err != nil { + log.Fatalf("Error moving new training file to %s: %v", trainingPath, err) } - f.Close() abstraining, err := filepath.Abs(trainingPath) if err != nil { @@ -253,7 +284,7 @@ These training files are included in rescribe, and are always available: } if flag.NArg() < 1 || *usegui { - err := startGui(*verboselog, tessCommand, gbookCommand, trainingName, tessdir) + err := startGui(verboselog, tessCommand, gbookCommand, trainingName, tessdir) err = os.RemoveAll(tessdir) if err != nil { log.Printf("Error removing tesseract directory %s: %v", tessdir, err) @@ -265,6 +296,14 @@ These training files are included in rescribe, and are always available: return } + f, err := os.Open(trainingPath) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: Training files %s or %s could not be opened.\n", *training, trainingPath) + fmt.Fprintf(os.Stderr, "Set the `-t` flag with path to a tesseract .traineddata file.\n") + os.Exit(1) + } + f.Close() + bookdir := flag.Arg(0) bookname := strings.ReplaceAll(filepath.Base(bookdir), " ", "_") savedir := bookdir @@ -305,7 +344,7 @@ These training files are included in rescribe, and are always available: ispdf = true } - err = startProcess(ctx, *verboselog, tessCommand, bookdir, bookname, trainingName, savedir, tessdir, !*wipe, *fullpdf) + err = startProcess(ctx, verboselog, tessCommand, bookdir, bookname, trainingName, savedir, tessdir, !*wipe, *fullpdf) if err != nil { log.Fatalln(err) } @@ -360,6 +399,12 @@ func extractPdfImgs(ctx context.Context, path string) (string, error) { if p.Page(pgnum).V.IsNull() { continue } + var rotate int64 + for v := p.Page(pgnum).V; !v.IsNull(); v = v.Key("Parent") { + if r := v.Key("Rotate"); !r.IsNull() { + rotate = r.Int64() + } + } res := p.Page(pgnum).Resources() if res.Kind() != pdf.Dict { continue @@ -395,6 +440,13 @@ func extractPdfImgs(ctx context.Context, path string) (string, error) { if err != nil { return tempdir, fmt.Errorf("Error removing extracted image %s from PDF: %v\n", fn, err) } + + if rotate != 0 { + err = rotateImage(path, rotate) + if err != nil { + return tempdir, fmt.Errorf("Error rotating extracted image %s from PDF: %v\n", fn, err) + } + } } } // TODO: check for places where there are multiple images per page, and only keep largest ones where that's the case @@ -410,7 +462,9 @@ func extractPdfImgs(ctx context.Context, path string) (string, error) { // rmIfNotImage attempts to decode a given file as an image. If it is // decode-able as PNG, then rename file extension from .jpg to .png, -// if it fails to be read as PNG or JPEG it will be deleted. +// if it is decode-able as TIFF then convert to PNG and rename file +// extension appropriately, if it fails to be read as PNG, TIFF or +// JPEG it will just be deleted. func rmIfNotImage(f string) error { r, err := os.Open(f) defer r.Close() @@ -421,9 +475,9 @@ func rmIfNotImage(f string) error { r.Close() if err == nil { b := strings.TrimSuffix(f, ".jpg") - err = os.Rename(f, b + ".png") + err = os.Rename(f, b+".png") if err != nil { - return fmt.Errorf("Error renaming %s to %s: %v", f, b + ".png", err) + return fmt.Errorf("Error renaming %s to %s: %v", f, b+".png", err) } return nil } @@ -434,18 +488,131 @@ func rmIfNotImage(f string) error { return fmt.Errorf("Failed to open image %s: %v\n", f, err) } _, err = jpeg.Decode(r) + r.Close() + if err == nil { + return nil + } + + r, err = os.Open(f) + defer r.Close() if err != nil { + return fmt.Errorf("Failed to open image %s: %v\n", f, err) + } + t, err := tiff.Decode(r) + if err == nil { + b := strings.TrimSuffix(f, ".jpg") + n, err := os.Create(b + ".png") + defer n.Close() + if err != nil { + return fmt.Errorf("Failed to create file to store new png %s from tiff %s: %v\n", b+".png", f, err) + } + err = png.Encode(n, t) + if err != nil { + return fmt.Errorf("Failed to encode tiff as png for %s: %v\n", f, err) + } r.Close() err = os.Remove(f) if err != nil { - return fmt.Errorf("Failed to remove invalid image %s: %v", f, err) + return fmt.Errorf("Failed to remove original tiff %s: %v\n", f, err) + } + return nil + } + + r.Close() + err = os.Remove(f) + if err != nil { + return fmt.Errorf("Failed to remove invalid image %s: %v", f, err) + } + + return nil +} + +// rotateImage rotates an image at the given path by the given angle +func rotateImage(path string, angle int64) error { + switch angle { + case 90: + // proceed with the rest of the function + case 180, 270: + // rotate the image again first, as many times as necessary. + // this is inefficient but easy. + err := rotateImage(path, angle-90) + if err != nil { + return fmt.Errorf("error with a rotation run: %w", err) } + default: + return fmt.Errorf("Rotation angle of %d is not supported", angle) + } + + r, err := os.Open(path) + defer r.Close() + if err != nil { + return fmt.Errorf("Failed to open image: %w", err) + } + img, err := png.Decode(r) + if err != nil { + r.Close() + r, err = os.Open(path) + defer r.Close() + if err != nil { + return fmt.Errorf("Failed to open image: %w", err) + } + img, err = jpeg.Decode(r) + } + if err != nil { + r.Close() + r, err = os.Open(path) + defer r.Close() + if err != nil { + return fmt.Errorf("Failed to open image: %w", err) + } + img, err = tiff.Decode(r) + } + if err != nil { + return fmt.Errorf("Failed to decode image as png, jpeg or tiff: %w", err) + } + + b := img.Bounds() + + orig := image.NewRGBA(b) + draw.Draw(orig, b, img, b.Min, draw.Src) + + newb := image.Rectangle{ + Min: image.Point{X: 0, Y: 0}, + Max: image.Point{X: b.Dy(), Y: b.Dx()}, + } + new := image.NewRGBA(newb) + + for x := b.Min.X; x < b.Max.X; x++ { + desty := newb.Min.Y + x + for y := b.Max.Y; y > b.Min.Y; y-- { + destx := b.Dy() - y + newb.Min.X + new.SetRGBA(destx, desty, orig.RGBAAt(x, y)) + } + } + + err = r.Close() + if err != nil { + return fmt.Errorf("Failed to close image: %w", err) + } + w, err := os.Create(path) + if err != nil { + return fmt.Errorf("Failed to create rotated image: %w", err) + } + defer w.Close() + + if !strings.HasSuffix(path, ".jpg") { + err = jpeg.Encode(w, new, nil) + } else { + err = png.Encode(w, new) + } + if err != nil { + return fmt.Errorf("Failed to encode rotated image: %w", err) } return nil } -func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, savedir string, tessdir string, nowipe bool, fullpdf bool) error { +func startProcess(ctx context.Context, logger *log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, savedir string, tessdir string, nowipe bool, fullpdf bool) error { cmd := exec.Command(tessCommand, "--help") pipeline.HideCmd(cmd) _, err := cmd.Output() @@ -464,7 +631,7 @@ func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bo } var conn Pipeliner - conn = &bookpipeline.LocalConn{Logger: &logger, TempDir: tempdir} + conn = &bookpipeline.LocalConn{Logger: logger, TempDir: tempdir} conn.Log("Setting up session") err = conn.Init() @@ -509,6 +676,11 @@ func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bo return fmt.Errorf("Error looking for .hocr files: %v", err) } + err = addFullTxt(hocrs, bookname) + if err != nil { + log.Fatalf("Error creating full txt version: %v", err) + } + for _, v := range hocrs { err = addTxtVersion(v) if err != nil { @@ -594,6 +766,32 @@ func addTxtVersion(hocrfn string) error { return nil } +func addFullTxt(hocrs []string, bookname string) error { + if len(hocrs) == 0 { + return nil + } + var full string + for i, v := range hocrs { + t, err := hocr.GetText(v) + if err != nil { + return fmt.Errorf("Error getting text from hocr file %s: %v", v, err) + } + if i > 0 { + full += "\n" + } + full += t + } + + dir := filepath.Dir(hocrs[0]) + fn := filepath.Join(dir, bookname+".txt") + err := ioutil.WriteFile(fn, []byte(full), 0644) + if err != nil { + return fmt.Errorf("Error creating text file %s: %v", fn, err) + } + + return nil +} + func uploadbook(ctx context.Context, dir string, name string, conn Pipeliner, nowipe bool) error { _, err := os.Stat(dir) if err != nil && !os.IsExist(err) { |