summaryrefslogtreecommitdiff
path: root/cmd/rescribe/main.go
diff options
context:
space:
mode:
Diffstat (limited to 'cmd/rescribe/main.go')
-rw-r--r--cmd/rescribe/main.go276
1 files changed, 237 insertions, 39 deletions
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go
index fd5b33b..96f2853 100644
--- a/cmd/rescribe/main.go
+++ b/cmd/rescribe/main.go
@@ -13,9 +13,10 @@ import (
"archive/zip"
"bytes"
"context"
- _ "embed"
"flag"
"fmt"
+ "image"
+ "image/draw"
"image/jpeg"
"image/png"
"io"
@@ -29,13 +30,14 @@ import (
"strings"
"time"
+ "golang.org/x/image/tiff"
"rescribe.xyz/bookpipeline"
"rescribe.xyz/bookpipeline/internal/pipeline"
"rescribe.xyz/pdf"
"rescribe.xyz/utils/pkg/hocr"
)
-const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd] [-t training] bookdir/book.pdf [savedir]
+const usage = `Usage: rescribe [-v] [-gui] [-systess] [-tesscmd cmd] [-gbookcmd cmd] [-t training] bookdir/book.pdf [savedir]
Process and OCR a book using the Rescribe pipeline on a local machine.
@@ -43,9 +45,6 @@ OCR results are saved into the bookdir directory unless savedir is
specified.
`
-//go:embed tessdata.20220322.zip
-var tessdatazip []byte
-
const QueueTimeoutSecs = 2 * 60
const PauseBetweenChecks = 1 * time.Second
const LogSaveTime = 1 * time.Minute
@@ -95,7 +94,7 @@ func resetTimer(t *time.Timer, d time.Duration) {
}
}
-// unpackTessZip unpacks a byte array of a zip file into a directory
+// unpackZip unpacks a byte array of a zip file into a directory
func unpackZip(b []byte, dir string) error {
br := bytes.NewReader(b)
zr, err := zip.NewReader(br, br.Size())
@@ -140,8 +139,10 @@ func unpackZip(b []byte, dir string) error {
func main() {
deftesscmd := "tesseract"
+ defgbookcmd := "getgbook"
if runtime.GOOS == "windows" {
deftesscmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
+ defgbookcmd = "getgbook.exe"
}
verbose := flag.Bool("v", false, "verbose")
@@ -153,6 +154,7 @@ These training files are included in rescribe, and are always available:
- lat.traineddata (Latin, modern print)
- rescribev9_fast.traineddata (Latin/English/French, printed ca 1500-1800)
`)
+ gbookcmd := flag.String("gbookcmd", defgbookcmd, "The getgbook executable to run. You may need to set this to the full path of getgbook.exe if you're on Windows.")
tesscmd := flag.String("tesscmd", deftesscmd, "The Tesseract executable to run. You may need to set this to the full path of Tesseract.exe if you're on Windows.")
wipe := flag.Bool("wipe", false, "Use wiper tool to remove noise like gutters from page before processing.")
fullpdf := flag.Bool("fullpdf", false, "Use highest image quality for searchable PDF (requires lots of RAM).")
@@ -187,7 +189,7 @@ These training files are included in rescribe, and are always available:
log.Fatalln("Error setting up tesseract directory:", err)
}
- if !*systess {
+ if !*systess && len(tesszip) > 0 {
err = unpackZip(tesszip, tessdir)
if err != nil {
log.Fatalln("Error unpacking embedded Tesseract zip:", err)
@@ -202,18 +204,31 @@ These training files are included in rescribe, and are always available:
}
}
- err = unpackZip(gbookzip, tessdir)
+ _, err = exec.LookPath(tessCommand)
if err != nil {
- log.Fatalln("Error unpacking embedded getgbook zip:", err)
+ log.Fatalf("No tesseract executable found [tried %s], either set -tesscmd and -systess on the command line or use the official build which includes an embedded copy of Tesseract.", tessCommand)
+ }
+
+ gbookCommand := *gbookcmd
+ if len(gbookzip) > 0 {
+ err = unpackZip(gbookzip, tessdir)
+ if err != nil {
+ log.Fatalln("Error unpacking embedded getgbook zip:", err)
+ }
+ switch runtime.GOOS {
+ case "darwin":
+ gbookCommand = filepath.Join(tessdir, "getgbook")
+ case "linux":
+ gbookCommand = filepath.Join(tessdir, "getgbook")
+ case "windows":
+ gbookCommand = filepath.Join(tessdir, "getgbook.exe")
+ }
}
- var gbookCommand string
- switch runtime.GOOS {
- case "darwin":
- gbookCommand = filepath.Join(tessdir, "getgbook")
- case "linux":
- gbookCommand = filepath.Join(tessdir, "getgbook")
- case "windows":
- gbookCommand = filepath.Join(tessdir, "getgbook.exe")
+
+ _, err = exec.LookPath(gbookCommand)
+ if err != nil {
+ log.Printf("No getgbook found [tried %s], google book downloading will be disabled, either set -gbookcmd on the command line or use the official build which includes an embedded getgbook.", gbookCommand)
+ gbookCommand = ""
}
tessdatadir := filepath.Join(tessdir, "tessdata")
@@ -221,25 +236,41 @@ These training files are included in rescribe, and are always available:
if err != nil {
log.Fatalln("Error setting up tessdata directory:", err)
}
- err = unpackZip(tessdatazip, tessdatadir)
- if err != nil {
- log.Fatalln("Error unpacking embedded tessdata zip:", err)
+ if len(tessdatazip) > 0 {
+ err = unpackZip(tessdatazip, tessdatadir)
+ if err != nil {
+ log.Fatalln("Error unpacking embedded tessdata zip:", err)
+ }
}
- // if trainingPath doesn't exist, set it to the embedded training instead
- _, err = os.Stat(trainingPath)
- if err != nil && !os.IsExist(err) {
- trainingPath = filepath.Base(trainingPath)
- trainingPath = filepath.Join(tessdatadir, trainingPath)
+ // copy training path to the tessdir directory, so that we can keep that a
+ // writeable space, which is needed opening other trainings in sandboxes
+ // like flatpak
+ in, err := os.Open(trainingPath)
+ trainingPath = filepath.Join(tessdatadir, filepath.Base(trainingPath))
+ if err != nil {
+ in, err = os.Open(trainingPath)
+ if err != nil {
+ log.Fatalf("Error opening training file %s: %v", trainingPath, err)
+ }
}
-
- f, err := os.Open(trainingPath)
+ defer in.Close()
+ newPath := trainingPath + ".new"
+ out, err := os.Create(newPath)
if err != nil {
- fmt.Fprintf(os.Stderr, "Error: Training files %s or %s could not be opened.\n", *training, trainingPath)
- fmt.Fprintf(os.Stderr, "Set the `-t` flag with path to a tesseract .traineddata file.\n")
- os.Exit(1)
+ log.Fatalf("Error creating training file %s: %v", newPath, err)
+ }
+ defer out.Close()
+ _, err = io.Copy(out, in)
+ if err != nil {
+ log.Fatalf("Error copying training file to %s: %v", newPath, err)
+ }
+ in.Close()
+ out.Close()
+ err = os.Rename(newPath, trainingPath)
+ if err != nil {
+ log.Fatalf("Error moving new training file to %s: %v", trainingPath, err)
}
- f.Close()
abstraining, err := filepath.Abs(trainingPath)
if err != nil {
@@ -253,7 +284,7 @@ These training files are included in rescribe, and are always available:
}
if flag.NArg() < 1 || *usegui {
- err := startGui(*verboselog, tessCommand, gbookCommand, trainingName, tessdir)
+ err := startGui(verboselog, tessCommand, gbookCommand, trainingName, tessdir)
err = os.RemoveAll(tessdir)
if err != nil {
log.Printf("Error removing tesseract directory %s: %v", tessdir, err)
@@ -265,6 +296,14 @@ These training files are included in rescribe, and are always available:
return
}
+ f, err := os.Open(trainingPath)
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "Error: Training files %s or %s could not be opened.\n", *training, trainingPath)
+ fmt.Fprintf(os.Stderr, "Set the `-t` flag with path to a tesseract .traineddata file.\n")
+ os.Exit(1)
+ }
+ f.Close()
+
bookdir := flag.Arg(0)
bookname := strings.ReplaceAll(filepath.Base(bookdir), " ", "_")
savedir := bookdir
@@ -305,7 +344,7 @@ These training files are included in rescribe, and are always available:
ispdf = true
}
- err = startProcess(ctx, *verboselog, tessCommand, bookdir, bookname, trainingName, savedir, tessdir, !*wipe, *fullpdf)
+ err = startProcess(ctx, verboselog, tessCommand, bookdir, bookname, trainingName, savedir, tessdir, !*wipe, *fullpdf)
if err != nil {
log.Fatalln(err)
}
@@ -360,6 +399,12 @@ func extractPdfImgs(ctx context.Context, path string) (string, error) {
if p.Page(pgnum).V.IsNull() {
continue
}
+ var rotate int64
+ for v := p.Page(pgnum).V; !v.IsNull(); v = v.Key("Parent") {
+ if r := v.Key("Rotate"); !r.IsNull() {
+ rotate = r.Int64()
+ }
+ }
res := p.Page(pgnum).Resources()
if res.Kind() != pdf.Dict {
continue
@@ -395,6 +440,13 @@ func extractPdfImgs(ctx context.Context, path string) (string, error) {
if err != nil {
return tempdir, fmt.Errorf("Error removing extracted image %s from PDF: %v\n", fn, err)
}
+
+ if rotate != 0 {
+ err = rotateImage(path, rotate)
+ if err != nil {
+ return tempdir, fmt.Errorf("Error rotating extracted image %s from PDF: %v\n", fn, err)
+ }
+ }
}
}
// TODO: check for places where there are multiple images per page, and only keep largest ones where that's the case
@@ -410,7 +462,9 @@ func extractPdfImgs(ctx context.Context, path string) (string, error) {
// rmIfNotImage attempts to decode a given file as an image. If it is
// decode-able as PNG, then rename file extension from .jpg to .png,
-// if it fails to be read as PNG or JPEG it will be deleted.
+// if it is decode-able as TIFF then convert to PNG and rename file
+// extension appropriately, if it fails to be read as PNG, TIFF or
+// JPEG it will just be deleted.
func rmIfNotImage(f string) error {
r, err := os.Open(f)
defer r.Close()
@@ -421,9 +475,9 @@ func rmIfNotImage(f string) error {
r.Close()
if err == nil {
b := strings.TrimSuffix(f, ".jpg")
- err = os.Rename(f, b + ".png")
+ err = os.Rename(f, b+".png")
if err != nil {
- return fmt.Errorf("Error renaming %s to %s: %v", f, b + ".png", err)
+ return fmt.Errorf("Error renaming %s to %s: %v", f, b+".png", err)
}
return nil
}
@@ -434,18 +488,131 @@ func rmIfNotImage(f string) error {
return fmt.Errorf("Failed to open image %s: %v\n", f, err)
}
_, err = jpeg.Decode(r)
+ r.Close()
+ if err == nil {
+ return nil
+ }
+
+ r, err = os.Open(f)
+ defer r.Close()
if err != nil {
+ return fmt.Errorf("Failed to open image %s: %v\n", f, err)
+ }
+ t, err := tiff.Decode(r)
+ if err == nil {
+ b := strings.TrimSuffix(f, ".jpg")
+ n, err := os.Create(b + ".png")
+ defer n.Close()
+ if err != nil {
+ return fmt.Errorf("Failed to create file to store new png %s from tiff %s: %v\n", b+".png", f, err)
+ }
+ err = png.Encode(n, t)
+ if err != nil {
+ return fmt.Errorf("Failed to encode tiff as png for %s: %v\n", f, err)
+ }
r.Close()
err = os.Remove(f)
if err != nil {
- return fmt.Errorf("Failed to remove invalid image %s: %v", f, err)
+ return fmt.Errorf("Failed to remove original tiff %s: %v\n", f, err)
+ }
+ return nil
+ }
+
+ r.Close()
+ err = os.Remove(f)
+ if err != nil {
+ return fmt.Errorf("Failed to remove invalid image %s: %v", f, err)
+ }
+
+ return nil
+}
+
+// rotateImage rotates an image at the given path by the given angle
+func rotateImage(path string, angle int64) error {
+ switch angle {
+ case 90:
+ // proceed with the rest of the function
+ case 180, 270:
+ // rotate the image again first, as many times as necessary.
+ // this is inefficient but easy.
+ err := rotateImage(path, angle-90)
+ if err != nil {
+ return fmt.Errorf("error with a rotation run: %w", err)
}
+ default:
+ return fmt.Errorf("Rotation angle of %d is not supported", angle)
+ }
+
+ r, err := os.Open(path)
+ defer r.Close()
+ if err != nil {
+ return fmt.Errorf("Failed to open image: %w", err)
+ }
+ img, err := png.Decode(r)
+ if err != nil {
+ r.Close()
+ r, err = os.Open(path)
+ defer r.Close()
+ if err != nil {
+ return fmt.Errorf("Failed to open image: %w", err)
+ }
+ img, err = jpeg.Decode(r)
+ }
+ if err != nil {
+ r.Close()
+ r, err = os.Open(path)
+ defer r.Close()
+ if err != nil {
+ return fmt.Errorf("Failed to open image: %w", err)
+ }
+ img, err = tiff.Decode(r)
+ }
+ if err != nil {
+ return fmt.Errorf("Failed to decode image as png, jpeg or tiff: %w", err)
+ }
+
+ b := img.Bounds()
+
+ orig := image.NewRGBA(b)
+ draw.Draw(orig, b, img, b.Min, draw.Src)
+
+ newb := image.Rectangle{
+ Min: image.Point{X: 0, Y: 0},
+ Max: image.Point{X: b.Dy(), Y: b.Dx()},
+ }
+ new := image.NewRGBA(newb)
+
+ for x := b.Min.X; x < b.Max.X; x++ {
+ desty := newb.Min.Y + x
+ for y := b.Max.Y; y > b.Min.Y; y-- {
+ destx := b.Dy() - y + newb.Min.X
+ new.SetRGBA(destx, desty, orig.RGBAAt(x, y))
+ }
+ }
+
+ err = r.Close()
+ if err != nil {
+ return fmt.Errorf("Failed to close image: %w", err)
+ }
+ w, err := os.Create(path)
+ if err != nil {
+ return fmt.Errorf("Failed to create rotated image: %w", err)
+ }
+ defer w.Close()
+
+ if !strings.HasSuffix(path, ".jpg") {
+ err = jpeg.Encode(w, new, nil)
+ } else {
+ err = png.Encode(w, new)
+ }
+ if err != nil {
+ return fmt.Errorf("Failed to encode rotated image: %w", err)
}
return nil
}
-func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, savedir string, tessdir string, nowipe bool, fullpdf bool) error {
+func startProcess(ctx context.Context, logger *log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, savedir string, tessdir string, nowipe bool, fullpdf bool) error {
cmd := exec.Command(tessCommand, "--help")
pipeline.HideCmd(cmd)
_, err := cmd.Output()
@@ -464,7 +631,7 @@ func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bo
}
var conn Pipeliner
- conn = &bookpipeline.LocalConn{Logger: &logger, TempDir: tempdir}
+ conn = &bookpipeline.LocalConn{Logger: logger, TempDir: tempdir}
conn.Log("Setting up session")
err = conn.Init()
@@ -509,6 +676,11 @@ func startProcess(ctx context.Context, logger log.Logger, tessCommand string, bo
return fmt.Errorf("Error looking for .hocr files: %v", err)
}
+ err = addFullTxt(hocrs, bookname)
+ if err != nil {
+ log.Fatalf("Error creating full txt version: %v", err)
+ }
+
for _, v := range hocrs {
err = addTxtVersion(v)
if err != nil {
@@ -594,6 +766,32 @@ func addTxtVersion(hocrfn string) error {
return nil
}
+func addFullTxt(hocrs []string, bookname string) error {
+ if len(hocrs) == 0 {
+ return nil
+ }
+ var full string
+ for i, v := range hocrs {
+ t, err := hocr.GetText(v)
+ if err != nil {
+ return fmt.Errorf("Error getting text from hocr file %s: %v", v, err)
+ }
+ if i > 0 {
+ full += "\n"
+ }
+ full += t
+ }
+
+ dir := filepath.Dir(hocrs[0])
+ fn := filepath.Join(dir, bookname+".txt")
+ err := ioutil.WriteFile(fn, []byte(full), 0644)
+ if err != nil {
+ return fmt.Errorf("Error creating text file %s: %v", fn, err)
+ }
+
+ return nil
+}
+
func uploadbook(ctx context.Context, dir string, name string, conn Pipeliner, nowipe bool) error {
_, err := os.Stat(dir)
if err != nil && !os.IsExist(err) {