diff options
author | Nick White <git@njw.name> | 2020-11-16 16:44:42 +0000 |
---|---|---|
committer | Nick White <git@njw.name> | 2020-11-16 16:44:42 +0000 |
commit | 56c1cf041aec9cb2352a3bd4a4b46e65a3cc04c0 (patch) | |
tree | ce709d21a95a7171d0078878846199cc472fad51 /cmd/rescribe/main.go | |
parent | 6b5145f0b75c8d5719bf44d5f654b9a2d1e3b2cd (diff) |
[rescribe] Add txt output, only keep colour pdf, and reorganise files so they're more user-friendly
Diffstat (limited to 'cmd/rescribe/main.go')
-rw-r--r-- | cmd/rescribe/main.go | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 8ca4189..fe36aea 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -20,6 +20,7 @@ import ( "time" "rescribe.xyz/bookpipeline" + "rescribe.xyz/utils/pkg/hocr" "rescribe.xyz/bookpipeline/internal/pipeline" ) @@ -175,6 +176,55 @@ func main() { if err != nil { log.Fatalf("Error removing temporary directory %s: %v", tempdir, err) } + + hocrs, err := filepath.Glob(fmt.Sprintf("%s/*hocr", bookname)) + if err != nil { + log.Fatalf("Error looking for .hocr files: %v", err) + } + + for _, v := range hocrs { + err = addTxtVersion(v) + if err != nil { + log.Fatalf("Error creating txt version of %s: %v", v, err) + } + + err = os.MkdirAll(filepath.Join(bookname, "hocr"), 0755) + if err != nil { + log.Fatalf("Error creating hocr directory: %v", err) + } + + err = os.Rename(v, filepath.Join(bookname, "hocr", filepath.Base(v))) + if err != nil { + log.Fatalf("Error moving hocr %s to hocr directory: %v", v, err) + } + } + + // For simplicity, remove .binarised.pdf and rename .colour.pdf to .pdf + _ = os.Remove(filepath.Join(bookname, bookname + ".binarised.pdf")) + _ = os.Rename(filepath.Join(bookname, bookname + ".colour.pdf"), filepath.Join(bookname, bookname + ".pdf")) +} + +func addTxtVersion(hocrfn string) error { + dir := filepath.Dir(hocrfn) + err := os.MkdirAll(filepath.Join(dir, "text"), 0755) + if err != nil { + log.Fatalf("Error creating text directory: %v", err) + } + + t, err := hocr.GetText(hocrfn) + if err != nil { + return fmt.Errorf("Error getting text from hocr file %s: %v", hocrfn, err) + } + + basefn := strings.TrimSuffix(filepath.Base(hocrfn), ".hocr") + ".txt" + fn := filepath.Join(dir, "text", basefn) + + err = ioutil.WriteFile(fn, []byte(t), 0644) + if err != nil { + return fmt.Errorf("Error creating text file %s: %v", fn, err) + } + + return nil } func uploadbook(dir string, name string, conn Pipeliner) error { |