summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2020-11-09 12:40:57 +0000
committerNick White <git@njw.name>2020-11-09 12:40:57 +0000
commit34b5735503edb9c5ab635c84cd356f19df7d7381 (patch)
treedd3c1ee9df2fcc7cf5d3247c2e523fb464513da0
parent48f817f0dfd3e89c372ac358418fe69b43eefa1b (diff)
Set hocr config options directly rather than relying on 'hocr' config file
This ensures that bookpipeline will still work even if TESSDATA_PREFIX has been set to a directory without configs in it.
-rw-r--r--cmd/bookpipeline/main.go2
1 files changed, 1 insertions, 1 deletions
diff --git a/cmd/bookpipeline/main.go b/cmd/bookpipeline/main.go
index 36295a6..b3ffc53 100644
--- a/cmd/bookpipeline/main.go
+++ b/cmd/bookpipeline/main.go
@@ -216,7 +216,7 @@ func ocr(training string) func(chan string, chan string, chan error, *log.Logger
for path := range toocr {
logger.Println("OCRing", path)
name := strings.Replace(path, ".png", "", 1)
- cmd := exec.Command("tesseract", "-l", training, path, name, "hocr")
+ cmd := exec.Command("tesseract", "-l", training, path, name, "-c", "tessedit_create_hocr=1", "-c", "hocr_font_info=0")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr