summaryrefslogtreecommitdiff
path: root/cmd/extracthocrlines
diff options
context:
space:
mode:
authorNick White <git@njw.name>2021-03-23 11:14:35 +0000
committerNick White <git@njw.name>2021-03-23 11:14:35 +0000
commitfa4f48ad54ec94c222269d335d40b21becff92a4 (patch)
treeca3ba1a4b46d1570607d9233455496e93e10517a /cmd/extracthocrlines
parent59d139893ab9044447e14fa091f50b48969847b5 (diff)
hocr: Add ability to specify a custom image path for hocr line extraction, and use it in extracthocrlines
Diffstat (limited to 'cmd/extracthocrlines')
-rw-r--r--cmd/extracthocrlines/main.go13
1 files changed, 11 insertions, 2 deletions
diff --git a/cmd/extracthocrlines/main.go b/cmd/extracthocrlines/main.go
index d765875..fa5ef28 100644
--- a/cmd/extracthocrlines/main.go
+++ b/cmd/extracthocrlines/main.go
@@ -14,12 +14,13 @@ import (
"log"
"os"
"path/filepath"
+ "strings"
"rescribe.xyz/utils/pkg/hocr"
"rescribe.xyz/utils/pkg/line"
)
-const usage = `Usage: extracthocrlines file.hocr [file.hocr]
+const usage = `Usage: extracthocrlines [-d] [-e] file.hocr [file.hocr]
Copies the text and corresponding image section for each line
of a HOCR file into separate files, which is useful for OCR
@@ -64,6 +65,7 @@ func main() {
flag.PrintDefaults()
}
dir := flag.String("d", ".", "Directory to save lines in")
+ embeddedimgpath := flag.Bool("e", false, "Use image path embedded in hOCR (rather than the path of the .hocr file with a .png suffix)")
flag.Parse()
if flag.NArg() < 1 {
flag.Usage()
@@ -71,7 +73,14 @@ func main() {
}
for _, f := range flag.Args() {
- newlines, err := hocr.GetLineDetails(f)
+ var err error
+ var newlines line.Details
+ if *embeddedimgpath {
+ newlines, err = hocr.GetLineDetails(f)
+ } else {
+ imgName := strings.TrimSuffix(f, ".hocr") + ".png"
+ newlines, err = hocr.GetLineDetailsCustomImg(f, imgName)
+ }
if err != nil {
log.Fatal(err)
}