summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2021-03-23 11:14:35 +0000
committerNick White <git@njw.name>2021-03-23 11:14:35 +0000
commitfa4f48ad54ec94c222269d335d40b21becff92a4 (patch)
treeca3ba1a4b46d1570607d9233455496e93e10517a
parent59d139893ab9044447e14fa091f50b48969847b5 (diff)
hocr: Add ability to specify a custom image path for hocr line extraction, and use it in extracthocrlines
-rw-r--r--cmd/extracthocrlines/main.go13
-rw-r--r--pkg/hocr/hocr.go10
-rw-r--r--pkg/hocr/lines.go48
3 files changed, 53 insertions, 18 deletions
diff --git a/cmd/extracthocrlines/main.go b/cmd/extracthocrlines/main.go
index d765875..fa5ef28 100644
--- a/cmd/extracthocrlines/main.go
+++ b/cmd/extracthocrlines/main.go
@@ -14,12 +14,13 @@ import (
"log"
"os"
"path/filepath"
+ "strings"
"rescribe.xyz/utils/pkg/hocr"
"rescribe.xyz/utils/pkg/line"
)
-const usage = `Usage: extracthocrlines file.hocr [file.hocr]
+const usage = `Usage: extracthocrlines [-d] [-e] file.hocr [file.hocr]
Copies the text and corresponding image section for each line
of a HOCR file into separate files, which is useful for OCR
@@ -64,6 +65,7 @@ func main() {
flag.PrintDefaults()
}
dir := flag.String("d", ".", "Directory to save lines in")
+ embeddedimgpath := flag.Bool("e", false, "Use image path embedded in hOCR (rather than the path of the .hocr file with a .png suffix)")
flag.Parse()
if flag.NArg() < 1 {
flag.Usage()
@@ -71,7 +73,14 @@ func main() {
}
for _, f := range flag.Args() {
- newlines, err := hocr.GetLineDetails(f)
+ var err error
+ var newlines line.Details
+ if *embeddedimgpath {
+ newlines, err = hocr.GetLineDetails(f)
+ } else {
+ imgName := strings.TrimSuffix(f, ".hocr") + ".png"
+ newlines, err = hocr.GetLineDetailsCustomImg(f, imgName)
+ }
if err != nil {
log.Fatal(err)
}
diff --git a/pkg/hocr/hocr.go b/pkg/hocr/hocr.go
index 6cd1df1..e84c2b1 100644
--- a/pkg/hocr/hocr.go
+++ b/pkg/hocr/hocr.go
@@ -58,16 +58,6 @@ func wordConf(s string) (float64, error) {
return strconv.ParseFloat(conf[1], 64)
}
-// Returns the image path for a page from a ocr_page title
-func imagePath(s string) (string, error) {
- re, err := regexp.Compile(`image ["']([^"']+)["']`)
- if err != nil {
- return "", err
- }
- m := re.FindStringSubmatch(s)
- return m[1], nil
-}
-
// BoxCoords parses bbox coordinate strings
func BoxCoords(s string) ([4]int, error) {
var coords [4]int
diff --git a/pkg/hocr/lines.go b/pkg/hocr/lines.go
index 1c759e0..5794243 100644
--- a/pkg/hocr/lines.go
+++ b/pkg/hocr/lines.go
@@ -16,11 +16,22 @@ import (
"io/ioutil"
"os"
"path/filepath"
+ "regexp"
"strings"
"rescribe.xyz/utils/pkg/line"
)
+// Returns the image path for a page from a ocr_page title
+func imagePathFromTitle(s string) (string, error) {
+ re, err := regexp.Compile(`image ["']([^"']+)["']`)
+ if err != nil {
+ return "", err
+ }
+ m := re.FindStringSubmatch(s)
+ return m[1], nil
+}
+
// LineText extracts the text from an OcrLine
func LineText(l OcrLine) string {
linetext := ""
@@ -54,11 +65,17 @@ func LineText(l OcrLine) string {
return linetext
}
-func parseLineDetails(h Hocr, dir string) (line.Details, error) {
+// parseLineDetails parses a Hocr struct into a line.Details
+// struct, including extracted image segments for each line.
+// The image location is taken from imgPath, which can either
+// be imagePathFromTitle (see above) which loads the image
+// path embedded in the title attribute of a hocr page, or
+// a custom handler.
+func parseLineDetails(h Hocr, dir string, imgPath func(string) (string, error)) (line.Details, error) {
lines := make(line.Details, 0)
for _, p := range h.Pages {
- imgpath, err := imagePath(p.Title)
+ imgpath, err := imgPath(p.Title)
if err != nil {
return lines, err
}
@@ -68,7 +85,7 @@ func parseLineDetails(h Hocr, dir string) (line.Details, error) {
var gray *image.Gray
pngf, err := os.Open(imgpath)
if err != nil {
- fmt.Fprintf(os.Stderr, "Warning: error opening image %s: %v", imgpath, err)
+ fmt.Fprintf(os.Stderr, "Warning: error opening image %s: %v\n", imgpath, err)
}
defer pngf.Close()
img, _, err = image.Decode(pngf)
@@ -99,7 +116,7 @@ func parseLineDetails(h Hocr, dir string) (line.Details, error) {
ln.Name = l.Id
ln.Avgconf = (totalconf / float64(num)) / 100
ln.Text = LineText(l)
- imgpath, err := imagePath(p.Title)
+ imgpath, err := imgPath(p.Title)
if err != nil {
return lines, err
}
@@ -131,7 +148,26 @@ func GetLineDetails(hocrfn string) (line.Details, error) {
return newlines, err
}
- return parseLineDetails(h, filepath.Dir(hocrfn))
+ return parseLineDetails(h, filepath.Dir(hocrfn), imagePathFromTitle)
+}
+
+// GetLineDetailsCustomImg is a variant of GetLineDetails that
+// uses a provided image path for line image extracts, rather
+// than the image name embedded in the .hocr
+func GetLineDetailsCustomImg(hocrfn string, imgfn string) (line.Details, error) {
+ var newlines line.Details
+
+ file, err := ioutil.ReadFile(hocrfn)
+ if err != nil {
+ return newlines, err
+ }
+
+ h, err := Parse(file)
+ if err != nil {
+ return newlines, err
+ }
+
+ return parseLineDetails(h, filepath.Dir(hocrfn), func(s string) (string, error) {return imgfn, nil})
}
// GetLineBasics parses a hocr file and returns a corresponding
@@ -149,5 +185,5 @@ func GetLineBasics(hocrfn string) (line.Details, error) {
return newlines, err
}
- return parseLineDetails(h, filepath.Dir(hocrfn))
+ return parseLineDetails(h, filepath.Dir(hocrfn), imagePathFromTitle)
}