summaryrefslogtreecommitdiff
path: root/pkg/hocr
diff options
context:
space:
mode:
authorNick White <git@njw.name>2021-03-23 11:14:35 +0000
committerNick White <git@njw.name>2021-03-23 11:14:35 +0000
commitfa4f48ad54ec94c222269d335d40b21becff92a4 (patch)
treeca3ba1a4b46d1570607d9233455496e93e10517a /pkg/hocr
parent59d139893ab9044447e14fa091f50b48969847b5 (diff)
hocr: Add ability to specify a custom image path for hocr line extraction, and use it in extracthocrlines
Diffstat (limited to 'pkg/hocr')
-rw-r--r--pkg/hocr/hocr.go10
-rw-r--r--pkg/hocr/lines.go48
2 files changed, 42 insertions, 16 deletions
diff --git a/pkg/hocr/hocr.go b/pkg/hocr/hocr.go
index 6cd1df1..e84c2b1 100644
--- a/pkg/hocr/hocr.go
+++ b/pkg/hocr/hocr.go
@@ -58,16 +58,6 @@ func wordConf(s string) (float64, error) {
return strconv.ParseFloat(conf[1], 64)
}
-// Returns the image path for a page from a ocr_page title
-func imagePath(s string) (string, error) {
- re, err := regexp.Compile(`image ["']([^"']+)["']`)
- if err != nil {
- return "", err
- }
- m := re.FindStringSubmatch(s)
- return m[1], nil
-}
-
// BoxCoords parses bbox coordinate strings
func BoxCoords(s string) ([4]int, error) {
var coords [4]int
diff --git a/pkg/hocr/lines.go b/pkg/hocr/lines.go
index 1c759e0..5794243 100644
--- a/pkg/hocr/lines.go
+++ b/pkg/hocr/lines.go
@@ -16,11 +16,22 @@ import (
"io/ioutil"
"os"
"path/filepath"
+ "regexp"
"strings"
"rescribe.xyz/utils/pkg/line"
)
+// Returns the image path for a page from a ocr_page title
+func imagePathFromTitle(s string) (string, error) {
+ re, err := regexp.Compile(`image ["']([^"']+)["']`)
+ if err != nil {
+ return "", err
+ }
+ m := re.FindStringSubmatch(s)
+ return m[1], nil
+}
+
// LineText extracts the text from an OcrLine
func LineText(l OcrLine) string {
linetext := ""
@@ -54,11 +65,17 @@ func LineText(l OcrLine) string {
return linetext
}
-func parseLineDetails(h Hocr, dir string) (line.Details, error) {
+// parseLineDetails parses a Hocr struct into a line.Details
+// struct, including extracted image segments for each line.
+// The image location is taken from imgPath, which can either
+// be imagePathFromTitle (see above) which loads the image
+// path embedded in the title attribute of a hocr page, or
+// a custom handler.
+func parseLineDetails(h Hocr, dir string, imgPath func(string) (string, error)) (line.Details, error) {
lines := make(line.Details, 0)
for _, p := range h.Pages {
- imgpath, err := imagePath(p.Title)
+ imgpath, err := imgPath(p.Title)
if err != nil {
return lines, err
}
@@ -68,7 +85,7 @@ func parseLineDetails(h Hocr, dir string) (line.Details, error) {
var gray *image.Gray
pngf, err := os.Open(imgpath)
if err != nil {
- fmt.Fprintf(os.Stderr, "Warning: error opening image %s: %v", imgpath, err)
+ fmt.Fprintf(os.Stderr, "Warning: error opening image %s: %v\n", imgpath, err)
}
defer pngf.Close()
img, _, err = image.Decode(pngf)
@@ -99,7 +116,7 @@ func parseLineDetails(h Hocr, dir string) (line.Details, error) {
ln.Name = l.Id
ln.Avgconf = (totalconf / float64(num)) / 100
ln.Text = LineText(l)
- imgpath, err := imagePath(p.Title)
+ imgpath, err := imgPath(p.Title)
if err != nil {
return lines, err
}
@@ -131,7 +148,26 @@ func GetLineDetails(hocrfn string) (line.Details, error) {
return newlines, err
}
- return parseLineDetails(h, filepath.Dir(hocrfn))
+ return parseLineDetails(h, filepath.Dir(hocrfn), imagePathFromTitle)
+}
+
+// GetLineDetailsCustomImg is a variant of GetLineDetails that
+// uses a provided image path for line image extracts, rather
+// than the image name embedded in the .hocr
+func GetLineDetailsCustomImg(hocrfn string, imgfn string) (line.Details, error) {
+ var newlines line.Details
+
+ file, err := ioutil.ReadFile(hocrfn)
+ if err != nil {
+ return newlines, err
+ }
+
+ h, err := Parse(file)
+ if err != nil {
+ return newlines, err
+ }
+
+ return parseLineDetails(h, filepath.Dir(hocrfn), func(s string) (string, error) {return imgfn, nil})
}
// GetLineBasics parses a hocr file and returns a corresponding
@@ -149,5 +185,5 @@ func GetLineBasics(hocrfn string) (line.Details, error) {
return newlines, err
}
- return parseLineDetails(h, filepath.Dir(hocrfn))
+ return parseLineDetails(h, filepath.Dir(hocrfn), imagePathFromTitle)
}