diff options
author | Nick White <git@njw.name> | 2021-03-23 11:14:35 +0000 |
---|---|---|
committer | Nick White <git@njw.name> | 2021-03-23 11:14:35 +0000 |
commit | fa4f48ad54ec94c222269d335d40b21becff92a4 (patch) | |
tree | ca3ba1a4b46d1570607d9233455496e93e10517a /pkg/hocr | |
parent | 59d139893ab9044447e14fa091f50b48969847b5 (diff) |
hocr: Add ability to specify a custom image path for hocr line extraction, and use it in extracthocrlines
Diffstat (limited to 'pkg/hocr')
-rw-r--r-- | pkg/hocr/hocr.go | 10 | ||||
-rw-r--r-- | pkg/hocr/lines.go | 48 |
2 files changed, 42 insertions, 16 deletions
diff --git a/pkg/hocr/hocr.go b/pkg/hocr/hocr.go index 6cd1df1..e84c2b1 100644 --- a/pkg/hocr/hocr.go +++ b/pkg/hocr/hocr.go @@ -58,16 +58,6 @@ func wordConf(s string) (float64, error) { return strconv.ParseFloat(conf[1], 64) } -// Returns the image path for a page from a ocr_page title -func imagePath(s string) (string, error) { - re, err := regexp.Compile(`image ["']([^"']+)["']`) - if err != nil { - return "", err - } - m := re.FindStringSubmatch(s) - return m[1], nil -} - // BoxCoords parses bbox coordinate strings func BoxCoords(s string) ([4]int, error) { var coords [4]int diff --git a/pkg/hocr/lines.go b/pkg/hocr/lines.go index 1c759e0..5794243 100644 --- a/pkg/hocr/lines.go +++ b/pkg/hocr/lines.go @@ -16,11 +16,22 @@ import ( "io/ioutil" "os" "path/filepath" + "regexp" "strings" "rescribe.xyz/utils/pkg/line" ) +// Returns the image path for a page from a ocr_page title +func imagePathFromTitle(s string) (string, error) { + re, err := regexp.Compile(`image ["']([^"']+)["']`) + if err != nil { + return "", err + } + m := re.FindStringSubmatch(s) + return m[1], nil +} + // LineText extracts the text from an OcrLine func LineText(l OcrLine) string { linetext := "" @@ -54,11 +65,17 @@ func LineText(l OcrLine) string { return linetext } -func parseLineDetails(h Hocr, dir string) (line.Details, error) { +// parseLineDetails parses a Hocr struct into a line.Details +// struct, including extracted image segments for each line. +// The image location is taken from imgPath, which can either +// be imagePathFromTitle (see above) which loads the image +// path embedded in the title attribute of a hocr page, or +// a custom handler. +func parseLineDetails(h Hocr, dir string, imgPath func(string) (string, error)) (line.Details, error) { lines := make(line.Details, 0) for _, p := range h.Pages { - imgpath, err := imagePath(p.Title) + imgpath, err := imgPath(p.Title) if err != nil { return lines, err } @@ -68,7 +85,7 @@ func parseLineDetails(h Hocr, dir string) (line.Details, error) { var gray *image.Gray pngf, err := os.Open(imgpath) if err != nil { - fmt.Fprintf(os.Stderr, "Warning: error opening image %s: %v", imgpath, err) + fmt.Fprintf(os.Stderr, "Warning: error opening image %s: %v\n", imgpath, err) } defer pngf.Close() img, _, err = image.Decode(pngf) @@ -99,7 +116,7 @@ func parseLineDetails(h Hocr, dir string) (line.Details, error) { ln.Name = l.Id ln.Avgconf = (totalconf / float64(num)) / 100 ln.Text = LineText(l) - imgpath, err := imagePath(p.Title) + imgpath, err := imgPath(p.Title) if err != nil { return lines, err } @@ -131,7 +148,26 @@ func GetLineDetails(hocrfn string) (line.Details, error) { return newlines, err } - return parseLineDetails(h, filepath.Dir(hocrfn)) + return parseLineDetails(h, filepath.Dir(hocrfn), imagePathFromTitle) +} + +// GetLineDetailsCustomImg is a variant of GetLineDetails that +// uses a provided image path for line image extracts, rather +// than the image name embedded in the .hocr +func GetLineDetailsCustomImg(hocrfn string, imgfn string) (line.Details, error) { + var newlines line.Details + + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return newlines, err + } + + h, err := Parse(file) + if err != nil { + return newlines, err + } + + return parseLineDetails(h, filepath.Dir(hocrfn), func(s string) (string, error) {return imgfn, nil}) } // GetLineBasics parses a hocr file and returns a corresponding @@ -149,5 +185,5 @@ func GetLineBasics(hocrfn string) (line.Details, error) { return newlines, err } - return parseLineDetails(h, filepath.Dir(hocrfn)) + return parseLineDetails(h, filepath.Dir(hocrfn), imagePathFromTitle) } |