summaryrefslogtreecommitdiff
path: root/pkg/hocr/hocr.go
diff options
context:
space:
mode:
authorNick White <git@njw.name>2021-02-09 16:58:32 +0000
committerNick White <git@njw.name>2021-02-09 17:34:45 +0000
commit13ce8fc4b45073e1f81a39c4923e44420509be73 (patch)
treea60db25d66f33bf1716b8c9ba0467a6604b3418f /pkg/hocr/hocr.go
parent89866846f395115dd7ab576077067783b8119e66 (diff)
hocr: Use image specified in ocr_page title, so can support multipage hocrs cleanly
Diffstat (limited to 'pkg/hocr/hocr.go')
-rw-r--r--pkg/hocr/hocr.go60
1 files changed, 40 insertions, 20 deletions
diff --git a/pkg/hocr/hocr.go b/pkg/hocr/hocr.go
index 6b43558..6cd1df1 100644
--- a/pkg/hocr/hocr.go
+++ b/pkg/hocr/hocr.go
@@ -16,7 +16,12 @@ import (
)
type Hocr struct {
- Lines []OcrLine `xml:"body>div>div>p>span"`
+ Pages []Page `xml:"body>div"`
+}
+
+type Page struct {
+ Lines []OcrLine `xml:"div>p>span"`
+ Title string `xml:"title,attr"`
}
type OcrLine struct {
@@ -53,6 +58,16 @@ func wordConf(s string) (float64, error) {
return strconv.ParseFloat(conf[1], 64)
}
+// Returns the image path for a page from a ocr_page title
+func imagePath(s string) (string, error) {
+ re, err := regexp.Compile(`image ["']([^"']+)["']`)
+ if err != nil {
+ return "", err
+ }
+ m := re.FindStringSubmatch(s)
+ return m[1], nil
+}
+
// BoxCoords parses bbox coordinate strings
func BoxCoords(s string) ([4]int, error) {
var coords [4]int
@@ -102,9 +117,10 @@ func GetText(hocrfn string) (string, error) {
return s, err
}
-
- for _, l := range h.Lines {
- s += LineText(l) + "\n"
+ for _, p := range h.Pages {
+ for _, l := range p.Lines {
+ s += LineText(l) + "\n"
+ }
}
return s, nil
}
@@ -123,14 +139,16 @@ func GetAvgConf(hocrfn string) (float64, error) {
}
var total, num float64
- for _, l := range h.Lines {
- for _, w := range l.Words {
- c, err := wordConf(w.Title)
- if err != nil {
- return 0, err
+ for _, p := range h.Pages {
+ for _, l := range p.Lines {
+ for _, w := range l.Words {
+ c, err := wordConf(w.Title)
+ if err != nil {
+ return 0, err
+ }
+ total += c
+ num++
}
- total += c
- num++
}
}
if num == 0 {
@@ -155,15 +173,17 @@ func GetWordConfs(hocrfn string) ([]float64, error) {
return confs, err
}
- for _, l := range h.Lines {
- for _, w := range l.Words {
- c, err := wordConf(w.Title)
- if err != nil {
- return confs, err
- }
- confs = append(confs, c)
- }
- }
+ for _, p := range h.Pages {
+ for _, l := range p.Lines {
+ for _, w := range l.Words {
+ c, err := wordConf(w.Title)
+ if err != nil {
+ return confs, err
+ }
+ confs = append(confs, c)
+ }
+ }
+ }
return confs, nil
}