diff options
author | Nick White <git@njw.name> | 2021-02-09 16:58:32 +0000 |
---|---|---|
committer | Nick White <git@njw.name> | 2021-02-09 17:34:45 +0000 |
commit | 13ce8fc4b45073e1f81a39c4923e44420509be73 (patch) | |
tree | a60db25d66f33bf1716b8c9ba0467a6604b3418f /pkg/hocr/hocr.go | |
parent | 89866846f395115dd7ab576077067783b8119e66 (diff) |
hocr: Use image specified in ocr_page title, so can support multipage hocrs cleanly
Diffstat (limited to 'pkg/hocr/hocr.go')
-rw-r--r-- | pkg/hocr/hocr.go | 60 |
1 files changed, 40 insertions, 20 deletions
diff --git a/pkg/hocr/hocr.go b/pkg/hocr/hocr.go index 6b43558..6cd1df1 100644 --- a/pkg/hocr/hocr.go +++ b/pkg/hocr/hocr.go @@ -16,7 +16,12 @@ import ( ) type Hocr struct { - Lines []OcrLine `xml:"body>div>div>p>span"` + Pages []Page `xml:"body>div"` +} + +type Page struct { + Lines []OcrLine `xml:"div>p>span"` + Title string `xml:"title,attr"` } type OcrLine struct { @@ -53,6 +58,16 @@ func wordConf(s string) (float64, error) { return strconv.ParseFloat(conf[1], 64) } +// Returns the image path for a page from a ocr_page title +func imagePath(s string) (string, error) { + re, err := regexp.Compile(`image ["']([^"']+)["']`) + if err != nil { + return "", err + } + m := re.FindStringSubmatch(s) + return m[1], nil +} + // BoxCoords parses bbox coordinate strings func BoxCoords(s string) ([4]int, error) { var coords [4]int @@ -102,9 +117,10 @@ func GetText(hocrfn string) (string, error) { return s, err } - - for _, l := range h.Lines { - s += LineText(l) + "\n" + for _, p := range h.Pages { + for _, l := range p.Lines { + s += LineText(l) + "\n" + } } return s, nil } @@ -123,14 +139,16 @@ func GetAvgConf(hocrfn string) (float64, error) { } var total, num float64 - for _, l := range h.Lines { - for _, w := range l.Words { - c, err := wordConf(w.Title) - if err != nil { - return 0, err + for _, p := range h.Pages { + for _, l := range p.Lines { + for _, w := range l.Words { + c, err := wordConf(w.Title) + if err != nil { + return 0, err + } + total += c + num++ } - total += c - num++ } } if num == 0 { @@ -155,15 +173,17 @@ func GetWordConfs(hocrfn string) ([]float64, error) { return confs, err } - for _, l := range h.Lines { - for _, w := range l.Words { - c, err := wordConf(w.Title) - if err != nil { - return confs, err - } - confs = append(confs, c) - } - } + for _, p := range h.Pages { + for _, l := range p.Lines { + for _, w := range l.Words { + c, err := wordConf(w.Title) + if err != nil { + return confs, err + } + confs = append(confs, c) + } + } + } return confs, nil } |