summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2021-02-09 16:58:32 +0000
committerNick White <git@njw.name>2021-02-09 17:34:45 +0000
commit13ce8fc4b45073e1f81a39c4923e44420509be73 (patch)
treea60db25d66f33bf1716b8c9ba0467a6604b3418f
parent89866846f395115dd7ab576077067783b8119e66 (diff)
hocr: Use image specified in ocr_page title, so can support multipage hocrs cleanly
-rw-r--r--pkg/hocr/hocr.go60
-rw-r--r--pkg/hocr/lines.go90
2 files changed, 90 insertions, 60 deletions
diff --git a/pkg/hocr/hocr.go b/pkg/hocr/hocr.go
index 6b43558..6cd1df1 100644
--- a/pkg/hocr/hocr.go
+++ b/pkg/hocr/hocr.go
@@ -16,7 +16,12 @@ import (
)
type Hocr struct {
- Lines []OcrLine `xml:"body>div>div>p>span"`
+ Pages []Page `xml:"body>div"`
+}
+
+type Page struct {
+ Lines []OcrLine `xml:"div>p>span"`
+ Title string `xml:"title,attr"`
}
type OcrLine struct {
@@ -53,6 +58,16 @@ func wordConf(s string) (float64, error) {
return strconv.ParseFloat(conf[1], 64)
}
+// Returns the image path for a page from a ocr_page title
+func imagePath(s string) (string, error) {
+ re, err := regexp.Compile(`image ["']([^"']+)["']`)
+ if err != nil {
+ return "", err
+ }
+ m := re.FindStringSubmatch(s)
+ return m[1], nil
+}
+
// BoxCoords parses bbox coordinate strings
func BoxCoords(s string) ([4]int, error) {
var coords [4]int
@@ -102,9 +117,10 @@ func GetText(hocrfn string) (string, error) {
return s, err
}
-
- for _, l := range h.Lines {
- s += LineText(l) + "\n"
+ for _, p := range h.Pages {
+ for _, l := range p.Lines {
+ s += LineText(l) + "\n"
+ }
}
return s, nil
}
@@ -123,14 +139,16 @@ func GetAvgConf(hocrfn string) (float64, error) {
}
var total, num float64
- for _, l := range h.Lines {
- for _, w := range l.Words {
- c, err := wordConf(w.Title)
- if err != nil {
- return 0, err
+ for _, p := range h.Pages {
+ for _, l := range p.Lines {
+ for _, w := range l.Words {
+ c, err := wordConf(w.Title)
+ if err != nil {
+ return 0, err
+ }
+ total += c
+ num++
}
- total += c
- num++
}
}
if num == 0 {
@@ -155,15 +173,17 @@ func GetWordConfs(hocrfn string) ([]float64, error) {
return confs, err
}
- for _, l := range h.Lines {
- for _, w := range l.Words {
- c, err := wordConf(w.Title)
- if err != nil {
- return confs, err
- }
- confs = append(confs, c)
- }
- }
+ for _, p := range h.Pages {
+ for _, l := range p.Lines {
+ for _, w := range l.Words {
+ c, err := wordConf(w.Title)
+ if err != nil {
+ return confs, err
+ }
+ confs = append(confs, c)
+ }
+ }
+ }
return confs, nil
}
diff --git a/pkg/hocr/lines.go b/pkg/hocr/lines.go
index 942bd01..393f0ec 100644
--- a/pkg/hocr/lines.go
+++ b/pkg/hocr/lines.go
@@ -8,6 +8,7 @@ package hocr
// be sorted easily
import (
+ "fmt"
"image"
"image/draw"
_ "image/jpeg"
@@ -21,7 +22,7 @@ import (
)
// LineText extracts the text from an OcrLine
-func LineText(l OcrLine) (string) {
+func LineText(l OcrLine) string {
linetext := ""
linetext = l.Text
@@ -53,37 +54,60 @@ func LineText(l OcrLine) (string) {
return linetext
}
-func parseLineDetails(h Hocr, i *image.Gray, name string) (line.Details, error) {
+func parseLineDetails(h Hocr, dir string, name string) (line.Details, error) {
lines := make(line.Details, 0)
- for _, l := range h.Lines {
- totalconf := float64(0)
- num := 0
- for _, w := range l.Words {
- c, err := wordConf(w.Title)
- if err != nil {
- return lines, err
- }
- num++
- totalconf += c
+ for _, p := range h.Pages {
+ imgpath, err := imagePath(p.Title)
+ if err != nil {
+ return lines, err
}
+ imgpath = filepath.Join(dir, filepath.Base(imgpath))
- coords, err := BoxCoords(l.Title)
+ var img image.Image
+ var gray *image.Gray
+ pngf, err := os.Open(imgpath)
if err != nil {
- return lines, err
+ fmt.Fprintf(os.Stderr, "Warning: error opening image %s: %v", imgpath, err)
+ }
+ defer pngf.Close()
+ img, _, err = image.Decode(pngf)
+ if err == nil {
+ b := img.Bounds()
+ gray = image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy()))
+ draw.Draw(gray, b, img, b.Min, draw.Src)
}
- var ln line.Detail
- ln.Name = l.Id
- ln.Avgconf = (totalconf / float64(num)) / 100
- ln.Text = LineText(l)
- ln.OcrName = name
- if i != nil {
- var imgd line.ImgDirect
- imgd.Img = i.SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3]))
- ln.Img = imgd
+ for _, l := range p.Lines {
+ totalconf := float64(0)
+ num := 0
+ for _, w := range l.Words {
+ c, err := wordConf(w.Title)
+ if err != nil {
+ return lines, err
+ }
+ num++
+ totalconf += c
+ }
+
+ coords, err := BoxCoords(l.Title)
+ if err != nil {
+ return lines, err
+ }
+
+ var ln line.Detail
+ ln.Name = l.Id
+ ln.Avgconf = (totalconf / float64(num)) / 100
+ ln.Text = LineText(l)
+ ln.OcrName = name
+ if gray != nil {
+ var imgd line.ImgDirect
+ imgd.Img = gray.SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3]))
+ ln.Img = imgd
+ }
+ lines = append(lines, ln)
}
- lines = append(lines, ln)
+ pngf.Close()
}
return lines, nil
}
@@ -103,22 +127,8 @@ func GetLineDetails(hocrfn string) (line.Details, error) {
return newlines, err
}
- var img image.Image
- var gray *image.Gray
- pngfn := strings.Replace(hocrfn, ".hocr", ".png", 1)
- pngf, err := os.Open(pngfn)
- if err == nil {
- defer pngf.Close()
- img, _, err = image.Decode(pngf)
- if err == nil {
- b := img.Bounds()
- gray = image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy()))
- draw.Draw(gray, b, img, b.Min, draw.Src)
- }
- }
-
n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1)
- return parseLineDetails(h, gray, n)
+ return parseLineDetails(h, filepath.Dir(hocrfn), n)
}
// GetLineBasics parses a hocr file and returns a corresponding
@@ -137,5 +147,5 @@ func GetLineBasics(hocrfn string) (line.Details, error) {
}
n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1)
- return parseLineDetails(h, nil, n)
+ return parseLineDetails(h, filepath.Dir(hocrfn), n)
}