From 13ce8fc4b45073e1f81a39c4923e44420509be73 Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 9 Feb 2021 16:58:32 +0000 Subject: hocr: Use image specified in ocr_page title, so can support multipage hocrs cleanly --- pkg/hocr/hocr.go | 60 ++++++++++++++++++++++++------------- pkg/hocr/lines.go | 90 ++++++++++++++++++++++++++++++------------------------- 2 files changed, 90 insertions(+), 60 deletions(-) diff --git a/pkg/hocr/hocr.go b/pkg/hocr/hocr.go index 6b43558..6cd1df1 100644 --- a/pkg/hocr/hocr.go +++ b/pkg/hocr/hocr.go @@ -16,7 +16,12 @@ import ( ) type Hocr struct { - Lines []OcrLine `xml:"body>div>div>p>span"` + Pages []Page `xml:"body>div"` +} + +type Page struct { + Lines []OcrLine `xml:"div>p>span"` + Title string `xml:"title,attr"` } type OcrLine struct { @@ -53,6 +58,16 @@ func wordConf(s string) (float64, error) { return strconv.ParseFloat(conf[1], 64) } +// Returns the image path for a page from a ocr_page title +func imagePath(s string) (string, error) { + re, err := regexp.Compile(`image ["']([^"']+)["']`) + if err != nil { + return "", err + } + m := re.FindStringSubmatch(s) + return m[1], nil +} + // BoxCoords parses bbox coordinate strings func BoxCoords(s string) ([4]int, error) { var coords [4]int @@ -102,9 +117,10 @@ func GetText(hocrfn string) (string, error) { return s, err } - - for _, l := range h.Lines { - s += LineText(l) + "\n" + for _, p := range h.Pages { + for _, l := range p.Lines { + s += LineText(l) + "\n" + } } return s, nil } @@ -123,14 +139,16 @@ func GetAvgConf(hocrfn string) (float64, error) { } var total, num float64 - for _, l := range h.Lines { - for _, w := range l.Words { - c, err := wordConf(w.Title) - if err != nil { - return 0, err + for _, p := range h.Pages { + for _, l := range p.Lines { + for _, w := range l.Words { + c, err := wordConf(w.Title) + if err != nil { + return 0, err + } + total += c + num++ } - total += c - num++ } } if num == 0 { @@ -155,15 +173,17 @@ func GetWordConfs(hocrfn string) ([]float64, error) { return confs, err } - for _, l := range h.Lines { - for _, w := range l.Words { - c, err := wordConf(w.Title) - if err != nil { - return confs, err - } - confs = append(confs, c) - } - } + for _, p := range h.Pages { + for _, l := range p.Lines { + for _, w := range l.Words { + c, err := wordConf(w.Title) + if err != nil { + return confs, err + } + confs = append(confs, c) + } + } + } return confs, nil } diff --git a/pkg/hocr/lines.go b/pkg/hocr/lines.go index 942bd01..393f0ec 100644 --- a/pkg/hocr/lines.go +++ b/pkg/hocr/lines.go @@ -8,6 +8,7 @@ package hocr // be sorted easily import ( + "fmt" "image" "image/draw" _ "image/jpeg" @@ -21,7 +22,7 @@ import ( ) // LineText extracts the text from an OcrLine -func LineText(l OcrLine) (string) { +func LineText(l OcrLine) string { linetext := "" linetext = l.Text @@ -53,37 +54,60 @@ func LineText(l OcrLine) (string) { return linetext } -func parseLineDetails(h Hocr, i *image.Gray, name string) (line.Details, error) { +func parseLineDetails(h Hocr, dir string, name string) (line.Details, error) { lines := make(line.Details, 0) - for _, l := range h.Lines { - totalconf := float64(0) - num := 0 - for _, w := range l.Words { - c, err := wordConf(w.Title) - if err != nil { - return lines, err - } - num++ - totalconf += c + for _, p := range h.Pages { + imgpath, err := imagePath(p.Title) + if err != nil { + return lines, err } + imgpath = filepath.Join(dir, filepath.Base(imgpath)) - coords, err := BoxCoords(l.Title) + var img image.Image + var gray *image.Gray + pngf, err := os.Open(imgpath) if err != nil { - return lines, err + fmt.Fprintf(os.Stderr, "Warning: error opening image %s: %v", imgpath, err) + } + defer pngf.Close() + img, _, err = image.Decode(pngf) + if err == nil { + b := img.Bounds() + gray = image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) + draw.Draw(gray, b, img, b.Min, draw.Src) } - var ln line.Detail - ln.Name = l.Id - ln.Avgconf = (totalconf / float64(num)) / 100 - ln.Text = LineText(l) - ln.OcrName = name - if i != nil { - var imgd line.ImgDirect - imgd.Img = i.SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) - ln.Img = imgd + for _, l := range p.Lines { + totalconf := float64(0) + num := 0 + for _, w := range l.Words { + c, err := wordConf(w.Title) + if err != nil { + return lines, err + } + num++ + totalconf += c + } + + coords, err := BoxCoords(l.Title) + if err != nil { + return lines, err + } + + var ln line.Detail + ln.Name = l.Id + ln.Avgconf = (totalconf / float64(num)) / 100 + ln.Text = LineText(l) + ln.OcrName = name + if gray != nil { + var imgd line.ImgDirect + imgd.Img = gray.SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) + ln.Img = imgd + } + lines = append(lines, ln) } - lines = append(lines, ln) + pngf.Close() } return lines, nil } @@ -103,22 +127,8 @@ func GetLineDetails(hocrfn string) (line.Details, error) { return newlines, err } - var img image.Image - var gray *image.Gray - pngfn := strings.Replace(hocrfn, ".hocr", ".png", 1) - pngf, err := os.Open(pngfn) - if err == nil { - defer pngf.Close() - img, _, err = image.Decode(pngf) - if err == nil { - b := img.Bounds() - gray = image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) - draw.Draw(gray, b, img, b.Min, draw.Src) - } - } - n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) - return parseLineDetails(h, gray, n) + return parseLineDetails(h, filepath.Dir(hocrfn), n) } // GetLineBasics parses a hocr file and returns a corresponding @@ -137,5 +147,5 @@ func GetLineBasics(hocrfn string) (line.Details, error) { } n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) - return parseLineDetails(h, nil, n) + return parseLineDetails(h, filepath.Dir(hocrfn), n) } -- cgit v1.2.1-24-ge1ad