summaryrefslogtreecommitdiff
path: root/parse
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-01-23 20:47:33 +0000
committerNick White <git@njw.name>2019-01-23 20:47:33 +0000
commitd256f967a26ceeb7c3987a1fc447b126a35054f9 (patch)
treef80c1e3e2c3757c59ad51dec98de5b6e82a426fa /parse
parentc41aa16d8a3d35ce4185184ee50536bf2089a120 (diff)
Separate out hocr parts from line parts
Diffstat (limited to 'parse')
-rw-r--r--parse/hocr/hocr.go147
-rw-r--r--parse/line.go51
2 files changed, 198 insertions, 0 deletions
diff --git a/parse/hocr/hocr.go b/parse/hocr/hocr.go
new file mode 100644
index 0000000..a281a7a
--- /dev/null
+++ b/parse/hocr/hocr.go
@@ -0,0 +1,147 @@
+package hocr
+
+// TODO: consider making GetLineDetails() a function of Hocr, so could do a
+// similar thing with prob format files too.
+// TODO: Parse line name to zero pad line numbers, so they come out in the correct order
+
+import (
+ "encoding/xml"
+ "image"
+ "regexp"
+ "strconv"
+ "strings"
+
+ "git.rescribe.xyz/testingtools/parse"
+)
+
+type Hocr struct {
+ Lines []OcrLine `xml:"body>div>div>p>span"`
+}
+
+type OcrLine struct {
+ Class string `xml:"class,attr"`
+ Id string `xml:"id,attr"`
+ Title string `xml:"title,attr"`
+ Words []OcrWord `xml:"span"`
+ Text string `xml:",chardata"`
+}
+
+type OcrWord struct {
+ Class string `xml:"class,attr"`
+ Id string `xml:"id,attr"`
+ Title string `xml:"title,attr"`
+ Chars []OcrChar `xml:"span"`
+ Text string `xml:",chardata"`
+}
+
+type OcrChar struct {
+ Class string `xml:"class,attr"`
+ Id string `xml:"id,attr"`
+ Title string `xml:"title,attr"`
+ Chars []OcrChar `xml:"span"`
+ Text string `xml:",chardata"`
+}
+
+// Returns the confidence for a word based on its x_wconf value
+func wordConf(s string) (float64, error) {
+ re, err := regexp.Compile(`x_wconf ([0-9.]+)`)
+ if err != nil {
+ return 0.0, err
+ }
+ conf := re.FindStringSubmatch(s)
+ return strconv.ParseFloat(conf[1], 64)
+}
+
+func boxCoords(s string) ([4]int, error) {
+ var coords [4]int
+ re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`)
+ if err != nil {
+ return coords, err
+ }
+ coordstr := re.FindStringSubmatch(s)
+ for i := range coords {
+ c, err := strconv.Atoi(coordstr[i+1])
+ if err != nil {
+ return coords, err
+ }
+ coords[i] = c
+ }
+ return coords, nil
+}
+
+func noText(s string) bool {
+ t := strings.Trim(s, " \n")
+ return len(t) == 0
+}
+
+func Parse(b []byte) (Hocr, error) {
+ var hocr Hocr
+
+ err := xml.Unmarshal(b, &hocr)
+ if err != nil {
+ return hocr, err
+ }
+
+ return hocr, nil
+}
+
+func GetLineDetails(h Hocr, i image.Image, name string) (parse.LineDetails, error) {
+ lines := make(parse.LineDetails, 0)
+
+ for _, l := range h.Lines {
+ totalconf := float64(0)
+ num := 0
+ for _, w := range l.Words {
+ c, err := wordConf(w.Title)
+ if err != nil {
+ return lines, err
+ }
+ num++
+ totalconf += c
+ }
+
+ coords, err := boxCoords(l.Title)
+ if err != nil {
+ return lines, err
+ }
+
+ var line parse.LineDetail
+ line.Name = l.Id
+ line.Avgconf = (totalconf/float64(num)) / 100
+ linetext := ""
+
+ linetext = l.Text
+ if(noText(linetext)) {
+ linetext = ""
+ for _, w := range l.Words {
+ if(w.Class != "ocrx_word") {
+ continue
+ }
+ linetext += w.Text + " "
+ }
+ }
+ if(noText(linetext)) {
+ linetext = ""
+ for _, w := range l.Words {
+ if(w.Class != "ocrx_word") {
+ continue
+ }
+ for _, c := range w.Chars {
+ if(c.Class != "ocrx_cinfo") {
+ continue
+ }
+ linetext += c.Text
+ }
+ linetext += " "
+ }
+ }
+ line.Text = strings.TrimRight(linetext, " ")
+ line.Text += "\n"
+ line.Hocrname = name
+ var imgd parse.ImgDirect
+ imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3]))
+ line.Img = imgd
+ lines = append(lines, line)
+ }
+ return lines, nil
+}
diff --git a/parse/line.go b/parse/line.go
new file mode 100644
index 0000000..3ddde76
--- /dev/null
+++ b/parse/line.go
@@ -0,0 +1,51 @@
+package parse
+
+// TODO: integrate in line-conf-buckets linedetail
+// TODO: add BucketUp() function here that does what both line-conf-buckets-tess.go
+// and line-conf-buckets.go do
+// TODO: consider naming this package line, and separating it from hocr and prob
+
+import (
+ "image"
+ "image/png"
+ "io"
+)
+
+type LineDetail struct {
+ Name string
+ Avgconf float64
+ Img CopyableLine
+ Text string
+ Hocrname string
+}
+
+type CopyableLine interface {
+ CopyLineTo(io.Writer) (error)
+}
+
+// This is an implementation of the CopyableLine interface that
+// stores the image directly as an image.Image
+type ImgDirect struct {
+ Img image.Image
+}
+
+func (i ImgDirect) CopyLineTo(w io.Writer) (error) {
+ err := png.Encode(w, i.Img)
+ if err != nil {
+ return err
+ }
+ return nil
+}
+
+type LineDetails []LineDetail
+
+// Used by sort.Sort.
+func (l LineDetails) Len() int { return len(l) }
+
+// Used by sort.Sort.
+func (l LineDetails) Less(i, j int) bool {
+ return l[i].Avgconf < l[j].Avgconf
+}
+
+// Used by sort.Sort.
+func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] }