diff options
author | Nick White <git@njw.name> | 2019-10-08 15:49:52 +0100 |
---|---|---|
committer | Nick White <git@njw.name> | 2019-10-08 15:49:52 +0100 |
commit | 69aae6b93dcadd9e4895f86fe661ee80e79dcf9e (patch) | |
tree | bfc42b832a6d6e3051631eb6cf530e1400b2c080 /pkg/hocr/hocr.go | |
parent | d43c11bf653bfe3c1ad1ed277f1ec08bf155cf98 (diff) |
Remove parts that have been moved elsewhere, and rename to rescribe.xyz/utils
bookpipeline is now at rescribe.xyz/bookpipeline
preproc is now at rescribe.xyz/preproc
integralimg is now at rescribe.xyz/preproc/integralimg
Diffstat (limited to 'pkg/hocr/hocr.go')
-rw-r--r-- | pkg/hocr/hocr.go | 129 |
1 files changed, 129 insertions, 0 deletions
diff --git a/pkg/hocr/hocr.go b/pkg/hocr/hocr.go new file mode 100644 index 0000000..dcd0494 --- /dev/null +++ b/pkg/hocr/hocr.go @@ -0,0 +1,129 @@ +package hocr + +import ( + "encoding/xml" + "errors" + "io/ioutil" + "regexp" + "strconv" + "strings" +) + +type Hocr struct { + Lines []OcrLine `xml:"body>div>div>p>span"` +} + +type OcrLine struct { + Class string `xml:"class,attr"` + Id string `xml:"id,attr"` + Title string `xml:"title,attr"` + Words []OcrWord `xml:"span"` + Text string `xml:",chardata"` +} + +type OcrWord struct { + Class string `xml:"class,attr"` + Id string `xml:"id,attr"` + Title string `xml:"title,attr"` + Chars []OcrChar `xml:"span"` + Text string `xml:",chardata"` +} + +type OcrChar struct { + Class string `xml:"class,attr"` + Id string `xml:"id,attr"` + Title string `xml:"title,attr"` + Chars []OcrChar `xml:"span"` + Text string `xml:",chardata"` +} + +// Returns the confidence for a word based on its x_wconf value +func wordConf(s string) (float64, error) { + re, err := regexp.Compile(`x_wconf ([0-9.]+)`) + if err != nil { + return 0.0, err + } + conf := re.FindStringSubmatch(s) + return strconv.ParseFloat(conf[1], 64) +} + +func boxCoords(s string) ([4]int, error) { + var coords [4]int + re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) + if err != nil { + return coords, err + } + coordstr := re.FindStringSubmatch(s) + for i := range coords { + c, err := strconv.Atoi(coordstr[i+1]) + if err != nil { + return coords, err + } + coords[i] = c + } + return coords, nil +} + +func noText(s string) bool { + t := strings.Trim(s, " \n") + return len(t) == 0 +} + +func Parse(b []byte) (Hocr, error) { + var hocr Hocr + + err := xml.Unmarshal(b, &hocr) + if err != nil { + return hocr, err + } + + return hocr, nil +} + +func GetText(hocrfn string) (string, error) { + var s string + + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return s, err + } + + h, err := Parse(file) + if err != nil { + return s, err + } + + + for _, l := range h.Lines { + s += getLineText(l) + } + return s, nil +} + +func GetAvgConf(hocrfn string) (float64, error) { + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return 0, err + } + + h, err := Parse(file) + if err != nil { + return 0, err + } + + var total, num float64 + for _, l := range h.Lines { + for _, w := range l.Words { + c, err := wordConf(w.Title) + if err != nil { + return 0, err + } + total += c + num++ + } + } + if num == 0 { + return 0, errors.New("No words found") + } + return total / num, nil +} |