diff options
author | Nick White <git@njw.name> | 2019-10-08 12:52:33 +0100 |
---|---|---|
committer | Nick White <git@njw.name> | 2019-10-08 12:52:33 +0100 |
commit | 7482157a03ed3e9d7f45e54a126b391001f34948 (patch) | |
tree | 52f87b9ca159fe4c04a0349de95ea9de82692b3c /lib/hocr/hocr.go | |
parent | d43c11bf653bfe3c1ad1ed277f1ec08bf155cf98 (diff) |
Separate out bookpipeline from catch-all go.git repo, and rename to rescribe.xyz/bookpipeline
The dependencies from the go.git repo will follow in due course.
Diffstat (limited to 'lib/hocr/hocr.go')
-rw-r--r-- | lib/hocr/hocr.go | 129 |
1 files changed, 0 insertions, 129 deletions
diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go deleted file mode 100644 index dcd0494..0000000 --- a/lib/hocr/hocr.go +++ /dev/null @@ -1,129 +0,0 @@ -package hocr - -import ( - "encoding/xml" - "errors" - "io/ioutil" - "regexp" - "strconv" - "strings" -) - -type Hocr struct { - Lines []OcrLine `xml:"body>div>div>p>span"` -} - -type OcrLine struct { - Class string `xml:"class,attr"` - Id string `xml:"id,attr"` - Title string `xml:"title,attr"` - Words []OcrWord `xml:"span"` - Text string `xml:",chardata"` -} - -type OcrWord struct { - Class string `xml:"class,attr"` - Id string `xml:"id,attr"` - Title string `xml:"title,attr"` - Chars []OcrChar `xml:"span"` - Text string `xml:",chardata"` -} - -type OcrChar struct { - Class string `xml:"class,attr"` - Id string `xml:"id,attr"` - Title string `xml:"title,attr"` - Chars []OcrChar `xml:"span"` - Text string `xml:",chardata"` -} - -// Returns the confidence for a word based on its x_wconf value -func wordConf(s string) (float64, error) { - re, err := regexp.Compile(`x_wconf ([0-9.]+)`) - if err != nil { - return 0.0, err - } - conf := re.FindStringSubmatch(s) - return strconv.ParseFloat(conf[1], 64) -} - -func boxCoords(s string) ([4]int, error) { - var coords [4]int - re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) - if err != nil { - return coords, err - } - coordstr := re.FindStringSubmatch(s) - for i := range coords { - c, err := strconv.Atoi(coordstr[i+1]) - if err != nil { - return coords, err - } - coords[i] = c - } - return coords, nil -} - -func noText(s string) bool { - t := strings.Trim(s, " \n") - return len(t) == 0 -} - -func Parse(b []byte) (Hocr, error) { - var hocr Hocr - - err := xml.Unmarshal(b, &hocr) - if err != nil { - return hocr, err - } - - return hocr, nil -} - -func GetText(hocrfn string) (string, error) { - var s string - - file, err := ioutil.ReadFile(hocrfn) - if err != nil { - return s, err - } - - h, err := Parse(file) - if err != nil { - return s, err - } - - - for _, l := range h.Lines { - s += getLineText(l) - } - return s, nil -} - -func GetAvgConf(hocrfn string) (float64, error) { - file, err := ioutil.ReadFile(hocrfn) - if err != nil { - return 0, err - } - - h, err := Parse(file) - if err != nil { - return 0, err - } - - var total, num float64 - for _, l := range h.Lines { - for _, w := range l.Words { - c, err := wordConf(w.Title) - if err != nil { - return 0, err - } - total += c - num++ - } - } - if num == 0 { - return 0, errors.New("No words found") - } - return total / num, nil -} |