summaryrefslogtreecommitdiff
path: root/hocr/hocr.go
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-01-23 20:47:33 +0000
committerNick White <git@njw.name>2019-01-23 20:47:33 +0000
commitd256f967a26ceeb7c3987a1fc447b126a35054f9 (patch)
treef80c1e3e2c3757c59ad51dec98de5b6e82a426fa /hocr/hocr.go
parentc41aa16d8a3d35ce4185184ee50536bf2089a120 (diff)
Separate out hocr parts from line parts
Diffstat (limited to 'hocr/hocr.go')
-rw-r--r--hocr/hocr.go188
1 files changed, 0 insertions, 188 deletions
diff --git a/hocr/hocr.go b/hocr/hocr.go
deleted file mode 100644
index 0c1295c..0000000
--- a/hocr/hocr.go
+++ /dev/null
@@ -1,188 +0,0 @@
-package hocr
-
-// TODO: separate out linedetail to a general structure that can incorporate
-// line-conf-buckets too, in a different file (and rename package to
-// something more generic). Do this using the CopyableLine interface
-// TODO: Parse line name to zero pad line numbers, so they come out in the correct order
-
-import (
- "encoding/xml"
- "image"
- "image/png"
- "io"
- "regexp"
- "strconv"
- "strings"
-)
-
-// TODO: move the linedetail stuff out to a separate file, and create a new
-// CopyableLine implementing struct for ocropy, which will just store
-// a file location
-type LineDetail struct {
- Name string
- Avgconf float64
- Img CopyableLine
- Text string
- Hocrname string
-}
-
-type CopyableLine interface {
- CopyLineTo(io.Writer) (error)
-}
-
-type ImgDirect struct {
- img image.Image
-}
-
-func (i ImgDirect) CopyLineTo(w io.Writer) (error) {
- err := png.Encode(w, i.img)
- if err != nil {
- return err
- }
- return nil
-}
-
-type LineDetails []LineDetail
-
-// Used by sort.Sort.
-func (l LineDetails) Len() int { return len(l) }
-
-// Used by sort.Sort.
-func (l LineDetails) Less(i, j int) bool {
- return l[i].Avgconf < l[j].Avgconf
-}
-
-// Used by sort.Sort.
-func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
-
-type Hocr struct {
- Lines []OcrLine `xml:"body>div>div>p>span"`
-}
-
-type OcrLine struct {
- Class string `xml:"class,attr"`
- Id string `xml:"id,attr"`
- Title string `xml:"title,attr"`
- Words []OcrWord `xml:"span"`
- Text string `xml:",chardata"`
-}
-
-type OcrWord struct {
- Class string `xml:"class,attr"`
- Id string `xml:"id,attr"`
- Title string `xml:"title,attr"`
- Chars []OcrChar `xml:"span"`
- Text string `xml:",chardata"`
-}
-
-type OcrChar struct {
- Class string `xml:"class,attr"`
- Id string `xml:"id,attr"`
- Title string `xml:"title,attr"`
- Chars []OcrChar `xml:"span"`
- Text string `xml:",chardata"`
-}
-
-// Returns the confidence for a word based on its x_wconf value
-func wordConf(s string) (float64, error) {
- re, err := regexp.Compile(`x_wconf ([0-9.]+)`)
- if err != nil {
- return 0.0, err
- }
- conf := re.FindStringSubmatch(s)
- return strconv.ParseFloat(conf[1], 64)
-}
-
-func boxCoords(s string) ([4]int, error) {
- var coords [4]int
- re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`)
- if err != nil {
- return coords, err
- }
- coordstr := re.FindStringSubmatch(s)
- for i := range coords {
- c, err := strconv.Atoi(coordstr[i+1])
- if err != nil {
- return coords, err
- }
- coords[i] = c
- }
- return coords, nil
-}
-
-func noText(s string) bool {
- t := strings.Trim(s, " \n")
- return len(t) == 0
-}
-
-func Parse(b []byte) (Hocr, error) {
- var hocr Hocr
-
- err := xml.Unmarshal(b, &hocr)
- if err != nil {
- return hocr, err
- }
-
- return hocr, nil
-}
-
-func GetLineDetails(h Hocr, i image.Image, name string) (LineDetails, error) {
- lines := make(LineDetails, 0)
-
- for _, l := range h.Lines {
- totalconf := float64(0)
- num := 0
- for _, w := range l.Words {
- c, err := wordConf(w.Title)
- if err != nil {
- return lines, err
- }
- num++
- totalconf += c
- }
-
- coords, err := boxCoords(l.Title)
- if err != nil {
- return lines, err
- }
-
- var line LineDetail
- line.Name = l.Id
- line.Avgconf = (totalconf/float64(num)) / 100
- linetext := ""
-
- linetext = l.Text
- if(noText(linetext)) {
- linetext = ""
- for _, w := range l.Words {
- if(w.Class != "ocrx_word") {
- continue
- }
- linetext += w.Text + " "
- }
- }
- if(noText(linetext)) {
- linetext = ""
- for _, w := range l.Words {
- if(w.Class != "ocrx_word") {
- continue
- }
- for _, c := range w.Chars {
- if(c.Class != "ocrx_cinfo") {
- continue
- }
- linetext += c.Text
- }
- linetext += " "
- }
- }
- line.Text = strings.TrimRight(linetext, " ")
- line.Text += "\n"
- line.Hocrname = name
- var imgd ImgDirect
- imgd.img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3]))
- line.Img = imgd
- lines = append(lines, line)
- }
- return lines, nil
-}