From d256f967a26ceeb7c3987a1fc447b126a35054f9 Mon Sep 17 00:00:00 2001 From: Nick White Date: Wed, 23 Jan 2019 20:47:33 +0000 Subject: Separate out hocr parts from line parts --- hocr/hocr.go | 188 ----------------------- line-conf-buckets-tess/line-conf-buckets-tess.go | 5 +- parse/hocr/hocr.go | 147 ++++++++++++++++++ parse/line.go | 51 ++++++ 4 files changed, 201 insertions(+), 190 deletions(-) delete mode 100644 hocr/hocr.go create mode 100644 parse/hocr/hocr.go create mode 100644 parse/line.go diff --git a/hocr/hocr.go b/hocr/hocr.go deleted file mode 100644 index 0c1295c..0000000 --- a/hocr/hocr.go +++ /dev/null @@ -1,188 +0,0 @@ -package hocr - -// TODO: separate out linedetail to a general structure that can incorporate -// line-conf-buckets too, in a different file (and rename package to -// something more generic). Do this using the CopyableLine interface -// TODO: Parse line name to zero pad line numbers, so they come out in the correct order - -import ( - "encoding/xml" - "image" - "image/png" - "io" - "regexp" - "strconv" - "strings" -) - -// TODO: move the linedetail stuff out to a separate file, and create a new -// CopyableLine implementing struct for ocropy, which will just store -// a file location -type LineDetail struct { - Name string - Avgconf float64 - Img CopyableLine - Text string - Hocrname string -} - -type CopyableLine interface { - CopyLineTo(io.Writer) (error) -} - -type ImgDirect struct { - img image.Image -} - -func (i ImgDirect) CopyLineTo(w io.Writer) (error) { - err := png.Encode(w, i.img) - if err != nil { - return err - } - return nil -} - -type LineDetails []LineDetail - -// Used by sort.Sort. -func (l LineDetails) Len() int { return len(l) } - -// Used by sort.Sort. -func (l LineDetails) Less(i, j int) bool { - return l[i].Avgconf < l[j].Avgconf -} - -// Used by sort.Sort. -func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] } - -type Hocr struct { - Lines []OcrLine `xml:"body>div>div>p>span"` -} - -type OcrLine struct { - Class string `xml:"class,attr"` - Id string `xml:"id,attr"` - Title string `xml:"title,attr"` - Words []OcrWord `xml:"span"` - Text string `xml:",chardata"` -} - -type OcrWord struct { - Class string `xml:"class,attr"` - Id string `xml:"id,attr"` - Title string `xml:"title,attr"` - Chars []OcrChar `xml:"span"` - Text string `xml:",chardata"` -} - -type OcrChar struct { - Class string `xml:"class,attr"` - Id string `xml:"id,attr"` - Title string `xml:"title,attr"` - Chars []OcrChar `xml:"span"` - Text string `xml:",chardata"` -} - -// Returns the confidence for a word based on its x_wconf value -func wordConf(s string) (float64, error) { - re, err := regexp.Compile(`x_wconf ([0-9.]+)`) - if err != nil { - return 0.0, err - } - conf := re.FindStringSubmatch(s) - return strconv.ParseFloat(conf[1], 64) -} - -func boxCoords(s string) ([4]int, error) { - var coords [4]int - re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) - if err != nil { - return coords, err - } - coordstr := re.FindStringSubmatch(s) - for i := range coords { - c, err := strconv.Atoi(coordstr[i+1]) - if err != nil { - return coords, err - } - coords[i] = c - } - return coords, nil -} - -func noText(s string) bool { - t := strings.Trim(s, " \n") - return len(t) == 0 -} - -func Parse(b []byte) (Hocr, error) { - var hocr Hocr - - err := xml.Unmarshal(b, &hocr) - if err != nil { - return hocr, err - } - - return hocr, nil -} - -func GetLineDetails(h Hocr, i image.Image, name string) (LineDetails, error) { - lines := make(LineDetails, 0) - - for _, l := range h.Lines { - totalconf := float64(0) - num := 0 - for _, w := range l.Words { - c, err := wordConf(w.Title) - if err != nil { - return lines, err - } - num++ - totalconf += c - } - - coords, err := boxCoords(l.Title) - if err != nil { - return lines, err - } - - var line LineDetail - line.Name = l.Id - line.Avgconf = (totalconf/float64(num)) / 100 - linetext := "" - - linetext = l.Text - if(noText(linetext)) { - linetext = "" - for _, w := range l.Words { - if(w.Class != "ocrx_word") { - continue - } - linetext += w.Text + " " - } - } - if(noText(linetext)) { - linetext = "" - for _, w := range l.Words { - if(w.Class != "ocrx_word") { - continue - } - for _, c := range w.Chars { - if(c.Class != "ocrx_cinfo") { - continue - } - linetext += c.Text - } - linetext += " " - } - } - line.Text = strings.TrimRight(linetext, " ") - line.Text += "\n" - line.Hocrname = name - var imgd ImgDirect - imgd.img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) - line.Img = imgd - lines = append(lines, line) - } - return lines, nil -} diff --git a/line-conf-buckets-tess/line-conf-buckets-tess.go b/line-conf-buckets-tess/line-conf-buckets-tess.go index facd108..b24bdec 100644 --- a/line-conf-buckets-tess/line-conf-buckets-tess.go +++ b/line-conf-buckets-tess/line-conf-buckets-tess.go @@ -17,7 +17,8 @@ import ( "strconv" "strings" - "git.rescribe.xyz/testingtools/hocr" + "git.rescribe.xyz/testingtools/parse" + "git.rescribe.xyz/testingtools/parse/hocr" ) func main() { @@ -33,7 +34,7 @@ func main() { os.Exit(1) } - lines := make(hocr.LineDetails, 0) + lines := make(parse.LineDetails, 0) for _, f := range flag.Args() { file, err := ioutil.ReadFile(f) diff --git a/parse/hocr/hocr.go b/parse/hocr/hocr.go new file mode 100644 index 0000000..a281a7a --- /dev/null +++ b/parse/hocr/hocr.go @@ -0,0 +1,147 @@ +package hocr + +// TODO: consider making GetLineDetails() a function of Hocr, so could do a +// similar thing with prob format files too. +// TODO: Parse line name to zero pad line numbers, so they come out in the correct order + +import ( + "encoding/xml" + "image" + "regexp" + "strconv" + "strings" + + "git.rescribe.xyz/testingtools/parse" +) + +type Hocr struct { + Lines []OcrLine `xml:"body>div>div>p>span"` +} + +type OcrLine struct { + Class string `xml:"class,attr"` + Id string `xml:"id,attr"` + Title string `xml:"title,attr"` + Words []OcrWord `xml:"span"` + Text string `xml:",chardata"` +} + +type OcrWord struct { + Class string `xml:"class,attr"` + Id string `xml:"id,attr"` + Title string `xml:"title,attr"` + Chars []OcrChar `xml:"span"` + Text string `xml:",chardata"` +} + +type OcrChar struct { + Class string `xml:"class,attr"` + Id string `xml:"id,attr"` + Title string `xml:"title,attr"` + Chars []OcrChar `xml:"span"` + Text string `xml:",chardata"` +} + +// Returns the confidence for a word based on its x_wconf value +func wordConf(s string) (float64, error) { + re, err := regexp.Compile(`x_wconf ([0-9.]+)`) + if err != nil { + return 0.0, err + } + conf := re.FindStringSubmatch(s) + return strconv.ParseFloat(conf[1], 64) +} + +func boxCoords(s string) ([4]int, error) { + var coords [4]int + re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) + if err != nil { + return coords, err + } + coordstr := re.FindStringSubmatch(s) + for i := range coords { + c, err := strconv.Atoi(coordstr[i+1]) + if err != nil { + return coords, err + } + coords[i] = c + } + return coords, nil +} + +func noText(s string) bool { + t := strings.Trim(s, " \n") + return len(t) == 0 +} + +func Parse(b []byte) (Hocr, error) { + var hocr Hocr + + err := xml.Unmarshal(b, &hocr) + if err != nil { + return hocr, err + } + + return hocr, nil +} + +func GetLineDetails(h Hocr, i image.Image, name string) (parse.LineDetails, error) { + lines := make(parse.LineDetails, 0) + + for _, l := range h.Lines { + totalconf := float64(0) + num := 0 + for _, w := range l.Words { + c, err := wordConf(w.Title) + if err != nil { + return lines, err + } + num++ + totalconf += c + } + + coords, err := boxCoords(l.Title) + if err != nil { + return lines, err + } + + var line parse.LineDetail + line.Name = l.Id + line.Avgconf = (totalconf/float64(num)) / 100 + linetext := "" + + linetext = l.Text + if(noText(linetext)) { + linetext = "" + for _, w := range l.Words { + if(w.Class != "ocrx_word") { + continue + } + linetext += w.Text + " " + } + } + if(noText(linetext)) { + linetext = "" + for _, w := range l.Words { + if(w.Class != "ocrx_word") { + continue + } + for _, c := range w.Chars { + if(c.Class != "ocrx_cinfo") { + continue + } + linetext += c.Text + } + linetext += " " + } + } + line.Text = strings.TrimRight(linetext, " ") + line.Text += "\n" + line.Hocrname = name + var imgd parse.ImgDirect + imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) + line.Img = imgd + lines = append(lines, line) + } + return lines, nil +} diff --git a/parse/line.go b/parse/line.go new file mode 100644 index 0000000..3ddde76 --- /dev/null +++ b/parse/line.go @@ -0,0 +1,51 @@ +package parse + +// TODO: integrate in line-conf-buckets linedetail +// TODO: add BucketUp() function here that does what both line-conf-buckets-tess.go +// and line-conf-buckets.go do +// TODO: consider naming this package line, and separating it from hocr and prob + +import ( + "image" + "image/png" + "io" +) + +type LineDetail struct { + Name string + Avgconf float64 + Img CopyableLine + Text string + Hocrname string +} + +type CopyableLine interface { + CopyLineTo(io.Writer) (error) +} + +// This is an implementation of the CopyableLine interface that +// stores the image directly as an image.Image +type ImgDirect struct { + Img image.Image +} + +func (i ImgDirect) CopyLineTo(w io.Writer) (error) { + err := png.Encode(w, i.Img) + if err != nil { + return err + } + return nil +} + +type LineDetails []LineDetail + +// Used by sort.Sort. +func (l LineDetails) Len() int { return len(l) } + +// Used by sort.Sort. +func (l LineDetails) Less(i, j int) bool { + return l[i].Avgconf < l[j].Avgconf +} + +// Used by sort.Sort. +func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] } -- cgit v1.2.1-24-ge1ad