From d256f967a26ceeb7c3987a1fc447b126a35054f9 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Wed, 23 Jan 2019 20:47:33 +0000
Subject: Separate out hocr parts from line parts

---
 hocr/hocr.go                                     | 188 -----------------------
 line-conf-buckets-tess/line-conf-buckets-tess.go |   5 +-
 parse/hocr/hocr.go                               | 147 ++++++++++++++++++
 parse/line.go                                    |  51 ++++++
 4 files changed, 201 insertions(+), 190 deletions(-)
 delete mode 100644 hocr/hocr.go
 create mode 100644 parse/hocr/hocr.go
 create mode 100644 parse/line.go

diff --git a/hocr/hocr.go b/hocr/hocr.go
deleted file mode 100644
index 0c1295c..0000000
--- a/hocr/hocr.go
+++ /dev/null
@@ -1,188 +0,0 @@
-package hocr
-
-// TODO: separate out linedetail to a general structure that can incorporate
-//       line-conf-buckets too, in a different file (and rename package to
-//       something more generic). Do this using the CopyableLine interface
-// TODO: Parse line name to zero pad line numbers, so they come out in the correct order
-
-import (
-	"encoding/xml"
-	"image"
-	"image/png"
-	"io"
-	"regexp"
-	"strconv"
-	"strings"
-)
-
-// TODO: move the linedetail stuff out to a separate file, and create a new
-//       CopyableLine implementing struct for ocropy, which will just store
-//       a file location
-type LineDetail struct {
-	Name string
-	Avgconf float64
-	Img CopyableLine
-	Text string
-	Hocrname string
-}
-
-type CopyableLine interface {
-	CopyLineTo(io.Writer) (error)
-}
-
-type ImgDirect struct {
-	img image.Image
-}
-
-func (i ImgDirect) CopyLineTo(w io.Writer) (error) {
-	err := png.Encode(w, i.img)
-	if err != nil {
-		return err
-	}
-	return nil
-}
-
-type LineDetails []LineDetail
-
-// Used by sort.Sort.
-func (l LineDetails) Len() int { return len(l) }
-
-// Used by sort.Sort.
-func (l LineDetails) Less(i, j int) bool {
-	return l[i].Avgconf < l[j].Avgconf
-}
-
-// Used by sort.Sort.
-func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
-
-type Hocr struct {
-	Lines []OcrLine `xml:"body>div>div>p>span"`
-}
-
-type OcrLine struct {
-	Class string `xml:"class,attr"`
-	Id string `xml:"id,attr"`
-	Title string `xml:"title,attr"`
-	Words []OcrWord `xml:"span"`
-	Text string `xml:",chardata"`
-}
-
-type OcrWord struct {
-	Class string `xml:"class,attr"`
-	Id string `xml:"id,attr"`
-	Title string `xml:"title,attr"`
-	Chars []OcrChar `xml:"span"`
-	Text string `xml:",chardata"`
-}
-
-type OcrChar struct {
-	Class string `xml:"class,attr"`
-	Id string `xml:"id,attr"`
-	Title string `xml:"title,attr"`
-	Chars []OcrChar `xml:"span"`
-	Text string `xml:",chardata"`
-}
-
-// Returns the confidence for a word based on its x_wconf value
-func wordConf(s string) (float64, error) {
-	re, err := regexp.Compile(`x_wconf ([0-9.]+)`)
-	if err != nil {
-		return 0.0, err
-	}
-	conf := re.FindStringSubmatch(s)
-	return strconv.ParseFloat(conf[1], 64)
-}
-
-func boxCoords(s string) ([4]int, error) {
-	var coords [4]int
-	re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`)
-	if err != nil {
-		return coords, err
-	}
-	coordstr := re.FindStringSubmatch(s)
-	for i := range coords {
-		c, err := strconv.Atoi(coordstr[i+1])
-		if err != nil {
-			return coords, err
-		}
-		coords[i] = c
-	}
-	return coords, nil
-}
-
-func noText(s string) bool {
-	t := strings.Trim(s, " \n")
-	return len(t) == 0
-}
-
-func Parse(b []byte) (Hocr, error) {
-	var hocr Hocr
-
-	err := xml.Unmarshal(b, &hocr)
-	if err != nil {
-		return hocr, err
-	}
-
-	return hocr, nil
-}
-
-func GetLineDetails(h Hocr, i image.Image, name string) (LineDetails, error) {
-	lines := make(LineDetails, 0)
-
-	for _, l := range h.Lines {
-		totalconf := float64(0)
-		num := 0
-		for _, w := range l.Words {
-			c, err := wordConf(w.Title)
-			if err != nil {
-				return lines, err
-			}
-			num++
-			totalconf += c
-		}
-
-		coords, err := boxCoords(l.Title)
-		if err != nil {
-			return lines, err
-		}
-
-		var line LineDetail
-		line.Name = l.Id
-		line.Avgconf = (totalconf/float64(num)) / 100
-		linetext := ""
-
-		linetext = l.Text
-		if(noText(linetext)) {
-			linetext = ""
-			for _, w := range l.Words {
-				if(w.Class != "ocrx_word") {
-					continue
-				}
-				linetext += w.Text + " "
-			}
-		}
-		if(noText(linetext)) {
-			linetext = ""
-			for _, w := range l.Words {
-				if(w.Class != "ocrx_word") {
-					continue
-				}
-				for _, c := range w.Chars {
-					if(c.Class != "ocrx_cinfo") {
-						continue
-					}
-					linetext += c.Text
-				}
-				linetext += " "
-			}
-		}
-		line.Text = strings.TrimRight(linetext, " ")
-		line.Text += "\n"
-		line.Hocrname = name
-		var imgd ImgDirect
-		imgd.img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3]))
-		line.Img = imgd
-		lines = append(lines, line)
-	}
-	return lines, nil
-}
diff --git a/line-conf-buckets-tess/line-conf-buckets-tess.go b/line-conf-buckets-tess/line-conf-buckets-tess.go
index facd108..b24bdec 100644
--- a/line-conf-buckets-tess/line-conf-buckets-tess.go
+++ b/line-conf-buckets-tess/line-conf-buckets-tess.go
@@ -17,7 +17,8 @@ import (
 	"strconv"
 	"strings"
 
-	"git.rescribe.xyz/testingtools/hocr"
+	"git.rescribe.xyz/testingtools/parse"
+	"git.rescribe.xyz/testingtools/parse/hocr"
 )
 
 func main() {
@@ -33,7 +34,7 @@ func main() {
 		os.Exit(1)
 	}
 
-	lines := make(hocr.LineDetails, 0)
+	lines := make(parse.LineDetails, 0)
 
 	for _, f := range flag.Args() {
 		file, err := ioutil.ReadFile(f)
diff --git a/parse/hocr/hocr.go b/parse/hocr/hocr.go
new file mode 100644
index 0000000..a281a7a
--- /dev/null
+++ b/parse/hocr/hocr.go
@@ -0,0 +1,147 @@
+package hocr
+
+// TODO: consider making GetLineDetails() a function of Hocr, so could do a
+//       similar thing with prob format files too.
+// TODO: Parse line name to zero pad line numbers, so they come out in the correct order
+
+import (
+	"encoding/xml"
+	"image"
+	"regexp"
+	"strconv"
+	"strings"
+
+	"git.rescribe.xyz/testingtools/parse"
+)
+
+type Hocr struct {
+	Lines []OcrLine `xml:"body>div>div>p>span"`
+}
+
+type OcrLine struct {
+	Class string `xml:"class,attr"`
+	Id string `xml:"id,attr"`
+	Title string `xml:"title,attr"`
+	Words []OcrWord `xml:"span"`
+	Text string `xml:",chardata"`
+}
+
+type OcrWord struct {
+	Class string `xml:"class,attr"`
+	Id string `xml:"id,attr"`
+	Title string `xml:"title,attr"`
+	Chars []OcrChar `xml:"span"`
+	Text string `xml:",chardata"`
+}
+
+type OcrChar struct {
+	Class string `xml:"class,attr"`
+	Id string `xml:"id,attr"`
+	Title string `xml:"title,attr"`
+	Chars []OcrChar `xml:"span"`
+	Text string `xml:",chardata"`
+}
+
+// Returns the confidence for a word based on its x_wconf value
+func wordConf(s string) (float64, error) {
+	re, err := regexp.Compile(`x_wconf ([0-9.]+)`)
+	if err != nil {
+		return 0.0, err
+	}
+	conf := re.FindStringSubmatch(s)
+	return strconv.ParseFloat(conf[1], 64)
+}
+
+func boxCoords(s string) ([4]int, error) {
+	var coords [4]int
+	re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`)
+	if err != nil {
+		return coords, err
+	}
+	coordstr := re.FindStringSubmatch(s)
+	for i := range coords {
+		c, err := strconv.Atoi(coordstr[i+1])
+		if err != nil {
+			return coords, err
+		}
+		coords[i] = c
+	}
+	return coords, nil
+}
+
+func noText(s string) bool {
+	t := strings.Trim(s, " \n")
+	return len(t) == 0
+}
+
+func Parse(b []byte) (Hocr, error) {
+	var hocr Hocr
+
+	err := xml.Unmarshal(b, &hocr)
+	if err != nil {
+		return hocr, err
+	}
+
+	return hocr, nil
+}
+
+func GetLineDetails(h Hocr, i image.Image, name string) (parse.LineDetails, error) {
+	lines := make(parse.LineDetails, 0)
+
+	for _, l := range h.Lines {
+		totalconf := float64(0)
+		num := 0
+		for _, w := range l.Words {
+			c, err := wordConf(w.Title)
+			if err != nil {
+				return lines, err
+			}
+			num++
+			totalconf += c
+		}
+
+		coords, err := boxCoords(l.Title)
+		if err != nil {
+			return lines, err
+		}
+
+		var line parse.LineDetail
+		line.Name = l.Id
+		line.Avgconf = (totalconf/float64(num)) / 100
+		linetext := ""
+
+		linetext = l.Text
+		if(noText(linetext)) {
+			linetext = ""
+			for _, w := range l.Words {
+				if(w.Class != "ocrx_word") {
+					continue
+				}
+				linetext += w.Text + " "
+			}
+		}
+		if(noText(linetext)) {
+			linetext = ""
+			for _, w := range l.Words {
+				if(w.Class != "ocrx_word") {
+					continue
+				}
+				for _, c := range w.Chars {
+					if(c.Class != "ocrx_cinfo") {
+						continue
+					}
+					linetext += c.Text
+				}
+				linetext += " "
+			}
+		}
+		line.Text = strings.TrimRight(linetext, " ")
+		line.Text += "\n"
+		line.Hocrname = name
+		var imgd parse.ImgDirect
+		imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3]))
+		line.Img = imgd
+		lines = append(lines, line)
+	}
+	return lines, nil
+}
diff --git a/parse/line.go b/parse/line.go
new file mode 100644
index 0000000..3ddde76
--- /dev/null
+++ b/parse/line.go
@@ -0,0 +1,51 @@
+package parse
+
+// TODO: integrate in line-conf-buckets linedetail
+// TODO: add BucketUp() function here that does what both line-conf-buckets-tess.go
+//       and line-conf-buckets.go do
+// TODO: consider naming this package line, and separating it from hocr and prob
+
+import (
+	"image"
+	"image/png"
+	"io"
+)
+
+type LineDetail struct {
+	Name string
+	Avgconf float64
+	Img CopyableLine
+	Text string
+	Hocrname string
+}
+
+type CopyableLine interface {
+	CopyLineTo(io.Writer) (error)
+}
+
+// This is an implementation of the CopyableLine interface that
+// stores the image directly as an image.Image
+type ImgDirect struct {
+	Img image.Image
+}
+
+func (i ImgDirect) CopyLineTo(w io.Writer) (error) {
+	err := png.Encode(w, i.Img)
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+type LineDetails []LineDetail
+
+// Used by sort.Sort.
+func (l LineDetails) Len() int { return len(l) }
+
+// Used by sort.Sort.
+func (l LineDetails) Less(i, j int) bool {
+	return l[i].Avgconf < l[j].Avgconf
+}
+
+// Used by sort.Sort.
+func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
-- 
cgit v1.2.1-24-ge1ad