From 30c088b90e7b6a25d93cbdad7564ff063e62afd3 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Fri, 25 Jan 2019 09:55:55 +0000
Subject: Reorganisation and cleanup

---
 lib/hocr/hocr.go  |  79 ++++++++++++++++++++++++++++++++++++++++
 lib/hocr/lines.go | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/line/line.go  |  64 ++++++++++++++++++++++++++++++++
 lib/prob/prob.go  |  69 +++++++++++++++++++++++++++++++++++
 4 files changed, 319 insertions(+)
 create mode 100644 lib/hocr/hocr.go
 create mode 100644 lib/hocr/lines.go
 create mode 100644 lib/line/line.go
 create mode 100644 lib/prob/prob.go

(limited to 'lib')

diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go
new file mode 100644
index 0000000..0d10819
--- /dev/null
+++ b/lib/hocr/hocr.go
@@ -0,0 +1,79 @@
+package hocr
+
+import (
+	"encoding/xml"
+	"regexp"
+	"strconv"
+	"strings"
+)
+
+type Hocr struct {
+	Lines []OcrLine `xml:"body>div>div>p>span"`
+}
+
+type OcrLine struct {
+	Class string `xml:"class,attr"`
+	Id string `xml:"id,attr"`
+	Title string `xml:"title,attr"`
+	Words []OcrWord `xml:"span"`
+	Text string `xml:",chardata"`
+}
+
+type OcrWord struct {
+	Class string `xml:"class,attr"`
+	Id string `xml:"id,attr"`
+	Title string `xml:"title,attr"`
+	Chars []OcrChar `xml:"span"`
+	Text string `xml:",chardata"`
+}
+
+type OcrChar struct {
+	Class string `xml:"class,attr"`
+	Id string `xml:"id,attr"`
+	Title string `xml:"title,attr"`
+	Chars []OcrChar `xml:"span"`
+	Text string `xml:",chardata"`
+}
+
+// Returns the confidence for a word based on its x_wconf value
+func wordConf(s string) (float64, error) {
+	re, err := regexp.Compile(`x_wconf ([0-9.]+)`)
+	if err != nil {
+		return 0.0, err
+	}
+	conf := re.FindStringSubmatch(s)
+	return strconv.ParseFloat(conf[1], 64)
+}
+
+func boxCoords(s string) ([4]int, error) {
+	var coords [4]int
+	re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`)
+	if err != nil {
+		return coords, err
+	}
+	coordstr := re.FindStringSubmatch(s)
+	for i := range coords {
+		c, err := strconv.Atoi(coordstr[i+1])
+		if err != nil {
+			return coords, err
+		}
+		coords[i] = c
+	}
+	return coords, nil
+}
+
+func noText(s string) bool {
+	t := strings.Trim(s, " \n")
+	return len(t) == 0
+}
+
+func Parse(b []byte) (Hocr, error) {
+	var hocr Hocr
+
+	err := xml.Unmarshal(b, &hocr)
+	if err != nil {
+		return hocr, err
+	}
+
+	return hocr, nil
+}
diff --git a/lib/hocr/lines.go b/lib/hocr/lines.go
new file mode 100644
index 0000000..4902b40
--- /dev/null
+++ b/lib/hocr/lines.go
@@ -0,0 +1,107 @@
+package hocr
+
+// TODO: Parse line name to zero pad line numbers, so they can
+//       be sorted easily
+// TODO: have same filename format as .prob uses, so include base
+//       dirname, and don't include line numbers if there's only
+//       one line in the hocr
+
+import (
+	"image"
+	"image/png"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"git.rescribe.xyz/testingtools/lib/line"
+)
+
+func parseLineDetails(h Hocr, i image.Image, name string) (line.Details, error) {
+	lines := make(line.Details, 0)
+
+	for _, l := range h.Lines {
+		totalconf := float64(0)
+		num := 0
+		for _, w := range l.Words {
+			c, err := wordConf(w.Title)
+			if err != nil {
+				return lines, err
+			}
+			num++
+			totalconf += c
+		}
+
+		coords, err := boxCoords(l.Title)
+		if err != nil {
+			return lines, err
+		}
+
+		var ln line.Detail
+		ln.Name = l.Id
+		ln.Avgconf = (totalconf/float64(num)) / 100
+		linetext := ""
+
+		linetext = l.Text
+		if(noText(linetext)) {
+			linetext = ""
+			for _, w := range l.Words {
+				if(w.Class != "ocrx_word") {
+					continue
+				}
+				linetext += w.Text + " "
+			}
+		}
+		if(noText(linetext)) {
+			linetext = ""
+			for _, w := range l.Words {
+				if(w.Class != "ocrx_word") {
+					continue
+				}
+				for _, c := range w.Chars {
+					if(c.Class != "ocrx_cinfo") {
+						continue
+					}
+					linetext += c.Text
+				}
+				linetext += " "
+			}
+		}
+		ln.Text = strings.TrimRight(linetext, " ")
+		ln.Text += "\n"
+		ln.OcrName = name
+		var imgd line.ImgDirect
+		imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3]))
+		ln.Img = imgd
+		lines = append(lines, ln)
+	}
+	return lines, nil
+}
+
+func GetLineDetails(hocrfn string) (line.Details, error) {
+	var newlines line.Details
+
+	file, err := ioutil.ReadFile(hocrfn)
+	if err != nil {
+		return newlines, err
+	}
+
+	h, err := Parse(file)
+	if err != nil {
+		return newlines, err
+	}
+
+	pngfn := strings.Replace(hocrfn, ".hocr", ".png", 1)
+	pngf, err := os.Open(pngfn)
+	if err != nil {
+		return newlines, err
+	}
+	defer pngf.Close()
+	img, err := png.Decode(pngf)
+	if err != nil {
+		return newlines, err
+	}
+
+	n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1)
+	return parseLineDetails(h, img, n)
+}
diff --git a/lib/line/line.go b/lib/line/line.go
new file mode 100644
index 0000000..3adac0a
--- /dev/null
+++ b/lib/line/line.go
@@ -0,0 +1,64 @@
+package line
+
+import (
+	"image"
+	"image/png"
+	"io"
+	"os"
+)
+
+type Detail struct {
+	Name string
+	Avgconf float64
+	Img CopyableImg
+	Text string
+	OcrName string
+}
+
+type CopyableImg interface {
+	CopyLineTo(io.Writer) (error)
+}
+
+type Details []Detail
+
+// Used by sort.Sort.
+func (l Details) Len() int { return len(l) }
+
+// Used by sort.Sort.
+func (l Details) Less(i, j int) bool {
+	return l[i].Avgconf < l[j].Avgconf
+}
+
+// Used by sort.Sort.
+func (l Details) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
+
+// This is an implementation of the CopyableImg interface that
+// stores the image directly as an image.Image
+type ImgDirect struct {
+	Img image.Image
+}
+
+func (i ImgDirect) CopyLineTo(w io.Writer) (error) {
+	err := png.Encode(w, i.Img)
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+// This is an implementation of the CopyableImg interface that
+// stores the path of an image
+type ImgPath struct {
+	Path string
+}
+
+func (i ImgPath) CopyLineTo(w io.Writer) (error) {
+	f, err := os.Open(i.Path)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	_, err = io.Copy(w, f)
+	return err
+}
diff --git a/lib/prob/prob.go b/lib/prob/prob.go
new file mode 100644
index 0000000..0299a96
--- /dev/null
+++ b/lib/prob/prob.go
@@ -0,0 +1,69 @@
+package prob
+
+import (
+	"io/ioutil"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"git.rescribe.xyz/testingtools/lib/line"
+)
+
+func getLineAvg(f string) (float64, error) {
+	totalconf := float64(0)
+	num := 0
+
+	prob, err := ioutil.ReadFile(f)
+        if err != nil {
+		return 0, err
+	}
+
+	for _, l := range strings.Split(string(prob), "\n") {
+		fields := strings.Fields(l)
+
+		if len(fields) == 2 {
+			conf, err := strconv.ParseFloat(fields[1], 64)
+			if err != nil {
+				continue
+			}
+			totalconf += conf
+			num += 1
+		}
+	}
+	if num <= 0 {
+		return 0, nil
+	}
+	avg := totalconf / float64(num)
+	return avg, nil
+}
+
+// Note this only processes one line at a time
+func GetLineDetails(probfn string) (line.Details, error) {
+	var l line.Detail
+	lines := make(line.Details, 0)
+
+	avg, err := getLineAvg(probfn)
+	if err != nil {
+		return lines, err
+	}
+
+	filebase := strings.Replace(probfn, ".prob", "", 1)
+
+	txt, err := ioutil.ReadFile(filebase + ".txt")
+	if err != nil {
+		return lines, err
+	}
+
+	l.Name = filepath.Base(filebase)
+	l.Avgconf = avg
+	l.Text = string(txt)
+	l.OcrName = filepath.Dir(filebase)
+
+	var imgfn line.ImgPath
+	imgfn.Path = filebase + ".bin.png"
+	l.Img = imgfn
+
+	lines = append(lines, l)
+
+	return lines, nil
+}
-- 
cgit v1.2.1-24-ge1ad