diff options
| author | Nick White <git@njw.name> | 2019-01-23 20:47:33 +0000 | 
|---|---|---|
| committer | Nick White <git@njw.name> | 2019-01-23 20:47:33 +0000 | 
| commit | d256f967a26ceeb7c3987a1fc447b126a35054f9 (patch) | |
| tree | f80c1e3e2c3757c59ad51dec98de5b6e82a426fa /hocr | |
| parent | c41aa16d8a3d35ce4185184ee50536bf2089a120 (diff) | |
Separate out hocr parts from line parts
Diffstat (limited to 'hocr')
| -rw-r--r-- | hocr/hocr.go | 188 | 
1 files changed, 0 insertions, 188 deletions
| diff --git a/hocr/hocr.go b/hocr/hocr.go deleted file mode 100644 index 0c1295c..0000000 --- a/hocr/hocr.go +++ /dev/null @@ -1,188 +0,0 @@ -package hocr - -// TODO: separate out linedetail to a general structure that can incorporate -//       line-conf-buckets too, in a different file (and rename package to -//       something more generic). Do this using the CopyableLine interface -// TODO: Parse line name to zero pad line numbers, so they come out in the correct order - -import ( -	"encoding/xml" -	"image" -	"image/png" -	"io" -	"regexp" -	"strconv" -	"strings" -) - -// TODO: move the linedetail stuff out to a separate file, and create a new -//       CopyableLine implementing struct for ocropy, which will just store -//       a file location -type LineDetail struct { -	Name string -	Avgconf float64 -	Img CopyableLine -	Text string -	Hocrname string -} - -type CopyableLine interface { -	CopyLineTo(io.Writer) (error) -} - -type ImgDirect struct { -	img image.Image -} - -func (i ImgDirect) CopyLineTo(w io.Writer) (error) { -	err := png.Encode(w, i.img) -	if err != nil { -		return err -	} -	return nil -} - -type LineDetails []LineDetail - -// Used by sort.Sort. -func (l LineDetails) Len() int { return len(l) } - -// Used by sort.Sort. -func (l LineDetails) Less(i, j int) bool { -	return l[i].Avgconf < l[j].Avgconf -} - -// Used by sort.Sort. -func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] } - -type Hocr struct { -	Lines []OcrLine `xml:"body>div>div>p>span"` -} - -type OcrLine struct { -	Class string `xml:"class,attr"` -	Id string `xml:"id,attr"` -	Title string `xml:"title,attr"` -	Words []OcrWord `xml:"span"` -	Text string `xml:",chardata"` -} - -type OcrWord struct { -	Class string `xml:"class,attr"` -	Id string `xml:"id,attr"` -	Title string `xml:"title,attr"` -	Chars []OcrChar `xml:"span"` -	Text string `xml:",chardata"` -} - -type OcrChar struct { -	Class string `xml:"class,attr"` -	Id string `xml:"id,attr"` -	Title string `xml:"title,attr"` -	Chars []OcrChar `xml:"span"` -	Text string `xml:",chardata"` -} - -// Returns the confidence for a word based on its x_wconf value -func wordConf(s string) (float64, error) { -	re, err := regexp.Compile(`x_wconf ([0-9.]+)`) -	if err != nil { -		return 0.0, err -	} -	conf := re.FindStringSubmatch(s) -	return strconv.ParseFloat(conf[1], 64) -} - -func boxCoords(s string) ([4]int, error) { -	var coords [4]int -	re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) -	if err != nil { -		return coords, err -	} -	coordstr := re.FindStringSubmatch(s) -	for i := range coords { -		c, err := strconv.Atoi(coordstr[i+1]) -		if err != nil { -			return coords, err -		} -		coords[i] = c -	} -	return coords, nil -} - -func noText(s string) bool { -	t := strings.Trim(s, " \n") -	return len(t) == 0 -} - -func Parse(b []byte) (Hocr, error) { -	var hocr Hocr - -	err := xml.Unmarshal(b, &hocr) -	if err != nil { -		return hocr, err -	} - -	return hocr, nil -} - -func GetLineDetails(h Hocr, i image.Image, name string) (LineDetails, error) { -	lines := make(LineDetails, 0) - -	for _, l := range h.Lines { -		totalconf := float64(0) -		num := 0 -		for _, w := range l.Words { -			c, err := wordConf(w.Title) -			if err != nil { -				return lines, err -			} -			num++ -			totalconf += c -		} - -		coords, err := boxCoords(l.Title) -		if err != nil { -			return lines, err -		} - -		var line LineDetail -		line.Name = l.Id -		line.Avgconf = (totalconf/float64(num)) / 100 -		linetext := "" - -		linetext = l.Text -		if(noText(linetext)) { -			linetext = "" -			for _, w := range l.Words { -				if(w.Class != "ocrx_word") { -					continue -				} -				linetext += w.Text + " " -			} -		} -		if(noText(linetext)) { -			linetext = "" -			for _, w := range l.Words { -				if(w.Class != "ocrx_word") { -					continue -				} -				for _, c := range w.Chars { -					if(c.Class != "ocrx_cinfo") { -						continue -					} -					linetext += c.Text -				} -				linetext += " " -			} -		} -		line.Text = strings.TrimRight(linetext, " ") -		line.Text += "\n" -		line.Hocrname = name -		var imgd ImgDirect -		imgd.img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) -		line.Img = imgd -		lines = append(lines, line) -	} -	return lines, nil -} | 
