diff options
Diffstat (limited to 'parse/hocr')
| -rw-r--r-- | parse/hocr/hocr.go | 181 | 
1 files changed, 0 insertions, 181 deletions
| diff --git a/parse/hocr/hocr.go b/parse/hocr/hocr.go deleted file mode 100644 index 81250a9..0000000 --- a/parse/hocr/hocr.go +++ /dev/null @@ -1,181 +0,0 @@ -package hocr - -// TODO: Parse line name to zero pad line numbers, so they can -//       be sorted easily -// TODO: have same filename format as .prob uses, so include base -//       dirname, and don't include line numbers if there's only -//       one line in the hocr - -import ( -	"encoding/xml" -	"image" -	"image/png" -	"io/ioutil" -	"os" -	"path/filepath" -	"regexp" -	"strconv" -	"strings" - -	"git.rescribe.xyz/testingtools/parse" -) - -type Hocr struct { -	Lines []OcrLine `xml:"body>div>div>p>span"` -} - -type OcrLine struct { -	Class string `xml:"class,attr"` -	Id string `xml:"id,attr"` -	Title string `xml:"title,attr"` -	Words []OcrWord `xml:"span"` -	Text string `xml:",chardata"` -} - -type OcrWord struct { -	Class string `xml:"class,attr"` -	Id string `xml:"id,attr"` -	Title string `xml:"title,attr"` -	Chars []OcrChar `xml:"span"` -	Text string `xml:",chardata"` -} - -type OcrChar struct { -	Class string `xml:"class,attr"` -	Id string `xml:"id,attr"` -	Title string `xml:"title,attr"` -	Chars []OcrChar `xml:"span"` -	Text string `xml:",chardata"` -} - -// Returns the confidence for a word based on its x_wconf value -func wordConf(s string) (float64, error) { -	re, err := regexp.Compile(`x_wconf ([0-9.]+)`) -	if err != nil { -		return 0.0, err -	} -	conf := re.FindStringSubmatch(s) -	return strconv.ParseFloat(conf[1], 64) -} - -func boxCoords(s string) ([4]int, error) { -	var coords [4]int -	re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) -	if err != nil { -		return coords, err -	} -	coordstr := re.FindStringSubmatch(s) -	for i := range coords { -		c, err := strconv.Atoi(coordstr[i+1]) -		if err != nil { -			return coords, err -		} -		coords[i] = c -	} -	return coords, nil -} - -func noText(s string) bool { -	t := strings.Trim(s, " \n") -	return len(t) == 0 -} - -func Parse(b []byte) (Hocr, error) { -	var hocr Hocr - -	err := xml.Unmarshal(b, &hocr) -	if err != nil { -		return hocr, err -	} - -	return hocr, nil -} - -func parseLineDetails(h Hocr, i image.Image, name string) (parse.LineDetails, error) { -	lines := make(parse.LineDetails, 0) - -	for _, l := range h.Lines { -		totalconf := float64(0) -		num := 0 -		for _, w := range l.Words { -			c, err := wordConf(w.Title) -			if err != nil { -				return lines, err -			} -			num++ -			totalconf += c -		} - -		coords, err := boxCoords(l.Title) -		if err != nil { -			return lines, err -		} - -		var line parse.LineDetail -		line.Name = l.Id -		line.Avgconf = (totalconf/float64(num)) / 100 -		linetext := "" - -		linetext = l.Text -		if(noText(linetext)) { -			linetext = "" -			for _, w := range l.Words { -				if(w.Class != "ocrx_word") { -					continue -				} -				linetext += w.Text + " " -			} -		} -		if(noText(linetext)) { -			linetext = "" -			for _, w := range l.Words { -				if(w.Class != "ocrx_word") { -					continue -				} -				for _, c := range w.Chars { -					if(c.Class != "ocrx_cinfo") { -						continue -					} -					linetext += c.Text -				} -				linetext += " " -			} -		} -		line.Text = strings.TrimRight(linetext, " ") -		line.Text += "\n" -		line.OcrName = name -		var imgd parse.ImgDirect -		imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) -		line.Img = imgd -		lines = append(lines, line) -	} -	return lines, nil -} - -func GetLineDetails(hocrfn string) (parse.LineDetails, error) { -	var newlines parse.LineDetails - -	file, err := ioutil.ReadFile(hocrfn) -	if err != nil { -		return newlines, err -	} - -	h, err := Parse(file) -	if err != nil { -		return newlines, err -	} - -	pngfn := strings.Replace(hocrfn, ".hocr", ".png", 1) -	pngf, err := os.Open(pngfn) -	if err != nil { -		return newlines, err -	} -	defer pngf.Close() -	img, err := png.Decode(pngf) -	if err != nil { -		return newlines, err -	} - -	n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) -	return parseLineDetails(h, img, n) -} | 
