From 30c088b90e7b6a25d93cbdad7564ff063e62afd3 Mon Sep 17 00:00:00 2001 From: Nick White Date: Fri, 25 Jan 2019 09:55:55 +0000 Subject: Reorganisation and cleanup --- lib/hocr/lines.go | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 lib/hocr/lines.go (limited to 'lib/hocr/lines.go') diff --git a/lib/hocr/lines.go b/lib/hocr/lines.go new file mode 100644 index 0000000..4902b40 --- /dev/null +++ b/lib/hocr/lines.go @@ -0,0 +1,107 @@ +package hocr + +// TODO: Parse line name to zero pad line numbers, so they can +// be sorted easily +// TODO: have same filename format as .prob uses, so include base +// dirname, and don't include line numbers if there's only +// one line in the hocr + +import ( + "image" + "image/png" + "io/ioutil" + "os" + "path/filepath" + "strings" + + "git.rescribe.xyz/testingtools/lib/line" +) + +func parseLineDetails(h Hocr, i image.Image, name string) (line.Details, error) { + lines := make(line.Details, 0) + + for _, l := range h.Lines { + totalconf := float64(0) + num := 0 + for _, w := range l.Words { + c, err := wordConf(w.Title) + if err != nil { + return lines, err + } + num++ + totalconf += c + } + + coords, err := boxCoords(l.Title) + if err != nil { + return lines, err + } + + var ln line.Detail + ln.Name = l.Id + ln.Avgconf = (totalconf/float64(num)) / 100 + linetext := "" + + linetext = l.Text + if(noText(linetext)) { + linetext = "" + for _, w := range l.Words { + if(w.Class != "ocrx_word") { + continue + } + linetext += w.Text + " " + } + } + if(noText(linetext)) { + linetext = "" + for _, w := range l.Words { + if(w.Class != "ocrx_word") { + continue + } + for _, c := range w.Chars { + if(c.Class != "ocrx_cinfo") { + continue + } + linetext += c.Text + } + linetext += " " + } + } + ln.Text = strings.TrimRight(linetext, " ") + ln.Text += "\n" + ln.OcrName = name + var imgd line.ImgDirect + imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) + ln.Img = imgd + lines = append(lines, ln) + } + return lines, nil +} + +func GetLineDetails(hocrfn string) (line.Details, error) { + var newlines line.Details + + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return newlines, err + } + + h, err := Parse(file) + if err != nil { + return newlines, err + } + + pngfn := strings.Replace(hocrfn, ".hocr", ".png", 1) + pngf, err := os.Open(pngfn) + if err != nil { + return newlines, err + } + defer pngf.Close() + img, err := png.Decode(pngf) + if err != nil { + return newlines, err + } + + n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) + return parseLineDetails(h, img, n) +} -- cgit v1.2.1-24-ge1ad