From cd1fb1c9f6e1384ac0add8904425e6f92b17a704 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 25 Feb 2019 13:01:28 +0000 Subject: Generalise get text from hocr lines --- lib/hocr/hocr.go | 29 ++----------------------- lib/hocr/lines.go | 63 ++++++++++++++++++++++++++++++------------------------- 2 files changed, 36 insertions(+), 56 deletions(-) diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go index fbf1523..f6316d8 100644 --- a/lib/hocr/hocr.go +++ b/lib/hocr/hocr.go @@ -92,34 +92,9 @@ func GetText(hocrfn string) (string, error) { return s, err } + for _, l := range h.Lines { - linetext := l.Text - if noText(linetext) { - linetext = "" - for _, w := range l.Words { - if w.Class != "ocrx_word" { - continue - } - linetext += w.Text + " " - } - } - if noText(linetext) { - linetext = "" - for _, w := range l.Words { - if w.Class != "ocrx_word" { - continue - } - for _, c := range w.Chars { - if c.Class != "ocrx_cinfo" { - continue - } - linetext += c.Text - } - linetext += " " - } - } - linetext = strings.TrimRight(linetext, " ") + "\n" - s += linetext + s += getLineText(l) } return s, nil } diff --git a/lib/hocr/lines.go b/lib/hocr/lines.go index 00acd1f..c60a619 100644 --- a/lib/hocr/lines.go +++ b/lib/hocr/lines.go @@ -14,6 +14,39 @@ import ( "rescribe.xyz/go.git/lib/line" ) +func getLineText(l OcrLine) (string) { + linetext := "" + + linetext = l.Text + if noText(linetext) { + linetext = "" + for _, w := range l.Words { + if w.Class != "ocrx_word" { + continue + } + linetext += w.Text + " " + } + } + if noText(linetext) { + linetext = "" + for _, w := range l.Words { + if w.Class != "ocrx_word" { + continue + } + for _, c := range w.Chars { + if c.Class != "ocrx_cinfo" { + continue + } + linetext += c.Text + } + linetext += " " + } + } + linetext = strings.TrimRight(linetext, " ") + linetext += "\n" + return linetext +} + func parseLineDetails(h Hocr, i image.Image, name string) (line.Details, error) { lines := make(line.Details, 0) @@ -37,35 +70,7 @@ func parseLineDetails(h Hocr, i image.Image, name string) (line.Details, error) var ln line.Detail ln.Name = l.Id ln.Avgconf = (totalconf / float64(num)) / 100 - linetext := "" - - linetext = l.Text - if noText(linetext) { - linetext = "" - for _, w := range l.Words { - if w.Class != "ocrx_word" { - continue - } - linetext += w.Text + " " - } - } - if noText(linetext) { - linetext = "" - for _, w := range l.Words { - if w.Class != "ocrx_word" { - continue - } - for _, c := range w.Chars { - if c.Class != "ocrx_cinfo" { - continue - } - linetext += c.Text - } - linetext += " " - } - } - ln.Text = strings.TrimRight(linetext, " ") - ln.Text += "\n" + ln.Text = getLineText(l) ln.OcrName = name var imgd line.ImgDirect imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) -- cgit v1.2.1-24-ge1ad