From 3c4c5f7c202b7c54ca8d23e7bd7bff4a4bb696cc Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 25 Feb 2019 12:09:06 +0000 Subject: Add tool to extract plain text from hocr --- lib/hocr/hocr.go | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'lib') diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go index bbcf8a2..fbf1523 100644 --- a/lib/hocr/hocr.go +++ b/lib/hocr/hocr.go @@ -2,6 +2,7 @@ package hocr import ( "encoding/xml" + "io/ioutil" "regexp" "strconv" "strings" @@ -77,3 +78,48 @@ func Parse(b []byte) (Hocr, error) { return hocr, nil } + +func GetText(hocrfn string) (string, error) { + var s string + + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return s, err + } + + h, err := Parse(file) + if err != nil { + return s, err + } + + for _, l := range h.Lines { + linetext := l.Text + if noText(linetext) { + linetext = "" + for _, w := range l.Words { + if w.Class != "ocrx_word" { + continue + } + linetext += w.Text + " " + } + } + if noText(linetext) { + linetext = "" + for _, w := range l.Words { + if w.Class != "ocrx_word" { + continue + } + for _, c := range w.Chars { + if c.Class != "ocrx_cinfo" { + continue + } + linetext += c.Text + } + linetext += " " + } + } + linetext = strings.TrimRight(linetext, " ") + "\n" + s += linetext + } + return s, nil +} -- cgit v1.2.1-24-ge1ad