diff options
Diffstat (limited to 'lib/hocr/hocr.go')
-rw-r--r-- | lib/hocr/hocr.go | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go index bbcf8a2..fbf1523 100644 --- a/lib/hocr/hocr.go +++ b/lib/hocr/hocr.go @@ -2,6 +2,7 @@ package hocr import ( "encoding/xml" + "io/ioutil" "regexp" "strconv" "strings" @@ -77,3 +78,48 @@ func Parse(b []byte) (Hocr, error) { return hocr, nil } + +func GetText(hocrfn string) (string, error) { + var s string + + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return s, err + } + + h, err := Parse(file) + if err != nil { + return s, err + } + + for _, l := range h.Lines { + linetext := l.Text + if noText(linetext) { + linetext = "" + for _, w := range l.Words { + if w.Class != "ocrx_word" { + continue + } + linetext += w.Text + " " + } + } + if noText(linetext) { + linetext = "" + for _, w := range l.Words { + if w.Class != "ocrx_word" { + continue + } + for _, c := range w.Chars { + if c.Class != "ocrx_cinfo" { + continue + } + linetext += c.Text + } + linetext += " " + } + } + linetext = strings.TrimRight(linetext, " ") + "\n" + s += linetext + } + return s, nil +} |