diff options
author | Nick White <git@njw.name> | 2019-02-25 12:09:06 +0000 |
---|---|---|
committer | Nick White <git@njw.name> | 2019-02-25 12:29:59 +0000 |
commit | 3c4c5f7c202b7c54ca8d23e7bd7bff4a4bb696cc (patch) | |
tree | 3e2be0e72fb4fbdac80c4df0edb3c67adac68625 /lib | |
parent | d8ca7b584b693a2a61dd88767a81d99bc48aca32 (diff) |
Add tool to extract plain text from hocr
Diffstat (limited to 'lib')
-rw-r--r-- | lib/hocr/hocr.go | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go index bbcf8a2..fbf1523 100644 --- a/lib/hocr/hocr.go +++ b/lib/hocr/hocr.go @@ -2,6 +2,7 @@ package hocr import ( "encoding/xml" + "io/ioutil" "regexp" "strconv" "strings" @@ -77,3 +78,48 @@ func Parse(b []byte) (Hocr, error) { return hocr, nil } + +func GetText(hocrfn string) (string, error) { + var s string + + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return s, err + } + + h, err := Parse(file) + if err != nil { + return s, err + } + + for _, l := range h.Lines { + linetext := l.Text + if noText(linetext) { + linetext = "" + for _, w := range l.Words { + if w.Class != "ocrx_word" { + continue + } + linetext += w.Text + " " + } + } + if noText(linetext) { + linetext = "" + for _, w := range l.Words { + if w.Class != "ocrx_word" { + continue + } + for _, c := range w.Chars { + if c.Class != "ocrx_cinfo" { + continue + } + linetext += c.Text + } + linetext += " " + } + } + linetext = strings.TrimRight(linetext, " ") + "\n" + s += linetext + } + return s, nil +} |