summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-02-25 12:09:06 +0000
committerNick White <git@njw.name>2019-02-25 12:29:59 +0000
commit3c4c5f7c202b7c54ca8d23e7bd7bff4a4bb696cc (patch)
tree3e2be0e72fb4fbdac80c4df0edb3c67adac68625 /lib
parentd8ca7b584b693a2a61dd88767a81d99bc48aca32 (diff)
Add tool to extract plain text from hocr
Diffstat (limited to 'lib')
-rw-r--r--lib/hocr/hocr.go46
1 files changed, 46 insertions, 0 deletions
diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go
index bbcf8a2..fbf1523 100644
--- a/lib/hocr/hocr.go
+++ b/lib/hocr/hocr.go
@@ -2,6 +2,7 @@ package hocr
import (
"encoding/xml"
+ "io/ioutil"
"regexp"
"strconv"
"strings"
@@ -77,3 +78,48 @@ func Parse(b []byte) (Hocr, error) {
return hocr, nil
}
+
+func GetText(hocrfn string) (string, error) {
+ var s string
+
+ file, err := ioutil.ReadFile(hocrfn)
+ if err != nil {
+ return s, err
+ }
+
+ h, err := Parse(file)
+ if err != nil {
+ return s, err
+ }
+
+ for _, l := range h.Lines {
+ linetext := l.Text
+ if noText(linetext) {
+ linetext = ""
+ for _, w := range l.Words {
+ if w.Class != "ocrx_word" {
+ continue
+ }
+ linetext += w.Text + " "
+ }
+ }
+ if noText(linetext) {
+ linetext = ""
+ for _, w := range l.Words {
+ if w.Class != "ocrx_word" {
+ continue
+ }
+ for _, c := range w.Chars {
+ if c.Class != "ocrx_cinfo" {
+ continue
+ }
+ linetext += c.Text
+ }
+ linetext += " "
+ }
+ }
+ linetext = strings.TrimRight(linetext, " ") + "\n"
+ s += linetext
+ }
+ return s, nil
+}