summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-02-25 12:09:06 +0000
committerNick White <git@njw.name>2019-02-25 12:29:59 +0000
commit3c4c5f7c202b7c54ca8d23e7bd7bff4a4bb696cc (patch)
tree3e2be0e72fb4fbdac80c4df0edb3c67adac68625
parentd8ca7b584b693a2a61dd88767a81d99bc48aca32 (diff)
Add tool to extract plain text from hocr
-rw-r--r--hocrtotxt/main.go30
-rw-r--r--lib/hocr/hocr.go46
2 files changed, 76 insertions, 0 deletions
diff --git a/hocrtotxt/main.go b/hocrtotxt/main.go
new file mode 100644
index 0000000..6821a9e
--- /dev/null
+++ b/hocrtotxt/main.go
@@ -0,0 +1,30 @@
+package main
+
+import (
+ "flag"
+ "fmt"
+ "log"
+ "os"
+
+ "rescribe.xyz/go.git/lib/hocr"
+)
+
+func main() {
+ flag.Usage = func() {
+ fmt.Fprintf(os.Stderr, "Usage: hocrtotxt hocrfile\n")
+ fmt.Fprintf(os.Stderr, "Prints the text from a hocr file.\n")
+ flag.PrintDefaults()
+ }
+ flag.Parse()
+ if flag.NArg() != 1 {
+ flag.Usage()
+ os.Exit(1)
+ }
+
+ text, err := hocr.GetText(flag.Arg(0))
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ fmt.Printf("%s\n", text)
+}
diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go
index bbcf8a2..fbf1523 100644
--- a/lib/hocr/hocr.go
+++ b/lib/hocr/hocr.go
@@ -2,6 +2,7 @@ package hocr
import (
"encoding/xml"
+ "io/ioutil"
"regexp"
"strconv"
"strings"
@@ -77,3 +78,48 @@ func Parse(b []byte) (Hocr, error) {
return hocr, nil
}
+
+func GetText(hocrfn string) (string, error) {
+ var s string
+
+ file, err := ioutil.ReadFile(hocrfn)
+ if err != nil {
+ return s, err
+ }
+
+ h, err := Parse(file)
+ if err != nil {
+ return s, err
+ }
+
+ for _, l := range h.Lines {
+ linetext := l.Text
+ if noText(linetext) {
+ linetext = ""
+ for _, w := range l.Words {
+ if w.Class != "ocrx_word" {
+ continue
+ }
+ linetext += w.Text + " "
+ }
+ }
+ if noText(linetext) {
+ linetext = ""
+ for _, w := range l.Words {
+ if w.Class != "ocrx_word" {
+ continue
+ }
+ for _, c := range w.Chars {
+ if c.Class != "ocrx_cinfo" {
+ continue
+ }
+ linetext += c.Text
+ }
+ linetext += " "
+ }
+ }
+ linetext = strings.TrimRight(linetext, " ") + "\n"
+ s += linetext
+ }
+ return s, nil
+}