From 3c4c5f7c202b7c54ca8d23e7bd7bff4a4bb696cc Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Mon, 25 Feb 2019 12:09:06 +0000
Subject: Add tool to extract plain text from hocr

---
 hocrtotxt/main.go | 30 ++++++++++++++++++++++++++++++
 lib/hocr/hocr.go  | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 hocrtotxt/main.go

diff --git a/hocrtotxt/main.go b/hocrtotxt/main.go
new file mode 100644
index 0000000..6821a9e
--- /dev/null
+++ b/hocrtotxt/main.go
@@ -0,0 +1,30 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"os"
+
+	"rescribe.xyz/go.git/lib/hocr"
+)
+
+func main() {
+	flag.Usage = func() {
+		fmt.Fprintf(os.Stderr, "Usage: hocrtotxt hocrfile\n")
+		fmt.Fprintf(os.Stderr, "Prints the text from a hocr file.\n")
+		flag.PrintDefaults()
+	}
+	flag.Parse()
+	if flag.NArg() != 1 {
+		flag.Usage()
+		os.Exit(1)
+	}
+
+	text, err := hocr.GetText(flag.Arg(0))
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	fmt.Printf("%s\n", text)
+}
diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go
index bbcf8a2..fbf1523 100644
--- a/lib/hocr/hocr.go
+++ b/lib/hocr/hocr.go
@@ -2,6 +2,7 @@ package hocr
 
 import (
 	"encoding/xml"
+	"io/ioutil"
 	"regexp"
 	"strconv"
 	"strings"
@@ -77,3 +78,48 @@ func Parse(b []byte) (Hocr, error) {
 
 	return hocr, nil
 }
+
+func GetText(hocrfn string) (string, error) {
+	var s string
+
+	file, err := ioutil.ReadFile(hocrfn)
+	if err != nil {
+		return s, err
+	}
+
+	h, err := Parse(file)
+	if err != nil {
+		return s, err
+	}
+
+	for _, l := range h.Lines {
+		linetext := l.Text
+		if noText(linetext) {
+			linetext = ""
+			for _, w := range l.Words {
+				if w.Class != "ocrx_word" {
+					continue
+				}
+				linetext += w.Text + " "
+			}
+		}
+		if noText(linetext) {
+			linetext = ""
+			for _, w := range l.Words {
+				if w.Class != "ocrx_word" {
+					continue
+				}
+				for _, c := range w.Chars {
+					if c.Class != "ocrx_cinfo" {
+						continue
+					}
+					linetext += c.Text
+				}
+				linetext += " "
+			}
+		}
+		linetext = strings.TrimRight(linetext, " ") + "\n"
+		s += linetext
+	}
+	return s, nil
+}
-- 
cgit v1.2.1-24-ge1ad