From 3c4c5f7c202b7c54ca8d23e7bd7bff4a4bb696cc Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 25 Feb 2019 12:09:06 +0000 Subject: Add tool to extract plain text from hocr --- hocrtotxt/main.go | 30 ++++++++++++++++++++++++++++++ lib/hocr/hocr.go | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 hocrtotxt/main.go diff --git a/hocrtotxt/main.go b/hocrtotxt/main.go new file mode 100644 index 0000000..6821a9e --- /dev/null +++ b/hocrtotxt/main.go @@ -0,0 +1,30 @@ +package main + +import ( + "flag" + "fmt" + "log" + "os" + + "rescribe.xyz/go.git/lib/hocr" +) + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: hocrtotxt hocrfile\n") + fmt.Fprintf(os.Stderr, "Prints the text from a hocr file.\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() != 1 { + flag.Usage() + os.Exit(1) + } + + text, err := hocr.GetText(flag.Arg(0)) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("%s\n", text) +} diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go index bbcf8a2..fbf1523 100644 --- a/lib/hocr/hocr.go +++ b/lib/hocr/hocr.go @@ -2,6 +2,7 @@ package hocr import ( "encoding/xml" + "io/ioutil" "regexp" "strconv" "strings" @@ -77,3 +78,48 @@ func Parse(b []byte) (Hocr, error) { return hocr, nil } + +func GetText(hocrfn string) (string, error) { + var s string + + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return s, err + } + + h, err := Parse(file) + if err != nil { + return s, err + } + + for _, l := range h.Lines { + linetext := l.Text + if noText(linetext) { + linetext = "" + for _, w := range l.Words { + if w.Class != "ocrx_word" { + continue + } + linetext += w.Text + " " + } + } + if noText(linetext) { + linetext = "" + for _, w := range l.Words { + if w.Class != "ocrx_word" { + continue + } + for _, c := range w.Chars { + if c.Class != "ocrx_cinfo" { + continue + } + linetext += c.Text + } + linetext += " " + } + } + linetext = strings.TrimRight(linetext, " ") + "\n" + s += linetext + } + return s, nil +} -- cgit v1.2.1-24-ge1ad