diff options
author | Nick White <git@njw.name> | 2019-02-25 12:09:06 +0000 |
---|---|---|
committer | Nick White <git@njw.name> | 2019-02-25 12:29:59 +0000 |
commit | 3c4c5f7c202b7c54ca8d23e7bd7bff4a4bb696cc (patch) | |
tree | 3e2be0e72fb4fbdac80c4df0edb3c67adac68625 | |
parent | d8ca7b584b693a2a61dd88767a81d99bc48aca32 (diff) |
Add tool to extract plain text from hocr
-rw-r--r-- | hocrtotxt/main.go | 30 | ||||
-rw-r--r-- | lib/hocr/hocr.go | 46 |
2 files changed, 76 insertions, 0 deletions
diff --git a/hocrtotxt/main.go b/hocrtotxt/main.go new file mode 100644 index 0000000..6821a9e --- /dev/null +++ b/hocrtotxt/main.go @@ -0,0 +1,30 @@ +package main + +import ( + "flag" + "fmt" + "log" + "os" + + "rescribe.xyz/go.git/lib/hocr" +) + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: hocrtotxt hocrfile\n") + fmt.Fprintf(os.Stderr, "Prints the text from a hocr file.\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() != 1 { + flag.Usage() + os.Exit(1) + } + + text, err := hocr.GetText(flag.Arg(0)) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("%s\n", text) +} diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go index bbcf8a2..fbf1523 100644 --- a/lib/hocr/hocr.go +++ b/lib/hocr/hocr.go @@ -2,6 +2,7 @@ package hocr import ( "encoding/xml" + "io/ioutil" "regexp" "strconv" "strings" @@ -77,3 +78,48 @@ func Parse(b []byte) (Hocr, error) { return hocr, nil } + +func GetText(hocrfn string) (string, error) { + var s string + + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return s, err + } + + h, err := Parse(file) + if err != nil { + return s, err + } + + for _, l := range h.Lines { + linetext := l.Text + if noText(linetext) { + linetext = "" + for _, w := range l.Words { + if w.Class != "ocrx_word" { + continue + } + linetext += w.Text + " " + } + } + if noText(linetext) { + linetext = "" + for _, w := range l.Words { + if w.Class != "ocrx_word" { + continue + } + for _, c := range w.Chars { + if c.Class != "ocrx_cinfo" { + continue + } + linetext += c.Text + } + linetext += " " + } + } + linetext = strings.TrimRight(linetext, " ") + "\n" + s += linetext + } + return s, nil +} |