diff options
author | Nick White <git@njw.name> | 2020-02-27 17:45:16 +0000 |
---|---|---|
committer | Nick White <git@njw.name> | 2020-02-27 17:45:16 +0000 |
commit | 3880414bbf2d6f2cd05e208abf919ae5ceabeddc (patch) | |
tree | dee30a151048de65a3e42cfdae7739c4502e148f /cmd/dehyphenate/main.go | |
parent | cda45588cfb796fdd2af27b1851685270df2c02b (diff) |
Reorganise all commands to be behind cmd/
Diffstat (limited to 'cmd/dehyphenate/main.go')
-rw-r--r-- | cmd/dehyphenate/main.go | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/cmd/dehyphenate/main.go b/cmd/dehyphenate/main.go new file mode 100644 index 0000000..b2bd6f9 --- /dev/null +++ b/cmd/dehyphenate/main.go @@ -0,0 +1,63 @@ +package main + +import ( + "encoding/xml" + "flag" + "fmt" + "io/ioutil" + "log" + "os" + + "rescribe.xyz/utils/pkg/hocr" +) + +// BUGS: +// - loses all elements not captured in hocr structure such as html headings +// might be best to copy the header and footer separately and put the hocr in between, but would still need to ensure all elements are captured +// - loses any formatting; doesn't need to be identical, but e.g. linebreaks after elements would be handy +// - need to handle OcrChar + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: dehyphenate hocrin hocrout\n") + fmt.Fprintf(os.Stderr, "Dehyphenates a hocr file.\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() != 2 { + flag.Usage() + os.Exit(1) + } + + in, err := ioutil.ReadFile(flag.Arg(0)) + if err != nil { + log.Fatalf("Error reading %s: %v", flag.Arg(1), err) + } + h, err := hocr.Parse(in) + if err != nil { + log.Fatal(err) + } + + for i, l := range h.Lines { + w := l.Words[len(l.Words)-1] + if len(w.Chars) == 0 { + if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' { + h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text + h.Lines[i+1].Words[0].Text = "" + } + } else { + log.Printf("TODO: handle OcrChar") + } + } + + f, err := os.Create(flag.Arg(1)) + if err != nil { + log.Fatalf("Error creating file %s: %v", flag.Arg(1), err) + } + defer f.Close() + e := xml.NewEncoder(f) + err = e.Encode(h) + if err != nil { + log.Fatalf("Error encoding XML: %v", err) + } +} |