From 3880414bbf2d6f2cd05e208abf919ae5ceabeddc Mon Sep 17 00:00:00 2001 From: Nick White Date: Thu, 27 Feb 2020 17:45:16 +0000 Subject: Reorganise all commands to be behind cmd/ --- dehyphenate/main.go | 63 ----------------------------------------------------- 1 file changed, 63 deletions(-) delete mode 100644 dehyphenate/main.go (limited to 'dehyphenate') diff --git a/dehyphenate/main.go b/dehyphenate/main.go deleted file mode 100644 index b2bd6f9..0000000 --- a/dehyphenate/main.go +++ /dev/null @@ -1,63 +0,0 @@ -package main - -import ( - "encoding/xml" - "flag" - "fmt" - "io/ioutil" - "log" - "os" - - "rescribe.xyz/utils/pkg/hocr" -) - -// BUGS: -// - loses all elements not captured in hocr structure such as html headings -// might be best to copy the header and footer separately and put the hocr in between, but would still need to ensure all elements are captured -// - loses any formatting; doesn't need to be identical, but e.g. linebreaks after elements would be handy -// - need to handle OcrChar - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: dehyphenate hocrin hocrout\n") - fmt.Fprintf(os.Stderr, "Dehyphenates a hocr file.\n") - flag.PrintDefaults() - } - flag.Parse() - if flag.NArg() != 2 { - flag.Usage() - os.Exit(1) - } - - in, err := ioutil.ReadFile(flag.Arg(0)) - if err != nil { - log.Fatalf("Error reading %s: %v", flag.Arg(1), err) - } - h, err := hocr.Parse(in) - if err != nil { - log.Fatal(err) - } - - for i, l := range h.Lines { - w := l.Words[len(l.Words)-1] - if len(w.Chars) == 0 { - if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' { - h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text - h.Lines[i+1].Words[0].Text = "" - } - } else { - log.Printf("TODO: handle OcrChar") - } - } - - f, err := os.Create(flag.Arg(1)) - if err != nil { - log.Fatalf("Error creating file %s: %v", flag.Arg(1), err) - } - defer f.Close() - e := xml.NewEncoder(f) - err = e.Encode(h) - if err != nil { - log.Fatalf("Error encoding XML: %v", err) - } -} -- cgit v1.2.1-24-ge1ad