From 8077bcde1f07d697d78f478f04e251b96bff3bbf Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 25 Aug 2020 15:39:49 +0100 Subject: Add text mode for dehyphenate tool --- cmd/dehyphenate/main.go | 69 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 18 deletions(-) (limited to 'cmd/dehyphenate') diff --git a/cmd/dehyphenate/main.go b/cmd/dehyphenate/main.go index 90a6cda..ed8eb13 100644 --- a/cmd/dehyphenate/main.go +++ b/cmd/dehyphenate/main.go @@ -9,9 +9,11 @@ import ( "encoding/xml" "flag" "fmt" + "io" "io/ioutil" "log" "os" + "strings" "rescribe.xyz/utils/pkg/hocr" ) @@ -24,10 +26,11 @@ import ( func main() { flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: dehyphenate hocrin hocrout\n") - fmt.Fprintf(os.Stderr, "Dehyphenates a hocr file.\n") + fmt.Fprintf(os.Stderr, "Usage: dehyphenate [-hocr] in out\n") + fmt.Fprintf(os.Stderr, "Dehyphenates a file.\n") flag.PrintDefaults() } + usehocr := flag.Bool("hocr", false, "process hocr files, rather than plain text") flag.Parse() if flag.NArg() != 2 { flag.Usage() @@ -38,21 +41,43 @@ func main() { if err != nil { log.Fatalf("Error reading %s: %v", flag.Arg(1), err) } - h, err := hocr.Parse(in) - if err != nil { - log.Fatal(err) - } - for i, l := range h.Lines { - w := l.Words[len(l.Words)-1] - if len(w.Chars) == 0 { - if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' { - h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text - h.Lines[i+1].Words[0].Text = "" + var finaltxt string + var h hocr.Hocr + + if *usehocr { + h, err = hocr.Parse(in) + if err != nil { + log.Fatal(err) + } + + for i, l := range h.Lines { + w := l.Words[len(l.Words)-1] + if len(w.Chars) == 0 { + if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' { + h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text + h.Lines[i+1].Words[0].Text = "" + } + } else { + log.Printf("TODO: handle OcrChar") } - } else { - log.Printf("TODO: handle OcrChar") } + } else { + var newlines []string + lines := strings.Split(string(in), "\n") + for i, line := range lines { + words := strings.Split(line, " ") + last := words[len(words)-1] + if len(last) > 0 && last[len(last) - 1] == '-' { + nextwords := strings.Split(lines[i+1], " ") + line = line[0:len(line)-1] + nextwords[0] + if len(nextwords) > 1 { + lines[i+1] = strings.Join(nextwords[1:], " ") + } + } + newlines = append(newlines, line) + } + finaltxt = strings.Join(newlines, "\n") } f, err := os.Create(flag.Arg(1)) @@ -60,9 +85,17 @@ func main() { log.Fatalf("Error creating file %s: %v", flag.Arg(1), err) } defer f.Close() - e := xml.NewEncoder(f) - err = e.Encode(h) - if err != nil { - log.Fatalf("Error encoding XML: %v", err) + + if *usehocr { + e := xml.NewEncoder(f) + err = e.Encode(h) + if err != nil { + log.Fatalf("Error encoding XML: %v", err) + } + } else { + _, err := io.WriteString(f, finaltxt) + if err != nil { + log.Fatalf("Error writing to file: %v", err) + } } } -- cgit v1.2.1-24-ge1ad