summaryrefslogtreecommitdiff
path: root/cmd
diff options
context:
space:
mode:
authorNick White <git@njw.name>2020-08-25 15:39:49 +0100
committerNick White <git@njw.name>2020-08-25 15:39:49 +0100
commit8077bcde1f07d697d78f478f04e251b96bff3bbf (patch)
treeca0b6fccb7500f2e84d2428818abe7288f9fec45 /cmd
parent19e74f4157970997835e9a3fb6941455129a2cd0 (diff)
Add text mode for dehyphenate tool
Diffstat (limited to 'cmd')
-rw-r--r--cmd/dehyphenate/main.go69
1 files changed, 51 insertions, 18 deletions
diff --git a/cmd/dehyphenate/main.go b/cmd/dehyphenate/main.go
index 90a6cda..ed8eb13 100644
--- a/cmd/dehyphenate/main.go
+++ b/cmd/dehyphenate/main.go
@@ -9,9 +9,11 @@ import (
"encoding/xml"
"flag"
"fmt"
+ "io"
"io/ioutil"
"log"
"os"
+ "strings"
"rescribe.xyz/utils/pkg/hocr"
)
@@ -24,10 +26,11 @@ import (
func main() {
flag.Usage = func() {
- fmt.Fprintf(os.Stderr, "Usage: dehyphenate hocrin hocrout\n")
- fmt.Fprintf(os.Stderr, "Dehyphenates a hocr file.\n")
+ fmt.Fprintf(os.Stderr, "Usage: dehyphenate [-hocr] in out\n")
+ fmt.Fprintf(os.Stderr, "Dehyphenates a file.\n")
flag.PrintDefaults()
}
+ usehocr := flag.Bool("hocr", false, "process hocr files, rather than plain text")
flag.Parse()
if flag.NArg() != 2 {
flag.Usage()
@@ -38,21 +41,43 @@ func main() {
if err != nil {
log.Fatalf("Error reading %s: %v", flag.Arg(1), err)
}
- h, err := hocr.Parse(in)
- if err != nil {
- log.Fatal(err)
- }
- for i, l := range h.Lines {
- w := l.Words[len(l.Words)-1]
- if len(w.Chars) == 0 {
- if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' {
- h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text
- h.Lines[i+1].Words[0].Text = ""
+ var finaltxt string
+ var h hocr.Hocr
+
+ if *usehocr {
+ h, err = hocr.Parse(in)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for i, l := range h.Lines {
+ w := l.Words[len(l.Words)-1]
+ if len(w.Chars) == 0 {
+ if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' {
+ h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text
+ h.Lines[i+1].Words[0].Text = ""
+ }
+ } else {
+ log.Printf("TODO: handle OcrChar")
}
- } else {
- log.Printf("TODO: handle OcrChar")
}
+ } else {
+ var newlines []string
+ lines := strings.Split(string(in), "\n")
+ for i, line := range lines {
+ words := strings.Split(line, " ")
+ last := words[len(words)-1]
+ if len(last) > 0 && last[len(last) - 1] == '-' {
+ nextwords := strings.Split(lines[i+1], " ")
+ line = line[0:len(line)-1] + nextwords[0]
+ if len(nextwords) > 1 {
+ lines[i+1] = strings.Join(nextwords[1:], " ")
+ }
+ }
+ newlines = append(newlines, line)
+ }
+ finaltxt = strings.Join(newlines, "\n")
}
f, err := os.Create(flag.Arg(1))
@@ -60,9 +85,17 @@ func main() {
log.Fatalf("Error creating file %s: %v", flag.Arg(1), err)
}
defer f.Close()
- e := xml.NewEncoder(f)
- err = e.Encode(h)
- if err != nil {
- log.Fatalf("Error encoding XML: %v", err)
+
+ if *usehocr {
+ e := xml.NewEncoder(f)
+ err = e.Encode(h)
+ if err != nil {
+ log.Fatalf("Error encoding XML: %v", err)
+ }
+ } else {
+ _, err := io.WriteString(f, finaltxt)
+ if err != nil {
+ log.Fatalf("Error writing to file: %v", err)
+ }
}
}