diff options
| author | Nick White <git@njw.name> | 2019-04-17 17:51:34 +0100 | 
|---|---|---|
| committer | Nick White <git@njw.name> | 2019-04-17 17:51:34 +0100 | 
| commit | 75173a030b1f3eb8a1e161ffd1732daa40a5e178 (patch) | |
| tree | 92987fc25dfb1699a04279a664a93e69e5226a8d | |
| parent | 66eaf329b5cb774a996a00d9b4c2bfa593ee7fa9 (diff) | |
Add basic dehyphenate tool
| -rw-r--r-- | dehyphenate/main.go | 63 | 
1 files changed, 63 insertions, 0 deletions
| diff --git a/dehyphenate/main.go b/dehyphenate/main.go new file mode 100644 index 0000000..4393c8f --- /dev/null +++ b/dehyphenate/main.go @@ -0,0 +1,63 @@ +package main + +import ( +	"encoding/xml" +	"flag" +	"fmt" +	"io/ioutil" +	"log" +	"os" + +	"rescribe.xyz/go.git/lib/hocr" +) + +// BUGS: +// - loses all elements not captured in hocr structure such as html headings +//   might be best to copy the header and footer separately and put the hocr in between, but would still need to ensure all elements are captured +// - loses any formatting; doesn't need to be identical, but e.g. linebreaks after elements would be handy +// - need to handle OcrChar + +func main() { +	flag.Usage = func() { +		fmt.Fprintf(os.Stderr, "Usage: dehyphenate hocrin hocrout\n") +		fmt.Fprintf(os.Stderr, "Dehyphenates a hocr file.\n") +		flag.PrintDefaults() +	} +	flag.Parse() +	if flag.NArg() != 2 { +		flag.Usage() +		os.Exit(1) +	} + +	in, err := ioutil.ReadFile(flag.Arg(0)) +	if err != nil { +		log.Fatalf("Error reading %s: %v", flag.Arg(1), err) +	} +	h, err := hocr.Parse(in) +	if err != nil { +		log.Fatal(err) +	} + +	for i, l := range h.Lines { +		w := l.Words[len(l.Words)-1] +		if len(w.Chars) == 0 { +			if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' { +				h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text +				h.Lines[i+1].Words[0].Text = "" +			} +		} else { +			log.Printf("TODO: handle OcrChar") +		} +	} + +	f, err := os.Create(flag.Arg(1)) +	if err != nil { +		log.Fatalf("Error creating file %s: %v", flag.Arg(1), err) +	} +	defer f.Close() +	e := xml.NewEncoder(f) +	err = e.Encode(h) +	if err != nil { +		log.Fatalf("Error encoding XML: %v", err) +	} +} | 
