summaryrefslogtreecommitdiff
path: root/cmd/dehyphenate/main.go
blob: b2bd6f9e635e888f86b2544bf3666b2a16ac8a43 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
package main

import (
	"encoding/xml"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"os"

	"rescribe.xyz/utils/pkg/hocr"
)

// BUGS:
// - loses all elements not captured in hocr structure such as html headings
//   might be best to copy the header and footer separately and put the hocr in between, but would still need to ensure all elements are captured
// - loses any formatting; doesn't need to be identical, but e.g. linebreaks after elements would be handy
// - need to handle OcrChar

func main() {
	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "Usage: dehyphenate hocrin hocrout\n")
		fmt.Fprintf(os.Stderr, "Dehyphenates a hocr file.\n")
		flag.PrintDefaults()
	}
	flag.Parse()
	if flag.NArg() != 2 {
		flag.Usage()
		os.Exit(1)
	}

	in, err := ioutil.ReadFile(flag.Arg(0))
	if err != nil {
		log.Fatalf("Error reading %s: %v", flag.Arg(1), err)
	}
	h, err := hocr.Parse(in)
	if err != nil {
		log.Fatal(err)
	}

	for i, l := range h.Lines {
		w := l.Words[len(l.Words)-1]
		if len(w.Chars) == 0 {
			if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' {
				h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text
				h.Lines[i+1].Words[0].Text = ""
			}
		} else {
			log.Printf("TODO: handle OcrChar")
		}
	}

	f, err := os.Create(flag.Arg(1))
	if err != nil {
		log.Fatalf("Error creating file %s: %v", flag.Arg(1), err)
	}
	defer f.Close()
	e := xml.NewEncoder(f)
	err = e.Encode(h)
	if err != nil {
		log.Fatalf("Error encoding XML: %v", err)
	}
}