summaryrefslogtreecommitdiff
path: root/cmd/dehyphenate/main.go
blob: ed8eb1385e8ab240840807c375dd2d5c2c82590d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
// Copyright 2019 Nick White.
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.

// dehyphenate does basic dehyphenation on a hocr file
package main

import (
	"encoding/xml"
	"flag"
	"fmt"
	"io"
	"io/ioutil"
	"log"
	"os"
	"strings"

	"rescribe.xyz/utils/pkg/hocr"
)

// BUGS:
// - loses all elements not captured in hocr structure such as html headings
//   might be best to copy the header and footer separately and put the hocr in between, but would still need to ensure all elements are captured
// - loses any formatting; doesn't need to be identical, but e.g. linebreaks after elements would be handy
// - need to handle OcrChar

func main() {
	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "Usage: dehyphenate [-hocr] in out\n")
		fmt.Fprintf(os.Stderr, "Dehyphenates a file.\n")
		flag.PrintDefaults()
	}
	usehocr := flag.Bool("hocr", false, "process hocr files, rather than plain text")
	flag.Parse()
	if flag.NArg() != 2 {
		flag.Usage()
		os.Exit(1)
	}

	in, err := ioutil.ReadFile(flag.Arg(0))
	if err != nil {
		log.Fatalf("Error reading %s: %v", flag.Arg(1), err)
	}

	var finaltxt string
	var h hocr.Hocr

	if *usehocr {
		h, err = hocr.Parse(in)
		if err != nil {
			log.Fatal(err)
		}

		for i, l := range h.Lines {
			w := l.Words[len(l.Words)-1]
			if len(w.Chars) == 0 {
				if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' {
					h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text
					h.Lines[i+1].Words[0].Text = ""
				}
			} else {
				log.Printf("TODO: handle OcrChar")
			}
		}
	} else {
		var newlines []string
		lines := strings.Split(string(in), "\n")
		for i, line := range lines {
			words := strings.Split(line, " ")
			last := words[len(words)-1]
			if len(last) > 0 && last[len(last) - 1] == '-' {
				nextwords := strings.Split(lines[i+1], " ")
				line = line[0:len(line)-1] + nextwords[0]
				if len(nextwords) > 1 {
					lines[i+1] = strings.Join(nextwords[1:], " ")
				}
			}
			newlines = append(newlines, line)
		}
		finaltxt = strings.Join(newlines, "\n")
	}

	f, err := os.Create(flag.Arg(1))
	if err != nil {
		log.Fatalf("Error creating file %s: %v", flag.Arg(1), err)
	}
	defer f.Close()

	if *usehocr {
		e := xml.NewEncoder(f)
		err = e.Encode(h)
		if err != nil {
			log.Fatalf("Error encoding XML: %v", err)
		}
	} else {
		_, err := io.WriteString(f, finaltxt)
		if err != nil {
			log.Fatalf("Error writing to file: %v", err)
		}
	}
}