summaryrefslogtreecommitdiff
path: root/cmd/dehyphenate/main.go
blob: 4d48e089fb6cd587ff799560102de09bd7ced969 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
// Copyright 2019 Nick White.
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.

// dehyphenate does basic dehyphenation on a hocr file
package main

import (
	"encoding/xml"
	"flag"
	"fmt"
	"io"
	"io/ioutil"
	"log"
	"os"
	"strings"

	"rescribe.xyz/utils/pkg/hocr"
)

// BUGS:
// - loses all elements not captured in hocr structure such as html headings
//   might be best to copy the header and footer separately and put the hocr in between, but would still need to ensure all elements are captured
// - loses any formatting; doesn't need to be identical, but e.g. linebreaks after elements would be handy
// - need to handle OcrChar

// dehyphenateString replaces hyphens at the end of a line
// with the first word from the following line, and removes
// that word from its line.
func dehyphenateString(in string) string {
	var newlines []string
	lines := strings.Split(in, "\n")
	for i, line := range lines {
		words := strings.Split(line, " ")
		last := words[len(words)-1]
		// the - 2 here is to account for a trailing newline and counting from zero
		if len(last) > 0 && last[len(last)-1] == '-' && i < len(lines)-2 {
			nextwords := strings.Split(lines[i+1], " ")
			if len(nextwords) > 0 {
				line = line[0:len(line)-1] + nextwords[0]
			}
			if len(nextwords) > 1 {
				lines[i+1] = strings.Join(nextwords[1:], " ")
			} else {
				lines[i+1] = ""
			}
		}
		newlines = append(newlines, line)
	}
	return strings.Join(newlines, "\n")
}

func main() {
	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "Usage: dehyphenate [-hocr] in out\n")
		fmt.Fprintf(os.Stderr, "Dehyphenates a file.\n")
		flag.PrintDefaults()
	}
	usehocr := flag.Bool("hocr", false, "process hocr files, rather than plain text")
	flag.Parse()
	if flag.NArg() != 2 {
		flag.Usage()
		os.Exit(1)
	}

	in, err := ioutil.ReadFile(flag.Arg(0))
	if err != nil {
		log.Fatalf("Error reading %s: %v", flag.Arg(1), err)
	}

	var finaltxt string
	var h hocr.Hocr

	if *usehocr {
		h, err = hocr.Parse(in)
		if err != nil {
			log.Fatal(err)
		}

		for _, p := range h.Pages {
			for i, l := range p.Lines {
				w := l.Words[len(l.Words)-1]
				if len(w.Chars) == 0 {
					if len(w.Text) > 0 && w.Text[len(w.Text)-1] == '-' {
						p.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + p.Lines[i+1].Words[0].Text
						p.Lines[i+1].Words[0].Text = ""
					}
				} else {
					log.Printf("TODO: handle OcrChar")
				}
			}
		}
	} else {
		finaltxt = dehyphenateString(string(in))
	}

	f, err := os.Create(flag.Arg(1))
	if err != nil {
		log.Fatalf("Error creating file %s: %v", flag.Arg(1), err)
	}
	defer f.Close()

	if *usehocr {
		e := xml.NewEncoder(f)
		err = e.Encode(h)
		if err != nil {
			log.Fatalf("Error encoding XML: %v", err)
		}
	} else {
		_, err := io.WriteString(f, finaltxt)
		if err != nil {
			log.Fatalf("Error writing to file: %v", err)
		}
	}
}