1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
// Copyright 2019 Nick White.
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
// dehyphenate does basic dehyphenation on a hocr file
package main
import (
"encoding/xml"
"flag"
"fmt"
"io"
"io/ioutil"
"log"
"os"
"strings"
"rescribe.xyz/utils/pkg/hocr"
)
// BUGS:
// - loses all elements not captured in hocr structure such as html headings
// might be best to copy the header and footer separately and put the hocr in between, but would still need to ensure all elements are captured
// - loses any formatting; doesn't need to be identical, but e.g. linebreaks after elements would be handy
// - need to handle OcrChar
// dehyphenateString replaces hyphens at the end of a line
// with the first word from the following line, and removes
// that word from its line.
func dehyphenateString(in string) string {
var newlines []string
lines := strings.Split(in, "\n")
for i, line := range lines {
words := strings.Split(line, " ")
last := words[len(words)-1]
// the - 2 here is to account for a trailing newline and counting from zero
if len(last) > 0 && last[len(last)-1] == '-' && i < len(lines)-2 {
nextwords := strings.Split(lines[i+1], " ")
if len(nextwords) > 0 {
line = line[0:len(line)-1] + nextwords[0]
}
if len(nextwords) > 1 {
lines[i+1] = strings.Join(nextwords[1:], " ")
} else {
lines[i+1] = ""
}
}
newlines = append(newlines, line)
}
return strings.Join(newlines, "\n")
}
func main() {
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: dehyphenate [-hocr] in out\n")
fmt.Fprintf(os.Stderr, "Dehyphenates a file.\n")
flag.PrintDefaults()
}
usehocr := flag.Bool("hocr", false, "process hocr files, rather than plain text")
flag.Parse()
if flag.NArg() != 2 {
flag.Usage()
os.Exit(1)
}
in, err := ioutil.ReadFile(flag.Arg(0))
if err != nil {
log.Fatalf("Error reading %s: %v", flag.Arg(1), err)
}
var finaltxt string
var h hocr.Hocr
if *usehocr {
h, err = hocr.Parse(in)
if err != nil {
log.Fatal(err)
}
for _, p := range h.Pages {
for i, l := range p.Lines {
w := l.Words[len(l.Words)-1]
if len(w.Chars) == 0 {
if len(w.Text) > 0 && w.Text[len(w.Text)-1] == '-' {
p.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + p.Lines[i+1].Words[0].Text
p.Lines[i+1].Words[0].Text = ""
}
} else {
log.Printf("TODO: handle OcrChar")
}
}
}
} else {
finaltxt = dehyphenateString(string(in))
}
f, err := os.Create(flag.Arg(1))
if err != nil {
log.Fatalf("Error creating file %s: %v", flag.Arg(1), err)
}
defer f.Close()
if *usehocr {
e := xml.NewEncoder(f)
err = e.Encode(h)
if err != nil {
log.Fatalf("Error encoding XML: %v", err)
}
} else {
_, err := io.WriteString(f, finaltxt)
if err != nil {
log.Fatalf("Error writing to file: %v", err)
}
}
}
|