1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
// Copyright 2019 Nick White.
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
// eeboxmltohocr converts the XML from an EEBO download to hOCR, which
// can be easily incorporated into a searchable PDF
package main
import (
"bufio"
"flag"
"fmt"
"io"
"log"
"os"
"regexp"
"strconv"
"strings"
)
// splitByPb is a split function for the scanner that splits by the
// '<pb' token.
func splitByPb(data []byte, atEOF bool) (advance int, token []byte, err error) {
if atEOF && len(data) == 0 {
return 0, nil, nil
}
if i := strings.Index(string(data[:]), "<pb"); i >= 0 {
return i + 1, data[0:i], nil
}
// If we're at EOF, we have a final section, so just return the lot.
if atEOF {
return len(data), data, nil
}
// Request more data.
return 0, nil, nil
}
type Page struct {
number int
text string
}
func addPage(pgs *[]Page, number int, text string) {
added := 0
for i, pg := range *pgs {
if pg.number == number {
(*pgs)[i].text = pg.text + text
added = 1
}
}
if added == 0 {
newpg := Page{number, text}
*pgs = append(*pgs, newpg)
}
}
func main() {
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: eeboxmltohocr in.xml outbase\n")
flag.PrintDefaults()
}
flag.Parse()
if flag.NArg() < 2 {
flag.Usage()
os.Exit(1)
}
f, err := os.Open(flag.Arg(0))
defer f.Close()
if err != nil {
log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err)
}
scanner := bufio.NewScanner(f)
scanner.Split(splitByPb)
var pgs []Page
for scanner.Scan() {
t := scanner.Text()
r := regexp.MustCompile(`pb [^>]*facs="tcp:.*?:(.*?)"`).FindStringSubmatch(t)
if len(r) <= 1 {
continue
}
pgnum, err := strconv.Atoi(r[1])
if err != nil {
continue
}
content := t[strings.Index(t, ">")+1:]
ungap := regexp.MustCompile(`(?s)<gap[ >].+?</gap>`).ReplaceAllString(content, "")
unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "")
finaltxt := strings.TrimLeft(unxml, " \n")
if len(finaltxt) == 0 {
continue
}
addPage(&pgs, pgnum, finaltxt)
}
for _, pg := range pgs {
fn := fmt.Sprintf("%s-%03d.hocr", flag.Arg(1), pg.number - 1)
f, err := os.Create(fn)
if err != nil {
log.Fatalf("Could not create file %s: %v\n", fn, err)
}
defer f.Close()
_, err = io.WriteString(f, hocrHeader + pg.text + hocrFooter)
if err != nil {
log.Fatalf("Could not write file %s: %v\n", fn, err)
}
}
}
const hocrHeader = `<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
<meta name='ocr-system' content='tesseract 4.0.0' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='bbox 0 0 600 1200'>
<div class='ocr_carea' id='block_1_1' title="bbox 0 0 600 1200">
<p class='ocr_par' id='par_1_1' lang='lat' title="bbox 0 0 600 1200">
<span class='ocr_line' id='line_1_1' title="bbox 0 0 600 1200"
>
<span class='ocrx_word' id='word_1_1' title='bbox 0 0 600 1200'>`
const hocrFooter = `</span>
</span>
</p>
</div>
</div>
</body>
</html>`
|