1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
package hocr
import (
"encoding/xml"
"errors"
"io/ioutil"
"regexp"
"strconv"
"strings"
)
type Hocr struct {
Lines []OcrLine `xml:"body>div>div>p>span"`
}
type OcrLine struct {
Class string `xml:"class,attr"`
Id string `xml:"id,attr"`
Title string `xml:"title,attr"`
Words []OcrWord `xml:"span"`
Text string `xml:",chardata"`
}
type OcrWord struct {
Class string `xml:"class,attr"`
Id string `xml:"id,attr"`
Title string `xml:"title,attr"`
Chars []OcrChar `xml:"span"`
Text string `xml:",chardata"`
}
type OcrChar struct {
Class string `xml:"class,attr"`
Id string `xml:"id,attr"`
Title string `xml:"title,attr"`
Chars []OcrChar `xml:"span"`
Text string `xml:",chardata"`
}
// Returns the confidence for a word based on its x_wconf value
func wordConf(s string) (float64, error) {
re, err := regexp.Compile(`x_wconf ([0-9.]+)`)
if err != nil {
return 0.0, err
}
conf := re.FindStringSubmatch(s)
return strconv.ParseFloat(conf[1], 64)
}
func boxCoords(s string) ([4]int, error) {
var coords [4]int
re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`)
if err != nil {
return coords, err
}
coordstr := re.FindStringSubmatch(s)
for i := range coords {
c, err := strconv.Atoi(coordstr[i+1])
if err != nil {
return coords, err
}
coords[i] = c
}
return coords, nil
}
func noText(s string) bool {
t := strings.Trim(s, " \n")
return len(t) == 0
}
func Parse(b []byte) (Hocr, error) {
var hocr Hocr
err := xml.Unmarshal(b, &hocr)
if err != nil {
return hocr, err
}
return hocr, nil
}
func GetText(hocrfn string) (string, error) {
var s string
file, err := ioutil.ReadFile(hocrfn)
if err != nil {
return s, err
}
h, err := Parse(file)
if err != nil {
return s, err
}
for _, l := range h.Lines {
s += getLineText(l)
}
return s, nil
}
func GetAvgConf(hocrfn string) (float64, error) {
file, err := ioutil.ReadFile(hocrfn)
if err != nil {
return 0, err
}
h, err := Parse(file)
if err != nil {
return 0, err
}
var total, num float64
for _, l := range h.Lines {
for _, w := range l.Words {
c, err := wordConf(w.Title)
if err != nil {
return 0, err
}
total += c
num++
}
}
if num == 0 {
return 0, errors.New("No words found")
}
return total / num, nil
}
|