summaryrefslogtreecommitdiff
path: root/lib/hocr/hocr.go
blob: 0d10819fa255110cc5569a13bec03d5e89cf67ed (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
package hocr

import (
	"encoding/xml"
	"regexp"
	"strconv"
	"strings"
)

type Hocr struct {
	Lines []OcrLine `xml:"body>div>div>p>span"`
}

type OcrLine struct {
	Class string `xml:"class,attr"`
	Id string `xml:"id,attr"`
	Title string `xml:"title,attr"`
	Words []OcrWord `xml:"span"`
	Text string `xml:",chardata"`
}

type OcrWord struct {
	Class string `xml:"class,attr"`
	Id string `xml:"id,attr"`
	Title string `xml:"title,attr"`
	Chars []OcrChar `xml:"span"`
	Text string `xml:",chardata"`
}

type OcrChar struct {
	Class string `xml:"class,attr"`
	Id string `xml:"id,attr"`
	Title string `xml:"title,attr"`
	Chars []OcrChar `xml:"span"`
	Text string `xml:",chardata"`
}

// Returns the confidence for a word based on its x_wconf value
func wordConf(s string) (float64, error) {
	re, err := regexp.Compile(`x_wconf ([0-9.]+)`)
	if err != nil {
		return 0.0, err
	}
	conf := re.FindStringSubmatch(s)
	return strconv.ParseFloat(conf[1], 64)
}

func boxCoords(s string) ([4]int, error) {
	var coords [4]int
	re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`)
	if err != nil {
		return coords, err
	}
	coordstr := re.FindStringSubmatch(s)
	for i := range coords {
		c, err := strconv.Atoi(coordstr[i+1])
		if err != nil {
			return coords, err
		}
		coords[i] = c
	}
	return coords, nil
}

func noText(s string) bool {
	t := strings.Trim(s, " \n")
	return len(t) == 0
}

func Parse(b []byte) (Hocr, error) {
	var hocr Hocr

	err := xml.Unmarshal(b, &hocr)
	if err != nil {
		return hocr, err
	}

	return hocr, nil
}