summaryrefslogtreecommitdiff
path: root/eeboxmltohocr/main.go
blob: 2761cd92e9a8ad38222d27f40e80b4e06e3a89d7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package main

import (
	"bufio"
	"flag"
	"fmt"
	"io"
	"log"
	"os"
	"regexp"
	"strconv"
	"strings"
)

// splitByPb is a split function for the scanner that splits by the
// '<pb' token.
func splitByPb(data []byte, atEOF bool) (advance int, token []byte, err error) {
	if atEOF && len(data) == 0 {
		return 0, nil, nil
	}
	if i := strings.Index(string(data[:]), "<pb"); i >= 0 {
		return i + 1, data[0:i], nil
	}
	// If we're at EOF, we have a final section, so just return the lot.
	if atEOF {
		return len(data), data, nil
	}
	// Request more data.
	return 0, nil, nil
}

type Page struct {
	number int
	text   string
}

func addPage(pgs *[]Page, number int, text string) {
	added := 0
	for i, pg := range *pgs {
		if pg.number == number {
			(*pgs)[i].text = pg.text + text
			added = 1
		}
	}
	if added == 0 {
		newpg := Page{number, text}
		*pgs = append(*pgs, newpg)
	}	
}

func main() {
	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "Usage: eeboxmltohocr in.xml outbase\n")
		flag.PrintDefaults()
	}
	flag.Parse()
	if flag.NArg() < 2 {
		flag.Usage()
		os.Exit(1)
	}

	f, err := os.Open(flag.Arg(0))
	defer f.Close()
	if err != nil {
		log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err)
	}
	scanner := bufio.NewScanner(f)

	scanner.Split(splitByPb)

	var pgs []Page

	for scanner.Scan() {
		t := scanner.Text()
		r := regexp.MustCompile(`pb [^>]*facs="tcp:.*?:(.*?)"`).FindStringSubmatch(t)
		if len(r) <= 1 {
			continue
		}
		pgnum, err := strconv.Atoi(r[1])
		if err != nil {
			continue
		}

		content := t[strings.Index(t, ">")+1:]
		ungap := regexp.MustCompile(`(?s)<gap[ >].+?</gap>`).ReplaceAllString(content, "")
		unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "")

		finaltxt := strings.TrimLeft(unxml, " \n")
		if len(finaltxt) == 0 {
			continue
		}

		addPage(&pgs, pgnum, finaltxt)
	}

	for _, pg := range pgs {
		fn := fmt.Sprintf("%s-%03d.hocr", flag.Arg(1), pg.number - 1)
		f, err := os.Create(fn)
		if err != nil {
			log.Fatalf("Could not create file %s: %v\n", fn, err)
		}
		defer f.Close()

		_, err = io.WriteString(f, hocrHeader + pg.text + hocrFooter)
		if err != nil {
			log.Fatalf("Could not write file %s: %v\n", fn, err)
		}
	}
}

const hocrHeader = `<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
  <title></title>
  <meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
  <meta name='ocr-system' content='tesseract 4.0.0' />
  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
 </head>
 <body>
  <div class='ocr_page' id='page_1' title='bbox 0 0 600 1200'>
   <div class='ocr_carea' id='block_1_1' title="bbox 0 0 600 1200">
    <p class='ocr_par' id='par_1_1' lang='lat' title="bbox 0 0 600 1200">
     <span class='ocr_line' id='line_1_1' title="bbox 0 0 600 1200"
>
      <span class='ocrx_word' id='word_1_1' title='bbox 0 0 600 1200'>`

const hocrFooter = `</span>
     </span>
    </p>
   </div>
  </div>
 </body>
</html>`