summaryrefslogtreecommitdiff
path: root/eeboxmltohocr
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-06-11 17:15:35 +0100
committerNick White <git@njw.name>2019-06-11 17:15:35 +0100
commit6d9659e826eb6977a5ee4b4abc8be902a91294f1 (patch)
treeaafd9a840e19da722aad2db5a468a9b963d58fbf /eeboxmltohocr
parent566896d9a2f89d079cbc118706611849abacc28f (diff)
Name hocrs as pdfimages does, and preserve entities for hocr
Diffstat (limited to 'eeboxmltohocr')
-rw-r--r--eeboxmltohocr/main.go6
1 files changed, 2 insertions, 4 deletions
diff --git a/eeboxmltohocr/main.go b/eeboxmltohocr/main.go
index 63502d4..2761cd9 100644
--- a/eeboxmltohocr/main.go
+++ b/eeboxmltohocr/main.go
@@ -84,10 +84,8 @@ func main() {
content := t[strings.Index(t, ">")+1:]
ungap := regexp.MustCompile(`(?s)<gap[ >].+?</gap>`).ReplaceAllString(content, "")
unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "")
- // could of course make this much more elaborate
- unentity := regexp.MustCompile(`&amp;`).ReplaceAllString(unxml, "&")
- finaltxt := strings.TrimLeft(unentity, " \n")
+ finaltxt := strings.TrimLeft(unxml, " \n")
if len(finaltxt) == 0 {
continue
}
@@ -96,7 +94,7 @@ func main() {
}
for _, pg := range pgs {
- fn := fmt.Sprintf("%s_%03d.hocr", flag.Arg(1), pg.number)
+ fn := fmt.Sprintf("%s-%03d.hocr", flag.Arg(1), pg.number - 1)
f, err := os.Create(fn)
if err != nil {
log.Fatalf("Could not create file %s: %v\n", fn, err)