From 6d9659e826eb6977a5ee4b4abc8be902a91294f1 Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 11 Jun 2019 17:15:35 +0100 Subject: Name hocrs as pdfimages does, and preserve entities for hocr --- eeboxmltohocr/main.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/eeboxmltohocr/main.go b/eeboxmltohocr/main.go index 63502d4..2761cd9 100644 --- a/eeboxmltohocr/main.go +++ b/eeboxmltohocr/main.go @@ -84,10 +84,8 @@ func main() { content := t[strings.Index(t, ">")+1:] ungap := regexp.MustCompile(`(?s)].+?`).ReplaceAllString(content, "") unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "") - // could of course make this much more elaborate - unentity := regexp.MustCompile(`&`).ReplaceAllString(unxml, "&") - finaltxt := strings.TrimLeft(unentity, " \n") + finaltxt := strings.TrimLeft(unxml, " \n") if len(finaltxt) == 0 { continue } @@ -96,7 +94,7 @@ func main() { } for _, pg := range pgs { - fn := fmt.Sprintf("%s_%03d.hocr", flag.Arg(1), pg.number) + fn := fmt.Sprintf("%s-%03d.hocr", flag.Arg(1), pg.number - 1) f, err := os.Create(fn) if err != nil { log.Fatalf("Could not create file %s: %v\n", fn, err) -- cgit v1.2.1-24-ge1ad