diff options
| author | Nick White <git@njw.name> | 2019-06-11 17:15:35 +0100 | 
|---|---|---|
| committer | Nick White <git@njw.name> | 2019-06-11 17:15:35 +0100 | 
| commit | 6d9659e826eb6977a5ee4b4abc8be902a91294f1 (patch) | |
| tree | aafd9a840e19da722aad2db5a468a9b963d58fbf /eeboxmltohocr | |
| parent | 566896d9a2f89d079cbc118706611849abacc28f (diff) | |
Name hocrs as pdfimages does, and preserve entities for hocr
Diffstat (limited to 'eeboxmltohocr')
| -rw-r--r-- | eeboxmltohocr/main.go | 6 | 
1 files changed, 2 insertions, 4 deletions
| diff --git a/eeboxmltohocr/main.go b/eeboxmltohocr/main.go index 63502d4..2761cd9 100644 --- a/eeboxmltohocr/main.go +++ b/eeboxmltohocr/main.go @@ -84,10 +84,8 @@ func main() {  		content := t[strings.Index(t, ">")+1:]  		ungap := regexp.MustCompile(`(?s)<gap[ >].+?</gap>`).ReplaceAllString(content, "")  		unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "") -		// could of course make this much more elaborate -		unentity := regexp.MustCompile(`&`).ReplaceAllString(unxml, "&") -		finaltxt := strings.TrimLeft(unentity, " \n") +		finaltxt := strings.TrimLeft(unxml, " \n")  		if len(finaltxt) == 0 {  			continue  		} @@ -96,7 +94,7 @@ func main() {  	}  	for _, pg := range pgs { -		fn := fmt.Sprintf("%s_%03d.hocr", flag.Arg(1), pg.number) +		fn := fmt.Sprintf("%s-%03d.hocr", flag.Arg(1), pg.number - 1)  		f, err := os.Create(fn)  		if err != nil {  			log.Fatalf("Could not create file %s: %v\n", fn, err) | 
