diff options
author | Nick White <git@njw.name> | 2019-10-08 12:52:33 +0100 |
---|---|---|
committer | Nick White <git@njw.name> | 2019-10-08 12:52:33 +0100 |
commit | 7482157a03ed3e9d7f45e54a126b391001f34948 (patch) | |
tree | 52f87b9ca159fe4c04a0349de95ea9de82692b3c /eeboxmltohocr | |
parent | d43c11bf653bfe3c1ad1ed277f1ec08bf155cf98 (diff) |
Separate out bookpipeline from catch-all go.git repo, and rename to rescribe.xyz/bookpipeline
The dependencies from the go.git repo will follow in due course.
Diffstat (limited to 'eeboxmltohocr')
-rw-r--r-- | eeboxmltohocr/main.go | 135 |
1 files changed, 0 insertions, 135 deletions
diff --git a/eeboxmltohocr/main.go b/eeboxmltohocr/main.go deleted file mode 100644 index 2761cd9..0000000 --- a/eeboxmltohocr/main.go +++ /dev/null @@ -1,135 +0,0 @@ -package main - -import ( - "bufio" - "flag" - "fmt" - "io" - "log" - "os" - "regexp" - "strconv" - "strings" -) - -// splitByPb is a split function for the scanner that splits by the -// '<pb' token. -func splitByPb(data []byte, atEOF bool) (advance int, token []byte, err error) { - if atEOF && len(data) == 0 { - return 0, nil, nil - } - if i := strings.Index(string(data[:]), "<pb"); i >= 0 { - return i + 1, data[0:i], nil - } - // If we're at EOF, we have a final section, so just return the lot. - if atEOF { - return len(data), data, nil - } - // Request more data. - return 0, nil, nil -} - -type Page struct { - number int - text string -} - -func addPage(pgs *[]Page, number int, text string) { - added := 0 - for i, pg := range *pgs { - if pg.number == number { - (*pgs)[i].text = pg.text + text - added = 1 - } - } - if added == 0 { - newpg := Page{number, text} - *pgs = append(*pgs, newpg) - } -} - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: eeboxmltohocr in.xml outbase\n") - flag.PrintDefaults() - } - flag.Parse() - if flag.NArg() < 2 { - flag.Usage() - os.Exit(1) - } - - f, err := os.Open(flag.Arg(0)) - defer f.Close() - if err != nil { - log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) - } - scanner := bufio.NewScanner(f) - - scanner.Split(splitByPb) - - var pgs []Page - - for scanner.Scan() { - t := scanner.Text() - r := regexp.MustCompile(`pb [^>]*facs="tcp:.*?:(.*?)"`).FindStringSubmatch(t) - if len(r) <= 1 { - continue - } - pgnum, err := strconv.Atoi(r[1]) - if err != nil { - continue - } - - content := t[strings.Index(t, ">")+1:] - ungap := regexp.MustCompile(`(?s)<gap[ >].+?</gap>`).ReplaceAllString(content, "") - unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "") - - finaltxt := strings.TrimLeft(unxml, " \n") - if len(finaltxt) == 0 { - continue - } - - addPage(&pgs, pgnum, finaltxt) - } - - for _, pg := range pgs { - fn := fmt.Sprintf("%s-%03d.hocr", flag.Arg(1), pg.number - 1) - f, err := os.Create(fn) - if err != nil { - log.Fatalf("Could not create file %s: %v\n", fn, err) - } - defer f.Close() - - _, err = io.WriteString(f, hocrHeader + pg.text + hocrFooter) - if err != nil { - log.Fatalf("Could not write file %s: %v\n", fn, err) - } - } -} - -const hocrHeader = `<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" - "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> - <head> - <title></title> - <meta http-equiv="Content-Type" content="text/html;charset=utf-8"/> - <meta name='ocr-system' content='tesseract 4.0.0' /> - <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/> - </head> - <body> - <div class='ocr_page' id='page_1' title='bbox 0 0 600 1200'> - <div class='ocr_carea' id='block_1_1' title="bbox 0 0 600 1200"> - <p class='ocr_par' id='par_1_1' lang='lat' title="bbox 0 0 600 1200"> - <span class='ocr_line' id='line_1_1' title="bbox 0 0 600 1200" -> - <span class='ocrx_word' id='word_1_1' title='bbox 0 0 600 1200'>` - -const hocrFooter = `</span> - </span> - </p> - </div> - </div> - </body> -</html>` |