diff options
Diffstat (limited to 'eeboxmltohocr')
-rw-r--r-- | eeboxmltohocr/main.go | 137 |
1 files changed, 137 insertions, 0 deletions
diff --git a/eeboxmltohocr/main.go b/eeboxmltohocr/main.go new file mode 100644 index 0000000..63502d4 --- /dev/null +++ b/eeboxmltohocr/main.go @@ -0,0 +1,137 @@ +package main + +import ( + "bufio" + "flag" + "fmt" + "io" + "log" + "os" + "regexp" + "strconv" + "strings" +) + +// splitByPb is a split function for the scanner that splits by the +// '<pb' token. +func splitByPb(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + if i := strings.Index(string(data[:]), "<pb"); i >= 0 { + return i + 1, data[0:i], nil + } + // If we're at EOF, we have a final section, so just return the lot. + if atEOF { + return len(data), data, nil + } + // Request more data. + return 0, nil, nil +} + +type Page struct { + number int + text string +} + +func addPage(pgs *[]Page, number int, text string) { + added := 0 + for i, pg := range *pgs { + if pg.number == number { + (*pgs)[i].text = pg.text + text + added = 1 + } + } + if added == 0 { + newpg := Page{number, text} + *pgs = append(*pgs, newpg) + } +} + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: eeboxmltohocr in.xml outbase\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() < 2 { + flag.Usage() + os.Exit(1) + } + + f, err := os.Open(flag.Arg(0)) + defer f.Close() + if err != nil { + log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) + } + scanner := bufio.NewScanner(f) + + scanner.Split(splitByPb) + + var pgs []Page + + for scanner.Scan() { + t := scanner.Text() + r := regexp.MustCompile(`pb [^>]*facs="tcp:.*?:(.*?)"`).FindStringSubmatch(t) + if len(r) <= 1 { + continue + } + pgnum, err := strconv.Atoi(r[1]) + if err != nil { + continue + } + + content := t[strings.Index(t, ">")+1:] + ungap := regexp.MustCompile(`(?s)<gap[ >].+?</gap>`).ReplaceAllString(content, "") + unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "") + // could of course make this much more elaborate + unentity := regexp.MustCompile(`&`).ReplaceAllString(unxml, "&") + + finaltxt := strings.TrimLeft(unentity, " \n") + if len(finaltxt) == 0 { + continue + } + + addPage(&pgs, pgnum, finaltxt) + } + + for _, pg := range pgs { + fn := fmt.Sprintf("%s_%03d.hocr", flag.Arg(1), pg.number) + f, err := os.Create(fn) + if err != nil { + log.Fatalf("Could not create file %s: %v\n", fn, err) + } + defer f.Close() + + _, err = io.WriteString(f, hocrHeader + pg.text + hocrFooter) + if err != nil { + log.Fatalf("Could not write file %s: %v\n", fn, err) + } + } +} + +const hocrHeader = `<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> + <head> + <title></title> + <meta http-equiv="Content-Type" content="text/html;charset=utf-8"/> + <meta name='ocr-system' content='tesseract 4.0.0' /> + <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/> + </head> + <body> + <div class='ocr_page' id='page_1' title='bbox 0 0 600 1200'> + <div class='ocr_carea' id='block_1_1' title="bbox 0 0 600 1200"> + <p class='ocr_par' id='par_1_1' lang='lat' title="bbox 0 0 600 1200"> + <span class='ocr_line' id='line_1_1' title="bbox 0 0 600 1200" +> + <span class='ocrx_word' id='word_1_1' title='bbox 0 0 600 1200'>` + +const hocrFooter = `</span> + </span> + </p> + </div> + </div> + </body> +</html>` |