summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-06-11 15:49:31 +0100
committerNick White <git@njw.name>2019-06-11 15:49:31 +0100
commit566896d9a2f89d079cbc118706611849abacc28f (patch)
treeb22787cc502eafc0ef519776598cd95cae3f0314
parentb401f1fadc934b22bc3c9d9f467d50c97b1cc3d8 (diff)
Add basic utility to turn an eebo xml into a set of hocr files (for hocr2pdf)
-rw-r--r--eeboxmltohocr/main.go137
1 files changed, 137 insertions, 0 deletions
diff --git a/eeboxmltohocr/main.go b/eeboxmltohocr/main.go
new file mode 100644
index 0000000..63502d4
--- /dev/null
+++ b/eeboxmltohocr/main.go
@@ -0,0 +1,137 @@
+package main
+
+import (
+ "bufio"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "regexp"
+ "strconv"
+ "strings"
+)
+
+// splitByPb is a split function for the scanner that splits by the
+// '<pb' token.
+func splitByPb(data []byte, atEOF bool) (advance int, token []byte, err error) {
+ if atEOF && len(data) == 0 {
+ return 0, nil, nil
+ }
+ if i := strings.Index(string(data[:]), "<pb"); i >= 0 {
+ return i + 1, data[0:i], nil
+ }
+ // If we're at EOF, we have a final section, so just return the lot.
+ if atEOF {
+ return len(data), data, nil
+ }
+ // Request more data.
+ return 0, nil, nil
+}
+
+type Page struct {
+ number int
+ text string
+}
+
+func addPage(pgs *[]Page, number int, text string) {
+ added := 0
+ for i, pg := range *pgs {
+ if pg.number == number {
+ (*pgs)[i].text = pg.text + text
+ added = 1
+ }
+ }
+ if added == 0 {
+ newpg := Page{number, text}
+ *pgs = append(*pgs, newpg)
+ }
+}
+
+func main() {
+ flag.Usage = func() {
+ fmt.Fprintf(os.Stderr, "Usage: eeboxmltohocr in.xml outbase\n")
+ flag.PrintDefaults()
+ }
+ flag.Parse()
+ if flag.NArg() < 2 {
+ flag.Usage()
+ os.Exit(1)
+ }
+
+ f, err := os.Open(flag.Arg(0))
+ defer f.Close()
+ if err != nil {
+ log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err)
+ }
+ scanner := bufio.NewScanner(f)
+
+ scanner.Split(splitByPb)
+
+ var pgs []Page
+
+ for scanner.Scan() {
+ t := scanner.Text()
+ r := regexp.MustCompile(`pb [^>]*facs="tcp:.*?:(.*?)"`).FindStringSubmatch(t)
+ if len(r) <= 1 {
+ continue
+ }
+ pgnum, err := strconv.Atoi(r[1])
+ if err != nil {
+ continue
+ }
+
+ content := t[strings.Index(t, ">")+1:]
+ ungap := regexp.MustCompile(`(?s)<gap[ >].+?</gap>`).ReplaceAllString(content, "")
+ unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "")
+ // could of course make this much more elaborate
+ unentity := regexp.MustCompile(`&amp;`).ReplaceAllString(unxml, "&")
+
+ finaltxt := strings.TrimLeft(unentity, " \n")
+ if len(finaltxt) == 0 {
+ continue
+ }
+
+ addPage(&pgs, pgnum, finaltxt)
+ }
+
+ for _, pg := range pgs {
+ fn := fmt.Sprintf("%s_%03d.hocr", flag.Arg(1), pg.number)
+ f, err := os.Create(fn)
+ if err != nil {
+ log.Fatalf("Could not create file %s: %v\n", fn, err)
+ }
+ defer f.Close()
+
+ _, err = io.WriteString(f, hocrHeader + pg.text + hocrFooter)
+ if err != nil {
+ log.Fatalf("Could not write file %s: %v\n", fn, err)
+ }
+ }
+}
+
+const hocrHeader = `<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head>
+ <title></title>
+ <meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
+ <meta name='ocr-system' content='tesseract 4.0.0' />
+ <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
+ </head>
+ <body>
+ <div class='ocr_page' id='page_1' title='bbox 0 0 600 1200'>
+ <div class='ocr_carea' id='block_1_1' title="bbox 0 0 600 1200">
+ <p class='ocr_par' id='par_1_1' lang='lat' title="bbox 0 0 600 1200">
+ <span class='ocr_line' id='line_1_1' title="bbox 0 0 600 1200"
+>
+ <span class='ocrx_word' id='word_1_1' title='bbox 0 0 600 1200'>`
+
+const hocrFooter = `</span>
+ </span>
+ </p>
+ </div>
+ </div>
+ </body>
+</html>`