From 3880414bbf2d6f2cd05e208abf919ae5ceabeddc Mon Sep 17 00:00:00 2001 From: Nick White Date: Thu, 27 Feb 2020 17:45:16 +0000 Subject: Reorganise all commands to be behind cmd/ --- cmd/eeboxmltohocr/main.go | 135 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 cmd/eeboxmltohocr/main.go (limited to 'cmd/eeboxmltohocr') diff --git a/cmd/eeboxmltohocr/main.go b/cmd/eeboxmltohocr/main.go new file mode 100644 index 0000000..2761cd9 --- /dev/null +++ b/cmd/eeboxmltohocr/main.go @@ -0,0 +1,135 @@ +package main + +import ( + "bufio" + "flag" + "fmt" + "io" + "log" + "os" + "regexp" + "strconv" + "strings" +) + +// splitByPb is a split function for the scanner that splits by the +// '= 0 { + return i + 1, data[0:i], nil + } + // If we're at EOF, we have a final section, so just return the lot. + if atEOF { + return len(data), data, nil + } + // Request more data. + return 0, nil, nil +} + +type Page struct { + number int + text string +} + +func addPage(pgs *[]Page, number int, text string) { + added := 0 + for i, pg := range *pgs { + if pg.number == number { + (*pgs)[i].text = pg.text + text + added = 1 + } + } + if added == 0 { + newpg := Page{number, text} + *pgs = append(*pgs, newpg) + } +} + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: eeboxmltohocr in.xml outbase\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() < 2 { + flag.Usage() + os.Exit(1) + } + + f, err := os.Open(flag.Arg(0)) + defer f.Close() + if err != nil { + log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) + } + scanner := bufio.NewScanner(f) + + scanner.Split(splitByPb) + + var pgs []Page + + for scanner.Scan() { + t := scanner.Text() + r := regexp.MustCompile(`pb [^>]*facs="tcp:.*?:(.*?)"`).FindStringSubmatch(t) + if len(r) <= 1 { + continue + } + pgnum, err := strconv.Atoi(r[1]) + if err != nil { + continue + } + + content := t[strings.Index(t, ">")+1:] + ungap := regexp.MustCompile(`(?s)].+?`).ReplaceAllString(content, "") + unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "") + + finaltxt := strings.TrimLeft(unxml, " \n") + if len(finaltxt) == 0 { + continue + } + + addPage(&pgs, pgnum, finaltxt) + } + + for _, pg := range pgs { + fn := fmt.Sprintf("%s-%03d.hocr", flag.Arg(1), pg.number - 1) + f, err := os.Create(fn) + if err != nil { + log.Fatalf("Could not create file %s: %v\n", fn, err) + } + defer f.Close() + + _, err = io.WriteString(f, hocrHeader + pg.text + hocrFooter) + if err != nil { + log.Fatalf("Could not write file %s: %v\n", fn, err) + } + } +} + +const hocrHeader = ` + + + + + + + + + +
+
+

+ + ` + +const hocrFooter = ` + +

+
+
+ +` -- cgit v1.2.1-24-ge1ad