summaryrefslogtreecommitdiff
path: root/cmd
diff options
context:
space:
mode:
authorNick White <git@njw.name>2020-02-27 17:45:16 +0000
committerNick White <git@njw.name>2020-02-27 17:45:16 +0000
commit3880414bbf2d6f2cd05e208abf919ae5ceabeddc (patch)
treedee30a151048de65a3e42cfdae7739c4502e148f /cmd
parentcda45588cfb796fdd2af27b1851685270df2c02b (diff)
Reorganise all commands to be behind cmd/
Diffstat (limited to 'cmd')
-rw-r--r--cmd/avg-lines/html.go61
-rw-r--r--cmd/avg-lines/main.go69
-rw-r--r--cmd/boxtotxt/main.go44
-rw-r--r--cmd/bucket-lines/bucket.go131
-rw-r--r--cmd/bucket-lines/main.go89
-rw-r--r--cmd/dehyphenate/main.go63
-rw-r--r--cmd/eeboxmltohocr/main.go135
-rw-r--r--cmd/fonttobytes/main.go49
-rw-r--r--cmd/hocrtotxt/main.go30
-rw-r--r--cmd/pare-gt/main.go147
-rw-r--r--cmd/pare-gt/main_test.go78
-rw-r--r--cmd/pgconf/main.go30
12 files changed, 926 insertions, 0 deletions
diff --git a/cmd/avg-lines/html.go b/cmd/avg-lines/html.go
new file mode 100644
index 0000000..97d8ec9
--- /dev/null
+++ b/cmd/avg-lines/html.go
@@ -0,0 +1,61 @@
+package main
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+
+ "rescribe.xyz/utils/pkg/line"
+)
+
+func copylineimg(fn string, l line.Detail) error {
+ f, err := os.Create(fn)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ return l.Img.CopyLineTo(f)
+}
+
+func htmlout(dir string, lines line.Details) error {
+ err := os.MkdirAll(dir, 0700)
+ if err != nil {
+ return err
+ }
+
+ fn := filepath.Join(dir, "index.html")
+ f, err := os.Create(fn)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ _, err = fmt.Fprintf(f, "<!DOCTYPE html><html><head><meta charset='UTF-8'><title></title>"+
+ "<style>td {border: 1px solid #444}</style></head><body>\n<table>\n")
+ if err != nil {
+ return err
+ }
+ for _, l := range lines {
+ fn = filepath.Base(l.OcrName) + "_" + l.Name + ".png"
+ err = copylineimg(filepath.Join(dir, fn), l)
+ if err != nil {
+ return err
+ }
+ _, err = fmt.Fprintf(f, "<tr>\n"+
+ "<td><h1>%.4f%%</h1></td>\n"+
+ "<td>%s %s</td>\n"+
+ "<td><img src='%s' width='100%%' /><br />%s</td>\n"+
+ "</tr>\n",
+ l.Avgconf, l.OcrName, l.Name, fn, l.Text)
+ if err != nil {
+ return err
+ }
+ }
+ _, err = fmt.Fprintf(f, "</table>\n</body></html>\n")
+ if err != nil {
+ return err
+ }
+
+ return nil
+}
diff --git a/cmd/avg-lines/main.go b/cmd/avg-lines/main.go
new file mode 100644
index 0000000..f7cedab
--- /dev/null
+++ b/cmd/avg-lines/main.go
@@ -0,0 +1,69 @@
+package main
+
+import (
+ "flag"
+ "fmt"
+ "log"
+ "os"
+ "path/filepath"
+ "sort"
+
+ "rescribe.xyz/utils/pkg/hocr"
+ "rescribe.xyz/utils/pkg/line"
+ "rescribe.xyz/utils/pkg/prob"
+)
+
+func main() {
+ flag.Usage = func() {
+ fmt.Fprintf(os.Stderr, "Usage: avg-lines [-html dir] [-nosort] [prob1] [hocr1] [prob2] [...]\n")
+ fmt.Fprintf(os.Stderr, "Prints a report of the average confidence for each line, sorted\n")
+ fmt.Fprintf(os.Stderr, "from worst to best.\n")
+ fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n")
+ fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n")
+ fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n")
+ fmt.Fprintf(os.Stderr, "option.\n\n")
+ flag.PrintDefaults()
+ }
+ var html = flag.String("html", "", "Output in html format to the specified directory")
+ var nosort = flag.Bool("nosort", false, "Don't sort lines by confidence")
+ flag.Parse()
+ if flag.NArg() < 1 {
+ flag.Usage()
+ os.Exit(1)
+ }
+
+ var err error
+ lines := make(line.Details, 0)
+
+ for _, f := range flag.Args() {
+ var newlines line.Details
+ switch ext := filepath.Ext(f); ext {
+ case ".prob":
+ newlines, err = prob.GetLineDetails(f)
+ case ".hocr":
+ newlines, err = hocr.GetLineDetails(f)
+ default:
+ log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f)
+ continue
+ }
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for _, l := range newlines {
+ lines = append(lines, l)
+ }
+ }
+
+ if *nosort == false {
+ sort.Sort(lines)
+ }
+
+ if *html == "" {
+ for _, l := range lines {
+ fmt.Printf("%s %s: %.2f%%\n", l.OcrName, l.Name, l.Avgconf)
+ }
+ } else {
+ htmlout(*html, lines)
+ }
+}
diff --git a/cmd/boxtotxt/main.go b/cmd/boxtotxt/main.go
new file mode 100644
index 0000000..058eb05
--- /dev/null
+++ b/cmd/boxtotxt/main.go
@@ -0,0 +1,44 @@
+package main
+
+import (
+ "bufio"
+ "flag"
+ "fmt"
+ "log"
+ "os"
+ "strings"
+)
+
+func main() {
+ flag.Usage = func() {
+ fmt.Fprintf(os.Stderr, "Usage: boxtotxt in.box\n")
+ flag.PrintDefaults()
+ }
+ flag.Parse()
+ if flag.NArg() != 1 {
+ flag.Usage()
+ os.Exit(1)
+ }
+
+ f, err := os.Open(flag.Arg(0))
+ defer f.Close()
+ if err != nil {
+ log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err)
+ }
+
+ scanner := bufio.NewScanner(f)
+
+ for scanner.Scan() {
+ t := scanner.Text()
+ s := strings.Split(t, "")
+ if len(s) < 1 {
+ continue
+ }
+ if s[0] == "\t" {
+ continue
+ }
+ fmt.Printf("%s", s[0])
+ }
+
+ fmt.Printf("\n")
+}
diff --git a/cmd/bucket-lines/bucket.go b/cmd/bucket-lines/bucket.go
new file mode 100644
index 0000000..7b6fc4f
--- /dev/null
+++ b/cmd/bucket-lines/bucket.go
@@ -0,0 +1,131 @@
+package main
+
+import (
+ "fmt"
+ "io"
+ "os"
+ "path/filepath"
+ "sort"
+ "strconv"
+
+ "rescribe.xyz/utils/pkg/line"
+)
+
+type BucketSpec struct {
+ Min float64
+ Name string
+}
+type BucketSpecs []BucketSpec
+
+func (b BucketSpecs) Len() int { return len(b) }
+func (b BucketSpecs) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
+func (b BucketSpecs) Less(i, j int) bool { return b[i].Min < b[j].Min }
+
+type BucketStat struct {
+ name string
+ num int
+}
+type BucketStats []BucketStat
+
+func (b BucketStats) Len() int { return len(b) }
+func (b BucketStats) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
+func (b BucketStats) Less(i, j int) bool { return b[i].num < b[j].num }
+
+// Copies the image and text for a line into a directory based on
+// the line confidence, as defined by the buckets struct
+func bucketLine(l line.Detail, buckets BucketSpecs, dirname string) (string, error) {
+ var bucket string
+
+ todir := ""
+ for _, b := range buckets {
+ if l.Avgconf >= b.Min {
+ todir = b.Name
+ bucket = b.Name
+ }
+ }
+
+ if todir == "" {
+ return bucket, nil
+ }
+
+ avgstr := strconv.FormatFloat(l.Avgconf, 'G', -1, 64)
+ if len(avgstr) > 2 {
+ avgstr = avgstr[2:]
+ }
+
+ base := filepath.Join(dirname, todir, l.OcrName+"_"+l.Name+"_"+avgstr)
+
+ err := os.MkdirAll(filepath.Join(dirname, todir), 0700)
+ if err != nil {
+ return bucket, err
+ }
+
+ f, err := os.Create(base + ".png")
+ if err != nil {
+ return bucket, err
+ }
+ defer f.Close()
+
+ err = l.Img.CopyLineTo(f)
+ if err != nil {
+ return bucket, err
+ }
+
+ f, err = os.Create(base + ".txt")
+ if err != nil {
+ return bucket, err
+ }
+ defer f.Close()
+
+ _, err = io.WriteString(f, l.Text)
+ if err != nil {
+ return bucket, err
+ }
+
+ return bucket, err
+}
+
+// Copies line images and text into directories based on their
+// confidence, as defined by the buckets struct, and returns
+// statistics of whire lines went in the process.
+func BucketUp(lines line.Details, buckets BucketSpecs, dirname string) (BucketStats, error) {
+ var all []string
+ var stats BucketStats
+
+ sort.Sort(lines)
+ sort.Sort(buckets)
+ for _, l := range lines {
+ bname, err := bucketLine(l, buckets, dirname)
+ if err != nil {
+ return stats, err
+ }
+ all = append(all, bname)
+ }
+
+ for _, b := range all {
+ i := sort.Search(len(stats), func(i int) bool { return stats[i].name == b })
+ if i == len(stats) {
+ newstat := BucketStat{b, 0}
+ stats = append(stats, newstat)
+ i = len(stats) - 1
+ }
+ stats[i].num++
+ }
+
+ return stats, nil
+}
+
+// Prints statistics of where lines went when bucketing
+func PrintBucketStats(w io.Writer, stats BucketStats) {
+ var total int
+ for _, s := range stats {
+ total += s.num
+ }
+
+ fmt.Fprintf(w, "Copied %d lines\n", total)
+ fmt.Fprintf(w, "---------------------------------\n")
+ sort.Sort(stats)
+ for _, s := range stats {
+ fmt.Fprintf(w, "Lines in %7s: %2d%%\n", s.name, 100*s.num/total)
+ }
+}
diff --git a/cmd/bucket-lines/main.go b/cmd/bucket-lines/main.go
new file mode 100644
index 0000000..af81b44
--- /dev/null
+++ b/cmd/bucket-lines/main.go
@@ -0,0 +1,89 @@
+package main
+
+import (
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "log"
+ "os"
+ "path/filepath"
+
+ "rescribe.xyz/utils/pkg/hocr"
+ "rescribe.xyz/utils/pkg/line"
+ "rescribe.xyz/utils/pkg/prob"
+)
+
+func main() {
+ b := BucketSpecs{
+ // minimum confidence, name
+ {0, "bad"},
+ {0.95, "95to98"},
+ {0.98, "98plus"},
+ }
+
+ flag.Usage = func() {
+ fmt.Fprintf(os.Stderr, "Usage: bucket-lines [-d dir] [-s specs.json] [hocr1] [prob1] [hocr2] [...]\n")
+ fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n")
+ fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n")
+ fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n")
+ fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n")
+ fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n")
+ fmt.Fprintf(os.Stderr, "option.\n")
+ fmt.Fprintf(os.Stderr, "The .prob and .hocr files are assumed to be in the same directory\n")
+ fmt.Fprintf(os.Stderr, "as the line's image and text files.\n\n")
+ flag.PrintDefaults()
+ fmt.Fprintf(os.Stderr, "\nAn example specs.json file would be the following:\n")
+ fmt.Fprintf(os.Stderr, "[{\"min\": 0, \"name\": \"terrible\"}, {\"min\": 0.80, \"name\": \"ok\"}, {\"min\": 0.98, \"name\": \"great\"}]\n")
+ }
+ dir := flag.String("d", "buckets", "Directory to store the buckets")
+ specs := flag.String("s", "", "JSON file describing specs to bucket into")
+ flag.Parse()
+ if flag.NArg() < 1 {
+ flag.Usage()
+ os.Exit(1)
+ }
+
+ if *specs != "" {
+ js, err := ioutil.ReadFile(*specs)
+ if err != nil {
+ log.Fatal(err)
+ }
+ err = json.Unmarshal(js, &b)
+ if err != nil {
+ log.Fatal(err)
+ }
+ }
+
+ var err error
+ lines := make(line.Details, 0)
+
+ for _, f := range flag.Args() {
+ var newlines line.Details
+ switch ext := filepath.Ext(f); ext {
+ case ".prob":
+ newlines, err = prob.GetLineDetails(f)
+ case ".hocr":
+ newlines, err = hocr.GetLineDetails(f)
+ default:
+ log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f)
+ continue
+ }
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for _, l := range newlines {
+ if l.Img != nil {
+ lines = append(lines, l)
+ }
+ }
+ }
+
+ stats, err := BucketUp(lines, b, *dir)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ PrintBucketStats(os.Stdout, stats)
+}
diff --git a/cmd/dehyphenate/main.go b/cmd/dehyphenate/main.go
new file mode 100644
index 0000000..b2bd6f9
--- /dev/null
+++ b/cmd/dehyphenate/main.go
@@ -0,0 +1,63 @@
+package main
+
+import (
+ "encoding/xml"
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "log"
+ "os"
+
+ "rescribe.xyz/utils/pkg/hocr"
+)
+
+// BUGS:
+// - loses all elements not captured in hocr structure such as html headings
+// might be best to copy the header and footer separately and put the hocr in between, but would still need to ensure all elements are captured
+// - loses any formatting; doesn't need to be identical, but e.g. linebreaks after elements would be handy
+// - need to handle OcrChar
+
+func main() {
+ flag.Usage = func() {
+ fmt.Fprintf(os.Stderr, "Usage: dehyphenate hocrin hocrout\n")
+ fmt.Fprintf(os.Stderr, "Dehyphenates a hocr file.\n")
+ flag.PrintDefaults()
+ }
+ flag.Parse()
+ if flag.NArg() != 2 {
+ flag.Usage()
+ os.Exit(1)
+ }
+
+ in, err := ioutil.ReadFile(flag.Arg(0))
+ if err != nil {
+ log.Fatalf("Error reading %s: %v", flag.Arg(1), err)
+ }
+ h, err := hocr.Parse(in)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for i, l := range h.Lines {
+ w := l.Words[len(l.Words)-1]
+ if len(w.Chars) == 0 {
+ if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' {
+ h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text
+ h.Lines[i+1].Words[0].Text = ""
+ }
+ } else {
+ log.Printf("TODO: handle OcrChar")
+ }
+ }
+
+ f, err := os.Create(flag.Arg(1))
+ if err != nil {
+ log.Fatalf("Error creating file %s: %v", flag.Arg(1), err)
+ }
+ defer f.Close()
+ e := xml.NewEncoder(f)
+ err = e.Encode(h)
+ if err != nil {
+ log.Fatalf("Error encoding XML: %v", err)
+ }
+}
diff --git a/cmd/eeboxmltohocr/main.go b/cmd/eeboxmltohocr/main.go
new file mode 100644
index 0000000..2761cd9
--- /dev/null
+++ b/cmd/eeboxmltohocr/main.go
@@ -0,0 +1,135 @@
+package main
+
+import (
+ "bufio"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "regexp"
+ "strconv"
+ "strings"
+)
+
+// splitByPb is a split function for the scanner that splits by the
+// '<pb' token.
+func splitByPb(data []byte, atEOF bool) (advance int, token []byte, err error) {
+ if atEOF && len(data) == 0 {
+ return 0, nil, nil
+ }
+ if i := strings.Index(string(data[:]), "<pb"); i >= 0 {
+ return i + 1, data[0:i], nil
+ }
+ // If we're at EOF, we have a final section, so just return the lot.
+ if atEOF {
+ return len(data), data, nil
+ }
+ // Request more data.
+ return 0, nil, nil
+}
+
+type Page struct {
+ number int
+ text string
+}
+
+func addPage(pgs *[]Page, number int, text string) {
+ added := 0
+ for i, pg := range *pgs {
+ if pg.number == number {
+ (*pgs)[i].text = pg.text + text
+ added = 1
+ }
+ }
+ if added == 0 {
+ newpg := Page{number, text}
+ *pgs = append(*pgs, newpg)
+ }
+}
+
+func main() {
+ flag.Usage = func() {
+ fmt.Fprintf(os.Stderr, "Usage: eeboxmltohocr in.xml outbase\n")
+ flag.PrintDefaults()
+ }
+ flag.Parse()
+ if flag.NArg() < 2 {
+ flag.Usage()
+ os.Exit(1)
+ }
+
+ f, err := os.Open(flag.Arg(0))
+ defer f.Close()
+ if err != nil {
+ log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err)
+ }
+ scanner := bufio.NewScanner(f)
+
+ scanner.Split(splitByPb)
+
+ var pgs []Page
+
+ for scanner.Scan() {
+ t := scanner.Text()
+ r := regexp.MustCompile(`pb [^>]*facs="tcp:.*?:(.*?)"`).FindStringSubmatch(t)
+ if len(r) <= 1 {
+ continue
+ }
+ pgnum, err := strconv.Atoi(r[1])
+ if err != nil {
+ continue
+ }
+
+ content := t[strings.Index(t, ">")+1:]
+ ungap := regexp.MustCompile(`(?s)<gap[ >].+?</gap>`).ReplaceAllString(content, "")
+ unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "")
+
+ finaltxt := strings.TrimLeft(unxml, " \n")
+ if len(finaltxt) == 0 {
+ continue
+ }
+
+ addPage(&pgs, pgnum, finaltxt)
+ }
+
+ for _, pg := range pgs {
+ fn := fmt.Sprintf("%s-%03d.hocr", flag.Arg(1), pg.number - 1)
+ f, err := os.Create(fn)
+ if err != nil {
+ log.Fatalf("Could not create file %s: %v\n", fn, err)
+ }
+ defer f.Close()
+
+ _, err = io.WriteString(f, hocrHeader + pg.text + hocrFooter)
+ if err != nil {
+ log.Fatalf("Could not write file %s: %v\n", fn, err)
+ }
+ }
+}
+
+const hocrHeader = `<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head>
+ <title></title>
+ <meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
+ <meta name='ocr-system' content='tesseract 4.0.0' />
+ <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
+ </head>
+ <body>
+ <div class='ocr_page' id='page_1' title='bbox 0 0 600 1200'>
+ <div class='ocr_carea' id='block_1_1' title="bbox 0 0 600 1200">
+ <p class='ocr_par' id='par_1_1' lang='lat' title="bbox 0 0 600 1200">
+ <span class='ocr_line' id='line_1_1' title="bbox 0 0 600 1200"
+>
+ <span class='ocrx_word' id='word_1_1' title='bbox 0 0 600 1200'>`
+
+const hocrFooter = `</span>
+ </span>
+ </p>
+ </div>
+ </div>
+ </body>
+</html>`
diff --git a/cmd/fonttobytes/main.go b/cmd/fonttobytes/main.go
new file mode 100644
index 0000000..8310e0f
--- /dev/null
+++ b/cmd/fonttobytes/main.go
@@ -0,0 +1,49 @@
+package main
+
+import (
+ "bytes"
+ "compress/zlib"
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "log"
+ "os"
+)
+
+func main() {
+ flag.Usage = func() {
+ fmt.Fprintln(flag.CommandLine.Output(), "Usage: fonttobytes font.ttf")
+ flag.PrintDefaults()
+ }
+ flag.Parse()
+
+ if flag.NArg() != 1 {
+ flag.Usage()
+ return
+ }
+
+ f, err := os.Open(flag.Arg(0))
+ if err != nil {
+ log.Fatalln("Failed to open file", flag.Arg(0), err)
+ }
+ fontbytes, err := ioutil.ReadAll(f)
+ if err != nil {
+ log.Fatalln("Failed to read file", flag.Arg(0), err)
+ }
+
+ var compressed bytes.Buffer
+ w := zlib.NewWriter(&compressed)
+ w.Write(fontbytes)
+ w.Close()
+
+ // This could be done with %+v in printf, but using the decimal rather than
+ // hex output saves quite a few bytes, so we do that instead.
+ fmt.Printf("[]byte{")
+ for i, b := range compressed.Bytes() {
+ if i > 0 {
+ fmt.Printf(", ")
+ }
+ fmt.Printf("%d", b)
+ }
+ fmt.Printf("}\n")
+}
diff --git a/cmd/hocrtotxt/main.go b/cmd/hocrtotxt/main.go
new file mode 100644
index 0000000..6716a9e
--- /dev/null
+++ b/cmd/hocrtotxt/main.go
@@ -0,0 +1,30 @@
+package main
+
+import (
+ "flag"
+ "fmt"
+ "log"
+ "os"
+
+ "rescribe.xyz/utils/pkg/hocr"
+)
+
+func main() {
+ flag.Usage = func() {
+ fmt.Fprintf(os.Stderr, "Usage: hocrtotxt hocrfile\n")
+ fmt.Fprintf(os.Stderr, "Prints the text from a hocr file.\n")
+ flag.PrintDefaults()
+ }
+ flag.Parse()
+ if flag.NArg() != 1 {
+ flag.Usage()
+ os.Exit(1)
+ }
+
+ text, err := hocr.GetText(flag.Arg(0))
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ fmt.Printf("%s\n", text)
+}
diff --git a/cmd/pare-gt/main.go b/cmd/pare-gt/main.go
new file mode 100644
index 0000000..a4d9600
--- /dev/null
+++ b/cmd/pare-gt/main.go
@@ -0,0 +1,147 @@
+package main
+
+import (
+ "flag"
+ "fmt"
+ "log"
+ "math/rand"
+ "os"
+ "path"
+ "path/filepath"
+ "sort"
+ "strings"
+)
+
+const usage = `Usage: pare-gt [-n num] gtdir movedir
+
+Moves some of the ground truth from gt-dir into movedir,
+ensuring that the same proportions of each ground truth
+source are represented in the moved section. Proportion of
+ground truth source is calculated by taking the prefix of
+the filename up to the first '-' character.
+`
+
+// Prefixes is a map of the prefix string to a list of filenames
+type Prefixes = map[string][]string
+
+// walker adds any .txt path to prefixes map, under the appropriate
+// prefix (blank if no '-' separator was found)
+func walker(prefixes *Prefixes) filepath.WalkFunc {
+ return func(fpath string, info os.FileInfo, err error) error {
+ if err != nil {
+ return err
+ }
+ if info.IsDir() {
+ return nil
+ }
+ ext := path.Ext(fpath)
+ if ext != ".txt" {
+ return nil
+ }
+ base := path.Base(fpath)
+ idx := strings.Index(base, "-")
+ var prefix string
+ if idx > -1 {
+ prefix = base[0:idx]
+ }
+ noext := strings.TrimSuffix(fpath, ext)
+ (*prefixes)[prefix] = append((*prefixes)[prefix], noext)
+ return nil
+ }
+}
+
+// inStrSlice checks whether a given string is part of a slice of
+// strings
+func inStrSlice(sl []string, s string) bool {
+ for _, v := range sl {
+ if s == v {
+ return true
+ }
+ }
+ return false
+}
+
+// samplePrefixes selects random samples for each prefix, proportional
+// to the amount of that prefix there are in the whole set, so that a
+// total of perctosample% are sampled.
+func samplePrefixes(perctosample int, prefixes Prefixes) (filestomove []string) {
+ var total, sample int
+ var keys []string
+ for i, v := range prefixes {
+ total += len(v)
+ // needed for determinism
+ sort.Strings(prefixes[i])
+ keys = append(keys, i)
+ }
+
+ sample = (total * perctosample) / 100
+
+ // This ensures the map is looped over deterministically
+ sort.Strings(keys)
+ for _, key := range keys {
+ prefix := prefixes[key]
+ len := len(prefix)
+ if len == 1 {
+ continue
+ }
+ numtoget := int(float64(sample) / float64(total) * float64(len))
+ if numtoget >= len {
+ numtoget = len - 1
+ }
+ if numtoget < 1 {
+ numtoget = 1
+ }
+ for i := 0; i < numtoget; i++ {
+ var selected string
+ selected = prefix[rand.Int()%len]
+ // pick a different random selection if the first one is
+ // already in the filestomove slice
+ for inStrSlice(filestomove, selected) {
+ selected = prefix[rand.Int()%len]
+ }
+ filestomove = append(filestomove, selected)
+ }
+ }
+
+ return
+}
+
+func main() {
+ flag.Usage = func() {
+ fmt.Fprintf(flag.CommandLine.Output(), usage)
+ flag.PrintDefaults()
+ }
+ numtopare := flag.Int("n", 10, "Percentage of the ground truth to pare away.")
+ flag.Parse()
+ if flag.NArg() != 2 {
+ flag.Usage()
+ os.Exit(1)
+ }
+
+ for _, d := range flag.Args() {
+ info, err := os.Stat(d)
+ if err != nil || !info.IsDir() {
+ log.Fatalln("Error accessing directory", flag.Arg(0), err)
+ }
+ }
+
+ var prefixes Prefixes
+ prefixes = make(Prefixes)
+ err := filepath.Walk(flag.Arg(0), walker(&prefixes))
+ if err != nil {
+ log.Fatalln("Failed to walk", flag.Arg(0), err)
+ }
+
+ filestomove := samplePrefixes(*numtopare, prefixes)
+
+ for _, f := range filestomove {
+ fmt.Println("Moving ground truth", f)
+ b := path.Base(f)
+ for _, ext := range []string{".txt", ".png"} {
+ err = os.Rename(f+ext, path.Join(flag.Arg(1), b+ext))
+ if err != nil {
+ log.Fatalln("Error moving file", f+ext, err)
+ }
+ }
+ }
+}
diff --git a/cmd/pare-gt/main_test.go b/cmd/pare-gt/main_test.go
new file mode 100644
index 0000000..c381a86
--- /dev/null
+++ b/cmd/pare-gt/main_test.go
@@ -0,0 +1,78 @@
+package main
+
+import (
+ "fmt"
+ "testing"
+)
+
+func TestSamplePrefixes(t *testing.T) {
+ prefixes := Prefixes{
+ "1471-Orthographia": {
+ "1471-Orthographia-Tortellius_00001.txt",
+ "1471-Orthographia-Tortellius_00002.txt",
+ "1471-Orthographia-Tortellius_00003.txt",
+ "1471-Orthographia-Tortellius_00004.txt",
+ "1471-Orthographia-Tortellius_00005.txt",
+ "1471-Orthographia-Tortellius_00006.txt",
+ "1471-Orthographia-Tortellius_00007.txt",
+ "1471-Orthographia-Tortellius_00008.txt",
+ "1471-Orthographia-Tortellius_00009.txt",
+ "1471-Orthographia-Tortellius_000010.txt",
+ "1471-Orthographia-Tortellius_000011.txt",
+ "1471-Orthographia-Tortellius_000012.txt",
+ "1471-Orthographia-Tortellius_000013.txt",
+ "1471-Orthographia-Tortellius_000014.txt",
+ "1471-Orthographia-Tortellius_000015.txt",
+ "1471-Orthographia-Tortellius_000016.txt",
+ "1471-Orthographia-Tortellius_000017.txt",
+ "1471-Orthographia-Tortellius_000018.txt",
+ "1471-Orthographia-Tortellius_000019.txt",
+ "1471-Orthographia-Tortellius_000020.txt",
+ },
+ "Kallimachos_1509": {
+ "Kallimachos_1509-ShipOfFools-Barclay_00121.txt",
+ "Kallimachos_1509-ShipOfFools-Barclay_00122.txt",
+ "Kallimachos_1509-ShipOfFools-Barclay_00123.txt",
+ "Kallimachos_1509-ShipOfFools-Barclay_00124.txt",
+ "Kallimachos_1509-ShipOfFools-Barclay_00125.txt",
+ "Kallimachos_1509-ShipOfFools-Barclay_00126.txt",
+ },
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4": {
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_12_49.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_1_415.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_14_6628571428571429.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_16_865.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_17_62.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_18_6366666666666666.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt",
+ },
+ }
+
+ cases := []struct {
+ perc int
+ expected []string
+ }{
+ {1, []string{"1471-Orthographia-Tortellius_000019.txt", "Kallimachos_1509-ShipOfFools-Barclay_00122.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt"}},
+ {10, []string{"1471-Orthographia-Tortellius_00002.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt"}},
+ {20, []string{"1471-Orthographia-Tortellius_00008.txt", "1471-Orthographia-Tortellius_000017.txt", "1471-Orthographia-Tortellius_00006.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt"}},
+ }
+
+ for _, c := range cases {
+ t.Run(fmt.Sprintf("%d%%", c.perc), func(t *testing.T) {
+ actual := samplePrefixes(c.perc, prefixes)
+ if len(c.expected) != len(actual) {
+ t.Fatalf("Number of files picked (%d) differs from expected (%d):\nExpected: %s\nActual: %s\n", len(actual), len(c.expected), c.expected, actual)
+ return
+ }
+ for i, v := range c.expected {
+ if actual[i] != v {
+ t.Fatalf("Difference in expected and actual files (first difference is in index %d of actual):\n\nExpected:\n%s\n\nActual:\n%s\n", i, c.expected, actual)
+ }
+ }
+ })
+ }
+}
diff --git a/cmd/pgconf/main.go b/cmd/pgconf/main.go
new file mode 100644
index 0000000..dbc6af8
--- /dev/null
+++ b/cmd/pgconf/main.go
@@ -0,0 +1,30 @@
+package main
+
+import (
+ "flag"
+ "fmt"
+ "log"
+ "os"
+
+ "rescribe.xyz/utils/pkg/hocr"
+)
+
+func main() {
+ flag.Usage = func() {
+ fmt.Fprintf(os.Stderr, "Usage: pgconf hocr\n")
+ fmt.Fprintf(os.Stderr, "Prints the total confidence for a page, as an average of the confidence of each word.\n")
+ flag.PrintDefaults()
+ }
+ flag.Parse()
+ if flag.NArg() != 1 {
+ flag.Usage()
+ os.Exit(1)
+ }
+
+ avg, err := hocr.GetAvgConf(flag.Arg(0))
+ if err != nil {
+ log.Fatalf("Error retreiving confidence for %s: %v\n", flag.Arg(0), err)
+ }
+
+ fmt.Printf("%0.0f\n", avg)
+}