From 3880414bbf2d6f2cd05e208abf919ae5ceabeddc Mon Sep 17 00:00:00 2001 From: Nick White Date: Thu, 27 Feb 2020 17:45:16 +0000 Subject: Reorganise all commands to be behind cmd/ --- cmd/avg-lines/html.go | 61 +++++++++++++++++++ cmd/avg-lines/main.go | 69 +++++++++++++++++++++ cmd/boxtotxt/main.go | 44 ++++++++++++++ cmd/bucket-lines/bucket.go | 131 ++++++++++++++++++++++++++++++++++++++++ cmd/bucket-lines/main.go | 89 +++++++++++++++++++++++++++ cmd/dehyphenate/main.go | 63 +++++++++++++++++++ cmd/eeboxmltohocr/main.go | 135 +++++++++++++++++++++++++++++++++++++++++ cmd/fonttobytes/main.go | 49 +++++++++++++++ cmd/hocrtotxt/main.go | 30 +++++++++ cmd/pare-gt/main.go | 147 +++++++++++++++++++++++++++++++++++++++++++++ cmd/pare-gt/main_test.go | 78 ++++++++++++++++++++++++ cmd/pgconf/main.go | 30 +++++++++ 12 files changed, 926 insertions(+) create mode 100644 cmd/avg-lines/html.go create mode 100644 cmd/avg-lines/main.go create mode 100644 cmd/boxtotxt/main.go create mode 100644 cmd/bucket-lines/bucket.go create mode 100644 cmd/bucket-lines/main.go create mode 100644 cmd/dehyphenate/main.go create mode 100644 cmd/eeboxmltohocr/main.go create mode 100644 cmd/fonttobytes/main.go create mode 100644 cmd/hocrtotxt/main.go create mode 100644 cmd/pare-gt/main.go create mode 100644 cmd/pare-gt/main_test.go create mode 100644 cmd/pgconf/main.go (limited to 'cmd') diff --git a/cmd/avg-lines/html.go b/cmd/avg-lines/html.go new file mode 100644 index 0000000..97d8ec9 --- /dev/null +++ b/cmd/avg-lines/html.go @@ -0,0 +1,61 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + + "rescribe.xyz/utils/pkg/line" +) + +func copylineimg(fn string, l line.Detail) error { + f, err := os.Create(fn) + if err != nil { + return err + } + defer f.Close() + + return l.Img.CopyLineTo(f) +} + +func htmlout(dir string, lines line.Details) error { + err := os.MkdirAll(dir, 0700) + if err != nil { + return err + } + + fn := filepath.Join(dir, "index.html") + f, err := os.Create(fn) + if err != nil { + return err + } + defer f.Close() + + _, err = fmt.Fprintf(f, ""+ + "\n\n") + if err != nil { + return err + } + for _, l := range lines { + fn = filepath.Base(l.OcrName) + "_" + l.Name + ".png" + err = copylineimg(filepath.Join(dir, fn), l) + if err != nil { + return err + } + _, err = fmt.Fprintf(f, "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n", + l.Avgconf, l.OcrName, l.Name, fn, l.Text) + if err != nil { + return err + } + } + _, err = fmt.Fprintf(f, "

%.4f%%

%s %s
%s
\n\n") + if err != nil { + return err + } + + return nil +} diff --git a/cmd/avg-lines/main.go b/cmd/avg-lines/main.go new file mode 100644 index 0000000..f7cedab --- /dev/null +++ b/cmd/avg-lines/main.go @@ -0,0 +1,69 @@ +package main + +import ( + "flag" + "fmt" + "log" + "os" + "path/filepath" + "sort" + + "rescribe.xyz/utils/pkg/hocr" + "rescribe.xyz/utils/pkg/line" + "rescribe.xyz/utils/pkg/prob" +) + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: avg-lines [-html dir] [-nosort] [prob1] [hocr1] [prob2] [...]\n") + fmt.Fprintf(os.Stderr, "Prints a report of the average confidence for each line, sorted\n") + fmt.Fprintf(os.Stderr, "from worst to best.\n") + fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n") + fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n") + fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") + fmt.Fprintf(os.Stderr, "option.\n\n") + flag.PrintDefaults() + } + var html = flag.String("html", "", "Output in html format to the specified directory") + var nosort = flag.Bool("nosort", false, "Don't sort lines by confidence") + flag.Parse() + if flag.NArg() < 1 { + flag.Usage() + os.Exit(1) + } + + var err error + lines := make(line.Details, 0) + + for _, f := range flag.Args() { + var newlines line.Details + switch ext := filepath.Ext(f); ext { + case ".prob": + newlines, err = prob.GetLineDetails(f) + case ".hocr": + newlines, err = hocr.GetLineDetails(f) + default: + log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) + continue + } + if err != nil { + log.Fatal(err) + } + + for _, l := range newlines { + lines = append(lines, l) + } + } + + if *nosort == false { + sort.Sort(lines) + } + + if *html == "" { + for _, l := range lines { + fmt.Printf("%s %s: %.2f%%\n", l.OcrName, l.Name, l.Avgconf) + } + } else { + htmlout(*html, lines) + } +} diff --git a/cmd/boxtotxt/main.go b/cmd/boxtotxt/main.go new file mode 100644 index 0000000..058eb05 --- /dev/null +++ b/cmd/boxtotxt/main.go @@ -0,0 +1,44 @@ +package main + +import ( + "bufio" + "flag" + "fmt" + "log" + "os" + "strings" +) + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: boxtotxt in.box\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() != 1 { + flag.Usage() + os.Exit(1) + } + + f, err := os.Open(flag.Arg(0)) + defer f.Close() + if err != nil { + log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) + } + + scanner := bufio.NewScanner(f) + + for scanner.Scan() { + t := scanner.Text() + s := strings.Split(t, "") + if len(s) < 1 { + continue + } + if s[0] == "\t" { + continue + } + fmt.Printf("%s", s[0]) + } + + fmt.Printf("\n") +} diff --git a/cmd/bucket-lines/bucket.go b/cmd/bucket-lines/bucket.go new file mode 100644 index 0000000..7b6fc4f --- /dev/null +++ b/cmd/bucket-lines/bucket.go @@ -0,0 +1,131 @@ +package main + +import ( + "fmt" + "io" + "os" + "path/filepath" + "sort" + "strconv" + + "rescribe.xyz/utils/pkg/line" +) + +type BucketSpec struct { + Min float64 + Name string +} +type BucketSpecs []BucketSpec + +func (b BucketSpecs) Len() int { return len(b) } +func (b BucketSpecs) Swap(i, j int) { b[i], b[j] = b[j], b[i] } +func (b BucketSpecs) Less(i, j int) bool { return b[i].Min < b[j].Min } + +type BucketStat struct { + name string + num int +} +type BucketStats []BucketStat + +func (b BucketStats) Len() int { return len(b) } +func (b BucketStats) Swap(i, j int) { b[i], b[j] = b[j], b[i] } +func (b BucketStats) Less(i, j int) bool { return b[i].num < b[j].num } + +// Copies the image and text for a line into a directory based on +// the line confidence, as defined by the buckets struct +func bucketLine(l line.Detail, buckets BucketSpecs, dirname string) (string, error) { + var bucket string + + todir := "" + for _, b := range buckets { + if l.Avgconf >= b.Min { + todir = b.Name + bucket = b.Name + } + } + + if todir == "" { + return bucket, nil + } + + avgstr := strconv.FormatFloat(l.Avgconf, 'G', -1, 64) + if len(avgstr) > 2 { + avgstr = avgstr[2:] + } + + base := filepath.Join(dirname, todir, l.OcrName+"_"+l.Name+"_"+avgstr) + + err := os.MkdirAll(filepath.Join(dirname, todir), 0700) + if err != nil { + return bucket, err + } + + f, err := os.Create(base + ".png") + if err != nil { + return bucket, err + } + defer f.Close() + + err = l.Img.CopyLineTo(f) + if err != nil { + return bucket, err + } + + f, err = os.Create(base + ".txt") + if err != nil { + return bucket, err + } + defer f.Close() + + _, err = io.WriteString(f, l.Text) + if err != nil { + return bucket, err + } + + return bucket, err +} + +// Copies line images and text into directories based on their +// confidence, as defined by the buckets struct, and returns +// statistics of whire lines went in the process. +func BucketUp(lines line.Details, buckets BucketSpecs, dirname string) (BucketStats, error) { + var all []string + var stats BucketStats + + sort.Sort(lines) + sort.Sort(buckets) + for _, l := range lines { + bname, err := bucketLine(l, buckets, dirname) + if err != nil { + return stats, err + } + all = append(all, bname) + } + + for _, b := range all { + i := sort.Search(len(stats), func(i int) bool { return stats[i].name == b }) + if i == len(stats) { + newstat := BucketStat{b, 0} + stats = append(stats, newstat) + i = len(stats) - 1 + } + stats[i].num++ + } + + return stats, nil +} + +// Prints statistics of where lines went when bucketing +func PrintBucketStats(w io.Writer, stats BucketStats) { + var total int + for _, s := range stats { + total += s.num + } + + fmt.Fprintf(w, "Copied %d lines\n", total) + fmt.Fprintf(w, "---------------------------------\n") + sort.Sort(stats) + for _, s := range stats { + fmt.Fprintf(w, "Lines in %7s: %2d%%\n", s.name, 100*s.num/total) + } +} diff --git a/cmd/bucket-lines/main.go b/cmd/bucket-lines/main.go new file mode 100644 index 0000000..af81b44 --- /dev/null +++ b/cmd/bucket-lines/main.go @@ -0,0 +1,89 @@ +package main + +import ( + "encoding/json" + "flag" + "fmt" + "io/ioutil" + "log" + "os" + "path/filepath" + + "rescribe.xyz/utils/pkg/hocr" + "rescribe.xyz/utils/pkg/line" + "rescribe.xyz/utils/pkg/prob" +) + +func main() { + b := BucketSpecs{ + // minimum confidence, name + {0, "bad"}, + {0.95, "95to98"}, + {0.98, "98plus"}, + } + + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: bucket-lines [-d dir] [-s specs.json] [hocr1] [prob1] [hocr2] [...]\n") + fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n") + fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n") + fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n") + fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n") + fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") + fmt.Fprintf(os.Stderr, "option.\n") + fmt.Fprintf(os.Stderr, "The .prob and .hocr files are assumed to be in the same directory\n") + fmt.Fprintf(os.Stderr, "as the line's image and text files.\n\n") + flag.PrintDefaults() + fmt.Fprintf(os.Stderr, "\nAn example specs.json file would be the following:\n") + fmt.Fprintf(os.Stderr, "[{\"min\": 0, \"name\": \"terrible\"}, {\"min\": 0.80, \"name\": \"ok\"}, {\"min\": 0.98, \"name\": \"great\"}]\n") + } + dir := flag.String("d", "buckets", "Directory to store the buckets") + specs := flag.String("s", "", "JSON file describing specs to bucket into") + flag.Parse() + if flag.NArg() < 1 { + flag.Usage() + os.Exit(1) + } + + if *specs != "" { + js, err := ioutil.ReadFile(*specs) + if err != nil { + log.Fatal(err) + } + err = json.Unmarshal(js, &b) + if err != nil { + log.Fatal(err) + } + } + + var err error + lines := make(line.Details, 0) + + for _, f := range flag.Args() { + var newlines line.Details + switch ext := filepath.Ext(f); ext { + case ".prob": + newlines, err = prob.GetLineDetails(f) + case ".hocr": + newlines, err = hocr.GetLineDetails(f) + default: + log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) + continue + } + if err != nil { + log.Fatal(err) + } + + for _, l := range newlines { + if l.Img != nil { + lines = append(lines, l) + } + } + } + + stats, err := BucketUp(lines, b, *dir) + if err != nil { + log.Fatal(err) + } + + PrintBucketStats(os.Stdout, stats) +} diff --git a/cmd/dehyphenate/main.go b/cmd/dehyphenate/main.go new file mode 100644 index 0000000..b2bd6f9 --- /dev/null +++ b/cmd/dehyphenate/main.go @@ -0,0 +1,63 @@ +package main + +import ( + "encoding/xml" + "flag" + "fmt" + "io/ioutil" + "log" + "os" + + "rescribe.xyz/utils/pkg/hocr" +) + +// BUGS: +// - loses all elements not captured in hocr structure such as html headings +// might be best to copy the header and footer separately and put the hocr in between, but would still need to ensure all elements are captured +// - loses any formatting; doesn't need to be identical, but e.g. linebreaks after elements would be handy +// - need to handle OcrChar + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: dehyphenate hocrin hocrout\n") + fmt.Fprintf(os.Stderr, "Dehyphenates a hocr file.\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() != 2 { + flag.Usage() + os.Exit(1) + } + + in, err := ioutil.ReadFile(flag.Arg(0)) + if err != nil { + log.Fatalf("Error reading %s: %v", flag.Arg(1), err) + } + h, err := hocr.Parse(in) + if err != nil { + log.Fatal(err) + } + + for i, l := range h.Lines { + w := l.Words[len(l.Words)-1] + if len(w.Chars) == 0 { + if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' { + h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text + h.Lines[i+1].Words[0].Text = "" + } + } else { + log.Printf("TODO: handle OcrChar") + } + } + + f, err := os.Create(flag.Arg(1)) + if err != nil { + log.Fatalf("Error creating file %s: %v", flag.Arg(1), err) + } + defer f.Close() + e := xml.NewEncoder(f) + err = e.Encode(h) + if err != nil { + log.Fatalf("Error encoding XML: %v", err) + } +} diff --git a/cmd/eeboxmltohocr/main.go b/cmd/eeboxmltohocr/main.go new file mode 100644 index 0000000..2761cd9 --- /dev/null +++ b/cmd/eeboxmltohocr/main.go @@ -0,0 +1,135 @@ +package main + +import ( + "bufio" + "flag" + "fmt" + "io" + "log" + "os" + "regexp" + "strconv" + "strings" +) + +// splitByPb is a split function for the scanner that splits by the +// '= 0 { + return i + 1, data[0:i], nil + } + // If we're at EOF, we have a final section, so just return the lot. + if atEOF { + return len(data), data, nil + } + // Request more data. + return 0, nil, nil +} + +type Page struct { + number int + text string +} + +func addPage(pgs *[]Page, number int, text string) { + added := 0 + for i, pg := range *pgs { + if pg.number == number { + (*pgs)[i].text = pg.text + text + added = 1 + } + } + if added == 0 { + newpg := Page{number, text} + *pgs = append(*pgs, newpg) + } +} + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: eeboxmltohocr in.xml outbase\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() < 2 { + flag.Usage() + os.Exit(1) + } + + f, err := os.Open(flag.Arg(0)) + defer f.Close() + if err != nil { + log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) + } + scanner := bufio.NewScanner(f) + + scanner.Split(splitByPb) + + var pgs []Page + + for scanner.Scan() { + t := scanner.Text() + r := regexp.MustCompile(`pb [^>]*facs="tcp:.*?:(.*?)"`).FindStringSubmatch(t) + if len(r) <= 1 { + continue + } + pgnum, err := strconv.Atoi(r[1]) + if err != nil { + continue + } + + content := t[strings.Index(t, ">")+1:] + ungap := regexp.MustCompile(`(?s)].+?`).ReplaceAllString(content, "") + unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "") + + finaltxt := strings.TrimLeft(unxml, " \n") + if len(finaltxt) == 0 { + continue + } + + addPage(&pgs, pgnum, finaltxt) + } + + for _, pg := range pgs { + fn := fmt.Sprintf("%s-%03d.hocr", flag.Arg(1), pg.number - 1) + f, err := os.Create(fn) + if err != nil { + log.Fatalf("Could not create file %s: %v\n", fn, err) + } + defer f.Close() + + _, err = io.WriteString(f, hocrHeader + pg.text + hocrFooter) + if err != nil { + log.Fatalf("Could not write file %s: %v\n", fn, err) + } + } +} + +const hocrHeader = ` + + + + + + + + + +
+
+

+ + ` + +const hocrFooter = ` + +

+
+
+ +` diff --git a/cmd/fonttobytes/main.go b/cmd/fonttobytes/main.go new file mode 100644 index 0000000..8310e0f --- /dev/null +++ b/cmd/fonttobytes/main.go @@ -0,0 +1,49 @@ +package main + +import ( + "bytes" + "compress/zlib" + "flag" + "fmt" + "io/ioutil" + "log" + "os" +) + +func main() { + flag.Usage = func() { + fmt.Fprintln(flag.CommandLine.Output(), "Usage: fonttobytes font.ttf") + flag.PrintDefaults() + } + flag.Parse() + + if flag.NArg() != 1 { + flag.Usage() + return + } + + f, err := os.Open(flag.Arg(0)) + if err != nil { + log.Fatalln("Failed to open file", flag.Arg(0), err) + } + fontbytes, err := ioutil.ReadAll(f) + if err != nil { + log.Fatalln("Failed to read file", flag.Arg(0), err) + } + + var compressed bytes.Buffer + w := zlib.NewWriter(&compressed) + w.Write(fontbytes) + w.Close() + + // This could be done with %+v in printf, but using the decimal rather than + // hex output saves quite a few bytes, so we do that instead. + fmt.Printf("[]byte{") + for i, b := range compressed.Bytes() { + if i > 0 { + fmt.Printf(", ") + } + fmt.Printf("%d", b) + } + fmt.Printf("}\n") +} diff --git a/cmd/hocrtotxt/main.go b/cmd/hocrtotxt/main.go new file mode 100644 index 0000000..6716a9e --- /dev/null +++ b/cmd/hocrtotxt/main.go @@ -0,0 +1,30 @@ +package main + +import ( + "flag" + "fmt" + "log" + "os" + + "rescribe.xyz/utils/pkg/hocr" +) + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: hocrtotxt hocrfile\n") + fmt.Fprintf(os.Stderr, "Prints the text from a hocr file.\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() != 1 { + flag.Usage() + os.Exit(1) + } + + text, err := hocr.GetText(flag.Arg(0)) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("%s\n", text) +} diff --git a/cmd/pare-gt/main.go b/cmd/pare-gt/main.go new file mode 100644 index 0000000..a4d9600 --- /dev/null +++ b/cmd/pare-gt/main.go @@ -0,0 +1,147 @@ +package main + +import ( + "flag" + "fmt" + "log" + "math/rand" + "os" + "path" + "path/filepath" + "sort" + "strings" +) + +const usage = `Usage: pare-gt [-n num] gtdir movedir + +Moves some of the ground truth from gt-dir into movedir, +ensuring that the same proportions of each ground truth +source are represented in the moved section. Proportion of +ground truth source is calculated by taking the prefix of +the filename up to the first '-' character. +` + +// Prefixes is a map of the prefix string to a list of filenames +type Prefixes = map[string][]string + +// walker adds any .txt path to prefixes map, under the appropriate +// prefix (blank if no '-' separator was found) +func walker(prefixes *Prefixes) filepath.WalkFunc { + return func(fpath string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.IsDir() { + return nil + } + ext := path.Ext(fpath) + if ext != ".txt" { + return nil + } + base := path.Base(fpath) + idx := strings.Index(base, "-") + var prefix string + if idx > -1 { + prefix = base[0:idx] + } + noext := strings.TrimSuffix(fpath, ext) + (*prefixes)[prefix] = append((*prefixes)[prefix], noext) + return nil + } +} + +// inStrSlice checks whether a given string is part of a slice of +// strings +func inStrSlice(sl []string, s string) bool { + for _, v := range sl { + if s == v { + return true + } + } + return false +} + +// samplePrefixes selects random samples for each prefix, proportional +// to the amount of that prefix there are in the whole set, so that a +// total of perctosample% are sampled. +func samplePrefixes(perctosample int, prefixes Prefixes) (filestomove []string) { + var total, sample int + var keys []string + for i, v := range prefixes { + total += len(v) + // needed for determinism + sort.Strings(prefixes[i]) + keys = append(keys, i) + } + + sample = (total * perctosample) / 100 + + // This ensures the map is looped over deterministically + sort.Strings(keys) + for _, key := range keys { + prefix := prefixes[key] + len := len(prefix) + if len == 1 { + continue + } + numtoget := int(float64(sample) / float64(total) * float64(len)) + if numtoget >= len { + numtoget = len - 1 + } + if numtoget < 1 { + numtoget = 1 + } + for i := 0; i < numtoget; i++ { + var selected string + selected = prefix[rand.Int()%len] + // pick a different random selection if the first one is + // already in the filestomove slice + for inStrSlice(filestomove, selected) { + selected = prefix[rand.Int()%len] + } + filestomove = append(filestomove, selected) + } + } + + return +} + +func main() { + flag.Usage = func() { + fmt.Fprintf(flag.CommandLine.Output(), usage) + flag.PrintDefaults() + } + numtopare := flag.Int("n", 10, "Percentage of the ground truth to pare away.") + flag.Parse() + if flag.NArg() != 2 { + flag.Usage() + os.Exit(1) + } + + for _, d := range flag.Args() { + info, err := os.Stat(d) + if err != nil || !info.IsDir() { + log.Fatalln("Error accessing directory", flag.Arg(0), err) + } + } + + var prefixes Prefixes + prefixes = make(Prefixes) + err := filepath.Walk(flag.Arg(0), walker(&prefixes)) + if err != nil { + log.Fatalln("Failed to walk", flag.Arg(0), err) + } + + filestomove := samplePrefixes(*numtopare, prefixes) + + for _, f := range filestomove { + fmt.Println("Moving ground truth", f) + b := path.Base(f) + for _, ext := range []string{".txt", ".png"} { + err = os.Rename(f+ext, path.Join(flag.Arg(1), b+ext)) + if err != nil { + log.Fatalln("Error moving file", f+ext, err) + } + } + } +} diff --git a/cmd/pare-gt/main_test.go b/cmd/pare-gt/main_test.go new file mode 100644 index 0000000..c381a86 --- /dev/null +++ b/cmd/pare-gt/main_test.go @@ -0,0 +1,78 @@ +package main + +import ( + "fmt" + "testing" +) + +func TestSamplePrefixes(t *testing.T) { + prefixes := Prefixes{ + "1471-Orthographia": { + "1471-Orthographia-Tortellius_00001.txt", + "1471-Orthographia-Tortellius_00002.txt", + "1471-Orthographia-Tortellius_00003.txt", + "1471-Orthographia-Tortellius_00004.txt", + "1471-Orthographia-Tortellius_00005.txt", + "1471-Orthographia-Tortellius_00006.txt", + "1471-Orthographia-Tortellius_00007.txt", + "1471-Orthographia-Tortellius_00008.txt", + "1471-Orthographia-Tortellius_00009.txt", + "1471-Orthographia-Tortellius_000010.txt", + "1471-Orthographia-Tortellius_000011.txt", + "1471-Orthographia-Tortellius_000012.txt", + "1471-Orthographia-Tortellius_000013.txt", + "1471-Orthographia-Tortellius_000014.txt", + "1471-Orthographia-Tortellius_000015.txt", + "1471-Orthographia-Tortellius_000016.txt", + "1471-Orthographia-Tortellius_000017.txt", + "1471-Orthographia-Tortellius_000018.txt", + "1471-Orthographia-Tortellius_000019.txt", + "1471-Orthographia-Tortellius_000020.txt", + }, + "Kallimachos_1509": { + "Kallimachos_1509-ShipOfFools-Barclay_00121.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00122.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00123.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00124.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00125.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", + }, + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4": { + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_12_49.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_1_415.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_14_6628571428571429.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_16_865.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_17_62.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_18_6366666666666666.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt", + }, + } + + cases := []struct { + perc int + expected []string + }{ + {1, []string{"1471-Orthographia-Tortellius_000019.txt", "Kallimachos_1509-ShipOfFools-Barclay_00122.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt"}}, + {10, []string{"1471-Orthographia-Tortellius_00002.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt"}}, + {20, []string{"1471-Orthographia-Tortellius_00008.txt", "1471-Orthographia-Tortellius_000017.txt", "1471-Orthographia-Tortellius_00006.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt"}}, + } + + for _, c := range cases { + t.Run(fmt.Sprintf("%d%%", c.perc), func(t *testing.T) { + actual := samplePrefixes(c.perc, prefixes) + if len(c.expected) != len(actual) { + t.Fatalf("Number of files picked (%d) differs from expected (%d):\nExpected: %s\nActual: %s\n", len(actual), len(c.expected), c.expected, actual) + return + } + for i, v := range c.expected { + if actual[i] != v { + t.Fatalf("Difference in expected and actual files (first difference is in index %d of actual):\n\nExpected:\n%s\n\nActual:\n%s\n", i, c.expected, actual) + } + } + }) + } +} diff --git a/cmd/pgconf/main.go b/cmd/pgconf/main.go new file mode 100644 index 0000000..dbc6af8 --- /dev/null +++ b/cmd/pgconf/main.go @@ -0,0 +1,30 @@ +package main + +import ( + "flag" + "fmt" + "log" + "os" + + "rescribe.xyz/utils/pkg/hocr" +) + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: pgconf hocr\n") + fmt.Fprintf(os.Stderr, "Prints the total confidence for a page, as an average of the confidence of each word.\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() != 1 { + flag.Usage() + os.Exit(1) + } + + avg, err := hocr.GetAvgConf(flag.Arg(0)) + if err != nil { + log.Fatalf("Error retreiving confidence for %s: %v\n", flag.Arg(0), err) + } + + fmt.Printf("%0.0f\n", avg) +} -- cgit v1.2.1-24-ge1ad