From 3880414bbf2d6f2cd05e208abf919ae5ceabeddc Mon Sep 17 00:00:00 2001 From: Nick White Date: Thu, 27 Feb 2020 17:45:16 +0000 Subject: Reorganise all commands to be behind cmd/ --- avg-lines/html.go | 61 ------------------- avg-lines/main.go | 69 --------------------- boxtotxt/main.go | 44 -------------- bucket-lines/bucket.go | 131 ---------------------------------------- bucket-lines/main.go | 89 --------------------------- cmd/avg-lines/html.go | 61 +++++++++++++++++++ cmd/avg-lines/main.go | 69 +++++++++++++++++++++ cmd/boxtotxt/main.go | 44 ++++++++++++++ cmd/bucket-lines/bucket.go | 131 ++++++++++++++++++++++++++++++++++++++++ cmd/bucket-lines/main.go | 89 +++++++++++++++++++++++++++ cmd/dehyphenate/main.go | 63 +++++++++++++++++++ cmd/eeboxmltohocr/main.go | 135 +++++++++++++++++++++++++++++++++++++++++ cmd/fonttobytes/main.go | 49 +++++++++++++++ cmd/hocrtotxt/main.go | 30 +++++++++ cmd/pare-gt/main.go | 147 +++++++++++++++++++++++++++++++++++++++++++++ cmd/pare-gt/main_test.go | 78 ++++++++++++++++++++++++ cmd/pgconf/main.go | 30 +++++++++ dehyphenate/main.go | 63 ------------------- doc.go | 2 - eeboxmltohocr/main.go | 135 ----------------------------------------- fonttobytes/main.go | 49 --------------- hocrtotxt/main.go | 30 --------- pare-gt/main.go | 147 --------------------------------------------- pare-gt/main_test.go | 78 ------------------------ pgconf/main.go | 30 --------- 25 files changed, 926 insertions(+), 928 deletions(-) delete mode 100644 avg-lines/html.go delete mode 100644 avg-lines/main.go delete mode 100644 boxtotxt/main.go delete mode 100644 bucket-lines/bucket.go delete mode 100644 bucket-lines/main.go create mode 100644 cmd/avg-lines/html.go create mode 100644 cmd/avg-lines/main.go create mode 100644 cmd/boxtotxt/main.go create mode 100644 cmd/bucket-lines/bucket.go create mode 100644 cmd/bucket-lines/main.go create mode 100644 cmd/dehyphenate/main.go create mode 100644 cmd/eeboxmltohocr/main.go create mode 100644 cmd/fonttobytes/main.go create mode 100644 cmd/hocrtotxt/main.go create mode 100644 cmd/pare-gt/main.go create mode 100644 cmd/pare-gt/main_test.go create mode 100644 cmd/pgconf/main.go delete mode 100644 dehyphenate/main.go delete mode 100644 doc.go delete mode 100644 eeboxmltohocr/main.go delete mode 100644 fonttobytes/main.go delete mode 100644 hocrtotxt/main.go delete mode 100644 pare-gt/main.go delete mode 100644 pare-gt/main_test.go delete mode 100644 pgconf/main.go diff --git a/avg-lines/html.go b/avg-lines/html.go deleted file mode 100644 index 97d8ec9..0000000 --- a/avg-lines/html.go +++ /dev/null @@ -1,61 +0,0 @@ -package main - -import ( - "fmt" - "os" - "path/filepath" - - "rescribe.xyz/utils/pkg/line" -) - -func copylineimg(fn string, l line.Detail) error { - f, err := os.Create(fn) - if err != nil { - return err - } - defer f.Close() - - return l.Img.CopyLineTo(f) -} - -func htmlout(dir string, lines line.Details) error { - err := os.MkdirAll(dir, 0700) - if err != nil { - return err - } - - fn := filepath.Join(dir, "index.html") - f, err := os.Create(fn) - if err != nil { - return err - } - defer f.Close() - - _, err = fmt.Fprintf(f, ""+ - "\n\n") - if err != nil { - return err - } - for _, l := range lines { - fn = filepath.Base(l.OcrName) + "_" + l.Name + ".png" - err = copylineimg(filepath.Join(dir, fn), l) - if err != nil { - return err - } - _, err = fmt.Fprintf(f, "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n", - l.Avgconf, l.OcrName, l.Name, fn, l.Text) - if err != nil { - return err - } - } - _, err = fmt.Fprintf(f, "

%.4f%%

%s %s
%s
\n\n") - if err != nil { - return err - } - - return nil -} diff --git a/avg-lines/main.go b/avg-lines/main.go deleted file mode 100644 index f7cedab..0000000 --- a/avg-lines/main.go +++ /dev/null @@ -1,69 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "log" - "os" - "path/filepath" - "sort" - - "rescribe.xyz/utils/pkg/hocr" - "rescribe.xyz/utils/pkg/line" - "rescribe.xyz/utils/pkg/prob" -) - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: avg-lines [-html dir] [-nosort] [prob1] [hocr1] [prob2] [...]\n") - fmt.Fprintf(os.Stderr, "Prints a report of the average confidence for each line, sorted\n") - fmt.Fprintf(os.Stderr, "from worst to best.\n") - fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n") - fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n") - fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") - fmt.Fprintf(os.Stderr, "option.\n\n") - flag.PrintDefaults() - } - var html = flag.String("html", "", "Output in html format to the specified directory") - var nosort = flag.Bool("nosort", false, "Don't sort lines by confidence") - flag.Parse() - if flag.NArg() < 1 { - flag.Usage() - os.Exit(1) - } - - var err error - lines := make(line.Details, 0) - - for _, f := range flag.Args() { - var newlines line.Details - switch ext := filepath.Ext(f); ext { - case ".prob": - newlines, err = prob.GetLineDetails(f) - case ".hocr": - newlines, err = hocr.GetLineDetails(f) - default: - log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) - continue - } - if err != nil { - log.Fatal(err) - } - - for _, l := range newlines { - lines = append(lines, l) - } - } - - if *nosort == false { - sort.Sort(lines) - } - - if *html == "" { - for _, l := range lines { - fmt.Printf("%s %s: %.2f%%\n", l.OcrName, l.Name, l.Avgconf) - } - } else { - htmlout(*html, lines) - } -} diff --git a/boxtotxt/main.go b/boxtotxt/main.go deleted file mode 100644 index 058eb05..0000000 --- a/boxtotxt/main.go +++ /dev/null @@ -1,44 +0,0 @@ -package main - -import ( - "bufio" - "flag" - "fmt" - "log" - "os" - "strings" -) - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: boxtotxt in.box\n") - flag.PrintDefaults() - } - flag.Parse() - if flag.NArg() != 1 { - flag.Usage() - os.Exit(1) - } - - f, err := os.Open(flag.Arg(0)) - defer f.Close() - if err != nil { - log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) - } - - scanner := bufio.NewScanner(f) - - for scanner.Scan() { - t := scanner.Text() - s := strings.Split(t, "") - if len(s) < 1 { - continue - } - if s[0] == "\t" { - continue - } - fmt.Printf("%s", s[0]) - } - - fmt.Printf("\n") -} diff --git a/bucket-lines/bucket.go b/bucket-lines/bucket.go deleted file mode 100644 index 7b6fc4f..0000000 --- a/bucket-lines/bucket.go +++ /dev/null @@ -1,131 +0,0 @@ -package main - -import ( - "fmt" - "io" - "os" - "path/filepath" - "sort" - "strconv" - - "rescribe.xyz/utils/pkg/line" -) - -type BucketSpec struct { - Min float64 - Name string -} -type BucketSpecs []BucketSpec - -func (b BucketSpecs) Len() int { return len(b) } -func (b BucketSpecs) Swap(i, j int) { b[i], b[j] = b[j], b[i] } -func (b BucketSpecs) Less(i, j int) bool { return b[i].Min < b[j].Min } - -type BucketStat struct { - name string - num int -} -type BucketStats []BucketStat - -func (b BucketStats) Len() int { return len(b) } -func (b BucketStats) Swap(i, j int) { b[i], b[j] = b[j], b[i] } -func (b BucketStats) Less(i, j int) bool { return b[i].num < b[j].num } - -// Copies the image and text for a line into a directory based on -// the line confidence, as defined by the buckets struct -func bucketLine(l line.Detail, buckets BucketSpecs, dirname string) (string, error) { - var bucket string - - todir := "" - for _, b := range buckets { - if l.Avgconf >= b.Min { - todir = b.Name - bucket = b.Name - } - } - - if todir == "" { - return bucket, nil - } - - avgstr := strconv.FormatFloat(l.Avgconf, 'G', -1, 64) - if len(avgstr) > 2 { - avgstr = avgstr[2:] - } - - base := filepath.Join(dirname, todir, l.OcrName+"_"+l.Name+"_"+avgstr) - - err := os.MkdirAll(filepath.Join(dirname, todir), 0700) - if err != nil { - return bucket, err - } - - f, err := os.Create(base + ".png") - if err != nil { - return bucket, err - } - defer f.Close() - - err = l.Img.CopyLineTo(f) - if err != nil { - return bucket, err - } - - f, err = os.Create(base + ".txt") - if err != nil { - return bucket, err - } - defer f.Close() - - _, err = io.WriteString(f, l.Text) - if err != nil { - return bucket, err - } - - return bucket, err -} - -// Copies line images and text into directories based on their -// confidence, as defined by the buckets struct, and returns -// statistics of whire lines went in the process. -func BucketUp(lines line.Details, buckets BucketSpecs, dirname string) (BucketStats, error) { - var all []string - var stats BucketStats - - sort.Sort(lines) - sort.Sort(buckets) - for _, l := range lines { - bname, err := bucketLine(l, buckets, dirname) - if err != nil { - return stats, err - } - all = append(all, bname) - } - - for _, b := range all { - i := sort.Search(len(stats), func(i int) bool { return stats[i].name == b }) - if i == len(stats) { - newstat := BucketStat{b, 0} - stats = append(stats, newstat) - i = len(stats) - 1 - } - stats[i].num++ - } - - return stats, nil -} - -// Prints statistics of where lines went when bucketing -func PrintBucketStats(w io.Writer, stats BucketStats) { - var total int - for _, s := range stats { - total += s.num - } - - fmt.Fprintf(w, "Copied %d lines\n", total) - fmt.Fprintf(w, "---------------------------------\n") - sort.Sort(stats) - for _, s := range stats { - fmt.Fprintf(w, "Lines in %7s: %2d%%\n", s.name, 100*s.num/total) - } -} diff --git a/bucket-lines/main.go b/bucket-lines/main.go deleted file mode 100644 index af81b44..0000000 --- a/bucket-lines/main.go +++ /dev/null @@ -1,89 +0,0 @@ -package main - -import ( - "encoding/json" - "flag" - "fmt" - "io/ioutil" - "log" - "os" - "path/filepath" - - "rescribe.xyz/utils/pkg/hocr" - "rescribe.xyz/utils/pkg/line" - "rescribe.xyz/utils/pkg/prob" -) - -func main() { - b := BucketSpecs{ - // minimum confidence, name - {0, "bad"}, - {0.95, "95to98"}, - {0.98, "98plus"}, - } - - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: bucket-lines [-d dir] [-s specs.json] [hocr1] [prob1] [hocr2] [...]\n") - fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n") - fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n") - fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n") - fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n") - fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") - fmt.Fprintf(os.Stderr, "option.\n") - fmt.Fprintf(os.Stderr, "The .prob and .hocr files are assumed to be in the same directory\n") - fmt.Fprintf(os.Stderr, "as the line's image and text files.\n\n") - flag.PrintDefaults() - fmt.Fprintf(os.Stderr, "\nAn example specs.json file would be the following:\n") - fmt.Fprintf(os.Stderr, "[{\"min\": 0, \"name\": \"terrible\"}, {\"min\": 0.80, \"name\": \"ok\"}, {\"min\": 0.98, \"name\": \"great\"}]\n") - } - dir := flag.String("d", "buckets", "Directory to store the buckets") - specs := flag.String("s", "", "JSON file describing specs to bucket into") - flag.Parse() - if flag.NArg() < 1 { - flag.Usage() - os.Exit(1) - } - - if *specs != "" { - js, err := ioutil.ReadFile(*specs) - if err != nil { - log.Fatal(err) - } - err = json.Unmarshal(js, &b) - if err != nil { - log.Fatal(err) - } - } - - var err error - lines := make(line.Details, 0) - - for _, f := range flag.Args() { - var newlines line.Details - switch ext := filepath.Ext(f); ext { - case ".prob": - newlines, err = prob.GetLineDetails(f) - case ".hocr": - newlines, err = hocr.GetLineDetails(f) - default: - log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) - continue - } - if err != nil { - log.Fatal(err) - } - - for _, l := range newlines { - if l.Img != nil { - lines = append(lines, l) - } - } - } - - stats, err := BucketUp(lines, b, *dir) - if err != nil { - log.Fatal(err) - } - - PrintBucketStats(os.Stdout, stats) -} diff --git a/cmd/avg-lines/html.go b/cmd/avg-lines/html.go new file mode 100644 index 0000000..97d8ec9 --- /dev/null +++ b/cmd/avg-lines/html.go @@ -0,0 +1,61 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + + "rescribe.xyz/utils/pkg/line" +) + +func copylineimg(fn string, l line.Detail) error { + f, err := os.Create(fn) + if err != nil { + return err + } + defer f.Close() + + return l.Img.CopyLineTo(f) +} + +func htmlout(dir string, lines line.Details) error { + err := os.MkdirAll(dir, 0700) + if err != nil { + return err + } + + fn := filepath.Join(dir, "index.html") + f, err := os.Create(fn) + if err != nil { + return err + } + defer f.Close() + + _, err = fmt.Fprintf(f, ""+ + "\n\n") + if err != nil { + return err + } + for _, l := range lines { + fn = filepath.Base(l.OcrName) + "_" + l.Name + ".png" + err = copylineimg(filepath.Join(dir, fn), l) + if err != nil { + return err + } + _, err = fmt.Fprintf(f, "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n", + l.Avgconf, l.OcrName, l.Name, fn, l.Text) + if err != nil { + return err + } + } + _, err = fmt.Fprintf(f, "

%.4f%%

%s %s
%s
\n\n") + if err != nil { + return err + } + + return nil +} diff --git a/cmd/avg-lines/main.go b/cmd/avg-lines/main.go new file mode 100644 index 0000000..f7cedab --- /dev/null +++ b/cmd/avg-lines/main.go @@ -0,0 +1,69 @@ +package main + +import ( + "flag" + "fmt" + "log" + "os" + "path/filepath" + "sort" + + "rescribe.xyz/utils/pkg/hocr" + "rescribe.xyz/utils/pkg/line" + "rescribe.xyz/utils/pkg/prob" +) + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: avg-lines [-html dir] [-nosort] [prob1] [hocr1] [prob2] [...]\n") + fmt.Fprintf(os.Stderr, "Prints a report of the average confidence for each line, sorted\n") + fmt.Fprintf(os.Stderr, "from worst to best.\n") + fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n") + fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n") + fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") + fmt.Fprintf(os.Stderr, "option.\n\n") + flag.PrintDefaults() + } + var html = flag.String("html", "", "Output in html format to the specified directory") + var nosort = flag.Bool("nosort", false, "Don't sort lines by confidence") + flag.Parse() + if flag.NArg() < 1 { + flag.Usage() + os.Exit(1) + } + + var err error + lines := make(line.Details, 0) + + for _, f := range flag.Args() { + var newlines line.Details + switch ext := filepath.Ext(f); ext { + case ".prob": + newlines, err = prob.GetLineDetails(f) + case ".hocr": + newlines, err = hocr.GetLineDetails(f) + default: + log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) + continue + } + if err != nil { + log.Fatal(err) + } + + for _, l := range newlines { + lines = append(lines, l) + } + } + + if *nosort == false { + sort.Sort(lines) + } + + if *html == "" { + for _, l := range lines { + fmt.Printf("%s %s: %.2f%%\n", l.OcrName, l.Name, l.Avgconf) + } + } else { + htmlout(*html, lines) + } +} diff --git a/cmd/boxtotxt/main.go b/cmd/boxtotxt/main.go new file mode 100644 index 0000000..058eb05 --- /dev/null +++ b/cmd/boxtotxt/main.go @@ -0,0 +1,44 @@ +package main + +import ( + "bufio" + "flag" + "fmt" + "log" + "os" + "strings" +) + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: boxtotxt in.box\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() != 1 { + flag.Usage() + os.Exit(1) + } + + f, err := os.Open(flag.Arg(0)) + defer f.Close() + if err != nil { + log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) + } + + scanner := bufio.NewScanner(f) + + for scanner.Scan() { + t := scanner.Text() + s := strings.Split(t, "") + if len(s) < 1 { + continue + } + if s[0] == "\t" { + continue + } + fmt.Printf("%s", s[0]) + } + + fmt.Printf("\n") +} diff --git a/cmd/bucket-lines/bucket.go b/cmd/bucket-lines/bucket.go new file mode 100644 index 0000000..7b6fc4f --- /dev/null +++ b/cmd/bucket-lines/bucket.go @@ -0,0 +1,131 @@ +package main + +import ( + "fmt" + "io" + "os" + "path/filepath" + "sort" + "strconv" + + "rescribe.xyz/utils/pkg/line" +) + +type BucketSpec struct { + Min float64 + Name string +} +type BucketSpecs []BucketSpec + +func (b BucketSpecs) Len() int { return len(b) } +func (b BucketSpecs) Swap(i, j int) { b[i], b[j] = b[j], b[i] } +func (b BucketSpecs) Less(i, j int) bool { return b[i].Min < b[j].Min } + +type BucketStat struct { + name string + num int +} +type BucketStats []BucketStat + +func (b BucketStats) Len() int { return len(b) } +func (b BucketStats) Swap(i, j int) { b[i], b[j] = b[j], b[i] } +func (b BucketStats) Less(i, j int) bool { return b[i].num < b[j].num } + +// Copies the image and text for a line into a directory based on +// the line confidence, as defined by the buckets struct +func bucketLine(l line.Detail, buckets BucketSpecs, dirname string) (string, error) { + var bucket string + + todir := "" + for _, b := range buckets { + if l.Avgconf >= b.Min { + todir = b.Name + bucket = b.Name + } + } + + if todir == "" { + return bucket, nil + } + + avgstr := strconv.FormatFloat(l.Avgconf, 'G', -1, 64) + if len(avgstr) > 2 { + avgstr = avgstr[2:] + } + + base := filepath.Join(dirname, todir, l.OcrName+"_"+l.Name+"_"+avgstr) + + err := os.MkdirAll(filepath.Join(dirname, todir), 0700) + if err != nil { + return bucket, err + } + + f, err := os.Create(base + ".png") + if err != nil { + return bucket, err + } + defer f.Close() + + err = l.Img.CopyLineTo(f) + if err != nil { + return bucket, err + } + + f, err = os.Create(base + ".txt") + if err != nil { + return bucket, err + } + defer f.Close() + + _, err = io.WriteString(f, l.Text) + if err != nil { + return bucket, err + } + + return bucket, err +} + +// Copies line images and text into directories based on their +// confidence, as defined by the buckets struct, and returns +// statistics of whire lines went in the process. +func BucketUp(lines line.Details, buckets BucketSpecs, dirname string) (BucketStats, error) { + var all []string + var stats BucketStats + + sort.Sort(lines) + sort.Sort(buckets) + for _, l := range lines { + bname, err := bucketLine(l, buckets, dirname) + if err != nil { + return stats, err + } + all = append(all, bname) + } + + for _, b := range all { + i := sort.Search(len(stats), func(i int) bool { return stats[i].name == b }) + if i == len(stats) { + newstat := BucketStat{b, 0} + stats = append(stats, newstat) + i = len(stats) - 1 + } + stats[i].num++ + } + + return stats, nil +} + +// Prints statistics of where lines went when bucketing +func PrintBucketStats(w io.Writer, stats BucketStats) { + var total int + for _, s := range stats { + total += s.num + } + + fmt.Fprintf(w, "Copied %d lines\n", total) + fmt.Fprintf(w, "---------------------------------\n") + sort.Sort(stats) + for _, s := range stats { + fmt.Fprintf(w, "Lines in %7s: %2d%%\n", s.name, 100*s.num/total) + } +} diff --git a/cmd/bucket-lines/main.go b/cmd/bucket-lines/main.go new file mode 100644 index 0000000..af81b44 --- /dev/null +++ b/cmd/bucket-lines/main.go @@ -0,0 +1,89 @@ +package main + +import ( + "encoding/json" + "flag" + "fmt" + "io/ioutil" + "log" + "os" + "path/filepath" + + "rescribe.xyz/utils/pkg/hocr" + "rescribe.xyz/utils/pkg/line" + "rescribe.xyz/utils/pkg/prob" +) + +func main() { + b := BucketSpecs{ + // minimum confidence, name + {0, "bad"}, + {0.95, "95to98"}, + {0.98, "98plus"}, + } + + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: bucket-lines [-d dir] [-s specs.json] [hocr1] [prob1] [hocr2] [...]\n") + fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n") + fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n") + fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n") + fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n") + fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") + fmt.Fprintf(os.Stderr, "option.\n") + fmt.Fprintf(os.Stderr, "The .prob and .hocr files are assumed to be in the same directory\n") + fmt.Fprintf(os.Stderr, "as the line's image and text files.\n\n") + flag.PrintDefaults() + fmt.Fprintf(os.Stderr, "\nAn example specs.json file would be the following:\n") + fmt.Fprintf(os.Stderr, "[{\"min\": 0, \"name\": \"terrible\"}, {\"min\": 0.80, \"name\": \"ok\"}, {\"min\": 0.98, \"name\": \"great\"}]\n") + } + dir := flag.String("d", "buckets", "Directory to store the buckets") + specs := flag.String("s", "", "JSON file describing specs to bucket into") + flag.Parse() + if flag.NArg() < 1 { + flag.Usage() + os.Exit(1) + } + + if *specs != "" { + js, err := ioutil.ReadFile(*specs) + if err != nil { + log.Fatal(err) + } + err = json.Unmarshal(js, &b) + if err != nil { + log.Fatal(err) + } + } + + var err error + lines := make(line.Details, 0) + + for _, f := range flag.Args() { + var newlines line.Details + switch ext := filepath.Ext(f); ext { + case ".prob": + newlines, err = prob.GetLineDetails(f) + case ".hocr": + newlines, err = hocr.GetLineDetails(f) + default: + log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) + continue + } + if err != nil { + log.Fatal(err) + } + + for _, l := range newlines { + if l.Img != nil { + lines = append(lines, l) + } + } + } + + stats, err := BucketUp(lines, b, *dir) + if err != nil { + log.Fatal(err) + } + + PrintBucketStats(os.Stdout, stats) +} diff --git a/cmd/dehyphenate/main.go b/cmd/dehyphenate/main.go new file mode 100644 index 0000000..b2bd6f9 --- /dev/null +++ b/cmd/dehyphenate/main.go @@ -0,0 +1,63 @@ +package main + +import ( + "encoding/xml" + "flag" + "fmt" + "io/ioutil" + "log" + "os" + + "rescribe.xyz/utils/pkg/hocr" +) + +// BUGS: +// - loses all elements not captured in hocr structure such as html headings +// might be best to copy the header and footer separately and put the hocr in between, but would still need to ensure all elements are captured +// - loses any formatting; doesn't need to be identical, but e.g. linebreaks after elements would be handy +// - need to handle OcrChar + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: dehyphenate hocrin hocrout\n") + fmt.Fprintf(os.Stderr, "Dehyphenates a hocr file.\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() != 2 { + flag.Usage() + os.Exit(1) + } + + in, err := ioutil.ReadFile(flag.Arg(0)) + if err != nil { + log.Fatalf("Error reading %s: %v", flag.Arg(1), err) + } + h, err := hocr.Parse(in) + if err != nil { + log.Fatal(err) + } + + for i, l := range h.Lines { + w := l.Words[len(l.Words)-1] + if len(w.Chars) == 0 { + if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' { + h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text + h.Lines[i+1].Words[0].Text = "" + } + } else { + log.Printf("TODO: handle OcrChar") + } + } + + f, err := os.Create(flag.Arg(1)) + if err != nil { + log.Fatalf("Error creating file %s: %v", flag.Arg(1), err) + } + defer f.Close() + e := xml.NewEncoder(f) + err = e.Encode(h) + if err != nil { + log.Fatalf("Error encoding XML: %v", err) + } +} diff --git a/cmd/eeboxmltohocr/main.go b/cmd/eeboxmltohocr/main.go new file mode 100644 index 0000000..2761cd9 --- /dev/null +++ b/cmd/eeboxmltohocr/main.go @@ -0,0 +1,135 @@ +package main + +import ( + "bufio" + "flag" + "fmt" + "io" + "log" + "os" + "regexp" + "strconv" + "strings" +) + +// splitByPb is a split function for the scanner that splits by the +// '= 0 { + return i + 1, data[0:i], nil + } + // If we're at EOF, we have a final section, so just return the lot. + if atEOF { + return len(data), data, nil + } + // Request more data. + return 0, nil, nil +} + +type Page struct { + number int + text string +} + +func addPage(pgs *[]Page, number int, text string) { + added := 0 + for i, pg := range *pgs { + if pg.number == number { + (*pgs)[i].text = pg.text + text + added = 1 + } + } + if added == 0 { + newpg := Page{number, text} + *pgs = append(*pgs, newpg) + } +} + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: eeboxmltohocr in.xml outbase\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() < 2 { + flag.Usage() + os.Exit(1) + } + + f, err := os.Open(flag.Arg(0)) + defer f.Close() + if err != nil { + log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) + } + scanner := bufio.NewScanner(f) + + scanner.Split(splitByPb) + + var pgs []Page + + for scanner.Scan() { + t := scanner.Text() + r := regexp.MustCompile(`pb [^>]*facs="tcp:.*?:(.*?)"`).FindStringSubmatch(t) + if len(r) <= 1 { + continue + } + pgnum, err := strconv.Atoi(r[1]) + if err != nil { + continue + } + + content := t[strings.Index(t, ">")+1:] + ungap := regexp.MustCompile(`(?s)].+?`).ReplaceAllString(content, "") + unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "") + + finaltxt := strings.TrimLeft(unxml, " \n") + if len(finaltxt) == 0 { + continue + } + + addPage(&pgs, pgnum, finaltxt) + } + + for _, pg := range pgs { + fn := fmt.Sprintf("%s-%03d.hocr", flag.Arg(1), pg.number - 1) + f, err := os.Create(fn) + if err != nil { + log.Fatalf("Could not create file %s: %v\n", fn, err) + } + defer f.Close() + + _, err = io.WriteString(f, hocrHeader + pg.text + hocrFooter) + if err != nil { + log.Fatalf("Could not write file %s: %v\n", fn, err) + } + } +} + +const hocrHeader = ` + + + + + + + + + +
+
+

+ + ` + +const hocrFooter = ` + +

+
+
+ +` diff --git a/cmd/fonttobytes/main.go b/cmd/fonttobytes/main.go new file mode 100644 index 0000000..8310e0f --- /dev/null +++ b/cmd/fonttobytes/main.go @@ -0,0 +1,49 @@ +package main + +import ( + "bytes" + "compress/zlib" + "flag" + "fmt" + "io/ioutil" + "log" + "os" +) + +func main() { + flag.Usage = func() { + fmt.Fprintln(flag.CommandLine.Output(), "Usage: fonttobytes font.ttf") + flag.PrintDefaults() + } + flag.Parse() + + if flag.NArg() != 1 { + flag.Usage() + return + } + + f, err := os.Open(flag.Arg(0)) + if err != nil { + log.Fatalln("Failed to open file", flag.Arg(0), err) + } + fontbytes, err := ioutil.ReadAll(f) + if err != nil { + log.Fatalln("Failed to read file", flag.Arg(0), err) + } + + var compressed bytes.Buffer + w := zlib.NewWriter(&compressed) + w.Write(fontbytes) + w.Close() + + // This could be done with %+v in printf, but using the decimal rather than + // hex output saves quite a few bytes, so we do that instead. + fmt.Printf("[]byte{") + for i, b := range compressed.Bytes() { + if i > 0 { + fmt.Printf(", ") + } + fmt.Printf("%d", b) + } + fmt.Printf("}\n") +} diff --git a/cmd/hocrtotxt/main.go b/cmd/hocrtotxt/main.go new file mode 100644 index 0000000..6716a9e --- /dev/null +++ b/cmd/hocrtotxt/main.go @@ -0,0 +1,30 @@ +package main + +import ( + "flag" + "fmt" + "log" + "os" + + "rescribe.xyz/utils/pkg/hocr" +) + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: hocrtotxt hocrfile\n") + fmt.Fprintf(os.Stderr, "Prints the text from a hocr file.\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() != 1 { + flag.Usage() + os.Exit(1) + } + + text, err := hocr.GetText(flag.Arg(0)) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("%s\n", text) +} diff --git a/cmd/pare-gt/main.go b/cmd/pare-gt/main.go new file mode 100644 index 0000000..a4d9600 --- /dev/null +++ b/cmd/pare-gt/main.go @@ -0,0 +1,147 @@ +package main + +import ( + "flag" + "fmt" + "log" + "math/rand" + "os" + "path" + "path/filepath" + "sort" + "strings" +) + +const usage = `Usage: pare-gt [-n num] gtdir movedir + +Moves some of the ground truth from gt-dir into movedir, +ensuring that the same proportions of each ground truth +source are represented in the moved section. Proportion of +ground truth source is calculated by taking the prefix of +the filename up to the first '-' character. +` + +// Prefixes is a map of the prefix string to a list of filenames +type Prefixes = map[string][]string + +// walker adds any .txt path to prefixes map, under the appropriate +// prefix (blank if no '-' separator was found) +func walker(prefixes *Prefixes) filepath.WalkFunc { + return func(fpath string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.IsDir() { + return nil + } + ext := path.Ext(fpath) + if ext != ".txt" { + return nil + } + base := path.Base(fpath) + idx := strings.Index(base, "-") + var prefix string + if idx > -1 { + prefix = base[0:idx] + } + noext := strings.TrimSuffix(fpath, ext) + (*prefixes)[prefix] = append((*prefixes)[prefix], noext) + return nil + } +} + +// inStrSlice checks whether a given string is part of a slice of +// strings +func inStrSlice(sl []string, s string) bool { + for _, v := range sl { + if s == v { + return true + } + } + return false +} + +// samplePrefixes selects random samples for each prefix, proportional +// to the amount of that prefix there are in the whole set, so that a +// total of perctosample% are sampled. +func samplePrefixes(perctosample int, prefixes Prefixes) (filestomove []string) { + var total, sample int + var keys []string + for i, v := range prefixes { + total += len(v) + // needed for determinism + sort.Strings(prefixes[i]) + keys = append(keys, i) + } + + sample = (total * perctosample) / 100 + + // This ensures the map is looped over deterministically + sort.Strings(keys) + for _, key := range keys { + prefix := prefixes[key] + len := len(prefix) + if len == 1 { + continue + } + numtoget := int(float64(sample) / float64(total) * float64(len)) + if numtoget >= len { + numtoget = len - 1 + } + if numtoget < 1 { + numtoget = 1 + } + for i := 0; i < numtoget; i++ { + var selected string + selected = prefix[rand.Int()%len] + // pick a different random selection if the first one is + // already in the filestomove slice + for inStrSlice(filestomove, selected) { + selected = prefix[rand.Int()%len] + } + filestomove = append(filestomove, selected) + } + } + + return +} + +func main() { + flag.Usage = func() { + fmt.Fprintf(flag.CommandLine.Output(), usage) + flag.PrintDefaults() + } + numtopare := flag.Int("n", 10, "Percentage of the ground truth to pare away.") + flag.Parse() + if flag.NArg() != 2 { + flag.Usage() + os.Exit(1) + } + + for _, d := range flag.Args() { + info, err := os.Stat(d) + if err != nil || !info.IsDir() { + log.Fatalln("Error accessing directory", flag.Arg(0), err) + } + } + + var prefixes Prefixes + prefixes = make(Prefixes) + err := filepath.Walk(flag.Arg(0), walker(&prefixes)) + if err != nil { + log.Fatalln("Failed to walk", flag.Arg(0), err) + } + + filestomove := samplePrefixes(*numtopare, prefixes) + + for _, f := range filestomove { + fmt.Println("Moving ground truth", f) + b := path.Base(f) + for _, ext := range []string{".txt", ".png"} { + err = os.Rename(f+ext, path.Join(flag.Arg(1), b+ext)) + if err != nil { + log.Fatalln("Error moving file", f+ext, err) + } + } + } +} diff --git a/cmd/pare-gt/main_test.go b/cmd/pare-gt/main_test.go new file mode 100644 index 0000000..c381a86 --- /dev/null +++ b/cmd/pare-gt/main_test.go @@ -0,0 +1,78 @@ +package main + +import ( + "fmt" + "testing" +) + +func TestSamplePrefixes(t *testing.T) { + prefixes := Prefixes{ + "1471-Orthographia": { + "1471-Orthographia-Tortellius_00001.txt", + "1471-Orthographia-Tortellius_00002.txt", + "1471-Orthographia-Tortellius_00003.txt", + "1471-Orthographia-Tortellius_00004.txt", + "1471-Orthographia-Tortellius_00005.txt", + "1471-Orthographia-Tortellius_00006.txt", + "1471-Orthographia-Tortellius_00007.txt", + "1471-Orthographia-Tortellius_00008.txt", + "1471-Orthographia-Tortellius_00009.txt", + "1471-Orthographia-Tortellius_000010.txt", + "1471-Orthographia-Tortellius_000011.txt", + "1471-Orthographia-Tortellius_000012.txt", + "1471-Orthographia-Tortellius_000013.txt", + "1471-Orthographia-Tortellius_000014.txt", + "1471-Orthographia-Tortellius_000015.txt", + "1471-Orthographia-Tortellius_000016.txt", + "1471-Orthographia-Tortellius_000017.txt", + "1471-Orthographia-Tortellius_000018.txt", + "1471-Orthographia-Tortellius_000019.txt", + "1471-Orthographia-Tortellius_000020.txt", + }, + "Kallimachos_1509": { + "Kallimachos_1509-ShipOfFools-Barclay_00121.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00122.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00123.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00124.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00125.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", + }, + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4": { + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_12_49.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_1_415.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_14_6628571428571429.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_16_865.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_17_62.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_18_6366666666666666.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt", + }, + } + + cases := []struct { + perc int + expected []string + }{ + {1, []string{"1471-Orthographia-Tortellius_000019.txt", "Kallimachos_1509-ShipOfFools-Barclay_00122.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt"}}, + {10, []string{"1471-Orthographia-Tortellius_00002.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt"}}, + {20, []string{"1471-Orthographia-Tortellius_00008.txt", "1471-Orthographia-Tortellius_000017.txt", "1471-Orthographia-Tortellius_00006.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt"}}, + } + + for _, c := range cases { + t.Run(fmt.Sprintf("%d%%", c.perc), func(t *testing.T) { + actual := samplePrefixes(c.perc, prefixes) + if len(c.expected) != len(actual) { + t.Fatalf("Number of files picked (%d) differs from expected (%d):\nExpected: %s\nActual: %s\n", len(actual), len(c.expected), c.expected, actual) + return + } + for i, v := range c.expected { + if actual[i] != v { + t.Fatalf("Difference in expected and actual files (first difference is in index %d of actual):\n\nExpected:\n%s\n\nActual:\n%s\n", i, c.expected, actual) + } + } + }) + } +} diff --git a/cmd/pgconf/main.go b/cmd/pgconf/main.go new file mode 100644 index 0000000..dbc6af8 --- /dev/null +++ b/cmd/pgconf/main.go @@ -0,0 +1,30 @@ +package main + +import ( + "flag" + "fmt" + "log" + "os" + + "rescribe.xyz/utils/pkg/hocr" +) + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: pgconf hocr\n") + fmt.Fprintf(os.Stderr, "Prints the total confidence for a page, as an average of the confidence of each word.\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() != 1 { + flag.Usage() + os.Exit(1) + } + + avg, err := hocr.GetAvgConf(flag.Arg(0)) + if err != nil { + log.Fatalf("Error retreiving confidence for %s: %v\n", flag.Arg(0), err) + } + + fmt.Printf("%0.0f\n", avg) +} diff --git a/dehyphenate/main.go b/dehyphenate/main.go deleted file mode 100644 index b2bd6f9..0000000 --- a/dehyphenate/main.go +++ /dev/null @@ -1,63 +0,0 @@ -package main - -import ( - "encoding/xml" - "flag" - "fmt" - "io/ioutil" - "log" - "os" - - "rescribe.xyz/utils/pkg/hocr" -) - -// BUGS: -// - loses all elements not captured in hocr structure such as html headings -// might be best to copy the header and footer separately and put the hocr in between, but would still need to ensure all elements are captured -// - loses any formatting; doesn't need to be identical, but e.g. linebreaks after elements would be handy -// - need to handle OcrChar - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: dehyphenate hocrin hocrout\n") - fmt.Fprintf(os.Stderr, "Dehyphenates a hocr file.\n") - flag.PrintDefaults() - } - flag.Parse() - if flag.NArg() != 2 { - flag.Usage() - os.Exit(1) - } - - in, err := ioutil.ReadFile(flag.Arg(0)) - if err != nil { - log.Fatalf("Error reading %s: %v", flag.Arg(1), err) - } - h, err := hocr.Parse(in) - if err != nil { - log.Fatal(err) - } - - for i, l := range h.Lines { - w := l.Words[len(l.Words)-1] - if len(w.Chars) == 0 { - if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' { - h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text - h.Lines[i+1].Words[0].Text = "" - } - } else { - log.Printf("TODO: handle OcrChar") - } - } - - f, err := os.Create(flag.Arg(1)) - if err != nil { - log.Fatalf("Error creating file %s: %v", flag.Arg(1), err) - } - defer f.Close() - e := xml.NewEncoder(f) - err = e.Encode(h) - if err != nil { - log.Fatalf("Error encoding XML: %v", err) - } -} diff --git a/doc.go b/doc.go deleted file mode 100644 index 8063aa8..0000000 --- a/doc.go +++ /dev/null @@ -1,2 +0,0 @@ -// A collection of OCR related utilities and packages -package utils diff --git a/eeboxmltohocr/main.go b/eeboxmltohocr/main.go deleted file mode 100644 index 2761cd9..0000000 --- a/eeboxmltohocr/main.go +++ /dev/null @@ -1,135 +0,0 @@ -package main - -import ( - "bufio" - "flag" - "fmt" - "io" - "log" - "os" - "regexp" - "strconv" - "strings" -) - -// splitByPb is a split function for the scanner that splits by the -// '= 0 { - return i + 1, data[0:i], nil - } - // If we're at EOF, we have a final section, so just return the lot. - if atEOF { - return len(data), data, nil - } - // Request more data. - return 0, nil, nil -} - -type Page struct { - number int - text string -} - -func addPage(pgs *[]Page, number int, text string) { - added := 0 - for i, pg := range *pgs { - if pg.number == number { - (*pgs)[i].text = pg.text + text - added = 1 - } - } - if added == 0 { - newpg := Page{number, text} - *pgs = append(*pgs, newpg) - } -} - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: eeboxmltohocr in.xml outbase\n") - flag.PrintDefaults() - } - flag.Parse() - if flag.NArg() < 2 { - flag.Usage() - os.Exit(1) - } - - f, err := os.Open(flag.Arg(0)) - defer f.Close() - if err != nil { - log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) - } - scanner := bufio.NewScanner(f) - - scanner.Split(splitByPb) - - var pgs []Page - - for scanner.Scan() { - t := scanner.Text() - r := regexp.MustCompile(`pb [^>]*facs="tcp:.*?:(.*?)"`).FindStringSubmatch(t) - if len(r) <= 1 { - continue - } - pgnum, err := strconv.Atoi(r[1]) - if err != nil { - continue - } - - content := t[strings.Index(t, ">")+1:] - ungap := regexp.MustCompile(`(?s)].+?`).ReplaceAllString(content, "") - unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "") - - finaltxt := strings.TrimLeft(unxml, " \n") - if len(finaltxt) == 0 { - continue - } - - addPage(&pgs, pgnum, finaltxt) - } - - for _, pg := range pgs { - fn := fmt.Sprintf("%s-%03d.hocr", flag.Arg(1), pg.number - 1) - f, err := os.Create(fn) - if err != nil { - log.Fatalf("Could not create file %s: %v\n", fn, err) - } - defer f.Close() - - _, err = io.WriteString(f, hocrHeader + pg.text + hocrFooter) - if err != nil { - log.Fatalf("Could not write file %s: %v\n", fn, err) - } - } -} - -const hocrHeader = ` - - - - - - - - - -
-
-

- - ` - -const hocrFooter = ` - -

-
-
- -` diff --git a/fonttobytes/main.go b/fonttobytes/main.go deleted file mode 100644 index 8310e0f..0000000 --- a/fonttobytes/main.go +++ /dev/null @@ -1,49 +0,0 @@ -package main - -import ( - "bytes" - "compress/zlib" - "flag" - "fmt" - "io/ioutil" - "log" - "os" -) - -func main() { - flag.Usage = func() { - fmt.Fprintln(flag.CommandLine.Output(), "Usage: fonttobytes font.ttf") - flag.PrintDefaults() - } - flag.Parse() - - if flag.NArg() != 1 { - flag.Usage() - return - } - - f, err := os.Open(flag.Arg(0)) - if err != nil { - log.Fatalln("Failed to open file", flag.Arg(0), err) - } - fontbytes, err := ioutil.ReadAll(f) - if err != nil { - log.Fatalln("Failed to read file", flag.Arg(0), err) - } - - var compressed bytes.Buffer - w := zlib.NewWriter(&compressed) - w.Write(fontbytes) - w.Close() - - // This could be done with %+v in printf, but using the decimal rather than - // hex output saves quite a few bytes, so we do that instead. - fmt.Printf("[]byte{") - for i, b := range compressed.Bytes() { - if i > 0 { - fmt.Printf(", ") - } - fmt.Printf("%d", b) - } - fmt.Printf("}\n") -} diff --git a/hocrtotxt/main.go b/hocrtotxt/main.go deleted file mode 100644 index 6716a9e..0000000 --- a/hocrtotxt/main.go +++ /dev/null @@ -1,30 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "log" - "os" - - "rescribe.xyz/utils/pkg/hocr" -) - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: hocrtotxt hocrfile\n") - fmt.Fprintf(os.Stderr, "Prints the text from a hocr file.\n") - flag.PrintDefaults() - } - flag.Parse() - if flag.NArg() != 1 { - flag.Usage() - os.Exit(1) - } - - text, err := hocr.GetText(flag.Arg(0)) - if err != nil { - log.Fatal(err) - } - - fmt.Printf("%s\n", text) -} diff --git a/pare-gt/main.go b/pare-gt/main.go deleted file mode 100644 index a4d9600..0000000 --- a/pare-gt/main.go +++ /dev/null @@ -1,147 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "log" - "math/rand" - "os" - "path" - "path/filepath" - "sort" - "strings" -) - -const usage = `Usage: pare-gt [-n num] gtdir movedir - -Moves some of the ground truth from gt-dir into movedir, -ensuring that the same proportions of each ground truth -source are represented in the moved section. Proportion of -ground truth source is calculated by taking the prefix of -the filename up to the first '-' character. -` - -// Prefixes is a map of the prefix string to a list of filenames -type Prefixes = map[string][]string - -// walker adds any .txt path to prefixes map, under the appropriate -// prefix (blank if no '-' separator was found) -func walker(prefixes *Prefixes) filepath.WalkFunc { - return func(fpath string, info os.FileInfo, err error) error { - if err != nil { - return err - } - if info.IsDir() { - return nil - } - ext := path.Ext(fpath) - if ext != ".txt" { - return nil - } - base := path.Base(fpath) - idx := strings.Index(base, "-") - var prefix string - if idx > -1 { - prefix = base[0:idx] - } - noext := strings.TrimSuffix(fpath, ext) - (*prefixes)[prefix] = append((*prefixes)[prefix], noext) - return nil - } -} - -// inStrSlice checks whether a given string is part of a slice of -// strings -func inStrSlice(sl []string, s string) bool { - for _, v := range sl { - if s == v { - return true - } - } - return false -} - -// samplePrefixes selects random samples for each prefix, proportional -// to the amount of that prefix there are in the whole set, so that a -// total of perctosample% are sampled. -func samplePrefixes(perctosample int, prefixes Prefixes) (filestomove []string) { - var total, sample int - var keys []string - for i, v := range prefixes { - total += len(v) - // needed for determinism - sort.Strings(prefixes[i]) - keys = append(keys, i) - } - - sample = (total * perctosample) / 100 - - // This ensures the map is looped over deterministically - sort.Strings(keys) - for _, key := range keys { - prefix := prefixes[key] - len := len(prefix) - if len == 1 { - continue - } - numtoget := int(float64(sample) / float64(total) * float64(len)) - if numtoget >= len { - numtoget = len - 1 - } - if numtoget < 1 { - numtoget = 1 - } - for i := 0; i < numtoget; i++ { - var selected string - selected = prefix[rand.Int()%len] - // pick a different random selection if the first one is - // already in the filestomove slice - for inStrSlice(filestomove, selected) { - selected = prefix[rand.Int()%len] - } - filestomove = append(filestomove, selected) - } - } - - return -} - -func main() { - flag.Usage = func() { - fmt.Fprintf(flag.CommandLine.Output(), usage) - flag.PrintDefaults() - } - numtopare := flag.Int("n", 10, "Percentage of the ground truth to pare away.") - flag.Parse() - if flag.NArg() != 2 { - flag.Usage() - os.Exit(1) - } - - for _, d := range flag.Args() { - info, err := os.Stat(d) - if err != nil || !info.IsDir() { - log.Fatalln("Error accessing directory", flag.Arg(0), err) - } - } - - var prefixes Prefixes - prefixes = make(Prefixes) - err := filepath.Walk(flag.Arg(0), walker(&prefixes)) - if err != nil { - log.Fatalln("Failed to walk", flag.Arg(0), err) - } - - filestomove := samplePrefixes(*numtopare, prefixes) - - for _, f := range filestomove { - fmt.Println("Moving ground truth", f) - b := path.Base(f) - for _, ext := range []string{".txt", ".png"} { - err = os.Rename(f+ext, path.Join(flag.Arg(1), b+ext)) - if err != nil { - log.Fatalln("Error moving file", f+ext, err) - } - } - } -} diff --git a/pare-gt/main_test.go b/pare-gt/main_test.go deleted file mode 100644 index c381a86..0000000 --- a/pare-gt/main_test.go +++ /dev/null @@ -1,78 +0,0 @@ -package main - -import ( - "fmt" - "testing" -) - -func TestSamplePrefixes(t *testing.T) { - prefixes := Prefixes{ - "1471-Orthographia": { - "1471-Orthographia-Tortellius_00001.txt", - "1471-Orthographia-Tortellius_00002.txt", - "1471-Orthographia-Tortellius_00003.txt", - "1471-Orthographia-Tortellius_00004.txt", - "1471-Orthographia-Tortellius_00005.txt", - "1471-Orthographia-Tortellius_00006.txt", - "1471-Orthographia-Tortellius_00007.txt", - "1471-Orthographia-Tortellius_00008.txt", - "1471-Orthographia-Tortellius_00009.txt", - "1471-Orthographia-Tortellius_000010.txt", - "1471-Orthographia-Tortellius_000011.txt", - "1471-Orthographia-Tortellius_000012.txt", - "1471-Orthographia-Tortellius_000013.txt", - "1471-Orthographia-Tortellius_000014.txt", - "1471-Orthographia-Tortellius_000015.txt", - "1471-Orthographia-Tortellius_000016.txt", - "1471-Orthographia-Tortellius_000017.txt", - "1471-Orthographia-Tortellius_000018.txt", - "1471-Orthographia-Tortellius_000019.txt", - "1471-Orthographia-Tortellius_000020.txt", - }, - "Kallimachos_1509": { - "Kallimachos_1509-ShipOfFools-Barclay_00121.txt", - "Kallimachos_1509-ShipOfFools-Barclay_00122.txt", - "Kallimachos_1509-ShipOfFools-Barclay_00123.txt", - "Kallimachos_1509-ShipOfFools-Barclay_00124.txt", - "Kallimachos_1509-ShipOfFools-Barclay_00125.txt", - "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", - }, - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4": { - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_12_49.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_1_415.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_14_6628571428571429.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_16_865.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_17_62.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_18_6366666666666666.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt", - }, - } - - cases := []struct { - perc int - expected []string - }{ - {1, []string{"1471-Orthographia-Tortellius_000019.txt", "Kallimachos_1509-ShipOfFools-Barclay_00122.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt"}}, - {10, []string{"1471-Orthographia-Tortellius_00002.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt"}}, - {20, []string{"1471-Orthographia-Tortellius_00008.txt", "1471-Orthographia-Tortellius_000017.txt", "1471-Orthographia-Tortellius_00006.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt"}}, - } - - for _, c := range cases { - t.Run(fmt.Sprintf("%d%%", c.perc), func(t *testing.T) { - actual := samplePrefixes(c.perc, prefixes) - if len(c.expected) != len(actual) { - t.Fatalf("Number of files picked (%d) differs from expected (%d):\nExpected: %s\nActual: %s\n", len(actual), len(c.expected), c.expected, actual) - return - } - for i, v := range c.expected { - if actual[i] != v { - t.Fatalf("Difference in expected and actual files (first difference is in index %d of actual):\n\nExpected:\n%s\n\nActual:\n%s\n", i, c.expected, actual) - } - } - }) - } -} diff --git a/pgconf/main.go b/pgconf/main.go deleted file mode 100644 index dbc6af8..0000000 --- a/pgconf/main.go +++ /dev/null @@ -1,30 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "log" - "os" - - "rescribe.xyz/utils/pkg/hocr" -) - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: pgconf hocr\n") - fmt.Fprintf(os.Stderr, "Prints the total confidence for a page, as an average of the confidence of each word.\n") - flag.PrintDefaults() - } - flag.Parse() - if flag.NArg() != 1 { - flag.Usage() - os.Exit(1) - } - - avg, err := hocr.GetAvgConf(flag.Arg(0)) - if err != nil { - log.Fatalf("Error retreiving confidence for %s: %v\n", flag.Arg(0), err) - } - - fmt.Printf("%0.0f\n", avg) -} -- cgit v1.2.1-24-ge1ad