summaryrefslogtreecommitdiff
path: root/bucket-lines-hocr
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-01-24 17:56:14 +0000
committerNick White <git@njw.name>2019-01-24 17:56:14 +0000
commit6e08494a8883a8e304ef02ea352a11776ab2294c (patch)
tree47ca50fefafa5f8381dc2237792aff11a9b2c085 /bucket-lines-hocr
parent010255d2c19bbd06ca7e5ce92d8f98f3c12e0ad3 (diff)
Merge bucket-lines-{prob,hocr} into one tool called bucket-lines, that uses the filename extension to determine how to process the lines
Diffstat (limited to 'bucket-lines-hocr')
-rw-r--r--bucket-lines-hocr/bucket-lines-hocr.go91
1 files changed, 0 insertions, 91 deletions
diff --git a/bucket-lines-hocr/bucket-lines-hocr.go b/bucket-lines-hocr/bucket-lines-hocr.go
deleted file mode 100644
index b35c824..0000000
--- a/bucket-lines-hocr/bucket-lines-hocr.go
+++ /dev/null
@@ -1,91 +0,0 @@
-package main
-
-// TODO: merge with -prob, using filename extension to determine what to do for each file
-
-import (
- "flag"
- "fmt"
- "image/png"
- "io/ioutil"
- "log"
- "os"
- "path/filepath"
- "strings"
-
- "git.rescribe.xyz/testingtools/parse"
- "git.rescribe.xyz/testingtools/parse/hocr"
-)
-
-func detailsFromFile(f string) (parse.LineDetails, error) {
- var newlines parse.LineDetails
-
- file, err := ioutil.ReadFile(f)
- if err != nil {
- return newlines, err
- }
-
- h, err := hocr.Parse(file)
- if err != nil {
- return newlines, err
- }
-
- pngfn := strings.Replace(f, ".hocr", ".png", 1)
- pngf, err := os.Open(pngfn)
- if err != nil {
- return newlines, err
- }
- defer pngf.Close()
- img, err := png.Decode(pngf)
- if err != nil {
- return newlines, err
- }
-
- n := strings.Replace(filepath.Base(f), ".hocr", "", 1)
- return hocr.GetLineDetails(h, img, n)
-}
-
-func main() {
- b := parse.BucketSpecs{
- // minimum confidence, name
- { 0, "bad" },
- { 0.95, "95to98" },
- { 0.98, "98plus" },
- }
-
- flag.Usage = func() {
- fmt.Fprintf(os.Stderr, "Usage: bucket-lines-hocr [-d dir] hocr1 [hocr2] [...]\n")
- fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n")
- fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n")
- fmt.Fprintf(os.Stderr, "This uses the x_wconf data in .hocr files, which it assumes will be.\n")
- fmt.Fprintf(os.Stderr, "in the same directory as the line's image and text files. It can\n")
- fmt.Fprintf(os.Stderr, "handle hocr where each character is tagged separately and hocr where\n")
- fmt.Fprintf(os.Stderr, "only whole words are tagged.\n")
- flag.PrintDefaults()
- }
- dir := flag.String("d", "buckets", "Directory to store the buckets")
- flag.Parse()
- if flag.NArg() < 1 {
- flag.Usage()
- os.Exit(1)
- }
-
- lines := make(parse.LineDetails, 0)
-
- for _, f := range flag.Args() {
- newlines, err := detailsFromFile(f)
- if err != nil {
- log.Fatal(err)
- }
-
- for _, l := range newlines {
- lines = append(lines, l)
- }
- }
-
- stats, err := parse.BucketUp(lines, b, *dir)
- if err != nil {
- log.Fatal(err)
- }
-
- parse.PrintBucketStats(os.Stdout, stats)
-}