diff options
| author | Nick White <git@njw.name> | 2019-10-08 12:52:33 +0100 | 
|---|---|---|
| committer | Nick White <git@njw.name> | 2019-10-08 12:52:33 +0100 | 
| commit | 7482157a03ed3e9d7f45e54a126b391001f34948 (patch) | |
| tree | 52f87b9ca159fe4c04a0349de95ea9de82692b3c /bucket-lines | |
| parent | d43c11bf653bfe3c1ad1ed277f1ec08bf155cf98 (diff) | |
Separate out bookpipeline from catch-all go.git repo, and rename to rescribe.xyz/bookpipeline
The dependencies from the go.git repo will follow in due course.
Diffstat (limited to 'bucket-lines')
| -rw-r--r-- | bucket-lines/bucket.go | 131 | ||||
| -rw-r--r-- | bucket-lines/main.go | 87 | 
2 files changed, 0 insertions, 218 deletions
diff --git a/bucket-lines/bucket.go b/bucket-lines/bucket.go deleted file mode 100644 index 9f98887..0000000 --- a/bucket-lines/bucket.go +++ /dev/null @@ -1,131 +0,0 @@ -package main - -import ( -	"fmt" -	"io" -	"os" -	"path/filepath" -	"sort" -	"strconv" - -	"rescribe.xyz/go.git/lib/line" -) - -type BucketSpec struct { -	Min  float64 -	Name string -} -type BucketSpecs []BucketSpec - -func (b BucketSpecs) Len() int           { return len(b) } -func (b BucketSpecs) Swap(i, j int)      { b[i], b[j] = b[j], b[i] } -func (b BucketSpecs) Less(i, j int) bool { return b[i].Min < b[j].Min } - -type BucketStat struct { -	name string -	num  int -} -type BucketStats []BucketStat - -func (b BucketStats) Len() int           { return len(b) } -func (b BucketStats) Swap(i, j int)      { b[i], b[j] = b[j], b[i] } -func (b BucketStats) Less(i, j int) bool { return b[i].num < b[j].num } - -// Copies the image and text for a line into a directory based on -// the line confidence, as defined by the buckets struct -func bucketLine(l line.Detail, buckets BucketSpecs, dirname string) (string, error) { -	var bucket string - -	todir := "" -	for _, b := range buckets { -		if l.Avgconf >= b.Min { -			todir = b.Name -			bucket = b.Name -		} -	} - -	if todir == "" { -		return bucket, nil -	} - -	avgstr := strconv.FormatFloat(l.Avgconf, 'G', -1, 64) -	if len(avgstr) > 2 { -		avgstr = avgstr[2:] -	} - -	base := filepath.Join(dirname, todir, l.OcrName+"_"+l.Name+"_"+avgstr) - -	err := os.MkdirAll(filepath.Join(dirname, todir), 0700) -	if err != nil { -		return bucket, err -	} - -	f, err := os.Create(base + ".png") -	if err != nil { -		return bucket, err -	} -	defer f.Close() - -	err = l.Img.CopyLineTo(f) -	if err != nil { -		return bucket, err -	} - -	f, err = os.Create(base + ".txt") -	if err != nil { -		return bucket, err -	} -	defer f.Close() - -	_, err = io.WriteString(f, l.Text) -	if err != nil { -		return bucket, err -	} - -	return bucket, err -} - -// Copies line images and text into directories based on their -// confidence, as defined by the buckets struct, and returns -// statistics of whire lines went in the process. -func BucketUp(lines line.Details, buckets BucketSpecs, dirname string) (BucketStats, error) { -	var all []string -	var stats BucketStats - -	sort.Sort(lines) -	sort.Sort(buckets) -	for _, l := range lines { -		bname, err := bucketLine(l, buckets, dirname) -		if err != nil { -			return stats, err -		} -		all = append(all, bname) -	} - -	for _, b := range all { -		i := sort.Search(len(stats), func(i int) bool { return stats[i].name == b }) -		if i == len(stats) { -			newstat := BucketStat{b, 0} -			stats = append(stats, newstat) -			i = len(stats) - 1 -		} -		stats[i].num++ -	} - -	return stats, nil -} - -// Prints statistics of where lines went when bucketing -func PrintBucketStats(w io.Writer, stats BucketStats) { -	var total int -	for _, s := range stats { -		total += s.num -	} - -	fmt.Fprintf(w, "Copied %d lines\n", total) -	fmt.Fprintf(w, "---------------------------------\n") -	sort.Sort(stats) -	for _, s := range stats { -		fmt.Fprintf(w, "Lines in %7s: %2d%%\n", s.name, 100*s.num/total) -	} -} diff --git a/bucket-lines/main.go b/bucket-lines/main.go deleted file mode 100644 index 990e84c..0000000 --- a/bucket-lines/main.go +++ /dev/null @@ -1,87 +0,0 @@ -package main - -import ( -	"encoding/json" -	"flag" -	"fmt" -	"io/ioutil" -	"log" -	"os" -	"path/filepath" - -	"rescribe.xyz/go.git/lib/hocr" -	"rescribe.xyz/go.git/lib/line" -	"rescribe.xyz/go.git/lib/prob" -) - -func main() { -	b := BucketSpecs{ -		// minimum confidence, name -		{0, "bad"}, -		{0.95, "95to98"}, -		{0.98, "98plus"}, -	} - -	flag.Usage = func() { -		fmt.Fprintf(os.Stderr, "Usage: bucket-lines [-d dir] [-s specs.json] [hocr1] [prob1] [hocr2] [...]\n") -		fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n") -		fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n") -		fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n") -		fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n") -		fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") -		fmt.Fprintf(os.Stderr, "option.\n") -		fmt.Fprintf(os.Stderr, "The .prob and .hocr files are assumed to be in the same directory\n") -		fmt.Fprintf(os.Stderr, "as the line's image and text files.\n\n") -		flag.PrintDefaults() -		fmt.Fprintf(os.Stderr, "\nAn example specs.json file would be the following:\n") -		fmt.Fprintf(os.Stderr, "[{\"min\": 0, \"name\": \"terrible\"}, {\"min\": 0.80, \"name\": \"ok\"}, {\"min\": 0.98, \"name\": \"great\"}]\n") -	} -	dir := flag.String("d", "buckets", "Directory to store the buckets") -	specs := flag.String("s", "", "JSON file describing specs to bucket into") -	flag.Parse() -	if flag.NArg() < 1 { -		flag.Usage() -		os.Exit(1) -	} - -	if *specs != "" { -		js, err := ioutil.ReadFile(*specs) -		if err != nil { -			log.Fatal(err) -		} -		err = json.Unmarshal(js, &b) -		if err != nil { -			log.Fatal(err) -		} -	} - -	var err error -	lines := make(line.Details, 0) - -	for _, f := range flag.Args() { -		var newlines line.Details -		switch ext := filepath.Ext(f); ext { -		case ".prob": -			newlines, err = prob.GetLineDetails(f) -		case ".hocr": -			newlines, err = hocr.GetLineDetails(f) -		default: -			log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) -			continue -		} -		if err != nil { -			log.Fatal(err) -		} - -		for _, l := range newlines { -			lines = append(lines, l) -		} -	} - -	stats, err := BucketUp(lines, b, *dir) -	if err != nil { -		log.Fatal(err) -	} - -	PrintBucketStats(os.Stdout, stats) -}  | 
