summaryrefslogtreecommitdiff
path: root/cmd/bucket-lines/main.go
blob: af81b44ae1aeecec9ea9bcebf19a947f8a4ba704 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
package main

import (
	"encoding/json"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"os"
	"path/filepath"

	"rescribe.xyz/utils/pkg/hocr"
	"rescribe.xyz/utils/pkg/line"
	"rescribe.xyz/utils/pkg/prob"
)

func main() {
	b := BucketSpecs{
		// minimum confidence, name
		{0, "bad"},
		{0.95, "95to98"},
		{0.98, "98plus"},
	}

	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "Usage: bucket-lines [-d dir] [-s specs.json] [hocr1] [prob1] [hocr2] [...]\n")
		fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n")
		fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n")
		fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n")
		fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n")
		fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n")
		fmt.Fprintf(os.Stderr, "option.\n")
		fmt.Fprintf(os.Stderr, "The .prob and .hocr files are assumed to be in the same directory\n")
		fmt.Fprintf(os.Stderr, "as the line's image and text files.\n\n")
		flag.PrintDefaults()
		fmt.Fprintf(os.Stderr, "\nAn example specs.json file would be the following:\n")
		fmt.Fprintf(os.Stderr, "[{\"min\": 0, \"name\": \"terrible\"}, {\"min\": 0.80, \"name\": \"ok\"}, {\"min\": 0.98, \"name\": \"great\"}]\n")
	}
	dir := flag.String("d", "buckets", "Directory to store the buckets")
	specs := flag.String("s", "", "JSON file describing specs to bucket into")
	flag.Parse()
	if flag.NArg() < 1 {
		flag.Usage()
		os.Exit(1)
	}

	if *specs != "" {
		js, err := ioutil.ReadFile(*specs)
		if err != nil {
			log.Fatal(err)
		}
		err = json.Unmarshal(js, &b)
		if err != nil {
			log.Fatal(err)
		}
	}

	var err error
	lines := make(line.Details, 0)

	for _, f := range flag.Args() {
		var newlines line.Details
		switch ext := filepath.Ext(f); ext {
		case ".prob":
			newlines, err = prob.GetLineDetails(f)
		case ".hocr":
			newlines, err = hocr.GetLineDetails(f)
		default:
			log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f)
			continue
		}
		if err != nil {
			log.Fatal(err)
		}

		for _, l := range newlines {
			if l.Img != nil {
				lines = append(lines, l)
			}
		}
	}

	stats, err := BucketUp(lines, b, *dir)
	if err != nil {
		log.Fatal(err)
	}

	PrintBucketStats(os.Stdout, stats)
}