summaryrefslogtreecommitdiff
path: root/cmd/bucket-lines/main.go
blob: fddff21b94bf37dfbbb1b26f128749153a67a592 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
// Copyright 2019 Nick White.
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.

// bucket-lines copies image-text line pairs into different directories
// according to the average character probability for the line
package main

import (
	"encoding/json"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"os"
	"path/filepath"

	"rescribe.xyz/utils/pkg/hocr"
	"rescribe.xyz/utils/pkg/line"
	"rescribe.xyz/utils/pkg/prob"
)

func main() {
	b := BucketSpecs{
		// minimum confidence, name
		{0, "bad"},
		{0.95, "95to98"},
		{0.98, "98plus"},
	}

	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "Usage: bucket-lines [-d dir] [-s specs.json] [hocr1] [prob1] [hocr2] [...]\n")
		fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n")
		fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n")
		fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n")
		fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n")
		fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n")
		fmt.Fprintf(os.Stderr, "option.\n")
		fmt.Fprintf(os.Stderr, "The .prob and .hocr files are assumed to be in the same directory\n")
		fmt.Fprintf(os.Stderr, "as the line's image and text files.\n\n")
		flag.PrintDefaults()
		fmt.Fprintf(os.Stderr, "\nAn example specs.json file would be the following:\n")
		fmt.Fprintf(os.Stderr, "[{\"min\": 0, \"name\": \"terrible\"}, {\"min\": 0.80, \"name\": \"ok\"}, {\"min\": 0.98, \"name\": \"great\"}]\n")
	}
	dir := flag.String("d", "buckets", "Directory to store the buckets")
	specs := flag.String("s", "", "JSON file describing specs to bucket into")
	flag.Parse()
	if flag.NArg() < 1 {
		flag.Usage()
		os.Exit(1)
	}

	if *specs != "" {
		js, err := ioutil.ReadFile(*specs)
		if err != nil {
			log.Fatal(err)
		}
		err = json.Unmarshal(js, &b)
		if err != nil {
			log.Fatal(err)
		}
	}

	var err error
	lines := make(line.Details, 0)

	for _, f := range flag.Args() {
		var newlines line.Details
		switch ext := filepath.Ext(f); ext {
		case ".prob":
			newlines, err = prob.GetLineDetails(f)
		case ".hocr":
			newlines, err = hocr.GetLineDetails(f)
		default:
			log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f)
			continue
		}
		if err != nil {
			log.Fatal(err)
		}

		for _, l := range newlines {
			if l.Img != nil {
				lines = append(lines, l)
			}
		}
	}

	stats, err := BucketUp(lines, b, *dir)
	if err != nil {
		log.Fatal(err)
	}

	PrintBucketStats(os.Stdout, stats)
}