summaryrefslogtreecommitdiff
path: root/bucket-lines-hocr/bucket-lines-hocr.go
blob: b35c824127498647f6d6d4494b7f5961e1364edf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
package main

// TODO: merge with -prob, using filename extension to determine what to do for each file

import (
	"flag"
	"fmt"
	"image/png"
	"io/ioutil"
	"log"
	"os"
	"path/filepath"
	"strings"

	"git.rescribe.xyz/testingtools/parse"
	"git.rescribe.xyz/testingtools/parse/hocr"
)

func detailsFromFile(f string) (parse.LineDetails, error) {
	var newlines parse.LineDetails

	file, err := ioutil.ReadFile(f)
	if err != nil {
		return newlines, err
	}

	h, err := hocr.Parse(file)
	if err != nil {
		return newlines, err
	}

	pngfn := strings.Replace(f, ".hocr", ".png", 1)
	pngf, err := os.Open(pngfn)
	if err != nil {
		return newlines, err
	}
	defer pngf.Close()
	img, err := png.Decode(pngf)
	if err != nil {
		return newlines, err
	}

	n := strings.Replace(filepath.Base(f), ".hocr", "", 1)
	return hocr.GetLineDetails(h, img, n)
}

func main() {
	b := parse.BucketSpecs{
		// minimum confidence, name
		{ 0, "bad" },
		{ 0.95, "95to98" },
		{ 0.98, "98plus" },
	}

	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "Usage: bucket-lines-hocr [-d dir] hocr1 [hocr2] [...]\n")
		fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n")
		fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n")
		fmt.Fprintf(os.Stderr, "This uses the x_wconf data in .hocr files, which it assumes will be.\n")
		fmt.Fprintf(os.Stderr, "in the same directory as the line's image and text files. It can\n")
		fmt.Fprintf(os.Stderr, "handle hocr where each character is tagged separately and hocr where\n")
		fmt.Fprintf(os.Stderr, "only whole words are tagged.\n")
		flag.PrintDefaults()
	}
	dir := flag.String("d", "buckets", "Directory to store the buckets")
	flag.Parse()
	if flag.NArg() < 1 {
		flag.Usage()
		os.Exit(1)
	}

	lines := make(parse.LineDetails, 0)

	for _, f := range flag.Args() {
		newlines, err := detailsFromFile(f)
		if err != nil {
			log.Fatal(err)
		}

		for _, l := range newlines {
			lines = append(lines, l)
		}
	}

	stats, err := parse.BucketUp(lines, b, *dir)
	if err != nil {
		log.Fatal(err)
	}

	parse.PrintBucketStats(os.Stdout, stats)
}