summaryrefslogtreecommitdiff
path: root/cmd/analysestats/main.go
blob: 10be46a9b81ed1def7bff1c6f16678f0706d3b90 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
// Copyright 2020 Nick White.
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.

// analysestats analyses a set of 'best', 'conf', and 'hocr' files
// in a directory, outputting results to a .csv file for further
// investigation.
package main

import (
	"bufio"
	"encoding/csv"
	"encoding/xml"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"math"
	"os"
	"path/filepath"
	"strings"
	"strconv"
)

const usage = `Usage: analysestats statsdir csvfile

analysestats analyses a set of 'best', 'conf', and 'hocr' files
in the 'statsdir' directory, outputting results to the 'csvfile'
file in CSV format for further investigation.
`

// stat represents key stats / metadata for a book
type stat struct {
	mean     float64
	stddev   float64
	training string
	year     int
}

// Bookstats is a map of the stats attached to each book (key is book name)
type Bookstats = map[string]*stat

type hocrPars struct {
	Par []struct {
		Lang string `xml:"lang,attr"`
	} `xml:"body>div>div>p"`
}

// getTrainingUsed parses a hOCR file to find the training
// file used to create it.
func getTrainingUsed(hocrfn string) (string, error) {
	b, err := ioutil.ReadFile(hocrfn)
	if err != nil {
		return "", err
	}

	var par hocrPars
	err = xml.Unmarshal(b, &par)
	if err != nil {
		return "", err
	}

	if len(par.Par) < 1 {
		return "", fmt.Errorf("No <p> tags found")
	}

	return par.Par[0].Lang, nil
}

// getMeanStddevOfBest calculates the mean and standard deviation
// of the confidence values of every page in bestfn, as listed in
// conffn.
func getMeanStddevOfBest(bestfn string, conffn string) (float64, float64, error) {
	f, err := os.Open(conffn)
	if err != nil {
		return 0, 0, fmt.Errorf("Failed to open %s: %v", conffn, err)
	}
	defer f.Close()
	s := bufio.NewScanner(f)

	// create a map of confs from the conf file
	var confs map[string]int
	confs = make(map[string]int)
	for s.Scan() {
		line := s.Text()
		parts := strings.Fields(line)
		if len(parts) != 2 {
			continue
		}
		c, err := strconv.Atoi(parts[1])
		if err != nil {
			continue
		}
		fn := filepath.Base(parts[0])
		confs[fn] = c
	}

	f, err = os.Open(bestfn)
	if err != nil {
		return 0, 0, fmt.Errorf("Failed to open %s: %v", bestfn, err)
	}
	defer f.Close()
	s = bufio.NewScanner(f)

	var bestConfs []int
	for s.Scan() {
		fn := s.Text()
		c, ok := confs[fn]
		if !ok {
			continue
		}
		// skip zero confidence pages, as they're likely blank pages
		if c == 0 {
			continue
		}
		bestConfs = append(bestConfs, c)
	}

	var sum int
	for _, v := range bestConfs {
		sum += v
	}
	mean := float64(sum) / float64(len(bestConfs))

	var a, stddev float64
	if len(bestConfs) > 1 {
		for _, v := range bestConfs {
			a += (float64(v) - mean) * (float64(v) - mean)
		}
		variance := a / float64(len(bestConfs) - 1)
		stddev = math.Sqrt(variance)
	}

	return mean, stddev, nil
}

// walker returns a walkfunc that checks for hocr and best files,
// and uses them to fill the bookstats map & structure. Note that
// the stat file is read when the best file is read, as they need
// to be parsed together to get the statistics we're interested
// in.
func walker(bookstats *Bookstats) filepath.WalkFunc {
	return func(fpath string, info os.FileInfo, err error) error {
		if err != nil {
			return err
		}
		if info.IsDir() {
			return nil
		}
		b := filepath.Base(fpath)
		parts := strings.Split(b, "-")
		// if no - or name is too short to have a useful prefix, bail
		if len(parts) < 2 || len(b) < 6 {
			return nil
		}
		prefix := b[0:len(b)-6] // 6 is length of '-hocr' + 1
		ext := parts[len(parts)-1]

		if ext != "hocr" && ext != "best" {
			return nil
		}

		var year int
		parts2 := strings.Split(b, "_")
		if len(parts2) > 2 {
			// we can ignore an error as a zero year is correct in that case anyway
			year, _ = strconv.Atoi(parts2[0])
		}

		_, ok := (*bookstats)[prefix]
		if !ok {
			(*bookstats)[prefix] = &stat{year: year}
		}

		switch ext {
		case "hocr":
			training, err := getTrainingUsed(fpath)
			if err != nil {
				log.Printf("Warning: failed to get training used from %s: %v\n", fpath, err)
				return nil
			}
			(*bookstats)[prefix].training = training
		case "best":
			confpath := strings.Replace(fpath, "-best", "-conf", -1)
			mean, stddev, err := getMeanStddevOfBest(fpath, confpath)
			if err != nil {
				log.Printf("Warning: failed to get mean & standard deviation from %s and %s: %v\n", fpath, confpath, err)
				return nil
			}
			(*bookstats)[prefix].mean = mean
			(*bookstats)[prefix].stddev = stddev
		}

		return nil
	}
}

func main() {
	flag.Usage = func() {
		fmt.Fprintf(flag.CommandLine.Output(), usage)
		flag.PrintDefaults()
	}
	flag.Parse()
	if flag.NArg() != 2 {
		flag.Usage()
		os.Exit(1)
	}

	info, err := os.Stat(flag.Arg(0))
	if err != nil || !info.IsDir() {
		log.Fatalln("Error accessing directory", flag.Arg(0), err)
	}

	var bookstats Bookstats
	bookstats = make(Bookstats)

	err = filepath.Walk(flag.Arg(0), walker(&bookstats))
	if err != nil {
		log.Fatalln("Failed to walk", flag.Arg(0), err)
	}

	f, err := os.Create(flag.Arg(1))
	if err != nil {
		log.Fatalf("Failed to create file %s: %v\n", flag.Arg(1), err)
	}
	defer f.Close()
	csvw := csv.NewWriter(f)

	csvw.Write([]string{"Name", "Year", "Mean", "Standard Deviation", "Training"})
	for name, stats := range bookstats {
		year := fmt.Sprintf("%d", stats.year)
		mean := fmt.Sprintf("%0.1f", stats.mean)
		stddev := fmt.Sprintf("%0.1f", stats.stddev)
		err = csvw.Write([]string{name, year, mean, stddev, stats.training})
		if err != nil {
			log.Fatalf("Failed to write record %s to csv: %v\n", name, err)
		}
	}
	csvw.Flush()
}