summaryrefslogtreecommitdiff
path: root/cmd/analysestats/main.go
blob: 1d6071d942da88d34712306a8d83f7fe93eab248 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
// Copyright 2020 Nick White.
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.

// analysestats analyses a set of 'best', 'conf', and 'hocr' files
// in a directory, outputting results to a .csv file for further
// investigation.
package main

import (
	"encoding/csv"
	"encoding/xml"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"os"
	"path/filepath"
	"strings"
	"strconv"
)

const usage = `Usage: analysestats statsdir csvfile

analysestats analyses a set of 'best', 'conf', and 'hocr' files
in the 'statsdir' directory, outputting results to the 'csvfile'
file in CSV format for further investigation.
`

// stat represents key stats / metadata for a book
type stat struct {
	mean     float64
	stddev   float64
	training string
	year     int
}

// Bookstats is a map of the stats attached to each book (key is book name)
type Bookstats = map[string]*stat

type hocrPars struct {
	Par []struct {
		Lang string `xml:"lang,attr"`
	} `xml:"body>div>div>p"`
}

// getTrainingUsed parses a hOCR file to find the training
// file used to create it.
func getTrainingUsed(hocrfn string) (string, error) {
	b, err := ioutil.ReadFile(hocrfn)
	if err != nil {
		return "", err
	}

	var par hocrPars
	err = xml.Unmarshal(b, &par)
	if err != nil {
		return "", err
	}

	if len(par.Par) < 1 {
		return "", fmt.Errorf("No <p> tags found")
	}

	return par.Par[0].Lang, nil
}

// walker returns a walkfunc that checks for hocr and best files,
// and uses them to fill the bookstats map & structure. Note that
// the stat file is read when the best file is read, as they need
// to be parsed together to get the statistics we're interested
// in.
func walker(bookstats *Bookstats) filepath.WalkFunc {
	return func(fpath string, info os.FileInfo, err error) error {
		if err != nil {
			return err
		}
		if info.IsDir() {
			return nil
		}
		b := filepath.Base(fpath)
		parts := strings.Split(b, "-")
		// if no - or name is too short to have a useful prefix, bail
		if len(parts) < 2 || len(b) < 6 {
			return nil
		}
		prefix := b[0:len(b)-6] // 6 is length of '-hocr' + 1
		ext := parts[len(parts)-1]

		if ext != "hocr" && ext != "best" {
			return nil
		}

		var year int
		parts2 := strings.Split(b, "_")
		if len(parts2) > 2 {
			// we can ignore an error as a zero year is correct in that case anyway
			year, _ = strconv.Atoi(parts2[0])
		}

		_, ok := (*bookstats)[prefix]
		if !ok {
			(*bookstats)[prefix] = &stat{year: year}
		}

		switch ext {
		case "hocr":
			training, err := getTrainingUsed(fpath)
			if err != nil {
				log.Printf("Warning: failed to get training used from %s: %v\n", fpath, err)
				return nil
			}
			(*bookstats)[prefix].training = training
		case "best":
			// TODO: read conf also and fill in mean and stddev
		}

		return nil
	}
}

func main() {
	flag.Usage = func() {
		fmt.Fprintf(flag.CommandLine.Output(), usage)
		flag.PrintDefaults()
	}
	flag.Parse()
	if flag.NArg() != 2 {
		flag.Usage()
		os.Exit(1)
	}

	info, err := os.Stat(flag.Arg(0))
	if err != nil || !info.IsDir() {
		log.Fatalln("Error accessing directory", flag.Arg(0), err)
	}

	var bookstats Bookstats
	bookstats = make(Bookstats)

	err = filepath.Walk(flag.Arg(0), walker(&bookstats))
	if err != nil {
		log.Fatalln("Failed to walk", flag.Arg(0), err)
	}

	f, err := os.Create(flag.Arg(1))
	if err != nil {
		log.Fatalf("Failed to create file %s: %v\n", flag.Arg(1), err)
	}
	defer f.Close()
	csvw := csv.NewWriter(f)

	csvw.Write([]string{"Name", "Year", "Mean", "Standard Deviation", "Training"})
	for name, stats := range bookstats {
		year := fmt.Sprintf("%d", stats.year)
		mean := fmt.Sprintf("%0.1f", stats.mean)
		stddev := fmt.Sprintf("%0.1f", stats.stddev)
		err = csvw.Write([]string{name, year, mean, stddev, stats.training})
		if err != nil {
			log.Fatalf("Failed to write record %s to csv: %v\n", name, err)
		}
	}
	csvw.Flush()
}