[analysestats] complete

author: Nick White <git@njw.name> 2020-09-22 13:22:59 +0100
committer: Nick White <git@njw.name> 2020-09-22 13:22:59 +0100
commit: 251c0ea75d6ad28dbea7ad38bf15349a3b108d9e (patch)
tree: 96b0379ff3e1a09b34b0aa85656a585274fca459 /cmd
parent: 9aea616e32dbd94e1670e7cee4d96612ce0bf217 (diff)
1 files changed, 73 insertions, 1 deletions
diff --git a/cmd/analysestats/main.go b/cmd/analysestats/main.go
index 1d6071d..a15f3b5 100644
--- a/cmd/analysestats/main.go
+++ b/cmd/analysestats/main.go
@@ -8,12 +8,14 @@
 package main
 
 import (
+	"bufio"
 	"encoding/csv"
 	"encoding/xml"
 	"flag"
 	"fmt"
 	"io/ioutil"
 	"log"
+	"math"
 	"os"
 	"path/filepath"
 	"strings"
@@ -65,6 +67,69 @@ func getTrainingUsed(hocrfn string) (string, error) {
 	return par.Par[0].Lang, nil
 }
 
+// getMeanStddevOfBest calculates the mean and standard deviation
+// of the confidence values of every page in bestfn, as listed in
+// conffn.
+func getMeanStddevOfBest(bestfn string, conffn string) (float64, float64, error) {
+	f, err := os.Open(conffn)
+	if err != nil {
+		return 0, 0, fmt.Errorf("Failed to open %s: %v", conffn, err)
+	}
+	defer f.Close()
+	s := bufio.NewScanner(f)
+
+	// create a map of confs from the conf file
+	var confs map[string]int
+	confs = make(map[string]int)
+	for s.Scan() {
+		line := s.Text()
+		parts := strings.Fields(line)
+		if len(parts) != 2 {
+			continue
+		}
+		c, err := strconv.Atoi(parts[1])
+		if err != nil {
+			continue
+		}
+		fn := filepath.Base(parts[0])
+		confs[fn] = c
+	}
+
+	f, err = os.Open(bestfn)
+	if err != nil {
+		return 0, 0, fmt.Errorf("Failed to open %s: %v", bestfn, err)
+	}
+	defer f.Close()
+	s = bufio.NewScanner(f)
+
+	var bestConfs []int
+	for s.Scan() {
+		fn := s.Text()
+		c, ok := confs[fn]
+		if !ok {
+			continue
+		}
+		bestConfs = append(bestConfs, c)
+	}
+
+	var sum int
+	for _, v := range bestConfs {
+		sum += v
+	}
+	mean := float64(sum) / float64(len(bestConfs))
+
+	var a, stddev float64
+	if len(bestConfs) > 1 {
+		for _, v := range bestConfs {
+			a += (float64(v) - mean) * (float64(v) - mean)
+		}
+		variance := a / float64(len(bestConfs) - 1)
+		stddev = math.Sqrt(variance)
+	}
+
+	return mean, stddev, nil
+}
+
 // walker returns a walkfunc that checks for hocr and best files,
 // and uses them to fill the bookstats map & structure. Note that
 // the stat file is read when the best file is read, as they need
@@ -112,7 +177,14 @@ func walker(bookstats *Bookstats) filepath.WalkFunc {
 			}
 			(*bookstats)[prefix].training = training
 		case "best":
-			// TODO: read conf also and fill in mean and stddev
+			confpath := strings.Replace(fpath, "-best", "-conf", -1)
+			mean, stddev, err := getMeanStddevOfBest(fpath, confpath)
+			if err != nil {
+				log.Printf("Warning: failed to get mean & standard deviation from %s and %s: %v\n", fpath, confpath, err)
+				return nil
+			}
+			(*bookstats)[prefix].mean = mean
+			(*bookstats)[prefix].stddev = stddev
 		}
 
 		return nil
author	Nick White <git@njw.name>	2020-09-22 13:22:59 +0100
committer	Nick White <git@njw.name>	2020-09-22 13:22:59 +0100
commit	251c0ea75d6ad28dbea7ad38bf15349a3b108d9e (patch)
tree	96b0379ff3e1a09b34b0aa85656a585274fca459 /cmd
parent	9aea616e32dbd94e1670e7cee4d96612ce0bf217 (diff)