summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bookpipeline/main.go190
1 files changed, 106 insertions, 84 deletions
diff --git a/bookpipeline/main.go b/bookpipeline/main.go
index cb02f5e..de38ab4 100644
--- a/bookpipeline/main.go
+++ b/bookpipeline/main.go
@@ -7,6 +7,7 @@ import (
"errors"
"flag"
"fmt"
+ "io"
"log"
"os"
"os/exec"
@@ -40,6 +41,9 @@ one is found this general process is followed:
`
+const maxticks = 20
+const cutoff = 70
+
// null writer to enable non-verbose logging to be discarded
type NullWriter bool
@@ -143,85 +147,14 @@ type GraphConf struct {
pgnum, conf float64
}
-func analyse(toanalyse chan string, up chan string, errc chan error, logger *log.Logger) {
- confs := make(map[string][]*Conf)
- bestconfs := make(map[string]*Conf)
- savedir := ""
-
- for path := range toanalyse {
- if savedir == "" {
- savedir = filepath.Dir(path)
- }
- logger.Println("Calculating confidence for", path)
- avg, err := hocr.GetAvgConf(path)
- if err != nil {
- close(up)
- errc <- errors.New(fmt.Sprintf("Error retreiving confidence for %s: %s", path, err))
- return
- }
- base := filepath.Base(path)
- codestart := strings.Index(base, "_bin")
- name := base[0:codestart]
- var c Conf
- c.path = path
- c.code = base[codestart:]
- c.conf = avg
- confs[name] = append(confs[name], &c)
-
- }
-
- fn := filepath.Join(savedir, "conf")
- logger.Println("Saving confidences in file", fn)
- f, err := os.Create(fn)
- if err != nil {
- close(up)
- errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err))
- return
- }
- defer f.Close()
-
- logger.Println("Finding best confidence for each page, and saving all confidences")
- for base, conf := range confs {
- var best float64
- for _, c := range conf {
- if c.conf > best {
- best = c.conf
- bestconfs[base] = c
- }
- _, err = fmt.Fprintf(f, "%s\t%02.f\n", c.path, c.conf)
- if err != nil {
- close(up)
- errc <- errors.New(fmt.Sprintf("Error writing confidences file: %s", err))
- return
- }
- }
- }
- up <- fn
-
- logger.Println("Creating best file listing the best file for each page")
- fn = filepath.Join(savedir, "best")
- f, err = os.Create(fn)
- if err != nil {
- close(up)
- errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err))
- return
- }
- defer f.Close()
- for _, conf := range bestconfs {
- _, err = fmt.Fprintf(f, "%s\n", filepath.Base(conf.path))
- }
- up <- fn
-
- // TODO: move this graph stuff out into its own file, it's pretty big
- logger.Println("Creating graph")
+func graph(confs map[string]*Conf, bookname string, w io.Writer) (error) {
+ // Organise confs to sort them by page
var graphconf []GraphConf
- // organise bestconfs to sort them by page
- for _, conf := range bestconfs {
+ for _, conf := range confs {
name := filepath.Base(conf.path)
numend := strings.Index(name, "_")
pgnum, err := strconv.ParseFloat(name[0:numend], 64)
if err != nil {
- logger.Printf("Failed to convert %s to float, excluding from graph\n", name[0:numend])
continue
}
var c GraphConf
@@ -230,16 +163,18 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log
graphconf = append(graphconf, c)
}
sort.Slice(graphconf, func(i, j int) bool { return graphconf[i].pgnum < graphconf[j].pgnum })
+
+ // Create main xvalues and yvalues, annotations and ticks
var xvalues, yvalues []float64
var annotations []chart.Value2
var ticks []chart.Tick
i := 0
- tickevery := len(graphconf) / 20
+ tickevery := len(graphconf) / maxticks
for _, c := range graphconf {
i = i + 1
xvalues = append(xvalues, c.pgnum)
yvalues = append(yvalues, c.conf)
- if c.conf < 70 {
+ if c.conf < cutoff {
annotations = append(annotations, chart.Value2{Label: fmt.Sprintf("%.0f", c.pgnum), XValue: c.pgnum, YValue: c.conf})
}
if tickevery % i == 0 {
@@ -251,10 +186,24 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log
YValues: yvalues,
}
- // remove outliers at 10% of max and min confidence to use for dotted lines
+ // Create 70% line
+ yvalues = []float64{}
+ for _, _ = range xvalues {
+ yvalues = append(yvalues, cutoff)
+ }
+ cutoffSeries := chart.ContinuousSeries{
+ XValues: xvalues,
+ YValues: yvalues,
+ Style: chart.Style{
+ Show: true,
+ StrokeColor: chart.ColorAlternateGreen,
+ StrokeDashArray: []float64{10.0, 5.0},
+ },
+ }
+
+ // Create lines marking top and bottom 10% confidence
sort.Slice(graphconf, func(i, j int) bool { return graphconf[i].conf < graphconf[j].conf })
cutoff := int(len(graphconf) / 10)
- logger.Printf("cutoff is %d, from %d\n", cutoff, len(graphconf))
mostconf := graphconf[cutoff:len(graphconf)-cutoff]
sort.Slice(mostconf, func(i, j int) bool { return mostconf[i].pgnum < mostconf[j].pgnum })
xvalues = []float64{}
@@ -284,8 +233,11 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log
InnerSeries: mostSeries,
}
- // TODO: add number of words series using yaxissecondary
graph := chart.Chart{
+ Title: fmt.Sprintf("Confidence of pages from %s", bookname),
+ TitleStyle: chart.StyleShow(),
+ Width: 1920,
+ Height: 1080,
XAxis: chart.XAxis{
Name: "Page number",
NameStyle: chart.StyleShow(),
@@ -304,14 +256,11 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log
Max: 100.0,
},
},
- //YAxisSecondary: chart.YAxis{
- // Name: "Number of words",
- // Style: chart.StyleShow(),
- //},
Series: []chart.Series{
mainSeries,
minSeries,
maxSeries,
+ cutoffSeries,
chart.LastValueAnnotation(minSeries),
chart.LastValueAnnotation(maxSeries),
chart.AnnotationSeries{
@@ -324,6 +273,79 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log
//},
},
}
+ return graph.Render(chart.PNG, w)
+}
+
+func analyse(toanalyse chan string, up chan string, errc chan error, logger *log.Logger) {
+ confs := make(map[string][]*Conf)
+ bestconfs := make(map[string]*Conf)
+ savedir := ""
+
+ for path := range toanalyse {
+ if savedir == "" {
+ savedir = filepath.Dir(path)
+ }
+ logger.Println("Calculating confidence for", path)
+ avg, err := hocr.GetAvgConf(path)
+ if err != nil {
+ close(up)
+ errc <- errors.New(fmt.Sprintf("Error retreiving confidence for %s: %s", path, err))
+ return
+ }
+ base := filepath.Base(path)
+ codestart := strings.Index(base, "_bin")
+ name := base[0:codestart]
+ var c Conf
+ c.path = path
+ c.code = base[codestart:]
+ c.conf = avg
+ confs[name] = append(confs[name], &c)
+
+ }
+
+ fn := filepath.Join(savedir, "conf")
+ logger.Println("Saving confidences in file", fn)
+ f, err := os.Create(fn)
+ if err != nil {
+ close(up)
+ errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err))
+ return
+ }
+ defer f.Close()
+
+ logger.Println("Finding best confidence for each page, and saving all confidences")
+ for base, conf := range confs {
+ var best float64
+ for _, c := range conf {
+ if c.conf > best {
+ best = c.conf
+ bestconfs[base] = c
+ }
+ _, err = fmt.Fprintf(f, "%s\t%02.f\n", c.path, c.conf)
+ if err != nil {
+ close(up)
+ errc <- errors.New(fmt.Sprintf("Error writing confidences file: %s", err))
+ return
+ }
+ }
+ }
+ up <- fn
+
+ logger.Println("Creating best file listing the best file for each page")
+ fn = filepath.Join(savedir, "best")
+ f, err = os.Create(fn)
+ if err != nil {
+ close(up)
+ errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err))
+ return
+ }
+ defer f.Close()
+ for _, conf := range bestconfs {
+ _, err = fmt.Fprintf(f, "%s\n", filepath.Base(conf.path))
+ }
+ up <- fn
+
+ logger.Println("Creating graph")
fn = filepath.Join(savedir, "graph.png")
f, err = os.Create(fn)
if err != nil {
@@ -332,7 +354,7 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log
return
}
defer f.Close()
- err = graph.Render(chart.PNG, f)
+ err = graph(bestconfs, filepath.Base(savedir), f)
if err != nil {
close(up)
errc <- errors.New(fmt.Sprintf("Error rendering graph: %s", err))