diff options
-rw-r--r-- | bookpipeline/main.go | 190 |
1 files changed, 106 insertions, 84 deletions
diff --git a/bookpipeline/main.go b/bookpipeline/main.go index cb02f5e..de38ab4 100644 --- a/bookpipeline/main.go +++ b/bookpipeline/main.go @@ -7,6 +7,7 @@ import ( "errors" "flag" "fmt" + "io" "log" "os" "os/exec" @@ -40,6 +41,9 @@ one is found this general process is followed: ` +const maxticks = 20 +const cutoff = 70 + // null writer to enable non-verbose logging to be discarded type NullWriter bool @@ -143,85 +147,14 @@ type GraphConf struct { pgnum, conf float64 } -func analyse(toanalyse chan string, up chan string, errc chan error, logger *log.Logger) { - confs := make(map[string][]*Conf) - bestconfs := make(map[string]*Conf) - savedir := "" - - for path := range toanalyse { - if savedir == "" { - savedir = filepath.Dir(path) - } - logger.Println("Calculating confidence for", path) - avg, err := hocr.GetAvgConf(path) - if err != nil { - close(up) - errc <- errors.New(fmt.Sprintf("Error retreiving confidence for %s: %s", path, err)) - return - } - base := filepath.Base(path) - codestart := strings.Index(base, "_bin") - name := base[0:codestart] - var c Conf - c.path = path - c.code = base[codestart:] - c.conf = avg - confs[name] = append(confs[name], &c) - - } - - fn := filepath.Join(savedir, "conf") - logger.Println("Saving confidences in file", fn) - f, err := os.Create(fn) - if err != nil { - close(up) - errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err)) - return - } - defer f.Close() - - logger.Println("Finding best confidence for each page, and saving all confidences") - for base, conf := range confs { - var best float64 - for _, c := range conf { - if c.conf > best { - best = c.conf - bestconfs[base] = c - } - _, err = fmt.Fprintf(f, "%s\t%02.f\n", c.path, c.conf) - if err != nil { - close(up) - errc <- errors.New(fmt.Sprintf("Error writing confidences file: %s", err)) - return - } - } - } - up <- fn - - logger.Println("Creating best file listing the best file for each page") - fn = filepath.Join(savedir, "best") - f, err = os.Create(fn) - if err != nil { - close(up) - errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err)) - return - } - defer f.Close() - for _, conf := range bestconfs { - _, err = fmt.Fprintf(f, "%s\n", filepath.Base(conf.path)) - } - up <- fn - - // TODO: move this graph stuff out into its own file, it's pretty big - logger.Println("Creating graph") +func graph(confs map[string]*Conf, bookname string, w io.Writer) (error) { + // Organise confs to sort them by page var graphconf []GraphConf - // organise bestconfs to sort them by page - for _, conf := range bestconfs { + for _, conf := range confs { name := filepath.Base(conf.path) numend := strings.Index(name, "_") pgnum, err := strconv.ParseFloat(name[0:numend], 64) if err != nil { - logger.Printf("Failed to convert %s to float, excluding from graph\n", name[0:numend]) continue } var c GraphConf @@ -230,16 +163,18 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log graphconf = append(graphconf, c) } sort.Slice(graphconf, func(i, j int) bool { return graphconf[i].pgnum < graphconf[j].pgnum }) + + // Create main xvalues and yvalues, annotations and ticks var xvalues, yvalues []float64 var annotations []chart.Value2 var ticks []chart.Tick i := 0 - tickevery := len(graphconf) / 20 + tickevery := len(graphconf) / maxticks for _, c := range graphconf { i = i + 1 xvalues = append(xvalues, c.pgnum) yvalues = append(yvalues, c.conf) - if c.conf < 70 { + if c.conf < cutoff { annotations = append(annotations, chart.Value2{Label: fmt.Sprintf("%.0f", c.pgnum), XValue: c.pgnum, YValue: c.conf}) } if tickevery % i == 0 { @@ -251,10 +186,24 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log YValues: yvalues, } - // remove outliers at 10% of max and min confidence to use for dotted lines + // Create 70% line + yvalues = []float64{} + for _, _ = range xvalues { + yvalues = append(yvalues, cutoff) + } + cutoffSeries := chart.ContinuousSeries{ + XValues: xvalues, + YValues: yvalues, + Style: chart.Style{ + Show: true, + StrokeColor: chart.ColorAlternateGreen, + StrokeDashArray: []float64{10.0, 5.0}, + }, + } + + // Create lines marking top and bottom 10% confidence sort.Slice(graphconf, func(i, j int) bool { return graphconf[i].conf < graphconf[j].conf }) cutoff := int(len(graphconf) / 10) - logger.Printf("cutoff is %d, from %d\n", cutoff, len(graphconf)) mostconf := graphconf[cutoff:len(graphconf)-cutoff] sort.Slice(mostconf, func(i, j int) bool { return mostconf[i].pgnum < mostconf[j].pgnum }) xvalues = []float64{} @@ -284,8 +233,11 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log InnerSeries: mostSeries, } - // TODO: add number of words series using yaxissecondary graph := chart.Chart{ + Title: fmt.Sprintf("Confidence of pages from %s", bookname), + TitleStyle: chart.StyleShow(), + Width: 1920, + Height: 1080, XAxis: chart.XAxis{ Name: "Page number", NameStyle: chart.StyleShow(), @@ -304,14 +256,11 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log Max: 100.0, }, }, - //YAxisSecondary: chart.YAxis{ - // Name: "Number of words", - // Style: chart.StyleShow(), - //}, Series: []chart.Series{ mainSeries, minSeries, maxSeries, + cutoffSeries, chart.LastValueAnnotation(minSeries), chart.LastValueAnnotation(maxSeries), chart.AnnotationSeries{ @@ -324,6 +273,79 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log //}, }, } + return graph.Render(chart.PNG, w) +} + +func analyse(toanalyse chan string, up chan string, errc chan error, logger *log.Logger) { + confs := make(map[string][]*Conf) + bestconfs := make(map[string]*Conf) + savedir := "" + + for path := range toanalyse { + if savedir == "" { + savedir = filepath.Dir(path) + } + logger.Println("Calculating confidence for", path) + avg, err := hocr.GetAvgConf(path) + if err != nil { + close(up) + errc <- errors.New(fmt.Sprintf("Error retreiving confidence for %s: %s", path, err)) + return + } + base := filepath.Base(path) + codestart := strings.Index(base, "_bin") + name := base[0:codestart] + var c Conf + c.path = path + c.code = base[codestart:] + c.conf = avg + confs[name] = append(confs[name], &c) + + } + + fn := filepath.Join(savedir, "conf") + logger.Println("Saving confidences in file", fn) + f, err := os.Create(fn) + if err != nil { + close(up) + errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err)) + return + } + defer f.Close() + + logger.Println("Finding best confidence for each page, and saving all confidences") + for base, conf := range confs { + var best float64 + for _, c := range conf { + if c.conf > best { + best = c.conf + bestconfs[base] = c + } + _, err = fmt.Fprintf(f, "%s\t%02.f\n", c.path, c.conf) + if err != nil { + close(up) + errc <- errors.New(fmt.Sprintf("Error writing confidences file: %s", err)) + return + } + } + } + up <- fn + + logger.Println("Creating best file listing the best file for each page") + fn = filepath.Join(savedir, "best") + f, err = os.Create(fn) + if err != nil { + close(up) + errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err)) + return + } + defer f.Close() + for _, conf := range bestconfs { + _, err = fmt.Fprintf(f, "%s\n", filepath.Base(conf.path)) + } + up <- fn + + logger.Println("Creating graph") fn = filepath.Join(savedir, "graph.png") f, err = os.Create(fn) if err != nil { @@ -332,7 +354,7 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log return } defer f.Close() - err = graph.Render(chart.PNG, f) + err = graph(bestconfs, filepath.Base(savedir), f) if err != nil { close(up) errc <- errors.New(fmt.Sprintf("Error rendering graph: %s", err)) |