diff options
-rw-r--r-- | bookpipeline/main.go | 145 |
1 files changed, 133 insertions, 12 deletions
diff --git a/bookpipeline/main.go b/bookpipeline/main.go index 4596171..63eb6bc 100644 --- a/bookpipeline/main.go +++ b/bookpipeline/main.go @@ -12,9 +12,13 @@ import ( "os/exec" "path/filepath" "regexp" + "sort" + "strconv" "strings" "time" + "github.com/wcharczuk/go-chart" + "rescribe.xyz/go.git/lib/hocr" "rescribe.xyz/go.git/preproc" ) @@ -135,6 +139,9 @@ type Conf struct { path, code string conf float64 } +type GraphConf struct { + pgnum, conf float64 +} func analyse(toanalyse chan string, up chan string, errc chan error, logger *log.Logger) { confs := make(map[string][]*Conf) @@ -163,12 +170,12 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log } - conffn := filepath.Join(savedir, "conf") - logger.Println("Saving confidences in file", conffn) - f, err := os.Create(conffn) + fn := filepath.Join(savedir, "conf") + logger.Println("Saving confidences in file", fn) + f, err := os.Create(fn) if err != nil { close(up) - errc <- errors.New(fmt.Sprintf("Error creating conf file %s: %s", conffn, err)) + errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err)) return } defer f.Close() @@ -189,23 +196,138 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log } } } - up <- conffn + up <- fn logger.Println("Creating best file listing the best file for each page") - bestfn := filepath.Join(savedir, "best") - f, err = os.Create(bestfn) + fn = filepath.Join(savedir, "best") + f, err = os.Create(fn) if err != nil { close(up) - errc <- errors.New(fmt.Sprintf("Error creating best file %s: %s", bestfn, err)) + errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err)) return } defer f.Close() for _, conf := range bestconfs { _, err = fmt.Fprintf(f, "%s\n", filepath.Base(conf.path)) } - up <- bestfn + up <- fn + + // TODO: move this graph stuff out into its own file, it's pretty big + logger.Println("Creating graph") + var graphconf []GraphConf + // organise bestconfs to sort them by page + for _, conf := range bestconfs { + name := filepath.Base(conf.path) + numend := strings.Index(name, "_") + pgnum, err := strconv.ParseFloat(name[0:numend], 64) + if err != nil { + logger.Printf("Failed to convert %s to float, excluding from graph\n", name[0:numend]) + continue + } + var c GraphConf + c.pgnum = pgnum + c.conf = conf.conf + graphconf = append(graphconf, c) + } + sort.Slice(graphconf, func(i, j int) bool { return graphconf[i].pgnum < graphconf[j].pgnum }) + var xvalues, yvalues []float64 + for _, c := range graphconf { + xvalues = append(xvalues, c.pgnum) + yvalues = append(yvalues, c.conf) + } + mainSeries := chart.ContinuousSeries{ + XValues: xvalues, + YValues: yvalues, + } + + // remove outliers at 10% of max and min confidence to use for dotted lines + sort.Slice(graphconf, func(i, j int) bool { return graphconf[i].conf < graphconf[j].conf }) + cutoff := int(len(graphconf) / 10) + logger.Printf("cutoff is %d, from %d\n", cutoff, len(graphconf)) + mostconf := graphconf[cutoff:len(graphconf)-cutoff] + sort.Slice(mostconf, func(i, j int) bool { return mostconf[i].pgnum < mostconf[j].pgnum }) + xvalues = []float64{} + yvalues = []float64{} + for _, c := range mostconf { + xvalues = append(xvalues, c.pgnum) + yvalues = append(yvalues, c.conf) + } + mostSeries := chart.ContinuousSeries{ + XValues: xvalues, + YValues: yvalues, + } + minSeries := &chart.MinSeries{ + Style: chart.Style{ + Show: true, + StrokeColor: chart.ColorAlternateGray, + StrokeDashArray: []float64{5.0, 5.0}, + }, + InnerSeries: mostSeries, + } + maxSeries := &chart.MaxSeries{ + Style: chart.Style{ + Show: true, + StrokeColor: chart.ColorAlternateGray, + StrokeDashArray: []float64{5.0, 5.0}, + }, + InnerSeries: mostSeries, + } + + // TODO: annotate all values below 70%; see + // https://github.com/wcharczuk/go-chart/blob/master/_examples/annotations/main.go + + // TODO: add number of words series using yaxissecondary + graph := chart.Chart{ + XAxis: chart.XAxis{ + Name: "Page number", + NameStyle: chart.StyleShow(), + Style: chart.StyleShow(), + Range: &chart.ContinuousRange{ + Min: 0.0, + }, + }, + YAxis: chart.YAxis{ + Name: "Confidence", + NameStyle: chart.StyleShow(), + Style: chart.StyleShow(), + Range: &chart.ContinuousRange{ + Min: 0.0, + Max: 100.0, + }, + }, + //YAxisSecondary: chart.YAxis{ + // Name: "Number of words", + // Style: chart.StyleShow(), + //}, + Series: []chart.Series{ + mainSeries, + minSeries, + maxSeries, + chart.LastValueAnnotation(minSeries), + chart.LastValueAnnotation(maxSeries), + //chart.ContinuousSeries{ + // YAxis: chart.YAxisSecondary, + // XValues: xvalues, + // YValues: yvalues, + //}, + }, + } + fn = filepath.Join(savedir, "graph.png") + f, err = os.Create(fn) + if err != nil { + close(up) + errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err)) + return + } + defer f.Close() + err = graph.Render(chart.PNG, f) + if err != nil { + close(up) + errc <- errors.New(fmt.Sprintf("Error rendering graph: %s", err)) + return + } + up <- fn - // TODO: plot a graph with the confs, using https://github.com/gonum/plot, send to up // TODO: generate a general report.txt with statistics etc for the book, send to up close(up) @@ -308,8 +430,7 @@ func main() { verboselog = log.New(n, "", log.LstdFlags) } - // TODO: match jpg too - origPattern := regexp.MustCompile(`[0-9]{4}.jpg$`) // TODO: match other file naming + origPattern := regexp.MustCompile(`[0-9]{4}.jpg$`) // TODO: match alternative file naming preprocessedPattern := regexp.MustCompile(`_bin[0-9].[0-9].png$`) ocredPattern := regexp.MustCompile(`.hocr$`) |