summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bookpipeline/main.go145
1 files changed, 133 insertions, 12 deletions
diff --git a/bookpipeline/main.go b/bookpipeline/main.go
index 4596171..63eb6bc 100644
--- a/bookpipeline/main.go
+++ b/bookpipeline/main.go
@@ -12,9 +12,13 @@ import (
"os/exec"
"path/filepath"
"regexp"
+ "sort"
+ "strconv"
"strings"
"time"
+ "github.com/wcharczuk/go-chart"
+
"rescribe.xyz/go.git/lib/hocr"
"rescribe.xyz/go.git/preproc"
)
@@ -135,6 +139,9 @@ type Conf struct {
path, code string
conf float64
}
+type GraphConf struct {
+ pgnum, conf float64
+}
func analyse(toanalyse chan string, up chan string, errc chan error, logger *log.Logger) {
confs := make(map[string][]*Conf)
@@ -163,12 +170,12 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log
}
- conffn := filepath.Join(savedir, "conf")
- logger.Println("Saving confidences in file", conffn)
- f, err := os.Create(conffn)
+ fn := filepath.Join(savedir, "conf")
+ logger.Println("Saving confidences in file", fn)
+ f, err := os.Create(fn)
if err != nil {
close(up)
- errc <- errors.New(fmt.Sprintf("Error creating conf file %s: %s", conffn, err))
+ errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err))
return
}
defer f.Close()
@@ -189,23 +196,138 @@ func analyse(toanalyse chan string, up chan string, errc chan error, logger *log
}
}
}
- up <- conffn
+ up <- fn
logger.Println("Creating best file listing the best file for each page")
- bestfn := filepath.Join(savedir, "best")
- f, err = os.Create(bestfn)
+ fn = filepath.Join(savedir, "best")
+ f, err = os.Create(fn)
if err != nil {
close(up)
- errc <- errors.New(fmt.Sprintf("Error creating best file %s: %s", bestfn, err))
+ errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err))
return
}
defer f.Close()
for _, conf := range bestconfs {
_, err = fmt.Fprintf(f, "%s\n", filepath.Base(conf.path))
}
- up <- bestfn
+ up <- fn
+
+ // TODO: move this graph stuff out into its own file, it's pretty big
+ logger.Println("Creating graph")
+ var graphconf []GraphConf
+ // organise bestconfs to sort them by page
+ for _, conf := range bestconfs {
+ name := filepath.Base(conf.path)
+ numend := strings.Index(name, "_")
+ pgnum, err := strconv.ParseFloat(name[0:numend], 64)
+ if err != nil {
+ logger.Printf("Failed to convert %s to float, excluding from graph\n", name[0:numend])
+ continue
+ }
+ var c GraphConf
+ c.pgnum = pgnum
+ c.conf = conf.conf
+ graphconf = append(graphconf, c)
+ }
+ sort.Slice(graphconf, func(i, j int) bool { return graphconf[i].pgnum < graphconf[j].pgnum })
+ var xvalues, yvalues []float64
+ for _, c := range graphconf {
+ xvalues = append(xvalues, c.pgnum)
+ yvalues = append(yvalues, c.conf)
+ }
+ mainSeries := chart.ContinuousSeries{
+ XValues: xvalues,
+ YValues: yvalues,
+ }
+
+ // remove outliers at 10% of max and min confidence to use for dotted lines
+ sort.Slice(graphconf, func(i, j int) bool { return graphconf[i].conf < graphconf[j].conf })
+ cutoff := int(len(graphconf) / 10)
+ logger.Printf("cutoff is %d, from %d\n", cutoff, len(graphconf))
+ mostconf := graphconf[cutoff:len(graphconf)-cutoff]
+ sort.Slice(mostconf, func(i, j int) bool { return mostconf[i].pgnum < mostconf[j].pgnum })
+ xvalues = []float64{}
+ yvalues = []float64{}
+ for _, c := range mostconf {
+ xvalues = append(xvalues, c.pgnum)
+ yvalues = append(yvalues, c.conf)
+ }
+ mostSeries := chart.ContinuousSeries{
+ XValues: xvalues,
+ YValues: yvalues,
+ }
+ minSeries := &chart.MinSeries{
+ Style: chart.Style{
+ Show: true,
+ StrokeColor: chart.ColorAlternateGray,
+ StrokeDashArray: []float64{5.0, 5.0},
+ },
+ InnerSeries: mostSeries,
+ }
+ maxSeries := &chart.MaxSeries{
+ Style: chart.Style{
+ Show: true,
+ StrokeColor: chart.ColorAlternateGray,
+ StrokeDashArray: []float64{5.0, 5.0},
+ },
+ InnerSeries: mostSeries,
+ }
+
+ // TODO: annotate all values below 70%; see
+ // https://github.com/wcharczuk/go-chart/blob/master/_examples/annotations/main.go
+
+ // TODO: add number of words series using yaxissecondary
+ graph := chart.Chart{
+ XAxis: chart.XAxis{
+ Name: "Page number",
+ NameStyle: chart.StyleShow(),
+ Style: chart.StyleShow(),
+ Range: &chart.ContinuousRange{
+ Min: 0.0,
+ },
+ },
+ YAxis: chart.YAxis{
+ Name: "Confidence",
+ NameStyle: chart.StyleShow(),
+ Style: chart.StyleShow(),
+ Range: &chart.ContinuousRange{
+ Min: 0.0,
+ Max: 100.0,
+ },
+ },
+ //YAxisSecondary: chart.YAxis{
+ // Name: "Number of words",
+ // Style: chart.StyleShow(),
+ //},
+ Series: []chart.Series{
+ mainSeries,
+ minSeries,
+ maxSeries,
+ chart.LastValueAnnotation(minSeries),
+ chart.LastValueAnnotation(maxSeries),
+ //chart.ContinuousSeries{
+ // YAxis: chart.YAxisSecondary,
+ // XValues: xvalues,
+ // YValues: yvalues,
+ //},
+ },
+ }
+ fn = filepath.Join(savedir, "graph.png")
+ f, err = os.Create(fn)
+ if err != nil {
+ close(up)
+ errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err))
+ return
+ }
+ defer f.Close()
+ err = graph.Render(chart.PNG, f)
+ if err != nil {
+ close(up)
+ errc <- errors.New(fmt.Sprintf("Error rendering graph: %s", err))
+ return
+ }
+ up <- fn
- // TODO: plot a graph with the confs, using https://github.com/gonum/plot, send to up
// TODO: generate a general report.txt with statistics etc for the book, send to up
close(up)
@@ -308,8 +430,7 @@ func main() {
verboselog = log.New(n, "", log.LstdFlags)
}
- // TODO: match jpg too
- origPattern := regexp.MustCompile(`[0-9]{4}.jpg$`) // TODO: match other file naming
+ origPattern := regexp.MustCompile(`[0-9]{4}.jpg$`) // TODO: match alternative file naming
preprocessedPattern := regexp.MustCompile(`_bin[0-9].[0-9].png$`)
ocredPattern := regexp.MustCompile(`.hocr$`)