From f72c788235f4f0a3bb88cfcdb0911564e67259d8 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Tue, 27 Aug 2019 15:24:04 +0100
Subject: Add basic analyse step, working but incomplete

---
 bookpipeline/main.go | 114 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 107 insertions(+), 7 deletions(-)

(limited to 'bookpipeline')

diff --git a/bookpipeline/main.go b/bookpipeline/main.go
index 22bf7f5..4596171 100644
--- a/bookpipeline/main.go
+++ b/bookpipeline/main.go
@@ -15,6 +15,7 @@ import (
 	"strings"
 	"time"
 
+	"rescribe.xyz/go.git/lib/hocr"
 	"rescribe.xyz/go.git/preproc"
 )
 
@@ -130,6 +131,86 @@ func ocr(training string) func(chan string, chan string, chan error, *log.Logger
 	}
 }
 
+type Conf struct {
+	path, code string
+	conf float64
+}
+
+func analyse(toanalyse chan string, up chan string, errc chan error, logger *log.Logger) {
+	confs := make(map[string][]*Conf)
+	bestconfs := make(map[string]*Conf)
+	savedir := ""
+
+	for path := range toanalyse {
+		if savedir == "" {
+			savedir = filepath.Dir(path)
+		}
+		logger.Println("Calculating confidence for", path)
+		avg, err := hocr.GetAvgConf(path)
+		if err != nil {
+			close(up)
+			errc <- errors.New(fmt.Sprintf("Error retreiving confidence for %s: %s", path, err))
+			return
+		}
+		base := filepath.Base(path)
+		codestart := strings.Index(base, "_bin")
+		name := base[0:codestart]
+		var c Conf
+		c.path = path
+		c.code = base[codestart:]
+		c.conf = avg
+		confs[name] = append(confs[name], &c)
+
+	}
+
+	conffn := filepath.Join(savedir, "conf")
+	logger.Println("Saving confidences in file", conffn)
+	f, err := os.Create(conffn)
+	if err != nil {
+		close(up)
+		errc <- errors.New(fmt.Sprintf("Error creating conf file %s: %s", conffn, err))
+		return
+	}
+	defer f.Close()
+
+	logger.Println("Finding best confidence for each page, and saving all confidences")
+	for base, conf := range confs {
+		var best float64
+		for _, c := range conf {
+			if c.conf > best {
+				best = c.conf
+				bestconfs[base] = c
+			}
+			_, err = fmt.Fprintf(f, "%s\t%02.f\n", c.path, c.conf)
+			if err != nil {
+				close(up)
+				errc <- errors.New(fmt.Sprintf("Error writing confidences file: %s", err))
+				return
+			}
+		}
+	}
+	up <- conffn
+
+	logger.Println("Creating best file listing the best file for each page")
+	bestfn := filepath.Join(savedir, "best")
+	f, err = os.Create(bestfn)
+	if err != nil {
+		close(up)
+		errc <- errors.New(fmt.Sprintf("Error creating best file %s: %s", bestfn, err))
+		return
+	}
+	defer f.Close()
+	for _, conf := range bestconfs {
+		_, err = fmt.Fprintf(f, "%s\n", filepath.Base(conf.path))
+	}
+	up <- bestfn
+
+	// TODO: plot a graph with the confs, using https://github.com/gonum/plot, send to up
+	// TODO: generate a general report.txt with statistics etc for the book, send to up
+
+	close(up)
+}
+
 func processBook(msg Qmsg, conn Pipeliner, process func(chan string, chan string, chan error, *log.Logger), match *regexp.Regexp, fromQueue string, toQueue string) error {
 	bookname := msg.Body
 
@@ -183,12 +264,14 @@ func processBook(msg Qmsg, conn Pipeliner, process func(chan string, chan string
 	case <-done:
 	}
 
-	conn.Logger().Println("Sending", bookname, "to queue")
-	err = conn.AddToQueue(toQueue, bookname)
-	if err != nil {
-		t.Stop()
-		_ = os.RemoveAll(d)
-		return errors.New(fmt.Sprintf("Error adding to queue %s: %s", bookname, err))
+	if toQueue != "" {
+		conn.Logger().Println("Sending", bookname, "to queue")
+		err = conn.AddToQueue(toQueue, bookname)
+		if err != nil {
+			t.Stop()
+			_ = os.RemoveAll(d)
+			return errors.New(fmt.Sprintf("Error adding to queue %s: %s", bookname, err))
+		}
 	}
 
 	t.Stop()
@@ -228,7 +311,7 @@ func main() {
 	// TODO: match jpg too
 	origPattern := regexp.MustCompile(`[0-9]{4}.jpg$`) // TODO: match other file naming
 	preprocessedPattern := regexp.MustCompile(`_bin[0-9].[0-9].png$`)
-	//ocredPattern := regexp.MustCompile(`.hocr$`)
+	ocredPattern := regexp.MustCompile(`.hocr$`)
 
 	var conn Pipeliner
 	conn = &awsConn{region: "eu-west-2", logger: verboselog}
@@ -242,8 +325,10 @@ func main() {
 
 	var checkPreQueue <-chan time.Time
 	var checkOCRQueue <-chan time.Time
+	var checkAnalyseQueue <-chan time.Time
 	checkPreQueue = time.After(0)
 	checkOCRQueue = time.After(0)
+	checkAnalyseQueue = time.After(0)
 
 	for {
 		select {
@@ -277,6 +362,21 @@ func main() {
 			if err != nil {
 				log.Println("Error during OCR process", err)
 			}
+		case <-checkAnalyseQueue:
+			msg, err := conn.CheckQueue(conn.AnalyseQueueId())
+			checkAnalyseQueue = time.After(PauseBetweenChecks)
+			if err != nil {
+				log.Println("Error checking analyse queue", err)
+				continue
+			}
+			if msg.Handle == "" {
+				verboselog.Println("No message received on analyse queue, sleeping")
+				continue
+			}
+			err = processBook(msg, conn, analyse, ocredPattern, conn.AnalyseQueueId(), "")
+			if err != nil {
+				log.Println("Error during analysis", err)
+			}
 		}
 	}
 }
-- 
cgit v1.2.1-24-ge1ad