From 8d47a1baed954cf46cd57afcb5b4ef54c774bff7 Mon Sep 17 00:00:00 2001 From: Nick White Date: Thu, 22 Aug 2019 17:00:24 +0100 Subject: Generalise preprocessing and ocring to reuse common code --- bookpipeline/main.go | 112 ++++++++++----------------------------------------- 1 file changed, 21 insertions(+), 91 deletions(-) diff --git a/bookpipeline/main.go b/bookpipeline/main.go index fe9012b..6a68c57 100644 --- a/bookpipeline/main.go +++ b/bookpipeline/main.go @@ -104,7 +104,7 @@ func up(c chan string, done chan bool, conn Pipeliner, bookname string, errc cha done <- true } -func preprocess(pre chan string, up chan string, logger *log.Logger, errc chan error) { +func preprocess(pre chan string, up chan string, errc chan error, logger *log.Logger) { for path := range pre { logger.Println("Preprocessing", path) done, err := preproc.PreProcMulti(path, []float64{0.1, 0.2, 0.4, 0.5}, "binary", 0, true, 5, 30) @@ -120,95 +120,25 @@ func preprocess(pre chan string, up chan string, logger *log.Logger, errc chan e close(up) } -// TODO: use Tesseract API rather than calling the executable -func ocr(toocr chan string, up chan string, logger *log.Logger, training string, errc chan error) { - for path := range toocr { - logger.Println("OCRing", path) - name := strings.Replace(path, ".png", "", 1) // TODO: handle any file extension - cmd := exec.Command("tesseract", "-l", training, path, name, "hocr") - err := cmd.Run() - if err != nil { - close(up) - errc <- errors.New(fmt.Sprintf("Error ocring %s: %s", path, err)) - return +func ocr(training string) func(chan string, chan string, chan error, *log.Logger) { + return func (toocr chan string, up chan string, errc chan error, logger *log.Logger) { + for path := range toocr { + logger.Println("OCRing", path) + name := strings.Replace(path, ".png", "", 1) // TODO: handle any file extension + cmd := exec.Command("tesseract", "-l", training, path, name, "hocr") + err := cmd.Run() + if err != nil { + close(up) + errc <- errors.New(fmt.Sprintf("Error ocring %s: %s", path, err)) + return + } + up <- name + ".hocr" } - up <- name + ".hocr" - } - close(up) -} - -func preprocBook(msg Qmsg, conn Pipeliner) error { - bookname := msg.Body - - t := time.NewTicker(HeartbeatTime * time.Second) - go conn.PreQueueHeartbeat(t, msg.Handle) - - d := filepath.Join(os.TempDir(), bookname) - err := os.MkdirAll(d, 0755) - if err != nil { - t.Stop() - return errors.New(fmt.Sprintf("Failed to create directory %s: %s", d, err)) - } - - dl := make(chan string) - pre := make(chan string) - upc := make(chan string) - done := make(chan bool) - errc := make(chan error) - - // these functions will do their jobs when their channels have data - go download(dl, pre, conn, d, errc) - go preprocess(pre, upc, conn.Logger(), errc) - go up(upc, done, conn, bookname, errc) - - conn.Logger().Println("Getting list of objects to download") - todl, err := conn.ListToPreprocess(bookname) - if err != nil { - t.Stop() - _ = os.RemoveAll(d) - return errors.New(fmt.Sprintf("Failed to get list of files for book %s: %s", bookname, err)) - } - for _, d := range todl { - dl <- d - } - close(dl) - - // wait for either the done or errc channel to be sent to - select { - case err = <-errc: - t.Stop() - _ = os.RemoveAll(d) - return err - case <-done: + close(up) } - - conn.Logger().Println("Sending", bookname, "to OCR queue") - err = conn.AddToOCRQueue(bookname) - if err != nil { - t.Stop() - _ = os.RemoveAll(d) - return errors.New(fmt.Sprintf("Error adding to ocr queue %s: %s", bookname, err)) - } - - t.Stop() - - conn.Logger().Println("Deleting original message from preprocessing queue") - err = conn.DelFromPreQueue(msg.Handle) - if err != nil { - _ = os.RemoveAll(d) - return errors.New(fmt.Sprintf("Error deleting message from preprocessing queue: %s", err)) - } - - err = os.RemoveAll(d) - if err != nil { - return errors.New(fmt.Sprintf("Failed to remove directory %s: %s", d, err)) - } - - return nil } -// TODO: this is very similar to preprocBook; try to at least mostly merge them -func ocrBook(msg Qmsg, conn Pipeliner, training string) error { +func processBook(msg Qmsg, conn Pipeliner, process func(chan string, chan string, chan error, *log.Logger)) error { bookname := msg.Body t := time.NewTicker(HeartbeatTime * time.Second) @@ -222,14 +152,14 @@ func ocrBook(msg Qmsg, conn Pipeliner, training string) error { } dl := make(chan string) - ocrc := make(chan string) + processc := make(chan string) upc := make(chan string) done := make(chan bool) errc := make(chan error) // these functions will do their jobs when their channels have data - go download(dl, ocrc, conn, d, errc) - go ocr(ocrc, upc, conn.Logger(), training, errc) + go download(dl, processc, conn, d, errc) + go process(processc, upc, errc, conn.Logger()) go up(upc, done, conn, bookname, errc) conn.Logger().Println("Getting list of objects to download") @@ -323,7 +253,7 @@ func main() { verboselog.Println("No message received on preprocess queue, sleeping") continue } - err = preprocBook(msg, conn) + err = processBook(msg, conn, preprocess) if err != nil { log.Println("Error during preprocess", err) } @@ -338,7 +268,7 @@ func main() { verboselog.Println("No message received on OCR queue, sleeping") continue } - err = ocrBook(msg, conn, *training) + err = processBook(msg, conn, ocr(*training)) if err != nil { log.Println("Error during OCR process", err) } -- cgit v1.2.1-24-ge1ad