From 2c040f73ce7bbba480c441a0433fc8b4d6449254 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Mon, 9 Nov 2020 18:57:21 +0000
Subject: Add a couple of things that should not be forgotten

---
 cmd/booktopipeline/main.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'cmd/booktopipeline')

diff --git a/cmd/booktopipeline/main.go b/cmd/booktopipeline/main.go
index 60d1f81..96a6f6c 100644
--- a/cmd/booktopipeline/main.go
+++ b/cmd/booktopipeline/main.go
@@ -19,6 +19,8 @@ import (
 	"rescribe.xyz/bookpipeline"
 )
 
+// TODO: use internal/pipeline/get.go functions
+
 const usage = `Usage: booktopipeline [-c conn] [-t training] [-prebinarised] [-notbinarised] [-v] bookdir [bookname]
 
 Uploads the book in bookdir to the S3 'inprogress' bucket and adds it
-- 
cgit v1.2.1-24-ge1ad


From c48630f590fe2c877e899948f1bf88458d3fd813 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Tue, 10 Nov 2020 10:05:16 +0000
Subject: Switch booktopipeline to use internal pipeline functions

---
 cmd/booktopipeline/main.go | 85 ++++++----------------------------------------
 1 file changed, 11 insertions(+), 74 deletions(-)

(limited to 'cmd/booktopipeline')

diff --git a/cmd/booktopipeline/main.go b/cmd/booktopipeline/main.go
index 96a6f6c..7254d78 100644
--- a/cmd/booktopipeline/main.go
+++ b/cmd/booktopipeline/main.go
@@ -9,17 +9,14 @@ package main
 import (
 	"flag"
 	"fmt"
-	"image"
-	_ "image/png"
-	_ "image/jpeg"
 	"log"
 	"os"
 	"path/filepath"
 
 	"rescribe.xyz/bookpipeline"
-)
 
-// TODO: use internal/pipeline/get.go functions
+	"rescribe.xyz/bookpipeline/internal/pipeline"
+)
 
 const usage = `Usage: booktopipeline [-c conn] [-t training] [-prebinarised] [-notbinarised] [-v] bookdir [bookname]
 
@@ -34,15 +31,6 @@ using the flags -prebinarised (for the wipeonly queue) or
 If bookname is omitted the last part of the bookdir is used.
 `
 
-type Pipeliner interface {
-	Init() error
-	PreQueueId() string
-	WipeQueueId() string
-	WIPStorageId() string
-	AddToQueue(url string, msg string) error
-	Upload(bucket string, key string, path string) error
-}
-
 // null writer to enable non-verbose logging to be discarded
 type NullWriter bool
 
@@ -52,18 +40,6 @@ func (w NullWriter) Write(p []byte) (n int, err error) {
 
 var verboselog *log.Logger
 
-type fileWalk chan string
-
-func (f fileWalk) Walk(path string, info os.FileInfo, err error) error {
-	if err != nil {
-		return err
-	}
-	if !info.IsDir() {
-		f <- path
-	}
-	return nil
-}
-
 func main() {
 	verbose := flag.Bool("v", false, "Verbose")
 	conntype := flag.String("c", "aws", "connection type ('aws' or 'local')")
@@ -96,7 +72,7 @@ func main() {
 		verboselog = log.New(n, "", log.LstdFlags)
 	}
 
-	var conn Pipeliner
+	var conn pipeline.Pipeliner
 	switch *conntype {
 	case "aws":
 		conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog}
@@ -110,18 +86,7 @@ func main() {
 		log.Fatalln("Failed to set up cloud connection:", err)
 	}
 
-	qid := conn.PreQueueId()
-
-	// Auto detect type of queue to send to based on file extension
-	pngdirs, _ := filepath.Glob(bookdir + "/*.png")
-	jpgdirs, _ := filepath.Glob(bookdir + "/*.jpg")
-	pngcount := len(pngdirs)
-	jpgcount := len(jpgdirs)
-	if pngcount > jpgcount {
-		qid = conn.WipeQueueId()
-	} else {
-		qid = conn.PreQueueId()
-	}
+	qid := pipeline.DetectQueueType(bookdir, conn)
 
 	// Flags set override the queue selection
 	if *wipeonly {
@@ -132,43 +97,15 @@ func main() {
 	}
 
 	verboselog.Println("Checking that all images are valid in", bookdir)
-	checker := make(fileWalk)
-	go func() {
-		err = filepath.Walk(bookdir, checker.Walk)
-		if err != nil {
-			log.Fatalln("Filesystem walk failed:", err)
-		}
-		close(checker)
-	}()
-
-	for path := range checker {
-		f, err := os.Open(path)
-		if err != nil {
-			log.Fatalln("Opening image %s failed, bailing: %v", path, err)
-		}
-		_, _, err = image.Decode(f)
-		if err != nil {
-			log.Fatalf("Decoding image %s failed, bailing: %v", path, err)
-		}
+	err = pipeline.CheckImages(bookdir)
+	if err != nil {
+		log.Fatalln(err)
 	}
 
-	verboselog.Println("Walking", bookdir)
-	walker := make(fileWalk)
-	go func() {
-		err = filepath.Walk(bookdir, walker.Walk)
-		if err != nil {
-			log.Fatalln("Filesystem walk failed:", err)
-		}
-		close(walker)
-	}()
-
-	for path := range walker {
-		verboselog.Println("Uploading", path)
-		name := filepath.Base(path)
-		err = conn.Upload(conn.WIPStorageId(), filepath.Join(bookname, name), path)
-		if err != nil {
-			log.Fatalln("Failed to upload", path, err)
-		}
+	verboselog.Println("Uploading all images are valid in", bookdir)
+	err = pipeline.UploadImages(bookdir, bookname, conn)
+	if err != nil {
+		log.Fatalln(err)
 	}
 
 	if *training != "" {
-- 
cgit v1.2.1-24-ge1ad


From 0d914a5de3f8169d41df4fcff1ee4aea6d01afbe Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Tue, 24 Nov 2020 12:40:54 +0000
Subject: [booktopipeline] Add a check to disallow adding a book that already
 exists

This is important as if a book is added which has already been done,
then an analyse job will be added every time a page is OCRed, which
will clog up the pipeline with unnecessary work. Also if a book was
added with the same name but differently named files, or a different
number of pages, the results would almost certainly not be as
intended.

In the case of a book really wanting to be added with a particular
name, either the original directory can be removed on S3, or "v2"
or similar can be appended to the book name before calling
booktopipeline.
---
 cmd/booktopipeline/main.go | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'cmd/booktopipeline')

diff --git a/cmd/booktopipeline/main.go b/cmd/booktopipeline/main.go
index 7254d78..b4f4d99 100644
--- a/cmd/booktopipeline/main.go
+++ b/cmd/booktopipeline/main.go
@@ -102,6 +102,15 @@ func main() {
 		log.Fatalln(err)
 	}
 
+	verboselog.Println("Checking that a book hasn't already been uploaded with that name")
+	list, err := conn.ListObjects(conn.WIPStorageId(), bookname)
+	if err != nil {
+		log.Fatalln(err)
+	}
+	if len(list) > 0 {
+		log.Fatalf("Error: There is already a book in S3 named %s", bookname)
+	}
+
 	verboselog.Println("Uploading all images are valid in", bookdir)
 	err = pipeline.UploadImages(bookdir, bookname, conn)
 	if err != nil {
-- 
cgit v1.2.1-24-ge1ad