From 2c040f73ce7bbba480c441a0433fc8b4d6449254 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 9 Nov 2020 18:57:21 +0000 Subject: Add a couple of things that should not be forgotten --- cmd/booktopipeline/main.go | 2 ++ 1 file changed, 2 insertions(+) (limited to 'cmd/booktopipeline') diff --git a/cmd/booktopipeline/main.go b/cmd/booktopipeline/main.go index 60d1f81..96a6f6c 100644 --- a/cmd/booktopipeline/main.go +++ b/cmd/booktopipeline/main.go @@ -19,6 +19,8 @@ import ( "rescribe.xyz/bookpipeline" ) +// TODO: use internal/pipeline/get.go functions + const usage = `Usage: booktopipeline [-c conn] [-t training] [-prebinarised] [-notbinarised] [-v] bookdir [bookname] Uploads the book in bookdir to the S3 'inprogress' bucket and adds it -- cgit v1.2.1-24-ge1ad From c48630f590fe2c877e899948f1bf88458d3fd813 Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 10 Nov 2020 10:05:16 +0000 Subject: Switch booktopipeline to use internal pipeline functions --- cmd/booktopipeline/main.go | 85 ++++++---------------------------------------- 1 file changed, 11 insertions(+), 74 deletions(-) (limited to 'cmd/booktopipeline') diff --git a/cmd/booktopipeline/main.go b/cmd/booktopipeline/main.go index 96a6f6c..7254d78 100644 --- a/cmd/booktopipeline/main.go +++ b/cmd/booktopipeline/main.go @@ -9,17 +9,14 @@ package main import ( "flag" "fmt" - "image" - _ "image/png" - _ "image/jpeg" "log" "os" "path/filepath" "rescribe.xyz/bookpipeline" -) -// TODO: use internal/pipeline/get.go functions + "rescribe.xyz/bookpipeline/internal/pipeline" +) const usage = `Usage: booktopipeline [-c conn] [-t training] [-prebinarised] [-notbinarised] [-v] bookdir [bookname] @@ -34,15 +31,6 @@ using the flags -prebinarised (for the wipeonly queue) or If bookname is omitted the last part of the bookdir is used. ` -type Pipeliner interface { - Init() error - PreQueueId() string - WipeQueueId() string - WIPStorageId() string - AddToQueue(url string, msg string) error - Upload(bucket string, key string, path string) error -} - // null writer to enable non-verbose logging to be discarded type NullWriter bool @@ -52,18 +40,6 @@ func (w NullWriter) Write(p []byte) (n int, err error) { var verboselog *log.Logger -type fileWalk chan string - -func (f fileWalk) Walk(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - if !info.IsDir() { - f <- path - } - return nil -} - func main() { verbose := flag.Bool("v", false, "Verbose") conntype := flag.String("c", "aws", "connection type ('aws' or 'local')") @@ -96,7 +72,7 @@ func main() { verboselog = log.New(n, "", log.LstdFlags) } - var conn Pipeliner + var conn pipeline.Pipeliner switch *conntype { case "aws": conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog} @@ -110,18 +86,7 @@ func main() { log.Fatalln("Failed to set up cloud connection:", err) } - qid := conn.PreQueueId() - - // Auto detect type of queue to send to based on file extension - pngdirs, _ := filepath.Glob(bookdir + "/*.png") - jpgdirs, _ := filepath.Glob(bookdir + "/*.jpg") - pngcount := len(pngdirs) - jpgcount := len(jpgdirs) - if pngcount > jpgcount { - qid = conn.WipeQueueId() - } else { - qid = conn.PreQueueId() - } + qid := pipeline.DetectQueueType(bookdir, conn) // Flags set override the queue selection if *wipeonly { @@ -132,43 +97,15 @@ func main() { } verboselog.Println("Checking that all images are valid in", bookdir) - checker := make(fileWalk) - go func() { - err = filepath.Walk(bookdir, checker.Walk) - if err != nil { - log.Fatalln("Filesystem walk failed:", err) - } - close(checker) - }() - - for path := range checker { - f, err := os.Open(path) - if err != nil { - log.Fatalln("Opening image %s failed, bailing: %v", path, err) - } - _, _, err = image.Decode(f) - if err != nil { - log.Fatalf("Decoding image %s failed, bailing: %v", path, err) - } + err = pipeline.CheckImages(bookdir) + if err != nil { + log.Fatalln(err) } - verboselog.Println("Walking", bookdir) - walker := make(fileWalk) - go func() { - err = filepath.Walk(bookdir, walker.Walk) - if err != nil { - log.Fatalln("Filesystem walk failed:", err) - } - close(walker) - }() - - for path := range walker { - verboselog.Println("Uploading", path) - name := filepath.Base(path) - err = conn.Upload(conn.WIPStorageId(), filepath.Join(bookname, name), path) - if err != nil { - log.Fatalln("Failed to upload", path, err) - } + verboselog.Println("Uploading all images are valid in", bookdir) + err = pipeline.UploadImages(bookdir, bookname, conn) + if err != nil { + log.Fatalln(err) } if *training != "" { -- cgit v1.2.1-24-ge1ad From 0d914a5de3f8169d41df4fcff1ee4aea6d01afbe Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 24 Nov 2020 12:40:54 +0000 Subject: [booktopipeline] Add a check to disallow adding a book that already exists This is important as if a book is added which has already been done, then an analyse job will be added every time a page is OCRed, which will clog up the pipeline with unnecessary work. Also if a book was added with the same name but differently named files, or a different number of pages, the results would almost certainly not be as intended. In the case of a book really wanting to be added with a particular name, either the original directory can be removed on S3, or "v2" or similar can be appended to the book name before calling booktopipeline. --- cmd/booktopipeline/main.go | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'cmd/booktopipeline') diff --git a/cmd/booktopipeline/main.go b/cmd/booktopipeline/main.go index 7254d78..b4f4d99 100644 --- a/cmd/booktopipeline/main.go +++ b/cmd/booktopipeline/main.go @@ -102,6 +102,15 @@ func main() { log.Fatalln(err) } + verboselog.Println("Checking that a book hasn't already been uploaded with that name") + list, err := conn.ListObjects(conn.WIPStorageId(), bookname) + if err != nil { + log.Fatalln(err) + } + if len(list) > 0 { + log.Fatalf("Error: There is already a book in S3 named %s", bookname) + } + verboselog.Println("Uploading all images are valid in", bookdir) err = pipeline.UploadImages(bookdir, bookname, conn) if err != nil { -- cgit v1.2.1-24-ge1ad