From 068ad0b666705a49ab22d7b48cd6a7d67b37f234 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 7 Dec 2020 16:53:58 +0000 Subject: [rescribe] Allow saving of results to somewhere other than a directory named after the book being processed --- cmd/getpipelinebook/main.go | 10 ++++----- cmd/rescribe/main.go | 38 ++++++++++++++++++-------------- internal/pipeline/get.go | 53 +++++++++++++++++++++++++-------------------- 3 files changed, 57 insertions(+), 44 deletions(-) diff --git a/cmd/getpipelinebook/main.go b/cmd/getpipelinebook/main.go index 5116414..ccedd72 100644 --- a/cmd/getpipelinebook/main.go +++ b/cmd/getpipelinebook/main.go @@ -87,7 +87,7 @@ func main() { if *all { verboselog.Println("Downloading all files for", bookname) - err = pipeline.DownloadAll(bookname, conn) + err = pipeline.DownloadAll(bookname, bookname, conn) if err != nil { log.Fatalln(err) } @@ -122,7 +122,7 @@ func main() { if *pdf { verboselog.Println("Downloading PDFs") - pipeline.DownloadPdfs(bookname, conn) + pipeline.DownloadPdfs(bookname, bookname, conn) } if *binarisedpdf || *colourpdf || *graph || *pdf { @@ -130,19 +130,19 @@ func main() { } verboselog.Println("Downloading best pages") - err = pipeline.DownloadBestPages(bookname, conn, *png) + err = pipeline.DownloadBestPages(bookname, bookname, conn, *png) if err != nil { log.Fatalln(err) } verboselog.Println("Downloading PDFs") - pipeline.DownloadPdfs(bookname, conn) + pipeline.DownloadPdfs(bookname, bookname, conn) if err != nil { log.Fatalln(err) } verboselog.Println("Downloading analyses") - err = pipeline.DownloadAnalyses(bookname, conn) + err = pipeline.DownloadAnalyses(bookname, bookname, conn) if err != nil { log.Fatalln(err) } diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index 880bbc2..8414c53 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -29,6 +29,9 @@ import ( const usage = `Usage: rescribe [-v] [-t training] bookdir [savedir] Process and OCR a book using the Rescribe pipeline on a local machine. + +OCR results are saved into the bookdir directory unless savedir is +specified. ` const QueueTimeoutSecs = 2 * 60 @@ -93,17 +96,16 @@ func main() { } flag.Parse() - if flag.NArg() < 1 || flag.NArg() > 3 { + if flag.NArg() < 1 || flag.NArg() > 2 { flag.Usage() return } bookdir := flag.Arg(0) - var bookname string + bookname := filepath.Base(bookdir) + savedir := bookdir if flag.NArg() > 1 { - bookname = flag.Arg(1) - } else { - bookname = filepath.Base(bookdir) + savedir = flag.Arg(1) } var verboselog *log.Logger @@ -172,8 +174,12 @@ func main() { log.Fatalln(err) } - fmt.Printf("Saving finished book to %s\n", bookname) - err = downloadbook(bookname, conn) + fmt.Printf("Saving finished book to %s\n", savedir) + err = os.MkdirAll(savedir, 0755) + if err != nil { + log.Fatalf("Error creating save directory %s: %v", savedir, err) + } + err = downloadbook(savedir, bookname, conn) if err != nil { _ = os.RemoveAll(tempdir) log.Fatalln(err) @@ -184,7 +190,7 @@ func main() { log.Fatalf("Error removing temporary directory %s: %v", tempdir, err) } - hocrs, err := filepath.Glob(fmt.Sprintf("%s%s*hocr", bookname, string(filepath.Separator))) + hocrs, err := filepath.Glob(fmt.Sprintf("%s%s*hocr", savedir, string(filepath.Separator))) if err != nil { log.Fatalf("Error looking for .hocr files: %v", err) } @@ -195,20 +201,20 @@ func main() { log.Fatalf("Error creating txt version of %s: %v", v, err) } - err = os.MkdirAll(filepath.Join(bookname, "hocr"), 0755) + err = os.MkdirAll(filepath.Join(savedir, "hocr"), 0755) if err != nil { log.Fatalf("Error creating hocr directory: %v", err) } - err = os.Rename(v, filepath.Join(bookname, "hocr", filepath.Base(v))) + err = os.Rename(v, filepath.Join(savedir, "hocr", filepath.Base(v))) if err != nil { log.Fatalf("Error moving hocr %s to hocr directory: %v", v, err) } } // For simplicity, remove .binarised.pdf and rename .colour.pdf to .pdf - _ = os.Remove(filepath.Join(bookname, bookname + ".binarised.pdf")) - _ = os.Rename(filepath.Join(bookname, bookname + ".colour.pdf"), filepath.Join(bookname, bookname + ".pdf")) + _ = os.Remove(filepath.Join(savedir, bookname + ".binarised.pdf")) + _ = os.Rename(filepath.Join(savedir, bookname + ".colour.pdf"), filepath.Join(savedir, bookname + ".pdf")) } func addTxtVersion(hocrfn string) error { @@ -257,23 +263,23 @@ func uploadbook(dir string, name string, conn Pipeliner) error { return nil } -func downloadbook(name string, conn Pipeliner) error { +func downloadbook(dir string, name string, conn Pipeliner) error { err := os.MkdirAll(name, 0755) if err != nil { log.Fatalln("Failed to create directory", name, err) } - err = pipeline.DownloadBestPages(name, conn, false) + err = pipeline.DownloadBestPages(dir, name, conn, false) if err != nil { return fmt.Errorf("Error downloading best pages: %v", err) } - err = pipeline.DownloadPdfs(name, conn) + err = pipeline.DownloadPdfs(dir, name, conn) if err != nil { return fmt.Errorf("Error downloading PDFs: %v", err) } - err = pipeline.DownloadAnalyses(name, conn) + err = pipeline.DownloadAnalyses(dir, name, conn) if err != nil { return fmt.Errorf("Error downloading analyses: %v", err) } diff --git a/internal/pipeline/get.go b/internal/pipeline/get.go index 6949062..6c5b92c 100644 --- a/internal/pipeline/get.go +++ b/internal/pipeline/get.go @@ -12,9 +12,10 @@ import ( "strings" ) -func DownloadBestPages(name string, conn Pipeliner, pluspngs bool) error { - fn := filepath.Join(name, "best") - err := conn.Download(conn.WIPStorageId(), fn, fn) +func DownloadBestPages(dir string, name string, conn Pipeliner, pluspngs bool) error { + key := filepath.Join(name, "best") + fn := filepath.Join(dir, "best") + err := conn.Download(conn.WIPStorageId(), key, fn) if err != nil { return fmt.Errorf("Failed to download 'best' file: %v", err) } @@ -26,11 +27,12 @@ func DownloadBestPages(name string, conn Pipeliner, pluspngs bool) error { s := bufio.NewScanner(f) for s.Scan() { - fn = filepath.Join(name, s.Text()) - conn.Log("Downloading file", fn) - err = conn.Download(conn.WIPStorageId(), fn, fn) + key = filepath.Join(name, s.Text()) + fn = filepath.Join(dir, s.Text()) + conn.Log("Downloading file", key) + err = conn.Download(conn.WIPStorageId(), key, fn) if err != nil { - return fmt.Errorf("Failed to download file %s: %v", fn, err) + return fmt.Errorf("Failed to download file %s: %v", key, err) } } @@ -40,49 +42,54 @@ func DownloadBestPages(name string, conn Pipeliner, pluspngs bool) error { s = bufio.NewScanner(f) for s.Scan() { - txtfn := filepath.Join(name, s.Text()) - fn = strings.Replace(txtfn, ".hocr", ".png", 1) - conn.Log("Downloading file", fn) - err = conn.Download(conn.WIPStorageId(), fn, fn) + imgname := strings.Replace(s.Text(), ".hocr", ".png", 1) + key = filepath.Join(name, imgname) + fn = filepath.Join(dir, imgname) + conn.Log("Downloading file", key) + err = conn.Download(conn.WIPStorageId(), key, fn) if err != nil { - return fmt.Errorf("Failed to download file", fn, err) + return fmt.Errorf("Failed to download file %s: %v", key, err) } } return nil } -func DownloadPdfs(name string, conn Pipeliner) error { +func DownloadPdfs(dir string, name string, conn Pipeliner) error { for _, suffix := range []string{".colour.pdf", ".binarised.pdf"} { - fn := filepath.Join(name, name+suffix) - err := conn.Download(conn.WIPStorageId(), fn, fn) + key := filepath.Join(name, name+suffix) + fn := filepath.Join(dir, name+suffix) + err := conn.Download(conn.WIPStorageId(), key, fn) if err != nil { - return fmt.Errorf("Failed to download PDF %s: %v", fn, err) + return fmt.Errorf("Failed to download PDF %s: %v", key, err) } } return nil } -func DownloadAnalyses(name string, conn Pipeliner) error { +func DownloadAnalyses(dir string, name string, conn Pipeliner) error { for _, a := range []string{"conf", "graph.png"} { - fn := filepath.Join(name, a) - err := conn.Download(conn.WIPStorageId(), fn, fn) + key := filepath.Join(name, a) + fn := filepath.Join(dir, a) + err := conn.Download(conn.WIPStorageId(), key, fn) if err != nil { - return fmt.Errorf("Failed to download analysis file %s: %v", fn, err) + return fmt.Errorf("Failed to download analysis file %s: %v", key, err) } } return nil } -func DownloadAll(name string, conn Pipeliner) error { +func DownloadAll(dir string, name string, conn Pipeliner) error { objs, err := conn.ListObjects(conn.WIPStorageId(), name) if err != nil { return fmt.Errorf("Failed to get list of files for book", name, err) } for _, i := range objs { + base := filepath.Base(i) + fn := filepath.Join(dir, base) conn.Log("Downloading", i) - err = conn.Download(conn.WIPStorageId(), i, i) + err = conn.Download(conn.WIPStorageId(), i, fn) if err != nil { - return fmt.Errorf("Failed to download file", i, err) + return fmt.Errorf("Failed to download file %s: %v", i, err) } } return nil -- cgit v1.2.1-24-ge1ad