summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cmd/getpipelinebook/main.go100
-rw-r--r--cmd/rescribe/main.go2
-rw-r--r--internal/pipeline/get.go33
-rw-r--r--internal/pipeline/pipeline.go5
4 files changed, 56 insertions, 84 deletions
diff --git a/cmd/getpipelinebook/main.go b/cmd/getpipelinebook/main.go
index ef13db5..5116414 100644
--- a/cmd/getpipelinebook/main.go
+++ b/cmd/getpipelinebook/main.go
@@ -6,18 +6,16 @@
package main
import (
- "bufio"
"flag"
"fmt"
"log"
"os"
"path/filepath"
- "strings"
"rescribe.xyz/bookpipeline"
-)
-// TODO: use internal/pipeline/get.go functions
+ "rescribe.xyz/bookpipeline/internal/pipeline"
+)
const usage = `Usage: getpipelinebook [-c conn] [-a] [-graph] [-pdf] [-png] [-v] bookname
@@ -35,28 +33,6 @@ func (w NullWriter) Write(p []byte) (n int, err error) {
return len(p), nil
}
-type Pipeliner interface {
- MinimalInit() error
- ListObjects(bucket string, prefix string) ([]string, error)
- Download(bucket string, key string, fn string) error
- Upload(bucket string, key string, path string) error
- CheckQueue(url string, timeout int64) (bookpipeline.Qmsg, error)
- AddToQueue(url string, msg string) error
- DelFromQueue(url string, handle string) error
- WIPStorageId() string
-}
-
-func getpdfs(conn Pipeliner, l *log.Logger, bookname string) {
- for _, suffix := range []string{".colour.pdf", ".binarised.pdf"} {
- fn := filepath.Join(bookname, bookname+suffix)
- l.Println("Downloading PDF", fn)
- err := conn.Download(conn.WIPStorageId(), fn, fn)
- if err != nil {
- log.Printf("Failed to download %s: %s\n", fn, err)
- }
- }
-}
-
func main() {
all := flag.Bool("a", false, "Get all files for book")
conntype := flag.String("c", "aws", "connection type ('aws' or 'local')")
@@ -85,7 +61,7 @@ func main() {
verboselog = log.New(n, "", log.LstdFlags)
}
- var conn Pipeliner
+ var conn pipeline.MinPipeliner
switch *conntype {
case "aws":
conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog}
@@ -111,18 +87,10 @@ func main() {
if *all {
verboselog.Println("Downloading all files for", bookname)
- objs, err := conn.ListObjects(conn.WIPStorageId(), bookname)
+ err = pipeline.DownloadAll(bookname, conn)
if err != nil {
- log.Fatalln("Failed to get list of files for book", bookname, err)
- }
- for _, i := range objs {
- verboselog.Println("Downloading", i)
- err = conn.Download(conn.WIPStorageId(), i, i)
- if err != nil {
- log.Fatalln("Failed to download file", i, err)
- }
+ log.Fatalln(err)
}
- return
}
if *binarisedpdf {
@@ -153,61 +121,29 @@ func main() {
}
if *pdf {
- getpdfs(conn, verboselog, bookname)
+ verboselog.Println("Downloading PDFs")
+ pipeline.DownloadPdfs(bookname, conn)
}
if *binarisedpdf || *colourpdf || *graph || *pdf {
return
}
- verboselog.Println("Downloading best file")
- fn := filepath.Join(bookname, "best")
- err = conn.Download(conn.WIPStorageId(), fn, fn)
+ verboselog.Println("Downloading best pages")
+ err = pipeline.DownloadBestPages(bookname, conn, *png)
if err != nil {
- log.Fatalln("Failed to download 'best' file", err)
- }
- f, err := os.Open(fn)
- if err != nil {
- log.Fatalln("Failed to open best file", err)
- }
- defer f.Close()
-
- if *png {
- verboselog.Println("Downloading png files")
- s := bufio.NewScanner(f)
- for s.Scan() {
- txtfn := filepath.Join(bookname, s.Text())
- fn = strings.Replace(txtfn, ".hocr", ".png", 1)
- verboselog.Println("Downloading file", fn)
- err = conn.Download(conn.WIPStorageId(), fn, fn)
- if err != nil {
- log.Fatalln("Failed to download file", fn, err)
- }
- }
- return
+ log.Fatalln(err)
}
- verboselog.Println("Downloading HOCR files")
- s := bufio.NewScanner(f)
- for s.Scan() {
- fn = filepath.Join(bookname, s.Text())
- verboselog.Println("Downloading file", fn)
- err = conn.Download(conn.WIPStorageId(), fn, fn)
- if err != nil {
- log.Fatalln("Failed to download file", fn, err)
- }
+ verboselog.Println("Downloading PDFs")
+ pipeline.DownloadPdfs(bookname, conn)
+ if err != nil {
+ log.Fatalln(err)
}
- verboselog.Println("Downloading PDF files")
- getpdfs(conn, verboselog, bookname)
-
- verboselog.Println("Downloading analysis files")
- for _, a := range []string{"conf", "graph.png"} {
- fn = filepath.Join(bookname, a)
- verboselog.Println("Downloading file", fn)
- err = conn.Download(conn.WIPStorageId(), fn, fn)
- if err != nil {
- log.Fatalln("Failed to download file", fn, err)
- }
+ verboselog.Println("Downloading analyses")
+ err = pipeline.DownloadAnalyses(bookname, conn)
+ if err != nil {
+ log.Fatalln(err)
}
}
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go
index 1a3dcff..8e2fe69 100644
--- a/cmd/rescribe/main.go
+++ b/cmd/rescribe/main.go
@@ -168,7 +168,7 @@ func downloadbook(name string, conn Pipeliner) error {
log.Fatalln("Failed to create directory", name, err)
}
- err = pipeline.DownloadBestPages(name, conn)
+ err = pipeline.DownloadBestPages(name, conn, false)
if err != nil {
return fmt.Errorf("Error downloading best pages: %v", err)
}
diff --git a/internal/pipeline/get.go b/internal/pipeline/get.go
index 8492d99..6949062 100644
--- a/internal/pipeline/get.go
+++ b/internal/pipeline/get.go
@@ -9,9 +9,10 @@ import (
"fmt"
"os"
"path/filepath"
+ "strings"
)
-func DownloadBestPages(name string, conn Pipeliner) error {
+func DownloadBestPages(name string, conn Pipeliner, pluspngs bool) error {
fn := filepath.Join(name, "best")
err := conn.Download(conn.WIPStorageId(), fn, fn)
if err != nil {
@@ -26,12 +27,27 @@ func DownloadBestPages(name string, conn Pipeliner) error {
s := bufio.NewScanner(f)
for s.Scan() {
fn = filepath.Join(name, s.Text())
+ conn.Log("Downloading file", fn)
err = conn.Download(conn.WIPStorageId(), fn, fn)
if err != nil {
return fmt.Errorf("Failed to download file %s: %v", fn, err)
}
}
+ if !pluspngs {
+ return nil
+ }
+
+ s = bufio.NewScanner(f)
+ for s.Scan() {
+ txtfn := filepath.Join(name, s.Text())
+ fn = strings.Replace(txtfn, ".hocr", ".png", 1)
+ conn.Log("Downloading file", fn)
+ err = conn.Download(conn.WIPStorageId(), fn, fn)
+ if err != nil {
+ return fmt.Errorf("Failed to download file", fn, err)
+ }
+ }
return nil
}
@@ -56,3 +72,18 @@ func DownloadAnalyses(name string, conn Pipeliner) error {
}
return nil
}
+
+func DownloadAll(name string, conn Pipeliner) error {
+ objs, err := conn.ListObjects(conn.WIPStorageId(), name)
+ if err != nil {
+ return fmt.Errorf("Failed to get list of files for book", name, err)
+ }
+ for _, i := range objs {
+ conn.Log("Downloading", i)
+ err = conn.Download(conn.WIPStorageId(), i, i)
+ if err != nil {
+ return fmt.Errorf("Failed to download file", i, err)
+ }
+ }
+ return nil
+}
diff --git a/internal/pipeline/pipeline.go b/internal/pipeline/pipeline.go
index cce5c19..c0accdb 100644
--- a/internal/pipeline/pipeline.go
+++ b/internal/pipeline/pipeline.go
@@ -52,6 +52,11 @@ type Pipeliner interface {
Log(v ...interface{})
}
+type MinPipeliner interface {
+ Pipeliner
+ MinimalInit() error
+}
+
type pageimg struct {
hocr, img string
}