summaryrefslogtreecommitdiff
path: root/cmd/getsamplepages
diff options
context:
space:
mode:
authorNick White <git@njw.name>2020-08-19 14:15:37 +0100
committerNick White <git@njw.name>2020-08-19 14:15:37 +0100
commit029752e39023c4a7118f5e244d05faa75e1486d0 (patch)
tree8642acd37d84410b588636c1400fa587bd21ea52 /cmd/getsamplepages
parent8d8d599e1d08f5514e5761ee2ce5fd640203ae12 (diff)
Add getsamplepages
Diffstat (limited to 'cmd/getsamplepages')
-rw-r--r--cmd/getsamplepages/main.go88
1 files changed, 88 insertions, 0 deletions
diff --git a/cmd/getsamplepages/main.go b/cmd/getsamplepages/main.go
new file mode 100644
index 0000000..55cb8b2
--- /dev/null
+++ b/cmd/getsamplepages/main.go
@@ -0,0 +1,88 @@
+// Copyright 2020 Nick White.
+// Use of this source code is governed by the GPLv3
+// license that can be found in the LICENSE file.
+
+// getsamplepages downloads sample pages from each book in a
+// set of OCRed books
+package main
+
+import (
+ "flag"
+ "fmt"
+ "log"
+ "strings"
+
+ "rescribe.xyz/bookpipeline"
+)
+
+const usage = `Usage: getsamplepages
+
+Downloads a sample page hocr and image from each book in a set
+of OCRed books. These can then be used for various testing,
+statistics, and so on.
+`
+
+const pgnum = "0100"
+
+// null writer to enable non-verbose logging to be discarded
+type NullWriter bool
+
+func (w NullWriter) Write(p []byte) (n int, err error) {
+ return len(p), nil
+}
+
+type Pipeliner interface {
+ Init() error
+ ListObjectPrefixes(bucket string) ([]string, error)
+ ListObjects(bucket string, prefix string) ([]string, error)
+ Download(bucket string, key string, fn string) error
+ WIPStorageId() string
+}
+
+func main() {
+ flag.Usage = func() {
+ fmt.Fprintf(flag.CommandLine.Output(), usage)
+ flag.PrintDefaults()
+ }
+ flag.Parse()
+
+ var n NullWriter
+ verboselog := log.New(n, "", log.LstdFlags)
+
+ var conn Pipeliner
+ conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog}
+
+ err := conn.Init()
+ if err != nil {
+ log.Fatalln("Error setting up cloud connection:", err)
+ }
+
+ log.Println("Getting list of all books")
+ prefixes, err := conn.ListObjectPrefixes(conn.WIPStorageId())
+ if err != nil {
+ log.Fatalln("Failed to get list of books", err)
+ }
+
+ for _, p := range prefixes {
+ name := strings.Split(p, "/")[0]
+ log.Printf("Downloading a page from %s\n", name)
+
+ for _, n := range []string{"_bin0.0", "_bin0.2", "_bin0.4", "_bin0.5"} {
+ fn := pgnum + n + ".hocr"
+ err = conn.Download(conn.WIPStorageId(), p+fn, name+fn)
+ if err != nil && strings.HasPrefix(err.Error(), "NoSuchKey:") {
+ continue
+ } else if err != nil {
+ log.Fatalf("Download of %s%s failed: %v\n", p+fn, err)
+ }
+
+ fn = pgnum + n + ".png"
+ err = conn.Download(conn.WIPStorageId(), p+fn, name+fn)
+ if err != nil && strings.HasPrefix(err.Error(), "NoSuchKey:") {
+ continue
+ } else if err != nil {
+ log.Fatalf("Download of %s%s failed: %v\n", p+fn, err)
+ }
+ }
+ }
+}