summaryrefslogtreecommitdiff
path: root/cmd/getstats/main.go
diff options
context:
space:
mode:
authorNick White <git@njw.name>2020-11-30 19:14:47 +0000
committerNick White <git@njw.name>2020-11-30 19:14:47 +0000
commitdf364cbf93e4ab5b4db8b924b8396f4fb9caa149 (patch)
tree79e0e82d95eb73b9a7f50c23048e819203f7881b /cmd/getstats/main.go
parent0d914a5de3f8169d41df4fcff1ee4aea6d01afbe (diff)
parent190e095b04ce61041d16eb5d0109f5073b83f624 (diff)
Merge branch 'master' of ssh://hammerhead/home/nick/rescribe/src/bookpipeline
Diffstat (limited to 'cmd/getstats/main.go')
-rw-r--r--cmd/getstats/main.go110
1 files changed, 110 insertions, 0 deletions
diff --git a/cmd/getstats/main.go b/cmd/getstats/main.go
new file mode 100644
index 0000000..5ed1414
--- /dev/null
+++ b/cmd/getstats/main.go
@@ -0,0 +1,110 @@
+// Copyright 2020 Nick White.
+// Use of this source code is governed by the GPLv3
+// license that can be found in the LICENSE file.
+
+// getstats gets relevant files for creating statistics from a set
+// of OCRed books stored on cloud infrastructure
+package main
+
+import (
+ "flag"
+ "fmt"
+ "log"
+ "strings"
+
+ "rescribe.xyz/bookpipeline"
+)
+
+const usage = `Usage: getstats
+
+Downloads every 'conf' and 'best' file, and one hocr file, from a
+set of OCRed books. This is useful for statistics.
+`
+
+// null writer to enable non-verbose logging to be discarded
+type NullWriter bool
+
+func (w NullWriter) Write(p []byte) (n int, err error) {
+ return len(p), nil
+}
+
+type Pipeliner interface {
+ MinimalInit() error
+ ListObjects(bucket string, prefix string) ([]string, error)
+ Download(bucket string, key string, fn string) error
+ WIPStorageId() string
+}
+
+func main() {
+ flag.Usage = func() {
+ fmt.Fprintf(flag.CommandLine.Output(), usage)
+ flag.PrintDefaults()
+ }
+ flag.Parse()
+
+ var n NullWriter
+ verboselog := log.New(n, "", log.LstdFlags)
+
+ var conn Pipeliner
+ conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog}
+
+ err := conn.MinimalInit()
+ if err != nil {
+ log.Fatalln("Error setting up cloud connection:", err)
+ }
+
+ log.Println("Getting list of all available objects to filter through")
+ objs, err := conn.ListObjects(conn.WIPStorageId(), "")
+ if err != nil {
+ log.Fatalln("Failed to get list of files", err)
+ }
+
+ log.Println("Downloading all best and conf files found")
+ for _, i := range objs {
+ parts := strings.Split(i, "/")
+ if parts[len(parts) - 1] == "best" {
+ fmt.Printf("Downloading %s to %s\n", i, parts[0] + "-best")
+ err = conn.Download(conn.WIPStorageId(), i, parts[0] + "-best")
+ if err != nil {
+ log.Fatalln("Failed to download file", i, err)
+ }
+ }
+ if parts[len(parts) - 1] == "conf" {
+ fmt.Printf("Downloading %s to %s\n", i, parts[0] + "-conf")
+ err = conn.Download(conn.WIPStorageId(), i, parts[0] + "-conf")
+ if err != nil {
+ log.Fatalln("Failed to download file", i, err)
+ }
+ }
+ }
+
+ var bookpagesgot []string
+ var found bool
+ for _, i := range objs {
+ parts := strings.Split(i, "/")
+ if len(parts) < 2 {
+ continue
+ }
+
+ // check if we already have a hocr page for this book
+ found = false
+ for _, name := range bookpagesgot {
+ if name == parts[0] {
+ found = true
+ continue
+ }
+ }
+ if found {
+ continue
+ }
+
+ if strings.HasSuffix(parts[1], ".hocr") {
+ fmt.Printf("Downloading %s to %s\n", i, parts[0] + "-hocr")
+ err = conn.Download(conn.WIPStorageId(), i, parts[0] + "-hocr")
+ if err != nil {
+ log.Fatalln("Failed to download file", i, err)
+ }
+ bookpagesgot = append(bookpagesgot, parts[0])
+ }
+ }
+}