From 8aa2d713d511baeaa94472e238e46c6d02ac9332 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 15 Jun 2020 15:14:48 +0100 Subject: Add getallhocrs tool --- cmd/getallhocrs/main.go | 77 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 cmd/getallhocrs/main.go diff --git a/cmd/getallhocrs/main.go b/cmd/getallhocrs/main.go new file mode 100644 index 0000000..136f07e --- /dev/null +++ b/cmd/getallhocrs/main.go @@ -0,0 +1,77 @@ +// Copyright 2020 Nick White. +// Use of this source code is governed by the GPLv3 +// license that can be found in the LICENSE file. + +// getallhocrs downloads every 'best' file from a set of OCRed books +// stored on cloud infrastructure +package main + +import ( + "flag" + "fmt" + "log" + "os" + "strings" + + "rescribe.xyz/bookpipeline" +) + +const usage = `Usage: getallhocrs + +Downloads every 'hocr' file. +` + +type Pipeliner interface { + Init() error + Download(bucket string, key string, fn string) error + ListObjects(bucket string, prefix string) ([]string, error) + ListObjectPrefixes(bucket string) ([]string, error) + Log(v ...interface{}) + WIPStorageId() string +} + +func main() { + flag.Usage = func() { + fmt.Fprintf(flag.CommandLine.Output(), usage) + flag.PrintDefaults() + } + flag.Parse() + + verboselog := log.New(os.Stdout, "", log.LstdFlags) + + var conn Pipeliner + conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog} + + err := conn.Init() + if err != nil { + log.Fatalln("Error setting up cloud connection:", err) + } + + conn.Log("Getting list of all books") + prefixes, err := conn.ListObjectPrefixes(conn.WIPStorageId()) + if err != nil { + log.Fatalln("Failed to get list of prefixes", err) + } + + for _, p := range prefixes { + conn.Log("Getting list of files for book", p) + objs, err := conn.ListObjects(conn.WIPStorageId(), p) + if err != nil { + log.Fatalln("Failed to get list of files", err) + } + err = os.MkdirAll(p, 0755) + if err != nil { + log.Fatalln("Failed to make directory", err) + } + conn.Log("Downloading hocrs from book", p) + for _, o := range objs { + if !strings.HasSuffix(o, ".hocr") { + continue + } + err = conn.Download(conn.WIPStorageId(), o, o) + if err != nil { + log.Fatalln("Failed to download file", o, err) + } + } + } +} -- cgit v1.2.1-24-ge1ad