diff options
| author | Nick White <git@njw.name> | 2020-11-30 19:13:53 +0000 | 
|---|---|---|
| committer | Nick White <git@njw.name> | 2020-11-30 19:13:53 +0000 | 
| commit | 190e095b04ce61041d16eb5d0109f5073b83f624 (patch) | |
| tree | 2c817a98163a5857fccfb36dea19c028ebb61301 /cmd | |
| parent | f706e96cb59d5548ed5afbbf1991dc442b8ffc09 (diff) | |
Add getstats tool
Diffstat (limited to 'cmd')
| -rw-r--r-- | cmd/getstats/main.go | 110 | 
1 files changed, 110 insertions, 0 deletions
| diff --git a/cmd/getstats/main.go b/cmd/getstats/main.go new file mode 100644 index 0000000..5ed1414 --- /dev/null +++ b/cmd/getstats/main.go @@ -0,0 +1,110 @@ +// Copyright 2020 Nick White. +// Use of this source code is governed by the GPLv3 +// license that can be found in the LICENSE file. + +// getstats gets relevant files for creating statistics from a set +// of OCRed books stored on cloud infrastructure +package main + +import ( +	"flag" +	"fmt" +	"log" +	"strings" + +	"rescribe.xyz/bookpipeline" +) + +const usage = `Usage: getstats + +Downloads every 'conf' and 'best' file, and one hocr file, from a +set of OCRed books. This is useful for statistics. +` + +// null writer to enable non-verbose logging to be discarded +type NullWriter bool + +func (w NullWriter) Write(p []byte) (n int, err error) { +	return len(p), nil +} + +type Pipeliner interface { +	MinimalInit() error +	ListObjects(bucket string, prefix string) ([]string, error) +	Download(bucket string, key string, fn string) error +	WIPStorageId() string +} + +func main() { +	flag.Usage = func() { +		fmt.Fprintf(flag.CommandLine.Output(), usage) +		flag.PrintDefaults() +	} +	flag.Parse() + +	var n NullWriter +	verboselog := log.New(n, "", log.LstdFlags) + +	var conn Pipeliner +	conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog} + +	err := conn.MinimalInit() +	if err != nil { +		log.Fatalln("Error setting up cloud connection:", err) +	} + +	log.Println("Getting list of all available objects to filter through") +	objs, err := conn.ListObjects(conn.WIPStorageId(), "") +	if err != nil { +		log.Fatalln("Failed to get list of files", err) +	} + +	log.Println("Downloading all best and conf files found") +	for _, i := range objs { +		parts := strings.Split(i, "/") +		if parts[len(parts) - 1] == "best" { +			fmt.Printf("Downloading %s to %s\n", i, parts[0] + "-best") +			err = conn.Download(conn.WIPStorageId(), i, parts[0] + "-best") +			if err != nil { +				log.Fatalln("Failed to download file", i, err) +			} +		} +		if parts[len(parts) - 1] == "conf" { +			fmt.Printf("Downloading %s to %s\n", i, parts[0] + "-conf") +			err = conn.Download(conn.WIPStorageId(), i, parts[0] + "-conf") +			if err != nil { +				log.Fatalln("Failed to download file", i, err) +			} +		} +	} + +	var bookpagesgot []string +	var found bool +	for _, i := range objs { +		parts := strings.Split(i, "/") +		if len(parts) < 2 { +			continue +		} + +		// check if we already have a hocr page for this book +		found = false +		for _, name := range bookpagesgot { +			if name == parts[0] { +				found = true +				continue +			} +		} +		if found { +			continue +		} + +		if strings.HasSuffix(parts[1], ".hocr") { +			fmt.Printf("Downloading %s to %s\n", i, parts[0] + "-hocr") +			err = conn.Download(conn.WIPStorageId(), i, parts[0] + "-hocr") +			if err != nil { +				log.Fatalln("Failed to download file", i, err) +			} +			bookpagesgot = append(bookpagesgot, parts[0]) +		} +	} +} | 
