summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2021-02-15 17:09:20 +0000
committerNick White <git@njw.name>2021-02-15 17:09:20 +0000
commit44a027984044a55a8a483268ddf0b841e9f33e83 (patch)
treec575a43b60757f297e94287390bc06a18e098f3f
parent11470933e4fd379b4aefa4e2bab33662a72791c2 (diff)
getsamplepages: Add -prefix option, and use 'best' to get random page numbers
The -prefix option is useful to us. Previously only a .jpg for page number 100 was retreived, which failed if the book had fewer (or unusually named) pages, and also didn't provide a corresponding .hocr at all (bug introduced with 48958d2). Using 'best', which is (effectively) randomly sorted, provides a guaranteed to exist page, and a random one at that.
-rw-r--r--cmd/getsamplepages/main.go46
1 files changed, 35 insertions, 11 deletions
diff --git a/cmd/getsamplepages/main.go b/cmd/getsamplepages/main.go
index 36c7a8e..8e35055 100644
--- a/cmd/getsamplepages/main.go
+++ b/cmd/getsamplepages/main.go
@@ -9,21 +9,21 @@ package main
import (
"flag"
"fmt"
+ "io/ioutil"
"log"
+ "os"
"strings"
"rescribe.xyz/bookpipeline"
)
-const usage = `Usage: getsamplepages
+const usage = `Usage: getsamplepages [-prefix prefix]
Downloads a sample page hocr and image from each book in a set
of OCRed books. These can then be used for various testing,
statistics, and so on.
`
-const pgnum = "0100"
-
// null writer to enable non-verbose logging to be discarded
type NullWriter bool
@@ -40,6 +40,7 @@ type Pipeliner interface {
}
func main() {
+ prefix := flag.String("prefix", "", "Only select books with this prefix (e.g. '17' for 18th century books)")
flag.Usage = func() {
fmt.Fprintf(flag.CommandLine.Output(), usage)
flag.PrintDefaults()
@@ -57,23 +58,46 @@ func main() {
log.Fatalln("Error setting up cloud connection:", err)
}
- log.Println("Getting list of all books")
+ fmt.Println("Getting list of all books")
prefixes, err := conn.ListObjectPrefixes(conn.WIPStorageId())
if err != nil {
log.Fatalln("Failed to get list of books", err)
}
for _, p := range prefixes {
+ if *prefix != "" && !strings.HasPrefix(p, *prefix) {
+ continue
+ }
+
name := strings.Split(p, "/")[0]
- log.Printf("Downloading a page from %s\n", name)
- fn := pgnum + ".jpg"
- err = conn.Download(conn.WIPStorageId(), p+fn, name+fn)
- if err != nil && strings.HasPrefix(err.Error(), "NoSuchKey:") {
- log.Printf("Skipping %s as no page %s found\n", p, pgnum)
+ err = conn.Download(conn.WIPStorageId(), p+"best", name+"best")
+ if err != nil {
+ }
+ b, err := ioutil.ReadFile(name+"best")
+ if err != nil {
+ log.Fatalf("Failed to read file %s\n", name+"best")
+ }
+ lines := strings.SplitN(string(b), "\n", 2)
+ if len(lines) == 1 {
+ fmt.Printf("No pages found for %s, skipping\n", name)
continue
- } else if err != nil {
- log.Fatalf("Download of %s%s failed: %v\n", p+fn, err)
+ }
+ pg := strings.TrimSuffix(lines[0], ".hocr")
+
+ err = os.Remove(name+"best")
+ if err != nil {
+ log.Fatalf("Failed to remove temporary best file for %s", name)
+ }
+
+ fmt.Printf("Downloading page %s from %s\n", pg, name)
+
+ for _, suffix := range []string{".png", ".hocr"} {
+ fn := pg + suffix
+ err = conn.Download(conn.WIPStorageId(), p+fn, name+fn)
+ if err != nil {
+ log.Fatalf("Download of %s%s failed: %v\n", p+fn, err)
+ }
}
}
}