From 30486c55b5b226f4a099ae1511dfea38dd9acfa9 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 7 Sep 2020 17:43:05 +0100 Subject: [iiifdownloader] Split out NoPgNums downloading to its own function --- cmd/iiifdownloader/main.go | 160 ++++++++++++++++++++++++--------------------- 1 file changed, 84 insertions(+), 76 deletions(-) diff --git a/cmd/iiifdownloader/main.go b/cmd/iiifdownloader/main.go index 3572dae..9dfc067 100644 --- a/cmd/iiifdownloader/main.go +++ b/cmd/iiifdownloader/main.go @@ -59,6 +59,88 @@ func parseMets(url string) ([]string, error) { return urls, nil } +// dlNoPgNums downloads all pages, starting from zero, until either +// a 404 is returned, or identical files are returned for two subsequent +// pages (the latter being the behaviour of BNF's server). +func dlNoPgNums(bookdir, pgurlStart, pgurlEnd, pgurlAltStart, pgurlAltEnd string) error { + pgnum := 0 + for { + pgnum++ + + fmt.Printf("Downloading page %d\n", pgnum) + + fn := path.Join(bookdir, fmt.Sprintf("%04d.jpg", pgnum)) + _, err := os.Stat(fn) + if err == nil || os.IsExist(err) { + fmt.Printf("Skipping already present page %d\n", pgnum) + continue + } + + u := fmt.Sprintf("%s%d%s", pgurlStart, pgnum, pgurlEnd) + resp, err := http.Get(u) + if err != nil { + return fmt.Errorf("Error downloading page %d, %s: %v\n", pgnum, u, err) + } + defer resp.Body.Close() + switch { + case resp.StatusCode == http.StatusNotFound: + fmt.Printf("Got 404, assuming end of pages, for page %d, %s\n", pgnum, u) + return nil + case resp.StatusCode != http.StatusOK: + fmt.Printf("Error downloading page %d, %s: HTTP Code %s\n", pgnum, u, resp.Status) + + if pgurlAltStart == "" && pgurlAltEnd == "" { + return fmt.Errorf("No alternative URL to try, book failed (or ended, hopefully)") + } + + fmt.Printf("Trying to redownload page %d at lower quality\n", pgnum) + u = fmt.Sprintf("%s%d%s", pgurlAltStart, pgnum, pgurlAltEnd) + resp, err = http.Get(u) + if err != nil { + return fmt.Errorf("Error downloading page %d, %s: %v\n", pgnum, u, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("Error downloading page %d, %s: HTTP Code %s\n", pgnum, u, resp.Status) + } + } + + f, err := os.Create(fn) + defer f.Close() + if err != nil { + return fmt.Errorf("Error creating file %s: %v\n", fn, err) + } + _, err = io.Copy(f, resp.Body) + if err != nil { + return fmt.Errorf("Error writing file %s: %v\n", fn, err) + } + + // Close once finished with, as defer won't trigger until the end of the function + resp.Body.Close() + f.Close() + + // Check that the last two downloaded files aren't identical, as this + // can happen when there are no more pages to download. + if pgnum == 1 { + continue + } + fn2 := path.Join(bookdir, fmt.Sprintf("%04d.jpg", pgnum-1)) + identical, err := filesAreIdentical(fn, fn2) + if err != nil { + return fmt.Errorf("Error checking for files being identical: %v\n", err) + } + if identical { + fmt.Println("Last 2 pages were identical, looks like it's the end of the book") + err = os.Remove(fn) + if err != nil { + return fmt.Errorf("Error removing dupilicate page %d: %v", fn, err) + } + return nil + } + } +} + func main() { flag.Usage = func() { fmt.Fprintf(flag.CommandLine.Output(), usage) @@ -110,6 +192,7 @@ func main() { log.Fatalln("Failed to extract BNF book ID from URL") } bookid := f[0] + bookdir = bookid metsurl := "https://daten.digitale-sammlungen.de/~db/mets/" + bookid + "_mets.xml" pgUrls, err = parseMets(metsurl) @@ -128,81 +211,6 @@ func main() { if len(pgUrls) > 0 { fmt.Printf("I'll do something proper with these urls: %v\n", pgUrls) } else if noPgNums { - pgnum := 0 - for { - pgnum++ - - fmt.Printf("Downloading page %d\n", pgnum) - - fn := path.Join(bookdir, fmt.Sprintf("%04d.jpg", pgnum)) - _, err = os.Stat(fn) - if err == nil || os.IsExist(err) { - fmt.Printf("Skipping already present page %d\n", pgnum) - continue - } - - u := fmt.Sprintf("%s%d%s", pgurlStart, pgnum, pgurlEnd) - resp, err := http.Get(u) - if err != nil { - log.Fatalf("Error downloading page %d, %s: %v\n", pgnum, u, err) - } - defer resp.Body.Close() - switch { - case resp.StatusCode == http.StatusNotFound: - fmt.Printf("Got 404, assuming end of pages, for page %d, %s\n", pgnum, u) - return - case resp.StatusCode != http.StatusOK: - fmt.Printf("Error downloading page %d, %s: HTTP Code %s\n", pgnum, u, resp.Status) - - if pgurlAltStart == "" && pgurlAltEnd == "" { - log.Fatalln("No alternative URL to try, book failed (or ended, hopefully)") - } - - fmt.Printf("Trying to redownload page %d at lower quality\n", pgnum) - u = fmt.Sprintf("%s%d%s", pgurlAltStart, pgnum, pgurlAltEnd) - resp, err = http.Get(u) - if err != nil { - log.Fatalf("Error downloading page %d, %s: %v\n", pgnum, u, err) - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - log.Fatalf("Error downloading page %d, %s: HTTP Code %s\n", pgnum, u, resp.Status) - } - } - - f, err := os.Create(fn) - defer f.Close() - if err != nil { - log.Fatalf("Error creating file %s: %v\n", fn, err) - } - _, err = io.Copy(f, resp.Body) - if err != nil { - log.Fatalf("Error writing file %s: %v\n", fn, err) - } - - // Close once finished with, as defer won't trigger until the end of the function - resp.Body.Close() - f.Close() - - // Check that the last two downloaded files aren't identical, as this - // can happen when there are no more pages to download. - if pgnum == 1 { - continue - } - fn2 := path.Join(bookdir, fmt.Sprintf("%04d.jpg", pgnum-1)) - identical, err := filesAreIdentical(fn, fn2) - if err != nil { - log.Fatalf("Error checking for files being identical: %v\n", err) - } - if identical { - fmt.Println("Last 2 pages were identical, looks like it's the end of the book") - err = os.Remove(fn) - if err != nil { - log.Fatalf("Error removing dupilicate page %d: %v", fn, err) - } - return - } - } + dlNoPgNums(bookdir, pgurlStart, pgurlEnd, pgurlAltStart, pgurlAltEnd) } } -- cgit v1.2.1-24-ge1ad