From 91902623777e693d1aadc525a9e6f453aa763712 Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 8 Sep 2020 14:17:35 +0100 Subject: [iiifdownloader] BSB downloading works now, by parsing METS XML --- cmd/iiifdownloader/main.go | 142 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 131 insertions(+), 11 deletions(-) diff --git a/cmd/iiifdownloader/main.go b/cmd/iiifdownloader/main.go index 9dfc067..cccc293 100644 --- a/cmd/iiifdownloader/main.go +++ b/cmd/iiifdownloader/main.go @@ -1,12 +1,14 @@ package main import ( + "encoding/xml" "flag" "fmt" "io" "io/ioutil" "log" "net/http" + "net/url" "os" "path" "strings" @@ -18,10 +20,12 @@ Downloads all pages from a IIIF server. Currently supports the following IIIF using services: - BNF's Gallica (any book or page URL should work) +- BSB / MDZ / DFG ` const bnfPrefix = `https://gallica.bnf.fr/ark:/` const bsbPrefix = `https://reader.digitale-sammlungen.de/de/fs1/object/display/` +const dfgPrefix = `http://dfg-viewer.de/` func filesAreIdentical(fn1, fn2 string) (bool, error) { f1, err := os.Open(fn1) @@ -51,14 +55,90 @@ func filesAreIdentical(fn1, fn2 string) (bool, error) { return true, nil } -func parseMets(url string) ([]string, error) { +func parseMets(u string) ([]string, error) { var urls []string - // TODO: download and parse xml; - // https://daten.digitale-sammlungen.de/~db/mets/bsb11274872_mets.xml - // mets:mets -> mets:fileSec -> mets:fileGrp USE="MAX" -> mets:file -> mets:FLocat xlink:href + + // designed to be unmarshalled by encoding/xml's Unmarshal() + type metsXML struct { + FileGrps []struct { + Attr string `xml:"USE,attr"` + Files []struct { + Url string `xml:"href,attr"` + } `xml:"file>FLocat"` + XMLName xml.Name `xml:"fileGrp"` + } `xml:"fileSec>fileGrp"` + } + + resp, err := http.Get(u) + if err != nil { + return urls, fmt.Errorf("Error downloading mets %s: %v", u, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return urls, fmt.Errorf("Error downloading mets %s: %v", u, err) + } + + b, err := ioutil.ReadAll(resp.Body) + if err != nil { + return urls, fmt.Errorf("Error reading mets XML %s: %v", u, err) + } + + v := metsXML{} + err = xml.Unmarshal(b, &v) + if err != nil { + return urls, fmt.Errorf("Error parsing mets XML %s: %v", u, err) + } + + for _, grp := range v.FileGrps { + if grp.Attr == "MAX" { + for _, f := range grp.Files { + urls = append(urls, f.Url) + } + } + } + return urls, nil } +func dlPage(bookdir, u string) error { + b := path.Base(u) + ext := path.Ext(u) + if len(ext) == 0 { + ext = "jpg" + } + field := strings.Split(b, ".") + name := field[0][len(field[0])-4:] + "." + ext + fn := path.Join(bookdir, name) + + fmt.Printf("Downloading page %s to %s\n", u, fn) + + resp, err := http.Get(u) + if err != nil { + return fmt.Errorf("Error downloading page %s: %v", u, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("Error downloading page %s: %v", u, err) + } + + f, err := os.Create(fn) + defer f.Close() + if err != nil { + return fmt.Errorf("Error creating file %s: %v\n", fn, err) + } + _, err = io.Copy(f, resp.Body) + if err != nil { + return fmt.Errorf("Error writing file %s: %v\n", fn, err) + } + + resp.Body.Close() + f.Close() + + return nil +} + // dlNoPgNums downloads all pages, starting from zero, until either // a 404 is returned, or identical files are returned for two subsequent // pages (the latter being the behaviour of BNF's server). @@ -153,7 +233,7 @@ func main() { return } - url := flag.Arg(0) + u := flag.Arg(0) var bookdir string var pgurlStart, pgurlEnd string @@ -163,8 +243,8 @@ func main() { var err error switch { - case strings.HasPrefix(url, bnfPrefix): - f := strings.Split(url[len(bnfPrefix):], "/") + case strings.HasPrefix(u, bnfPrefix): + f := strings.Split(u[len(bnfPrefix):], "/") if len(f) < 2 { log.Fatalln("Failed to extract BNF book ID from URL") } @@ -186,8 +266,8 @@ func main() { // the missing ones in less good quality from an alternative URL. pgurlAltStart = "https://gallica.bnf.fr/ark:/" + bookid + "/f" pgurlAltEnd = ".highres" - case strings.HasPrefix(url, bsbPrefix): - f := strings.Split(url[len(bsbPrefix):], "_") + case strings.HasPrefix(u, bsbPrefix): + f := strings.Split(u[len(bsbPrefix):], "_") if len(f) < 2 { log.Fatalln("Failed to extract BNF book ID from URL") } @@ -195,6 +275,39 @@ func main() { bookdir = bookid metsurl := "https://daten.digitale-sammlungen.de/~db/mets/" + bookid + "_mets.xml" + pgUrls, err = parseMets(metsurl) + if err != nil { + log.Fatalf("Error parsing mets url %s: %v\n", metsurl, err) + } + case strings.HasPrefix(u, dfgPrefix): + // dfg can have a url encoded mets url in several parts of the viewer url + metsNames := []string{"set[mets]", "tx_dlf[id]"} + var metsurl string + escurl, err := url.QueryUnescape(u) + if err != nil { + log.Fatalf("Error unescaping url %s: %v\n", u, err) + } + for _, v := range metsNames { + i := strings.Index(escurl, v) + if i != -1 { + start := i + len(v) + 1 // +1 is to pass the equals sign + end := strings.Index(escurl[start:], "&") + if end == -1 { + end = len(escurl) + } else { + end += start + } + metsurl = escurl[start:end] + } + } + if len(metsurl) == 0 { + log.Fatalf("No mets url found in %s\n", u) + } + + b := path.Base(metsurl) + f := strings.Split(b, "_") + bookdir = f[0] + pgUrls, err = parseMets(metsurl) if err != nil { log.Fatalf("Error parsing mets url %s: %v\n", metsurl, err) @@ -209,8 +322,15 @@ func main() { } if len(pgUrls) > 0 { - fmt.Printf("I'll do something proper with these urls: %v\n", pgUrls) + for _, v := range pgUrls { + dlPage(bookdir, v) + } } else if noPgNums { - dlNoPgNums(bookdir, pgurlStart, pgurlEnd, pgurlAltStart, pgurlAltEnd) + err = dlNoPgNums(bookdir, pgurlStart, pgurlEnd, pgurlAltStart, pgurlAltEnd) + if err != nil { + log.Fatalf("Error downloading pages: %v\n", err) + } + } else { + log.Fatalf("Failed to find any pages\n") } } -- cgit v1.2.1-24-ge1ad