diff options
author | Nick White <git@njw.name> | 2019-05-08 19:39:04 +0100 |
---|---|---|
committer | Nick White <git@njw.name> | 2019-05-08 19:39:04 +0100 |
commit | 2d204b513319e3af13ea6ba8b79f2af024efe48b (patch) | |
tree | 3b638b1edb505fe8dd72de368783659f1beea0c7 /scrape-bsb.sh | |
parent | 4b43563f58f82c35399d4da9729839e48cd192f0 (diff) |
Make scrapers more robust, and have them scrape into a directory per book
Diffstat (limited to 'scrape-bsb.sh')
-rw-r--r-- | scrape-bsb.sh | 31 |
1 files changed, 26 insertions, 5 deletions
diff --git a/scrape-bsb.sh b/scrape-bsb.sh index 6a7c049..9e8f825 100644 --- a/scrape-bsb.sh +++ b/scrape-bsb.sh @@ -1,20 +1,41 @@ #!/bin/sh -usage="Usage: $0 bsburl" +usage="Usage: $0 bsburl + +bsburl: The book index page, e.g. + https://reader.digitale-sammlungen.de/de/fs1/object/display/bsb11274872_00005.html + +Downloads all pages from a book on digitale-sammlungen.de + +Note that some books may initially fail in downloading the book index +page. The reasons for this are not exactly clear, but try again in a +few minutes and it may well work. It is probably to do with some +strange caching issue." test $# -ne 1 && echo "$usage" && exit 1 -# This probably isn't very robust; presumes we'll see URLs of the form: -# https://reader.digitale-sammlungen.de/de/fs1/object/display/bsb11274872_00005.html bookid=`echo "$1" | awk -F / '{printf("%s\n", $8)}'|sed 's/_.*//g'` -xml=`curl -s "http://daten.digitale-sammlungen.de/~db/mets/${bookid}_mets.xml"` +xml=`curl -f -s "http://daten.digitale-sammlungen.de/~db/mets/${bookid}_mets.xml"` + +if test $? -ne 0 ; then + echo "Error downloading book index page: http://daten.digitale-sammlungen.de/~db/mets/${bookid}_mets.xml" + exit 1 +fi urls=`echo "$xml" | grep mets:FLocat | grep 'images/150' | awk -F '"' '{print $2}'` mkdir -p "$bookid" +if test $? -ne 0 ; then + echo "Failed to mkdir $bookname" + exit 1 +fi echo "$urls" | while read i; do pgnum=`echo "$i" | awk -F '_' '{print $2}'` echo "Downloading page $pgnum to $bookid/$pgnum.jpg" - curl -s "$i" > "${bookid}/${pgnum}.jpg" + curl -f -s "$i" > "${bookid}/${pgnum}.jpg" + if test $? -ne 0 ; then + echo "Download failed for page ${pgnum}: $i" + rm -f "${bookid}/${pgnum}.jpg" + fi done |