summaryrefslogtreecommitdiff
path: root/scrape-bsb.sh
diff options
context:
space:
mode:
Diffstat (limited to 'scrape-bsb.sh')
-rw-r--r--scrape-bsb.sh31
1 files changed, 26 insertions, 5 deletions
diff --git a/scrape-bsb.sh b/scrape-bsb.sh
index 6a7c049..9e8f825 100644
--- a/scrape-bsb.sh
+++ b/scrape-bsb.sh
@@ -1,20 +1,41 @@
#!/bin/sh
-usage="Usage: $0 bsburl"
+usage="Usage: $0 bsburl
+
+bsburl: The book index page, e.g.
+ https://reader.digitale-sammlungen.de/de/fs1/object/display/bsb11274872_00005.html
+
+Downloads all pages from a book on digitale-sammlungen.de
+
+Note that some books may initially fail in downloading the book index
+page. The reasons for this are not exactly clear, but try again in a
+few minutes and it may well work. It is probably to do with some
+strange caching issue."
test $# -ne 1 && echo "$usage" && exit 1
-# This probably isn't very robust; presumes we'll see URLs of the form:
-# https://reader.digitale-sammlungen.de/de/fs1/object/display/bsb11274872_00005.html
bookid=`echo "$1" | awk -F / '{printf("%s\n", $8)}'|sed 's/_.*//g'`
-xml=`curl -s "http://daten.digitale-sammlungen.de/~db/mets/${bookid}_mets.xml"`
+xml=`curl -f -s "http://daten.digitale-sammlungen.de/~db/mets/${bookid}_mets.xml"`
+
+if test $? -ne 0 ; then
+ echo "Error downloading book index page: http://daten.digitale-sammlungen.de/~db/mets/${bookid}_mets.xml"
+ exit 1
+fi
urls=`echo "$xml" | grep mets:FLocat | grep 'images/150' | awk -F '"' '{print $2}'`
mkdir -p "$bookid"
+if test $? -ne 0 ; then
+ echo "Failed to mkdir $bookname"
+ exit 1
+fi
echo "$urls" | while read i; do
pgnum=`echo "$i" | awk -F '_' '{print $2}'`
echo "Downloading page $pgnum to $bookid/$pgnum.jpg"
- curl -s "$i" > "${bookid}/${pgnum}.jpg"
+ curl -f -s "$i" > "${bookid}/${pgnum}.jpg"
+ if test $? -ne 0 ; then
+ echo "Download failed for page ${pgnum}: $i"
+ rm -f "${bookid}/${pgnum}.jpg"
+ fi
done