diff options
Diffstat (limited to 'scrape-bsb.sh')
-rw-r--r-- | scrape-bsb.sh | 20 |
1 files changed, 20 insertions, 0 deletions
diff --git a/scrape-bsb.sh b/scrape-bsb.sh new file mode 100644 index 0000000..6a7c049 --- /dev/null +++ b/scrape-bsb.sh @@ -0,0 +1,20 @@ +#!/bin/sh +usage="Usage: $0 bsburl" + +test $# -ne 1 && echo "$usage" && exit 1 + +# This probably isn't very robust; presumes we'll see URLs of the form: +# https://reader.digitale-sammlungen.de/de/fs1/object/display/bsb11274872_00005.html +bookid=`echo "$1" | awk -F / '{printf("%s\n", $8)}'|sed 's/_.*//g'` + +xml=`curl -s "http://daten.digitale-sammlungen.de/~db/mets/${bookid}_mets.xml"` + +urls=`echo "$xml" | grep mets:FLocat | grep 'images/150' | awk -F '"' '{print $2}'` + +mkdir -p "$bookid" + +echo "$urls" | while read i; do + pgnum=`echo "$i" | awk -F '_' '{print $2}'` + echo "Downloading page $pgnum to $bookid/$pgnum.jpg" + curl -s "$i" > "${bookid}/${pgnum}.jpg" +done |