#!/bin/sh usage="Usage: $0 bsburl bsburl: The book index page, e.g. https://reader.digitale-sammlungen.de/de/fs1/object/display/bsb11274872_00005.html Downloads all pages from a book on digitale-sammlungen.de Note that some books may initially fail in downloading the book index page. The reasons for this are not exactly clear, but try again in a few minutes and it may well work. It is probably to do with some strange caching issue." test $# -ne 1 && echo "$usage" && exit 1 bookid=`echo "$1" | awk -F / '{printf("%s\n", $8)}'|sed 's/_.*//g'` xml=`curl -f -s "http://daten.digitale-sammlungen.de/~db/mets/${bookid}_mets.xml"` if test $? -ne 0 ; then echo "Error downloading book index page: http://daten.digitale-sammlungen.de/~db/mets/${bookid}_mets.xml" exit 1 fi urls=`echo "$xml" | grep mets:FLocat | grep 'images/150' | awk -F '"' '{print $2}'` mkdir -p "$bookid" if test $? -ne 0 ; then echo "Failed to mkdir $bookname" exit 1 fi echo "$urls" | while read i; do pgnum=`echo "$i" | awk -F '_' '{print $2}'` echo "Downloading page $pgnum to $bookid/$pgnum.jpg" curl -f -s "$i" > "${bookid}/${pgnum}.jpg" if test $? -ne 0 ; then echo "Download failed for page ${pgnum}: $i" rm -f "${bookid}/${pgnum}.jpg" fi done