summaryrefslogtreecommitdiff
path: root/scrape-bsb.sh
blob: 6a7c049a32d25ce36cd2a6ff6521c977523cfecf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#!/bin/sh
usage="Usage: $0 bsburl"

test $# -ne 1 && echo "$usage" && exit 1

# This probably isn't very robust; presumes we'll see URLs of the form:
# https://reader.digitale-sammlungen.de/de/fs1/object/display/bsb11274872_00005.html
bookid=`echo "$1" | awk -F / '{printf("%s\n", $8)}'|sed 's/_.*//g'`

xml=`curl -s "http://daten.digitale-sammlungen.de/~db/mets/${bookid}_mets.xml"`

urls=`echo "$xml" | grep mets:FLocat | grep 'images/150' | awk -F '"' '{print $2}'`

mkdir -p "$bookid"

echo "$urls" | while read i; do
	pgnum=`echo "$i" | awk -F '_' '{print $2}'`
	echo "Downloading page $pgnum to $bookid/$pgnum.jpg"
	curl -s "$i" > "${bookid}/${pgnum}.jpg"
done