summaryrefslogtreecommitdiff
path: root/scrape-bsb.sh
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-03-11 18:00:48 +0000
committerNick White <git@njw.name>2019-03-11 18:00:48 +0000
commit7f3b848dce01a7543dc584c0f42bb80859025fa9 (patch)
treec303db604ee904cd416d388593ba52f11d76f94c /scrape-bsb.sh
parent5d46aba466e9448aec21182a65054739b1a7ea27 (diff)
Add basic bsb scraper
Diffstat (limited to 'scrape-bsb.sh')
-rw-r--r--scrape-bsb.sh20
1 files changed, 20 insertions, 0 deletions
diff --git a/scrape-bsb.sh b/scrape-bsb.sh
new file mode 100644
index 0000000..6a7c049
--- /dev/null
+++ b/scrape-bsb.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+usage="Usage: $0 bsburl"
+
+test $# -ne 1 && echo "$usage" && exit 1
+
+# This probably isn't very robust; presumes we'll see URLs of the form:
+# https://reader.digitale-sammlungen.de/de/fs1/object/display/bsb11274872_00005.html
+bookid=`echo "$1" | awk -F / '{printf("%s\n", $8)}'|sed 's/_.*//g'`
+
+xml=`curl -s "http://daten.digitale-sammlungen.de/~db/mets/${bookid}_mets.xml"`
+
+urls=`echo "$xml" | grep mets:FLocat | grep 'images/150' | awk -F '"' '{print $2}'`
+
+mkdir -p "$bookid"
+
+echo "$urls" | while read i; do
+ pgnum=`echo "$i" | awk -F '_' '{print $2}'`
+ echo "Downloading page $pgnum to $bookid/$pgnum.jpg"
+ curl -s "$i" > "${bookid}/${pgnum}.jpg"
+done