summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-05-08 13:11:36 +0100
committerNick White <git@njw.name>2019-05-08 13:11:36 +0100
commit710ff20cdb4fd435f95e0f0fd6cacb8838aaf3c9 (patch)
treef917075f68bcbb4a9a1376fd529128ff043fb619
parente50bd37655e55bf47eb0973e860f79441b0e7c9f (diff)
Add scrape-erara.sh script (not fully tested)
-rw-r--r--scrape-erara.sh43
1 files changed, 43 insertions, 0 deletions
diff --git a/scrape-erara.sh b/scrape-erara.sh
new file mode 100644
index 0000000..11754d6
--- /dev/null
+++ b/scrape-erara.sh
@@ -0,0 +1,43 @@
+#!/bin/sh
+usage="Usage: $0 eraraurl
+
+eraraurl: The book index page, e.g.
+ https://www.e-rara.ch/zut/doi/10.3931/e-rara-10416
+
+Downloads all pages from a book on e-rara.com, saving them to the
+current directory. "
+
+test $# -ne 1 && echo "$usage" && exit 1
+
+bookindex=`curl -s -f "$1"`
+if test $? -ne 0; then
+ echo "Error downloading book index page: $1"
+ exit 1
+fi
+
+iiifpath=`echo "$bookindex" | sed 's/</\n/g' | awk -F '"' '/iiif-manifest/ {print $4}'`
+bookid=`echo "$iiifpath" | awk -F '/' '{print $4}'`
+iiifurl="https://www.e-rara.ch${iiifpath}"
+
+iiifmanifest=`curl -s -f "$iiifurl"`
+if test $? -ne 0; then
+ echo "Error downloading IIIF manifest: $iiifurl"
+ exit 1
+fi
+
+# Just grab all page ids that are listed anywhere in the manifest
+# Note that this loses page numbering.
+pgids=`echo "$iiifmanifest" | sed 's/"/\n/g' | awk -F '/' '/i3f/ {print $7}' | sort | uniq`
+
+for i in $pgids; do
+ test $i -eq $bookid && continue # skip book id, which is not a real page id
+
+ pgname=`printf '%s_%s' "$bookid" "$i"`
+ echo "Downloading page $i to ${pgname}.jpg"
+ pgurl="https://www.e-rara.ch/zut/i3f/v21/${i}/full/full/0/native.jpg"
+ curl -s -f "$pgurl" > "${pgname}.jpg"
+ if test $? -ne 0; then
+ echo "Error downloading page $i: $pgurl"
+ rm -f "${pgname}.jpg"
+ fi
+done