1 files changed, 43 insertions, 0 deletions
diff --git a/scrape-erara.sh b/scrape-erara.sh
new file mode 100644
index 0000000..11754d6
--- /dev/null
+++ b/scrape-erara.sh
@@ -0,0 +1,43 @@
+#!/bin/sh
+usage="Usage: $0 eraraurl
+
+eraraurl: The book index page, e.g.
+          https://www.e-rara.ch/zut/doi/10.3931/e-rara-10416
+
+Downloads all pages from a book on e-rara.com, saving them to the
+current directory. "
+
+test $# -ne 1 && echo "$usage" && exit 1
+
+bookindex=`curl -s -f "$1"`
+if test $? -ne 0; then
+	echo "Error downloading book index page: $1"
+	exit 1
+fi
+
+iiifpath=`echo "$bookindex" | sed 's/</\n/g' | awk -F '"' '/iiif-manifest/ {print $4}'`
+bookid=`echo "$iiifpath" | awk -F '/' '{print $4}'`
+iiifurl="https://www.e-rara.ch${iiifpath}"
+
+iiifmanifest=`curl -s -f "$iiifurl"`
+if test $? -ne 0; then
+	echo "Error downloading IIIF manifest: $iiifurl"
+	exit 1
+fi
+
+# Just grab all page ids that are listed anywhere in the manifest
+# Note that this loses page numbering.
+pgids=`echo "$iiifmanifest" | sed 's/"/\n/g' | awk -F '/' '/i3f/ {print $7}' | sort | uniq`
+
+for i in $pgids; do
+	test $i -eq $bookid && continue # skip book id, which is not a real page id
+
+	pgname=`printf '%s_%s' "$bookid" "$i"`
+	echo "Downloading page $i to ${pgname}.jpg"
+	pgurl="https://www.e-rara.ch/zut/i3f/v21/${i}/full/full/0/native.jpg"
+	curl -s -f "$pgurl" > "${pgname}.jpg"
+	if test $? -ne 0; then
+		echo "Error downloading page $i: $pgurl"
+		rm -f "${pgname}.jpg"
+	fi
+done