diff options
-rw-r--r-- | scrape-erara.sh | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/scrape-erara.sh b/scrape-erara.sh new file mode 100644 index 0000000..11754d6 --- /dev/null +++ b/scrape-erara.sh @@ -0,0 +1,43 @@ +#!/bin/sh +usage="Usage: $0 eraraurl + +eraraurl: The book index page, e.g. + https://www.e-rara.ch/zut/doi/10.3931/e-rara-10416 + +Downloads all pages from a book on e-rara.com, saving them to the +current directory. " + +test $# -ne 1 && echo "$usage" && exit 1 + +bookindex=`curl -s -f "$1"` +if test $? -ne 0; then + echo "Error downloading book index page: $1" + exit 1 +fi + +iiifpath=`echo "$bookindex" | sed 's/</\n/g' | awk -F '"' '/iiif-manifest/ {print $4}'` +bookid=`echo "$iiifpath" | awk -F '/' '{print $4}'` +iiifurl="https://www.e-rara.ch${iiifpath}" + +iiifmanifest=`curl -s -f "$iiifurl"` +if test $? -ne 0; then + echo "Error downloading IIIF manifest: $iiifurl" + exit 1 +fi + +# Just grab all page ids that are listed anywhere in the manifest +# Note that this loses page numbering. +pgids=`echo "$iiifmanifest" | sed 's/"/\n/g' | awk -F '/' '/i3f/ {print $7}' | sort | uniq` + +for i in $pgids; do + test $i -eq $bookid && continue # skip book id, which is not a real page id + + pgname=`printf '%s_%s' "$bookid" "$i"` + echo "Downloading page $i to ${pgname}.jpg" + pgurl="https://www.e-rara.ch/zut/i3f/v21/${i}/full/full/0/native.jpg" + curl -s -f "$pgurl" > "${pgname}.jpg" + if test $? -ne 0; then + echo "Error downloading page $i: $pgurl" + rm -f "${pgname}.jpg" + fi +done |