scrape-erara.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

#!/bin/sh
usage="Usage: $0 eraraurl

eraraurl: The book index page, e.g.
          https://www.e-rara.ch/zut/doi/10.3931/e-rara-10416

Downloads all pages from a book on e-rara.com"

test $# -ne 1 && echo "$usage" && exit 1

bookindex=`curl -s -f "$1"`
if test $? -ne 0; then
	echo "Error downloading book index page: $1"
	exit 1
fi

iiifpath=`echo "$bookindex" | sed 's/</\n/g' | awk -F '"' '/iiif-manifest/ {print $4}'`
bookid=`echo "$iiifpath" | awk -F '/' '{print $4}'`
iiifurl="https://www.e-rara.ch${iiifpath}"

iiifmanifest=`curl -s -f "$iiifurl"`
if test $? -ne 0; then
	echo "Error downloading IIIF manifest: $iiifurl"
	exit 1
fi

# Just grab all page ids that are listed anywhere in the manifest
# Note that this loses page numbering.
pgids=`echo "$iiifmanifest" | sed 's/"/\n/g' | awk -F '/' '/i3f/ {print $7}' | sort | uniq`

mkdir -p "$bookid"
if test $? -ne 0 ; then
	echo "Failed to mkdir $bookid"
	exit 1
fi

pgnum=0
for i in $pgids; do
	test $i -eq $bookid && continue # skip book id, which is not a real page id

	pgnum=`expr $pgnum + 1`
	pgname=`printf '%04d' "$pgnum"`
	echo "Downloading page id $i to $bookid/$pgname.jpg"
	pgurl="https://www.e-rara.ch/zut/i3f/v21/${i}/full/full/0/native.jpg"
	curl -s -f "$pgurl" > "$bookid/$pgname.jpg"
	if test $? -ne 0; then
		echo "Error downloading page id $i (number ${pgnum}): $pgurl"
		rm -f "$bookid/$pgname.jpg"
	fi
done