blob: 2d0fb6f73a5023c608a940531c1604aeaf460947 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
#!/bin/sh
usage="Usage: $0 eraraurl
eraraurl: The book index page, e.g.
https://www.e-rara.ch/zut/doi/10.3931/e-rara-10416
Downloads all pages from a book on e-rara.com"
test $# -ne 1 && echo "$usage" && exit 1
bookindex=`curl -s -f "$1"`
if test $? -ne 0; then
echo "Error downloading book index page: $1"
exit 1
fi
iiifpath=`echo "$bookindex" | sed 's/</\n/g' | awk -F '"' '/iiif-manifest/ {print $4}'`
bookid=`echo "$iiifpath" | awk -F '/' '{print $4}'`
iiifurl="https://www.e-rara.ch${iiifpath}"
iiifmanifest=`curl -s -f "$iiifurl"`
if test $? -ne 0; then
echo "Error downloading IIIF manifest: $iiifurl"
exit 1
fi
# Just grab all page ids that are listed anywhere in the manifest
# Note that this loses page numbering.
pgids=`echo "$iiifmanifest" | sed 's/"/\n/g' | awk -F '/' '/i3f/ {print $7}' | sort | uniq`
mkdir -p "$bookid"
if test $? -ne 0 ; then
echo "Failed to mkdir $bookid"
exit 1
fi
pgnum=0
for i in $pgids; do
test $i -eq $bookid && continue # skip book id, which is not a real page id
pgnum=`expr $pgnum + 1`
pgname=`printf '%04d' "$pgnum"`
echo "Downloading page id $i to $bookid/$pgname.jpg"
pgurl="https://www.e-rara.ch/zut/i3f/v21/${i}/full/full/0/native.jpg"
curl -s -f "$pgurl" > "$bookid/$pgname.jpg"
if test $? -ne 0; then
echo "Error downloading page id $i (number ${pgnum}): $pgurl"
rm -f "$bookid/$pgname.jpg"
fi
done
|