summaryrefslogtreecommitdiff
path: root/scrape-bnf.sh
blob: a0c49aed8f5c8f4ead4e8007fc99859356b64c85 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/bin/sh
usage="Usage: $0 bnfurl

bnfurl: The book index page, e.g.
        https://gallica.bnf.fr/ark:/12148/bpt6k6468158v

Downloads all pages from a book on bnf.fr"

test $# -ne 1 && echo "$usage" && exit 1

bookid=`echo "$1" |awk -F / '{printf("%s/%s\n", $5, $6)}'|sed 's/\..*//g'`
bookname=`echo "$bookid" | sed 's/\//_/'`

html=`curl -f -s "https://gallica.bnf.fr/ark:/${bookid}"`
if test $? -ne 0 ; then
	echo "Error: Failed to download book index: https://gallica.bnf.fr/ark:/${bookid}"
	exit 1
fi

pagenum=`echo "$html" | sed 's/.*nbTotalVues\\\"://g' | sed 's/,.*//'`

mkdir -p "$bookname"
if test $? -ne 0 ; then
	echo "Failed to mkdir $bookname"
	exit 1
fi

for i in `seq "$pagenum"`; do
	pgname=`printf '%04d' "${i}"`
	echo "Downloading page $i of $pagenum to $bookname/$pgname.jpg"
	curl -f -s "https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg" > "$bookname/$pgname.jpg"
	if test $? -ne 0 ; then
		echo "Failed to download page ${pgname}: https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg"
		rm -f "$bookname/$pgname.jpg"
	fi
done