diff options
author | Nick White <git@njw.name> | 2019-05-08 19:01:48 +0100 |
---|---|---|
committer | Nick White <git@njw.name> | 2019-05-08 19:01:48 +0100 |
commit | 4b43563f58f82c35399d4da9729839e48cd192f0 (patch) | |
tree | 41e2bc7e550750d103da553ca6e9265db289d76d /scrape-bnf.sh | |
parent | 44dfb6c33649efb0b45c72a3ab9ae7d629aa3172 (diff) |
Make BNF scraper much more robust
Diffstat (limited to 'scrape-bnf.sh')
-rw-r--r-- | scrape-bnf.sh | 22 |
1 files changed, 18 insertions, 4 deletions
diff --git a/scrape-bnf.sh b/scrape-bnf.sh index 677e4d4..f939832 100644 --- a/scrape-bnf.sh +++ b/scrape-bnf.sh @@ -1,17 +1,31 @@ #!/bin/sh -usage="Usage: $0 bnfurl" +usage="Usage: $0 bnfurl + +bnfurl: The book index page, e.g. + https://gallica.bnf.fr/ark:/12148/bpt6k6468158v + +Downloads all pages from a book on e-rara.com, saving them to the +current directory." test $# -ne 1 && echo "$usage" && exit 1 -bookid=`echo "$1" |awk -F / '{printf("%s/%s\n", $5, $6)}'` +bookid=`echo "$1" |awk -F / '{printf("%s/%s\n", $5, $6)}'|sed 's/\..*//g'` bookid_name=`echo "$bookid" | sed 's/\//_/'` -html=`curl -s "https://gallica.bnf.fr/ark:/${bookid}"` +html=`curl -f -s "https://gallica.bnf.fr/ark:/${bookid}"` +if test $? -ne 0 ; then + echo "Error: Failed to download book index: https://gallica.bnf.fr/ark:/${bookid}" + exit 1 +fi pagenum=`echo "$html" | sed 's/.*nbTotalVues\\\"://g' | sed 's/,.*//'` for i in `seq "$pagenum"`; do pgname=`printf "%s_%03d" "${bookid_name}" "${i}"` echo "Downloading page $i of $pagenum to ${pgname}.jpg" - curl -s "https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg" > "${pgname}.jpg" + curl -f -s "https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg" > "${pgname}.jpg" + if test $? -ne 0 ; then + echo "Failed to download page ${pgname}: https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg" + rm -f "${pgname}.jpg" + fi done |