summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-05-08 19:01:48 +0100
committerNick White <git@njw.name>2019-05-08 19:01:48 +0100
commit4b43563f58f82c35399d4da9729839e48cd192f0 (patch)
tree41e2bc7e550750d103da553ca6e9265db289d76d
parent44dfb6c33649efb0b45c72a3ab9ae7d629aa3172 (diff)
Make BNF scraper much more robust
-rw-r--r--scrape-bnf.sh22
-rwxr-xr-xscrape-erara.sh2
2 files changed, 19 insertions, 5 deletions
diff --git a/scrape-bnf.sh b/scrape-bnf.sh
index 677e4d4..f939832 100644
--- a/scrape-bnf.sh
+++ b/scrape-bnf.sh
@@ -1,17 +1,31 @@
#!/bin/sh
-usage="Usage: $0 bnfurl"
+usage="Usage: $0 bnfurl
+
+bnfurl: The book index page, e.g.
+ https://gallica.bnf.fr/ark:/12148/bpt6k6468158v
+
+Downloads all pages from a book on e-rara.com, saving them to the
+current directory."
test $# -ne 1 && echo "$usage" && exit 1
-bookid=`echo "$1" |awk -F / '{printf("%s/%s\n", $5, $6)}'`
+bookid=`echo "$1" |awk -F / '{printf("%s/%s\n", $5, $6)}'|sed 's/\..*//g'`
bookid_name=`echo "$bookid" | sed 's/\//_/'`
-html=`curl -s "https://gallica.bnf.fr/ark:/${bookid}"`
+html=`curl -f -s "https://gallica.bnf.fr/ark:/${bookid}"`
+if test $? -ne 0 ; then
+ echo "Error: Failed to download book index: https://gallica.bnf.fr/ark:/${bookid}"
+ exit 1
+fi
pagenum=`echo "$html" | sed 's/.*nbTotalVues\\\"://g' | sed 's/,.*//'`
for i in `seq "$pagenum"`; do
pgname=`printf "%s_%03d" "${bookid_name}" "${i}"`
echo "Downloading page $i of $pagenum to ${pgname}.jpg"
- curl -s "https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg" > "${pgname}.jpg"
+ curl -f -s "https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg" > "${pgname}.jpg"
+ if test $? -ne 0 ; then
+ echo "Failed to download page ${pgname}: https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg"
+ rm -f "${pgname}.jpg"
+ fi
done
diff --git a/scrape-erara.sh b/scrape-erara.sh
index c2da6f2..c66a73b 100755
--- a/scrape-erara.sh
+++ b/scrape-erara.sh
@@ -5,7 +5,7 @@ eraraurl: The book index page, e.g.
https://www.e-rara.ch/zut/doi/10.3931/e-rara-10416
Downloads all pages from a book on e-rara.com, saving them to the
-current directory. "
+current directory."
test $# -ne 1 && echo "$usage" && exit 1