From 2d204b513319e3af13ea6ba8b79f2af024efe48b Mon Sep 17 00:00:00 2001 From: Nick White Date: Wed, 8 May 2019 19:39:04 +0100 Subject: Make scrapers more robust, and have them scrape into a directory per book --- scrape-bnf.sh | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'scrape-bnf.sh') diff --git a/scrape-bnf.sh b/scrape-bnf.sh index f939832..a0c49ae 100644 --- a/scrape-bnf.sh +++ b/scrape-bnf.sh @@ -4,13 +4,12 @@ usage="Usage: $0 bnfurl bnfurl: The book index page, e.g. https://gallica.bnf.fr/ark:/12148/bpt6k6468158v -Downloads all pages from a book on e-rara.com, saving them to the -current directory." +Downloads all pages from a book on bnf.fr" test $# -ne 1 && echo "$usage" && exit 1 bookid=`echo "$1" |awk -F / '{printf("%s/%s\n", $5, $6)}'|sed 's/\..*//g'` -bookid_name=`echo "$bookid" | sed 's/\//_/'` +bookname=`echo "$bookid" | sed 's/\//_/'` html=`curl -f -s "https://gallica.bnf.fr/ark:/${bookid}"` if test $? -ne 0 ; then @@ -20,12 +19,18 @@ fi pagenum=`echo "$html" | sed 's/.*nbTotalVues\\\"://g' | sed 's/,.*//'` +mkdir -p "$bookname" +if test $? -ne 0 ; then + echo "Failed to mkdir $bookname" + exit 1 +fi + for i in `seq "$pagenum"`; do - pgname=`printf "%s_%03d" "${bookid_name}" "${i}"` - echo "Downloading page $i of $pagenum to ${pgname}.jpg" - curl -f -s "https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg" > "${pgname}.jpg" + pgname=`printf '%04d' "${i}"` + echo "Downloading page $i of $pagenum to $bookname/$pgname.jpg" + curl -f -s "https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg" > "$bookname/$pgname.jpg" if test $? -ne 0 ; then echo "Failed to download page ${pgname}: https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg" - rm -f "${pgname}.jpg" + rm -f "$bookname/$pgname.jpg" fi done -- cgit v1.2.1-24-ge1ad