Make scrapers more robust, and have them scrape into a directory per book

author: Nick White <git@njw.name> 2019-05-08 19:39:04 +0100
committer: Nick White <git@njw.name> 2019-05-08 19:39:04 +0100
commit: 2d204b513319e3af13ea6ba8b79f2af024efe48b (patch)
tree: 3b638b1edb505fe8dd72de368783659f1beea0c7
parent: 4b43563f58f82c35399d4da9729839e48cd192f0 (diff)
3 files changed, 49 insertions, 18 deletions
diff --git a/scrape-bnf.sh b/scrape-bnf.sh
index f939832..a0c49ae 100644
--- a/scrape-bnf.sh
+++ b/scrape-bnf.sh
@@ -4,13 +4,12 @@ usage="Usage: $0 bnfurl
 bnfurl: The book index page, e.g.
         https://gallica.bnf.fr/ark:/12148/bpt6k6468158v
 
-Downloads all pages from a book on e-rara.com, saving them to the
-current directory."
+Downloads all pages from a book on bnf.fr"
 
 test $# -ne 1 && echo "$usage" && exit 1
 
 bookid=`echo "$1" |awk -F / '{printf("%s/%s\n", $5, $6)}'|sed 's/\..*//g'`
-bookid_name=`echo "$bookid" | sed 's/\//_/'`
+bookname=`echo "$bookid" | sed 's/\//_/'`
 
 html=`curl -f -s "https://gallica.bnf.fr/ark:/${bookid}"`
 if test $? -ne 0 ; then
@@ -20,12 +19,18 @@ fi
 
 pagenum=`echo "$html" | sed 's/.*nbTotalVues\\\"://g' | sed 's/,.*//'`
 
+mkdir -p "$bookname"
+if test $? -ne 0 ; then
+	echo "Failed to mkdir $bookname"
+	exit 1
+fi
+
 for i in `seq "$pagenum"`; do
-	pgname=`printf "%s_%03d" "${bookid_name}" "${i}"`
-	echo "Downloading page $i of $pagenum to ${pgname}.jpg"
-	curl -f -s "https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg" > "${pgname}.jpg"
+	pgname=`printf '%04d' "${i}"`
+	echo "Downloading page $i of $pagenum to $bookname/$pgname.jpg"
+	curl -f -s "https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg" > "$bookname/$pgname.jpg"
 	if test $? -ne 0 ; then
 		echo "Failed to download page ${pgname}: https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg"
-		rm -f "${pgname}.jpg"
+		rm -f "$bookname/$pgname.jpg"
 	fi
 done
diff --git a/scrape-bsb.sh b/scrape-bsb.sh
index 6a7c049..9e8f825 100644
--- a/scrape-bsb.sh
+++ b/scrape-bsb.sh
@@ -1,20 +1,41 @@
 #!/bin/sh
-usage="Usage: $0 bsburl"
+usage="Usage: $0 bsburl
+
+bsburl: The book index page, e.g.
+        https://reader.digitale-sammlungen.de/de/fs1/object/display/bsb11274872_00005.html
+
+Downloads all pages from a book on digitale-sammlungen.de
+
+Note that some books may initially fail in downloading the book index
+page. The reasons for this are not exactly clear, but try again in a
+few minutes and it may well work. It is probably to do with some
+strange caching issue."
 
 test $# -ne 1 && echo "$usage" && exit 1
 
-# This probably isn't very robust; presumes we'll see URLs of the form:
-# https://reader.digitale-sammlungen.de/de/fs1/object/display/bsb11274872_00005.html
 bookid=`echo "$1" | awk -F / '{printf("%s\n", $8)}'|sed 's/_.*//g'`
 
-xml=`curl -s "http://daten.digitale-sammlungen.de/~db/mets/${bookid}_mets.xml"`
+xml=`curl -f -s "http://daten.digitale-sammlungen.de/~db/mets/${bookid}_mets.xml"`
+
+if test $? -ne 0 ; then
+	echo "Error downloading book index page: http://daten.digitale-sammlungen.de/~db/mets/${bookid}_mets.xml"
+	exit 1
+fi
 
 urls=`echo "$xml" | grep mets:FLocat | grep 'images/150' | awk -F '"' '{print $2}'`
 
 mkdir -p "$bookid"
+if test $? -ne 0 ; then
+	echo "Failed to mkdir $bookname"
+	exit 1
+fi
 
 echo "$urls" | while read i; do
 	pgnum=`echo "$i" | awk -F '_' '{print $2}'`
 	echo "Downloading page $pgnum to $bookid/$pgnum.jpg"
-	curl -s "$i" > "${bookid}/${pgnum}.jpg"
+	curl -f -s "$i" > "${bookid}/${pgnum}.jpg"
+	if test $? -ne 0 ; then
+		echo "Download failed for page ${pgnum}: $i"
+		rm -f "${bookid}/${pgnum}.jpg"
+	fi
 done
diff --git a/scrape-erara.sh b/scrape-erara.sh
index c66a73b..2d0fb6f 100755
--- a/scrape-erara.sh
+++ b/scrape-erara.sh
@@ -4,8 +4,7 @@ usage="Usage: $0 eraraurl
 eraraurl: The book index page, e.g.
           https://www.e-rara.ch/zut/doi/10.3931/e-rara-10416
 
-Downloads all pages from a book on e-rara.com, saving them to the
-current directory."
+Downloads all pages from a book on e-rara.com"
 
 test $# -ne 1 && echo "$usage" && exit 1
 
@@ -29,17 +28,23 @@ fi
 # Note that this loses page numbering.
 pgids=`echo "$iiifmanifest" | sed 's/"/\n/g' | awk -F '/' '/i3f/ {print $7}' | sort | uniq`
 
+mkdir -p "$bookid"
+if test $? -ne 0 ; then
+	echo "Failed to mkdir $bookid"
+	exit 1
+fi
+
 pgnum=0
 for i in $pgids; do
 	test $i -eq $bookid && continue # skip book id, which is not a real page id
 
 	pgnum=`expr $pgnum + 1`
-	pgname=`printf '%s_%04d' "$bookid" "$pgnum"`
-	echo "Downloading page id $i to ${pgname}.jpg"
+	pgname=`printf '%04d' "$pgnum"`
+	echo "Downloading page id $i to $bookid/$pgname.jpg"
 	pgurl="https://www.e-rara.ch/zut/i3f/v21/${i}/full/full/0/native.jpg"
-	curl -s -f "$pgurl" > "${pgname}.jpg"
+	curl -s -f "$pgurl" > "$bookid/$pgname.jpg"
 	if test $? -ne 0; then
 		echo "Error downloading page id $i (number ${pgnum}): $pgurl"
-		rm -f "${pgname}.jpg"
+		rm -f "$bookid/$pgname.jpg"
 	fi
 done
author	Nick White <git@njw.name>	2019-05-08 19:39:04 +0100
committer	Nick White <git@njw.name>	2019-05-08 19:39:04 +0100
commit	2d204b513319e3af13ea6ba8b79f2af024efe48b (patch)
tree	3b638b1edb505fe8dd72de368783659f1beea0c7
parent	4b43563f58f82c35399d4da9729839e48cd192f0 (diff)