From 2d204b513319e3af13ea6ba8b79f2af024efe48b Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Wed, 8 May 2019 19:39:04 +0100
Subject: Make scrapers more robust, and have them scrape into a directory per
 book

---
 scrape-erara.sh | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'scrape-erara.sh')

diff --git a/scrape-erara.sh b/scrape-erara.sh
index c66a73b..2d0fb6f 100755
--- a/scrape-erara.sh
+++ b/scrape-erara.sh
@@ -4,8 +4,7 @@ usage="Usage: $0 eraraurl
 eraraurl: The book index page, e.g.
           https://www.e-rara.ch/zut/doi/10.3931/e-rara-10416
 
-Downloads all pages from a book on e-rara.com, saving them to the
-current directory."
+Downloads all pages from a book on e-rara.com"
 
 test $# -ne 1 && echo "$usage" && exit 1
 
@@ -29,17 +28,23 @@ fi
 # Note that this loses page numbering.
 pgids=`echo "$iiifmanifest" | sed 's/"/\n/g' | awk -F '/' '/i3f/ {print $7}' | sort | uniq`
 
+mkdir -p "$bookid"
+if test $? -ne 0 ; then
+	echo "Failed to mkdir $bookid"
+	exit 1
+fi
+
 pgnum=0
 for i in $pgids; do
 	test $i -eq $bookid && continue # skip book id, which is not a real page id
 
 	pgnum=`expr $pgnum + 1`
-	pgname=`printf '%s_%04d' "$bookid" "$pgnum"`
-	echo "Downloading page id $i to ${pgname}.jpg"
+	pgname=`printf '%04d' "$pgnum"`
+	echo "Downloading page id $i to $bookid/$pgname.jpg"
 	pgurl="https://www.e-rara.ch/zut/i3f/v21/${i}/full/full/0/native.jpg"
-	curl -s -f "$pgurl" > "${pgname}.jpg"
+	curl -s -f "$pgurl" > "$bookid/$pgname.jpg"
 	if test $? -ne 0; then
 		echo "Error downloading page id $i (number ${pgnum}): $pgurl"
-		rm -f "${pgname}.jpg"
+		rm -f "$bookid/$pgname.jpg"
 	fi
 done
-- 
cgit v1.2.1-24-ge1ad