diff options
author | Nick White <git@njw.name> | 2019-05-08 19:39:04 +0100 |
---|---|---|
committer | Nick White <git@njw.name> | 2019-05-08 19:39:04 +0100 |
commit | 2d204b513319e3af13ea6ba8b79f2af024efe48b (patch) | |
tree | 3b638b1edb505fe8dd72de368783659f1beea0c7 /scrape-erara.sh | |
parent | 4b43563f58f82c35399d4da9729839e48cd192f0 (diff) |
Make scrapers more robust, and have them scrape into a directory per book
Diffstat (limited to 'scrape-erara.sh')
-rwxr-xr-x | scrape-erara.sh | 17 |
1 files changed, 11 insertions, 6 deletions
diff --git a/scrape-erara.sh b/scrape-erara.sh index c66a73b..2d0fb6f 100755 --- a/scrape-erara.sh +++ b/scrape-erara.sh @@ -4,8 +4,7 @@ usage="Usage: $0 eraraurl eraraurl: The book index page, e.g. https://www.e-rara.ch/zut/doi/10.3931/e-rara-10416 -Downloads all pages from a book on e-rara.com, saving them to the -current directory." +Downloads all pages from a book on e-rara.com" test $# -ne 1 && echo "$usage" && exit 1 @@ -29,17 +28,23 @@ fi # Note that this loses page numbering. pgids=`echo "$iiifmanifest" | sed 's/"/\n/g' | awk -F '/' '/i3f/ {print $7}' | sort | uniq` +mkdir -p "$bookid" +if test $? -ne 0 ; then + echo "Failed to mkdir $bookid" + exit 1 +fi + pgnum=0 for i in $pgids; do test $i -eq $bookid && continue # skip book id, which is not a real page id pgnum=`expr $pgnum + 1` - pgname=`printf '%s_%04d' "$bookid" "$pgnum"` - echo "Downloading page id $i to ${pgname}.jpg" + pgname=`printf '%04d' "$pgnum"` + echo "Downloading page id $i to $bookid/$pgname.jpg" pgurl="https://www.e-rara.ch/zut/i3f/v21/${i}/full/full/0/native.jpg" - curl -s -f "$pgurl" > "${pgname}.jpg" + curl -s -f "$pgurl" > "$bookid/$pgname.jpg" if test $? -ne 0; then echo "Error downloading page id $i (number ${pgnum}): $pgurl" - rm -f "${pgname}.jpg" + rm -f "$bookid/$pgname.jpg" fi done |