Created
January 18, 2018 17:28
-
-
Save nabijaczleweli/1147dfd931be77e776cc386b6d361d35 to your computer and use it in GitHub Desktop.
Chapterwise scraper for archiveofourown, w/image support and normalisation.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
tempdir="" | |
if [[ "$TEMP" ]] && [[ ! "$tempdir" ]]; then tempdir="$TEMP/archiveofourown-scrapper"; fi | |
if [[ "$TMP" ]] && [[ ! "$tempdir" ]]; then tempdir="$TMP/archiveofourown-scrapper"; fi | |
if ! [[ "$tempdir" ]]; then | |
echo "Couldn't find temprary directory" > 1 | |
exit 1 | |
fi | |
mkdir -p "$tempdir" | |
raw_url="$1" | |
work_id="$(echo "$raw_url" | sed -r 's;((http(s?):\/\/)?archiveofourown.org\/works\/)?([[:digit:]]+).*;\4;')" | |
nav_url="https://archiveofourown.org/works/$work_id/navigate" | |
nav_out="$tempdir/nav-$work_id.html" | |
image_list="$tempdir/nav-$work_id-images" | |
if [[ ! -f "$nav_out" ]]; then | |
echo "Downloading chapter index from $nav_url to $nav_out" | |
curl -SL --compressed "$nav_url" -o "$nav_out" || (echo "Retrying w/o compression" && curl -SL "$nav_url" -o "$nav_out") | |
else | |
echo "Using cached chapter index in $nav_out" | |
fi | |
title="$(awk ' | |
/"\/works\/'"$work_id"'"/ { | |
print(gensub(/.*'"$work_id"'">([^<]+)<\/a>.*/, "\\1", "g")); | |
} | |
' "$nav_out")" | |
title_noquote="${title//\"/ʹ}" | |
author="$(awk ' | |
/"\/works\/'"$work_id"'"/ { | |
print(gensub(/.*<a rel="author" [^>]*>([^<]+)<\/a>.*/, "\\1", "g")); | |
} | |
' "$nav_out")" | |
echo > "$image_list" | |
chapter_num=1 | |
awk -F '/' ' | |
/<ol class="chapter index group" role="navigation">/,/<\/ol>/ { | |
if($0 !~ /<ol class="chapter index group" role="navigation">/ && $0 !~ /<\/ol>/) | |
print(gensub(/".*/, "", "g", $5)); | |
} | |
' "$nav_out" | while read -r chapter_id; do | |
chapter_num_wide="$(printf "%03d" "$chapter_num")" | |
chapter_url="https://archiveofourown.org/works/$work_id/chapters/$chapter_id?view_adult=true" | |
chapter_out="$tempdir/ch-$work_id-$chapter_num_wide-$chapter_id.html" | |
chapter_final_out="$title ch$chapter_num_wide.html" | |
if [[ ! -f "$chapter_out" ]]; then | |
echo "Downloading chapter $chapter_num from $chapter_url to $chapter_out" | |
curl -SL --compressed "$chapter_url" -o "$chapter_out" || (echo "Retrying w/o compression" && curl -SL "$chapter_url" -o "$chapter_out") | |
else | |
echo "Using cached chapter $chapter_num in $chapter_out" | |
fi | |
{ | |
echo '<!DOCTYPE html>' | |
echo '<html lang="en">' | |
echo ' <head>' | |
echo ' <meta charset="utf-8" />' | |
echo ' <title>'"$title"' - Chapter '"$chapter_num"'</title>' | |
echo ' <meta name="author" content="'"$author"'" />' | |
echo ' </head>' | |
echo ' <body>' | |
echo ' <!-- ePub title: "'"$title_noquote"' - Chapter '"$chapter_num"'" -->' | |
echo | |
awk ' | |
BEGIN { | |
title_open=0; | |
title_close=0; | |
} | |
/<h2 class="title heading">/ { | |
if(!title_open) { | |
print(" <a href=\"https://archiveofourown.org/works/'"$work_id"'\">"); | |
title_open=1; | |
} | |
} | |
/<!--.*BEGIN.*work skin.*-->/,/<!--.*END.*work skin.*-->/ { | |
result=gensub(/href="\//, "href=\"https://archiveofourown.org/", "g"); | |
if(result ~ /(<img.*)src="http(s?):\/\/([^"/]+\/)+(.*".*\/>)/) | |
print(result) >> "'"$image_list"'"; | |
while(result ~ /(<img.*)src="http(s?):\/\/([^"/]+\/)+(.*".*\/>)/) | |
result = gensub(/(<img.*)src="http(s?):\/\/([^"/]+\/)+(.*".*\/>)/, "\\1 src=\"\\4", "g", result); | |
print(result); | |
} | |
/<\/h2>/ { | |
if(title_open && !title_close) { | |
print(" </a>"); | |
title_close=1; | |
} | |
} | |
' "$chapter_out" | html-beautify -qm 3 -te "\n" -n | |
echo | |
echo ' </body>' | |
echo '</html>' | |
} > "$chapter_final_out" | |
((++chapter_num)) | |
done | |
sed 's;<;\n<;g' "$image_list" | awk '/img/ {print(gensub(/[[:space:] ]/, "", "g", gensub(/<img.*src="(http(s?):\/\/([^"/]+\/)+.*\.[[:alnum:]]*)".*\/>/, "\\1", "g")));}' | while read -r image_url; do | |
image_out="$(echo "$image_url" | sed -r 's;[[:alpha:]]+://(.*/)+;;')" | |
if [[ ! -f "$image_out" ]]; then | |
echo "Downloading image from $image_url to $image_out" | |
curl -SL --compressed "$image_url" -o "$image_out" || (echo "Retrying w/o compression" && curl -SL "$image_url" -o "$image_out") | |
else | |
echo "Using cached image in $image_out" | |
fi | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment