nabijaczleweli · January 18, 2018 17:28
diff --git a/scrap_work.sh b/scrap_work.sh
 #!/bin/bash

 tempdir=""
 if [[ "$TEMP" ]] && [[ ! "$tempdir" ]]; then tempdir="$TEMP/archiveofourown-scrapper"; fi
 if [[ "$TMP" ]] && [[ ! "$tempdir" ]]; then tempdir="$TMP/archiveofourown-scrapper"; fi
 if ! [[ "$tempdir" ]]; then
 	echo "Couldn't find temprary directory" > 1
 	exit 1
 fi
 mkdir -p "$tempdir"

 raw_url="$1"
 work_id="$(echo "$raw_url" | sed -r 's;((http(s?):\/\/)?archiveofourown.org\/works\/)?([[:digit:]]+).*;\4;')"
 nav_url="https://archiveofourown.org/works/$work_id/navigate"
 nav_out="$tempdir/nav-$work_id.html"
 image_list="$tempdir/nav-$work_id-images"

 if [[ ! -f "$nav_out" ]]; then
 	echo "Downloading chapter index from $nav_url to $nav_out"
 	curl -SL --compressed "$nav_url" -o "$nav_out" || (echo "Retrying w/o compression" && curl -SL "$nav_url" -o "$nav_out")
 else
 	echo "Using cached chapter index in $nav_out"
 fi

 title="$(awk '
 	/"\/works\/'"$work_id"'"/ {
 		print(gensub(/.*'"$work_id"'">([^<]+)<\/a>.*/, "\\1", "g"));
 	}
 ' "$nav_out")"
 title_noquote="${title//\"/ʹ}"
 author="$(awk '
 	/"\/works\/'"$work_id"'"/ {
 		print(gensub(/.*<a rel="author" [^>]*>([^<]+)<\/a>.*/, "\\1", "g"));
 	}
 ' "$nav_out")"


 echo > "$image_list"
 chapter_num=1
 awk -F '/' '
 	/<ol class="chapter index group" role="navigation">/,/<\/ol>/ {
 		if($0 !~ /<ol class="chapter index group" role="navigation">/ && $0 !~ /<\/ol>/)
 			print(gensub(/".*/, "", "g", $5));
 	}
 ' "$nav_out" | while read -r chapter_id; do
 	chapter_num_wide="$(printf "%03d" "$chapter_num")"
 	chapter_url="https://archiveofourown.org/works/$work_id/chapters/$chapter_id?view_adult=true"
 	chapter_out="$tempdir/ch-$work_id-$chapter_num_wide-$chapter_id.html"
 	chapter_final_out="$title ch$chapter_num_wide.html"

 	if [[ ! -f "$chapter_out" ]]; then
 		echo "Downloading chapter $chapter_num from $chapter_url to $chapter_out"
 		curl -SL --compressed "$chapter_url" -o "$chapter_out" || (echo "Retrying w/o compression" && curl -SL "$chapter_url" -o "$chapter_out")
 	else
 		echo "Using cached chapter $chapter_num in $chapter_out"
 	fi

 	{
 		echo '<!DOCTYPE html>'
 		echo '<html lang="en">'
 		echo '	<head>'
 		echo '		<meta charset="utf-8" />'
 		echo '		<title>'"$title"' - Chapter '"$chapter_num"'</title>'
 		echo '		<meta name="author" content="'"$author"'" />'
 		echo '	</head>'
 		echo '	<body>'
 		echo '		<!-- ePub title: "'"$title_noquote"' - Chapter '"$chapter_num"'" -->'
 		echo

 		awk '
 			BEGIN {
 				title_open=0;
 				title_close=0;
 			}

 			/<h2 class="title heading">/ {
 				if(!title_open) {
 					print("		<a href=\"https://archiveofourown.org/works/'"$work_id"'\">");
 					title_open=1;
 				}
 			}

 			/<!--.*BEGIN.*work skin.*-->/,/<!--.*END.*work skin.*-->/ {
 				result=gensub(/href="\//, "href=\"https://archiveofourown.org/", "g");
 				if(result ~ /(<img.*)src="http(s?):\/\/([^"/]+\/)+(.*".*\/>)/)
 					print(result) >> "'"$image_list"'";
 				while(result ~ /(<img.*)src="http(s?):\/\/([^"/]+\/)+(.*".*\/>)/)
 					result = gensub(/(<img.*)src="http(s?):\/\/([^"/]+\/)+(.*".*\/>)/, "\\1 src=\"\\4", "g", result);

 				print(result);
 			}

 			/<\/h2>/ {
 				if(title_open && !title_close) {
 					print("		</a>");
 					title_close=1;
 				}
 			}
 		' "$chapter_out" | html-beautify -qm 3 -te "\n" -n

 		echo
 		echo '	</body>'
 		echo '</html>'
 	} > "$chapter_final_out"

 	((++chapter_num))
 done


 sed 's;<;\n<;g' "$image_list" | awk '/img/ {print(gensub(/[[:space:] ]/, "", "g", gensub(/<img.*src="(http(s?):\/\/([^"/]+\/)+.*\.[[:alnum:]]*)".*\/>/, "\\1", "g")));}' | while read -r image_url; do
 	image_out="$(echo "$image_url" | sed -r 's;[[:alpha:]]+://(.*/)+;;')"
 	if [[ ! -f "$image_out" ]]; then
 		echo "Downloading image from $image_url to $image_out"
 		curl -SL --compressed "$image_url" -o "$image_out" || (echo "Retrying w/o compression" && curl -SL "$image_url" -o "$image_out")
 	else
 		echo "Using cached image in $image_out"
 	fi
 done
	#!/bin/bash

	tempdir=""
	if [[ "$TEMP" ]] && [[ ! "$tempdir" ]]; then tempdir="$TEMP/archiveofourown-scrapper"; fi
	if [[ "$TMP" ]] && [[ ! "$tempdir" ]]; then tempdir="$TMP/archiveofourown-scrapper"; fi
	if ! [[ "$tempdir" ]]; then
	echo "Couldn't find temprary directory" > 1
	exit 1
	fi
	mkdir -p "$tempdir"

	raw_url="$1"
	work_id="$(echo "$raw_url" \| sed -r 's;((http(s?):\/\/)?archiveofourown.org\/works\/)?([[:digit:]]+).*;\4;')"
	nav_url="https://archiveofourown.org/works/$work_id/navigate"
	nav_out="$tempdir/nav-$work_id.html"
	image_list="$tempdir/nav-$work_id-images"

	if [[ ! -f "$nav_out" ]]; then
	echo "Downloading chapter index from $nav_url to $nav_out"
	curl -SL --compressed "$nav_url" -o "$nav_out" \|\| (echo "Retrying w/o compression" && curl -SL "$nav_url" -o "$nav_out")
	else
	echo "Using cached chapter index in $nav_out"
	fi

	title="$(awk '
	/"\/works\/'"$work_id"'"/ {
	print(gensub(/.'"$work_id"'">([^<]+)<\/a>./, "\\1", "g"));
	}
	' "$nav_out")"
	title_noquote="${title//\"/ʹ}"
	author="$(awk '
	/"\/works\/'"$work_id"'"/ {
	print(gensub(/.<a rel="author" [^>]>([^<]+)<\/a>.*/, "\\1", "g"));
	}
	' "$nav_out")"


	echo > "$image_list"
	chapter_num=1
	awk -F '/' '
	/<ol class="chapter index group" role="navigation">/,/<\/ol>/ {
	if($0 !~ /<ol class="chapter index group" role="navigation">/ && $0 !~ /<\/ol>/)
	print(gensub(/".*/, "", "g", $5));
	}
	' "$nav_out" \| while read -r chapter_id; do
	chapter_num_wide="$(printf "%03d" "$chapter_num")"
	chapter_url="https://archiveofourown.org/works/$work_id/chapters/$chapter_id?view_adult=true"
	chapter_out="$tempdir/ch-$work_id-$chapter_num_wide-$chapter_id.html"
	chapter_final_out="$title ch$chapter_num_wide.html"

	if [[ ! -f "$chapter_out" ]]; then
	echo "Downloading chapter $chapter_num from $chapter_url to $chapter_out"
	curl -SL --compressed "$chapter_url" -o "$chapter_out" \|\| (echo "Retrying w/o compression" && curl -SL "$chapter_url" -o "$chapter_out")
	else
	echo "Using cached chapter $chapter_num in $chapter_out"
	fi

	{
	echo '<!DOCTYPE html>'
	echo '<html lang="en">'
	echo ' <head>'
	echo ' <meta charset="utf-8" />'
	echo ' <title>'"$title"' - Chapter '"$chapter_num"'</title>'
	echo ' <meta name="author" content="'"$author"'" />'
	echo ' </head>'
	echo ' <body>'
	echo ' <!-- ePub title: "'"$title_noquote"' - Chapter '"$chapter_num"'" -->'
	echo

	awk '
	BEGIN {
	title_open=0;
	title_close=0;
	}

	/<h2 class="title heading">/ {
	if(!title_open) {
	print(" <a href=\"https://archiveofourown.org/works/'"$work_id"'\">");
	title_open=1;
	}
	}

	/<!--.BEGIN.work skin.-->/,/<!--.END.work skin.-->/ {
	result=gensub(/href="\//, "href=\"https://archiveofourown.org/", "g");
	if(result ~ /(<img.)src="http(s?):\/\/([^"/]+\/)+(.".*\/>)/)
	print(result) >> "'"$image_list"'";
	while(result ~ /(<img.)src="http(s?):\/\/([^"/]+\/)+(.".*\/>)/)
	result = gensub(/(<img.)src="http(s?):\/\/([^"/]+\/)+(.".*\/>)/, "\\1 src=\"\\4", "g", result);

	print(result);
	}

	/<\/h2>/ {
	if(title_open && !title_close) {
	print(" </a>");
	title_close=1;
	}
	}
	' "$chapter_out" \| html-beautify -qm 3 -te "\n" -n

	echo
	echo ' </body>'
	echo '</html>'
	} > "$chapter_final_out"

	((++chapter_num))
	done


	sed 's;<;\n<;g' "$image_list" \| awk '/img/ {print(gensub(/[[:space:] ]/, "", "g", gensub(/<img.src="(http(s?):\/\/([^"/]+\/)+.\.[[:alnum:]])".\/>/, "\\1", "g")));}' \| while read -r image_url; do
	image_out="$(echo "$image_url" \| sed -r 's;[[:alpha:]]+://(.*/)+;;')"
	if [[ ! -f "$image_out" ]]; then
	echo "Downloading image from $image_url to $image_out"
	curl -SL --compressed "$image_url" -o "$image_out" \|\| (echo "Retrying w/o compression" && curl -SL "$image_url" -o "$image_out")
	else
	echo "Using cached image in $image_out"
	fi
	done