Last active
August 18, 2017 01:02
-
-
Save nerflad/e8b545b0a42d434715c1c44b42d40421 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from bs4 import BeautifulSoup | |
import os, sys | |
import urllib.request | |
import time | |
urls = ("https://www.notsomoderndrummer.com/not-so-modern-drummer/columns/david-barsalou/mike-clark-words-of-wisdom?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
"https://www.notsomoderndrummer.com/not-so-modern-drummer/columns/david-barsalou/mike-clark-words-of-wisdom-part-2?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
"https://www.notsomoderndrummer.com/not-so-modern-drummer/columns/david-barsalou/mike-clark-words-of-wisdom-part-3?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
"https://www.notsomoderndrummer.com/not-so-modern-drummer/columns/david-barsalou/mike-clark-words-of-wisdom-part-4?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
"https://www.notsomoderndrummer.com/not-so-modern-drummer/columns/david-barsalou/mike-clark-words-of-wisdom-part-5?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
"https://www.notsomoderndrummer.com/not-so-modern-drummer/columns/david-barsalou/mike-clark-words-of-wisdom-part-6?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2015/9/15/mike-clark-words-of-wisdom-part-6?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2016/1/22/mike-clark-words-of-wisdom-part8?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2016/4/29/mike-clark-words-of-wisdom-part-9?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2016/8/1/mike-clark-words-of-wisdom-part-10?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2016/11/4/mike-clark-words-of-wisdom-part-11?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2016/12/15/mike-clark-words-of-wisdom-part-12?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2017/3/15/mike-clark-words-of-wisdom-13?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2017/5/16/mike-clark-words-of-wisdom-chapter-14?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2017/6/20/mike-clark-words-of-wisdom-chapter-15?rq=mike%20clark%3A%20words%20of%20wisdom",\ | |
) | |
def bs4_resultset_to_strings(list_): | |
newlist = [] | |
for i in list_: | |
newlist.append("".join((str(i), '\n'))) | |
return newlist | |
def get_soup_from_url(url): | |
page = urllib.request.urlopen(url).read() | |
page = page.decode("utf-8") | |
soup = BeautifulSoup(page, 'html.parser') | |
return soup | |
def get_blogtext_from_soup(soup): | |
text = soup.find_all('div', class_='sqs-block-html') | |
return text[1] | |
def standardify_html(string_list): | |
header = ["<!doctype html>\n",\ | |
"<html>\n",\ | |
"<head>\n",\ | |
" <meta charset=\"utf-8\" />\n",\ | |
" <style>\n",\ | |
" div { margin: auto; width: 60%; padding: 2em; background-color: #EEEEEE; }\n",\ | |
" </style>\n",\ | |
"</head>\n",\ | |
"<body>\n",\ | |
" <h2>Wisdom of Clark</h2>\n"] | |
for i in reversed(header): | |
string_list.insert(0, i) | |
string_list.append("</body>\n") | |
string_list.append("</html>\n") | |
return string_list | |
def main(): | |
final_html = [] | |
for i,x in enumerate(urls): | |
for retry in range(3): # retry up to three times with 4 second delay | |
try: | |
soup = get_soup_from_url(x) | |
break | |
except urllib.error.HTTPError: | |
print("Retrying URL:", x) | |
time.sleep(4) | |
content = bs4_resultset_to_strings(get_blogtext_from_soup(soup)) | |
final_html += content | |
final_html = standardify_html(final_html) | |
with open('wisdom-of-clark.html', 'w+') as _file: | |
for i in final_html: | |
_file.write(i) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment