Last active
June 11, 2021 02:15
-
-
Save charlee/bc865ba8aac295dd997691310514e515 to your computer and use it in GitHub Desktop.
A script that scrape top news from Reddit and extract the content as Markdown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import re | |
import praw | |
import requests | |
from datetime import datetime | |
from bs4 import BeautifulSoup | |
for sub in subs: | |
res = requests.get(sub.url) | |
if (res.status_code == 200 and 'content-type' in res.headers and | |
res.headers.get('content-type').startswith('text/html')): | |
html = res.text | |
def get_reddit(): | |
return praw.Reddit( | |
client_id='OUR_CLIENT_ID', | |
client_secret='OUR_SECRET', | |
grant_type='client_credentials', | |
user_agent='mytestscript/1.0' | |
) | |
def get_top(subreddit_name): | |
today = datetime.now().strftime(r'%Y-%m-%d') | |
dirname = os.path.join('news-%s' % today, subreddit_name) | |
os.makedirs(dirname, exist_ok=True) | |
# Get top 50 submissions from reddit | |
reddit = get_reddit() | |
top_subs = reddit.subreddit(subreddit_name).top(limit=50) | |
# Remove those submissions that belongs to reddit | |
subs = [sub for sub in top_subs if not sub.domain.startswith('self.')] | |
count = 10 | |
while subs and count > 0: | |
sub = subs.pop(0) | |
article = get_article(sub.url) | |
if article: | |
text = '\n\n'.join(article['content']) | |
filename = re.sub(r'\W+', '_', article['title']) + '.md' | |
open(os.path.join(dirname, filename), 'w').write(text) | |
count -= 1 | |
def get_article(url): | |
print(' - Retrieving %s' % url) | |
try: | |
res = requests.get(url) | |
if (res.status_code == 200 and 'content-type' in res.headers and | |
res.headers.get('content-type').startswith('text/html')): | |
article = parse_article(res.text) | |
print(' => done, title = "%s"' % article['title']) | |
return article | |
else: | |
print(' x fail or not html') | |
except Exception: | |
pass | |
def parse_article(text): | |
soup = BeautifulSoup(text, 'html.parser') | |
# find the article title | |
h1 = soup.body.find('h1') | |
# find the common parent for <h1> and all <p>s. | |
root = h1 | |
while root.name != 'body' and len(root.find_all('p')) < 5: | |
root = root.parent | |
if len(root.find_all('p')) < 5: | |
return None | |
# find all the content elements. | |
ps = root.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p', 'pre']) | |
ps.insert(0, h1) | |
content = [tag2md(p) for p in ps] | |
return {'title': h1.text, 'content': content} | |
def tag2md(tag): | |
if tag.name == 'p': | |
return tag.text | |
elif tag.name == 'h1': | |
return f'{tag.text}\n{"=" * len(tag.text)}' | |
elif tag.name == 'h2': | |
return f'{tag.text}\n{"-" * len(tag.text)}' | |
elif tag.name in ['h3', 'h4', 'h5', 'h6']: | |
return f'{"#" * int(tag.name[1:])} {tag.text}' | |
elif tag.name == 'pre': | |
return f'```\n{tag.text}\n```' | |
def main(): | |
subreddits = ['javascript', 'Python', 'news'] | |
for sr in subreddits: | |
print('Scraping /r/%s...' % sr) | |
get_top(sr) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment