Skip to content

Instantly share code, notes, and snippets.

@charlee
Last active June 11, 2021 02:15
Show Gist options
  • Save charlee/bc865ba8aac295dd997691310514e515 to your computer and use it in GitHub Desktop.
Save charlee/bc865ba8aac295dd997691310514e515 to your computer and use it in GitHub Desktop.
A script that scrape top news from Reddit and extract the content as Markdown.
#!/usr/bin/env python
import os
import re
import praw
import requests
from datetime import datetime
from bs4 import BeautifulSoup
for sub in subs:
res = requests.get(sub.url)
if (res.status_code == 200 and 'content-type' in res.headers and
res.headers.get('content-type').startswith('text/html')):
html = res.text
def get_reddit():
return praw.Reddit(
client_id='OUR_CLIENT_ID',
client_secret='OUR_SECRET',
grant_type='client_credentials',
user_agent='mytestscript/1.0'
)
def get_top(subreddit_name):
today = datetime.now().strftime(r'%Y-%m-%d')
dirname = os.path.join('news-%s' % today, subreddit_name)
os.makedirs(dirname, exist_ok=True)
# Get top 50 submissions from reddit
reddit = get_reddit()
top_subs = reddit.subreddit(subreddit_name).top(limit=50)
# Remove those submissions that belongs to reddit
subs = [sub for sub in top_subs if not sub.domain.startswith('self.')]
count = 10
while subs and count > 0:
sub = subs.pop(0)
article = get_article(sub.url)
if article:
text = '\n\n'.join(article['content'])
filename = re.sub(r'\W+', '_', article['title']) + '.md'
open(os.path.join(dirname, filename), 'w').write(text)
count -= 1
def get_article(url):
print(' - Retrieving %s' % url)
try:
res = requests.get(url)
if (res.status_code == 200 and 'content-type' in res.headers and
res.headers.get('content-type').startswith('text/html')):
article = parse_article(res.text)
print(' => done, title = "%s"' % article['title'])
return article
else:
print(' x fail or not html')
except Exception:
pass
def parse_article(text):
soup = BeautifulSoup(text, 'html.parser')
# find the article title
h1 = soup.body.find('h1')
# find the common parent for <h1> and all <p>s.
root = h1
while root.name != 'body' and len(root.find_all('p')) < 5:
root = root.parent
if len(root.find_all('p')) < 5:
return None
# find all the content elements.
ps = root.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p', 'pre'])
ps.insert(0, h1)
content = [tag2md(p) for p in ps]
return {'title': h1.text, 'content': content}
def tag2md(tag):
if tag.name == 'p':
return tag.text
elif tag.name == 'h1':
return f'{tag.text}\n{"=" * len(tag.text)}'
elif tag.name == 'h2':
return f'{tag.text}\n{"-" * len(tag.text)}'
elif tag.name in ['h3', 'h4', 'h5', 'h6']:
return f'{"#" * int(tag.name[1:])} {tag.text}'
elif tag.name == 'pre':
return f'```\n{tag.text}\n```'
def main():
subreddits = ['javascript', 'Python', 'news']
for sr in subreddits:
print('Scraping /r/%s...' % sr)
get_top(sr)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment