Skip to content

Instantly share code, notes, and snippets.

@mike-anderson
Last active December 30, 2015 15:58
Show Gist options
  • Save mike-anderson/7851233 to your computer and use it in GitHub Desktop.
Save mike-anderson/7851233 to your computer and use it in GitHub Desktop.
subreddit comment crawler
import sys
import requests
import json
subreddit_name = sys.argv[1]
output_file = open(sys.argv[2],'w')
subreddit_address = 'http://reddit.com/r/'+subreddit_name+'.json'
#default is front page, hot
subreddit = requests.get(subreddit_address).json()
subreddit_output = {}
def parseRoot(node):
node = node['data']
body_text = node['title']
if node['selftext'] != '':
body_text = body_text + ': ' + node['selftext']
new_node = {
'name':node['name'],
'author':node['author'],
'parent_name':None,
'ups':node['ups'],
'downs':node['downs'],
'score':node['score'],
'created':node['created_utc'],
'body':body_text
}
return [new_node]
def getChildren(node):
if node['kind'] == 't1':
node = node['data']
new_node = {
'name':node['name'],
'author':node['author'],
'parent_name':node['parent_id'],
'ups':node['ups'],
'downs':node['downs'],
'score_hidden':node['score_hidden'],
'created':node['created_utc'],
'body':node['body']
}
node_list = [new_node]
if node['replies'] != '':
for child in node['replies']['data']['children']:
new_children_list = getChildren(child)
if new_children_list is not None:
node_list = node_list + new_children_list
return node_list
for child in subreddit['data']['children']:
t3 = child['data']
if t3['num_comments'] > 0:
print 'getting '+ t3['id'] + ': ' + t3['title']
comment_thread = requests.get('http://www.reddit.com'+t3['permalink']+'.json'+'?sort=random').json()
comment_list = [];
for child in comment_thread[0]['data']['children']:
if child['kind'] == 't3':
#this is the root node
comment_list = comment_list + parseRoot(child)
for child in comment_thread[1]['data']['children']:
new_children_list = getChildren(child)
if new_children_list is not None:
comment_list = comment_list + new_children_list
subreddit_output[t3['id']] = comment_list
json.dump(subreddit_output,output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment