Last active
March 26, 2018 18:35
-
-
Save mshock/a70e3bd234ff9dc4d11c208e96aa0940 to your computer and use it in GitHub Desktop.
scrape user flair to sqlite db from posts within a subreddit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[reddit] | |
client_id = XXX | |
client_secret = XXX | |
unique_key = XXX |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! python | |
import praw | |
import sqlite3 | |
import time | |
import configparser | |
import sys | |
from colors import green, red, yellow | |
config = configparser.ConfigParser() | |
config.read('flair_parse.cfg') | |
print('configs parsed!') | |
subreddit = 'overwatch' | |
scan_type = sys.argv[1] if len(sys.argv) > 1 else 'new' | |
after = '' | |
page_limit = 10 | |
process_dups = True | |
user_auth = False | |
drop_tables = False | |
conn = sqlite3.connect(subreddit + '.db') | |
print("db connection successful") | |
if (drop_tables): | |
conn.execute("drop table if exists user") | |
conn.execute("drop table if exists flair;") | |
conn.execute("drop table if exists page;") | |
conn.execute("drop table if exists change;") | |
conn.execute("drop table if exists ticker;") | |
conn.execute('''create table if not exists user | |
(id integer primary key autoincrement, | |
name text not null, | |
flair_id int, | |
foreign key(flair_id) references flair(id), | |
unique(name) | |
) | |
''') | |
conn.execute('''create table if not exists flair | |
(id integer primary key autoincrement, | |
name text not null, | |
unique(name) | |
) | |
''') | |
conn.execute('''create table if not exists page | |
(id integer primary key autoincrement, | |
name text not null, | |
unique(name) | |
) | |
''') | |
conn.execute('''create table if not exists change | |
(id integer primary key autoincrement, | |
from_flair int not null, | |
to_flair int not null, | |
count int default 0, | |
unique (from_flair, to_flair) | |
) | |
''') | |
conn.execute('''create table if not exists ticker | |
(id integer primary key autoincrement, | |
flair_id int not null, | |
prev_id int | |
) | |
''') | |
print('db tables initialized') | |
r = praw.Reddit("flair parser by u/mschock") | |
sub = r.get_subreddit(subreddit) | |
if (user_auth): | |
r.set_oauth_app_info(client_id=config.get('reddit', 'client_id'), client_secret=config.get('reddit', 'client_secret'), redirect_uri='http://127.0.0.1:65010/authorize_callback') | |
url = r.get_authorize_url(config.get('reddit', 'unique_key'), 'read', True) | |
import webbrowser | |
webbrowser.open(url) | |
access_key = input("enter access key: ") | |
access_information = r.get_access_information(access_key.rstrip()) | |
r.set_access_credentials(**access_information) | |
print("user identity set!") | |
# access_information = r.get_access_information('leL3zVShuR2K-Juv68KTtZ4JNtA') | |
# r.set_access_credentials(**access_information) | |
# r.refresh_access_information(access_information['refresh_token']) | |
# authenticated_user = r.get_me() | |
# print(authenticated_user.name, authenticated_user.link_karma) | |
else: | |
print("using anonymous identity") | |
p = {'after': after} | |
num_users = conn.execute("select count(*) from user").fetchone()[0] | |
running = True | |
first = True | |
prev_after = '' | |
current_get_type = [] | |
current_get_name= ['new','top','rising','hot'] | |
while True: | |
try: | |
while running: | |
start_time = time.time() | |
print("retrieving {0} submissions...".format(str(page_limit))) | |
print("{0} @ {1}.{2}".format(subreddit, ('start' if after == '' else after), (scan_type + ' : ' + current_get_name[0] if scan_type == 'auto' else scan_type))) | |
if first: | |
current_get_type = [sub.get_new, sub.get_hot, sub.get_rising, sub.get_top] | |
get_current = current_get_type[0] | |
if scan_type == 'new': | |
content = sub.get_new(limit=page_limit,params=p) | |
elif scan_type == 'top': | |
content = sub.get_top(limit=page_limit,params=p) | |
elif scan_type == 'rising': | |
content = sub.get_rising(limit=page_limit,params=p) | |
elif scan_type == 'hot': | |
content = sub.get_hot(limit=page_limit,params=p) | |
elif scan_type == 'auto': | |
content = get_current(limit=page_limit, params=p) | |
else: | |
print("unknown scan type arg: " + scan_type) | |
sys.exit(1) | |
print("loaded!\n") | |
users_updated = 0 | |
num_subs = 0 | |
for submission in content: | |
num_subs += 1 | |
after = submission.name | |
# pull save user to function so that don't have to repeat updated users code etc. | |
prev_after = after | |
p['after'] = after | |
process_prefix = 'NEW' | |
if (conn.execute("select count(*) from page where name = '{0}'".format(after)).fetchone()[0] > 0): | |
if (process_dups): | |
process_prefix = "DUP" | |
else: | |
print("skipping duplicate: {0}".format(after)) | |
continue | |
conn.execute("insert or ignore into page(name) values ('{0}')".format(after)) | |
comments = praw.helpers.flatten_tree(submission.comments) | |
print("[{0}] parsing page: {1} - ({2}) <{3}> {4}".format(process_prefix, after, submission.score, len(comments), submission.title[:70].encode('ascii', 'replace'))) | |
if (hasattr(submission,'author_flair_css_class') and submission.author_flair_css_class is not None): | |
conn.execute("insert or ignore into flair(name) values ('{0}')".format(submission.author_flair_css_class)) | |
(flair_id, flair_name) = conn.execute("select id, name from flair where name = '{0}'".format(submission.author_flair_css_class)).fetchone() | |
flair_prev = conn.execute("select flair_id from user where name = '{0}'".format(submission.author)).fetchone() | |
flair_change = False | |
recently_flaired = False | |
if flair_prev is not None: | |
flair_prev = flair_prev[0] | |
if flair_prev != flair_id: | |
users_updated += 1 | |
flair_change = True | |
conn.execute("insert or ignore into change (from_flair, to_flair) values ({0}, {1})".format(flair_prev, flair_id)) | |
conn.execute("update change set count = count + 1 where from_flair = {0} and to_flair = {1};".format(flair_prev, flair_id)) | |
else: | |
flair_prev = 0 | |
else: | |
flair_prev = 0 | |
recently_flaired = True | |
conn.execute("insert or replace into user(name, flair_id) values ('{0}', {1})".format(submission.author, flair_id)) | |
conn.commit() | |
if recently_flaired or flair_change: | |
conn.execute("insert into ticker(flair_id, prev_id) values ({}, {})".format(flair_id, flair_prev)) | |
conn.commit() | |
for comment in comments: | |
if (hasattr(comment,'author_flair_css_class') and comment.author_flair_css_class is not None): | |
conn.execute("insert or ignore into flair(name) values ('{0}')".format(comment.author_flair_css_class)) | |
(flair_id, flair_name) = conn.execute("select id, name from flair where name = '{0}'".format(comment.author_flair_css_class)).fetchone() | |
flair_prev = conn.execute("select flair_id from user where name = '{0}'".format(comment.author)).fetchone() | |
flair_change = False | |
if flair_prev is not None: | |
flair_prev = flair_prev[0] | |
if flair_prev != flair_id: | |
users_updated += 1 | |
flair_change = True | |
conn.execute("insert or ignore into change (from_flair, to_flair) values ({0}, {1})".format(flair_prev, flair_id)) | |
conn.execute("update change set count = count + 1 where from_flair = {0} and to_flair = {1};".format(flair_prev, flair_id)) | |
else: | |
flair_prev = 0 | |
else: | |
flair_prev = 0 | |
recently_flaired = True | |
conn.execute("insert or replace into user(name, flair_id) values ('{0}', {1})".format(comment.author, flair_id)) | |
conn.commit() | |
if recently_flaired or flair_change: | |
conn.execute("insert into ticker(flair_id, prev_id) values ({}, {})".format(flair_id, flair_prev)) | |
conn.commit() | |
if num_subs == 0: | |
print('post limit reached, switching sort type and restarting...') | |
current_get_type = current_get_type[-1:] + current_get_type[:-1] | |
current_get_name = current_get_name[-1:] + current_get_name[:-1] | |
get_current = current_get_type[0] | |
after = '' | |
p['after'] = '' | |
continue | |
sleep_time = time.time() - start_time | |
sleep_time = 30 - sleep_time | |
num_users_new = conn.execute("select count(*) from user").fetchone()[0] | |
user_diff = num_users_new - num_users | |
num_users = num_users_new | |
print(green("\n(+{}) users added".format(user_diff))) | |
print(yellow("(+{}) users updated".format(users_updated))) | |
print(red("[{}] total users\n".format(num_users))) | |
if(sleep_time > 0): | |
print("sleeping: %f\n" % (sleep_time)) | |
time.sleep(sleep_time) | |
first = False | |
except HTTPError as e: | |
print("HTTP Error: " + str(e.code)) | |
print("sleeping 30 seconds...") | |
time.sleep(30) | |
continue | |
break | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! python | |
import sqlite3 | |
import sys | |
import time | |
import os | |
clear = lambda: os.system('cls') if os.name == 'nt' else os.system('clear') | |
from colors import red, green, yellow, magenta, cyan | |
conn = sqlite3.connect('overwatch.db') | |
print("\ndb connection successful\n") | |
first = True | |
hero_ranks = {} | |
hero_change = {} | |
hero_change2 = {} | |
hero_counts = {} | |
while True: | |
clear() | |
print("\n\n") | |
counts = conn.execute('select b.name, count(a.flair_id) from user a, flair b on a.flair_id = b.id group by b.name').fetchall() | |
heroes = dict(zip('Bastion DVa Genji Hanzo Junkrat Lucio Mccree Mei Mercy Pharah Reaper Reinhardt Roadhog Soldier76 Symmetra Torbjorn Tracer Widowmaker Winston Zarya Zenyatta'.split(), [0] * 21)) | |
css_lookup = dict(zip('DVa Symmetra Mercy Mei Lucio Winston Junkrat Roadhog Zarya Reaper Soldier76 Tracer Pharah Genji Reinhardt Mccree Widowmaker Bastion Zenyatta Torbjorn Hanzo'.split(), | |
'R19 R10 R13 R18 R15 R06 R17 R16 R08 R05 R02 R14 R07 R20 R09 R04 R01 R11 R00 R03 R12'.split())) | |
total_flaired = 0 | |
total = 0 | |
for hero in heroes: | |
for flair_css, count in counts: | |
if hero in flair_css or css_lookup[hero] in flair_css: | |
heroes[hero] += count | |
total_flaired += count | |
for i, (hero, hero_count) in enumerate(sorted(heroes.items(), key=lambda tup: tup[1], reverse=True)): | |
if first: | |
hero_ranks[hero] = i | |
hero_change[hero] = '' | |
hero_change2[hero] = '' | |
hero_counts[hero] = hero_count | |
if hero_counts[hero] < hero_count: | |
hero_change2[hero] = '(+{})'.format(hero_count - hero_counts[hero]) | |
elif hero_counts[hero] > hero_count: | |
hero_change2[hero] = '(-{})'.format(hero_counts[hero] - hero_count) | |
if hero_ranks[hero] < i: | |
hero_change[hero] = '---' | |
elif hero_ranks[hero] > i: | |
hero_change[hero] = '+++' | |
for hero_check in heroes: | |
if hero == hero_check: | |
continue | |
if heroes[hero] == heroes[hero_check]: | |
hero_change[hero] = '' | |
color = red | |
if i <= 4: | |
color = green | |
elif i < 10: | |
color = yellow | |
elif i < 15: | |
color = magenta | |
print(color("\t[{0}]\t{1:<10} : {2:<5} - {3:>5.2f}% {4:<5} {5:<3}".format( | |
i+1, | |
hero, | |
hero_count, | |
(hero_count / float(total_flaired)) * 100, | |
hero_change2[hero], | |
hero_change[hero] | |
))) | |
hero_ranks[hero] = i | |
total_flaired = conn.execute('select count(1) from user where flair_id != 1').fetchone()[0]; | |
total_users = conn.execute('select count(1) from user').fetchone()[0]; | |
print(cyan("\n\t({0} / {1}) - {2:.2f}% users flaired".format(total_flaired, total_users, (total_flaired / float(total_users)) * 100))) | |
first = False | |
time.sleep(15) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! python | |
import sqlite3 | |
from colors import red, green, yellow, magenta, cyan | |
import time | |
import os | |
clear = lambda: os.system('cls') if os.name == 'nt' else os.system('clear') | |
sleep_dur = 5 | |
num_rows = 25 | |
def get_name(css): | |
for hero in heroes: | |
if hero in css or css_lookup[hero] in css: | |
return hero | |
return "default" | |
heroes = dict(zip('Bastion DVa Genji Hanzo Junkrat Lucio Mccree Mei Mercy Pharah Reaper Reinhardt Roadhog Soldier76 Symmetra Torbjorn Tracer Widowmaker Winston Zarya Zenyatta'.split(), [0] * 21)) | |
css_lookup = dict(zip('DVa Symmetra Mercy Mei Lucio Winston Junkrat Roadhog Zarya Reaper Soldier76 Tracer Pharah Genji Reinhardt Mccree Widowmaker Bastion Zenyatta Torbjorn Hanzo'.split(), | |
'R19 R10 R13 R18 R15 R06 R17 R16 R08 R05 R02 R14 R07 R20 R09 R04 R01 R11 R00 R03 R12'.split())) | |
conn = sqlite3.connect('overwatch.db') | |
while True: | |
clear() | |
print("\n") | |
ticker = conn.execute("select * from ticker order by id desc limit {};".format(num_rows)).fetchall() | |
for (id, to_flair, from_flair) in ticker: | |
to_name = conn.execute("select name from flair where id = {}".format(to_flair)).fetchone()[0] | |
to_name = get_name(to_name) | |
if from_flair != 0: | |
from_name = conn.execute("select name from flair where id = {}".format(from_flair)).fetchone()[0] | |
from_name = get_name(from_name) | |
if to_flair == 1: | |
print(yellow(" [UPD]\t{0:<10} -> {1:<10}".format(to_name, from_name))) | |
else: | |
print(red(" [UPD]\t{0:<10} -> {1:<10}".format(to_name, from_name))) | |
else: | |
if to_flair == 1: | |
print(magenta(" [NEW]\t{0:<10}".format(to_name))) | |
else: | |
print(green(" [NEW]\t{0:<10}".format(to_name))) | |
time.sleep(sleep_dur) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment