Created
December 7, 2019 00:32
-
-
Save onionhoney/7c3704f88c1755badd7b07b2c3af87ed to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from pandas.errors import EmptyDataError | |
from collections import defaultdict | |
import random | |
import time | |
import datetime | |
import math | |
import requests as R | |
import json | |
from dateutil import parser as dateparser | |
import hashlib | |
import argparse | |
pd.set_option('display.max_columns', 100) | |
pd.set_option('display.max_colwidth', 100) | |
SLEEP = 2 # sleep time between api calls | |
### Get all Attendees ### | |
# Get all attendees for the (group_urlname, event_id) tuples | |
def get_attendees_for_event(group_urlname, event_id): | |
# assumption=capped at 200 | |
url = "https://api.meetup.com/{}/events/{}/rsvps".format( | |
group_urlname, event_id) | |
params = {} | |
params["fields"] = ",".join(["attendance_status", ]) | |
retry = 0 | |
while True: | |
try: | |
r = R.get(url, params=params) | |
res = r.json() | |
return pd.io.json.json_normalize(res) | |
except: | |
if retry >= 3: | |
print("failed request ", url, params) | |
return pd.DataFrame() | |
else: | |
retry += 1 | |
time.sleep(SLEEP * retry) | |
continue | |
def get_member(member_id): | |
# assumption=capped at 200 | |
url = "https://api.meetup.com/members/{}".format(member_id) | |
params = {} | |
params["fields"] = ",".join( [ | |
"bio", "birthday", "lat", "lon", "memberships", "name", "stats", "topics", "privacy" | |
] ) | |
retry = 0 | |
while True: | |
try: | |
r = R.get(url, params=params) | |
res = r.json() | |
return pd.io.json.json_normalize(res) | |
except: | |
if retry >= 3: | |
print("failed request ", url, params) | |
return pd.DataFrame() | |
else: | |
retry += 1 | |
time.sleep(SLEEP * retry) | |
continue | |
def crawl_members(member_ids, handle, batch=50): | |
# always restart at n - 1 | |
try: | |
with open(handle, "r") as f: | |
df = pd.read_csv(f) | |
finished = set(df["member_id"].values) | |
except (EmptyDataError, FileNotFoundError): | |
df = pd.DataFrame() | |
finished = set() | |
mask = np.array([x not in finished for x in member_ids]) | |
n_tot = len(member_ids) | |
params = np.array( member_ids )[mask] | |
if len(params) == 0: | |
print("{}/{} events processed. nothing to do".format(n_tot, n_tot) ) | |
return | |
batches = np.array_split(params, (len(params) + batch - 1) / batch) | |
n_processed = n_tot - len(params) | |
print("attendee processed: {} / {}".format(n_processed, n_tot)) | |
for batch in batches: | |
for x in batch: | |
print("crawl member", x) | |
df_curr = get_member(x) | |
df = df.append(df_curr, sort=True) | |
time.sleep(SLEEP) | |
with open(handle, "w+") as f: | |
df.to_csv(f, index=False) | |
n_processed += len(batch) | |
print("members processed: {} / {}".format(n_processed, n_tot)) | |
#print(df.tail(1)) | |
############# App Logic #################### | |
## We will be given the rep_groups, and a 'mod' number, and nothing more. | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description="Crawl all members, for given memberIDs, listed in a file") | |
parser.add_argument('filename') | |
parser.add_argument('n_channels', type=int) | |
parser.add_argument('which_channel', type=int) | |
args = parser.parse_args() | |
members = pd.read_csv(args.filename) | |
member_ids = members["member_id"].values.astype("int") | |
which_channel, n_channels = args.which_channel, args.n_channels | |
assert( 0 <= args.which_channel < args.n_channels ) # , "channel specified is larger than n_channels") | |
m = hashlib.sha256() | |
digest = hashlib.sha256() | |
for s in member_ids: | |
digest.update( str(s).encode() ) | |
str_id = digest.hexdigest()[-6:] | |
print("digest for file {} = {}".format(args.filename, str_id) ) | |
print("running {} out of {} channels".format(which_channel, n_channels)) | |
member_hashed = np.array([ int(hashlib.sha1(str(s).encode() ).hexdigest()[-5:], 16) for s in member_ids]) | |
member_ids_selected = member_ids[member_hashed % args.n_channels == args.which_channel] | |
print("processing members = {}".format(member_ids_selected[:10])) | |
crawl_members(member_ids_selected, | |
"crawl/members_{}_{}_{}.csv".format(str_id, which_channel, n_channels), | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment