Skip to content

Instantly share code, notes, and snippets.

@onionhoney
Created December 7, 2019 00:32
Show Gist options
  • Save onionhoney/7c3704f88c1755badd7b07b2c3af87ed to your computer and use it in GitHub Desktop.
Save onionhoney/7c3704f88c1755badd7b07b2c3af87ed to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
from pandas.errors import EmptyDataError
from collections import defaultdict
import random
import time
import datetime
import math
import requests as R
import json
from dateutil import parser as dateparser
import hashlib
import argparse
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)
SLEEP = 2 # sleep time between api calls
### Get all Attendees ###
# Get all attendees for the (group_urlname, event_id) tuples
def get_attendees_for_event(group_urlname, event_id):
# assumption=capped at 200
url = "https://api.meetup.com/{}/events/{}/rsvps".format(
group_urlname, event_id)
params = {}
params["fields"] = ",".join(["attendance_status", ])
retry = 0
while True:
try:
r = R.get(url, params=params)
res = r.json()
return pd.io.json.json_normalize(res)
except:
if retry >= 3:
print("failed request ", url, params)
return pd.DataFrame()
else:
retry += 1
time.sleep(SLEEP * retry)
continue
def get_member(member_id):
# assumption=capped at 200
url = "https://api.meetup.com/members/{}".format(member_id)
params = {}
params["fields"] = ",".join( [
"bio", "birthday", "lat", "lon", "memberships", "name", "stats", "topics", "privacy"
] )
retry = 0
while True:
try:
r = R.get(url, params=params)
res = r.json()
return pd.io.json.json_normalize(res)
except:
if retry >= 3:
print("failed request ", url, params)
return pd.DataFrame()
else:
retry += 1
time.sleep(SLEEP * retry)
continue
def crawl_members(member_ids, handle, batch=50):
# always restart at n - 1
try:
with open(handle, "r") as f:
df = pd.read_csv(f)
finished = set(df["member_id"].values)
except (EmptyDataError, FileNotFoundError):
df = pd.DataFrame()
finished = set()
mask = np.array([x not in finished for x in member_ids])
n_tot = len(member_ids)
params = np.array( member_ids )[mask]
if len(params) == 0:
print("{}/{} events processed. nothing to do".format(n_tot, n_tot) )
return
batches = np.array_split(params, (len(params) + batch - 1) / batch)
n_processed = n_tot - len(params)
print("attendee processed: {} / {}".format(n_processed, n_tot))
for batch in batches:
for x in batch:
print("crawl member", x)
df_curr = get_member(x)
df = df.append(df_curr, sort=True)
time.sleep(SLEEP)
with open(handle, "w+") as f:
df.to_csv(f, index=False)
n_processed += len(batch)
print("members processed: {} / {}".format(n_processed, n_tot))
#print(df.tail(1))
############# App Logic ####################
## We will be given the rep_groups, and a 'mod' number, and nothing more.
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Crawl all members, for given memberIDs, listed in a file")
parser.add_argument('filename')
parser.add_argument('n_channels', type=int)
parser.add_argument('which_channel', type=int)
args = parser.parse_args()
members = pd.read_csv(args.filename)
member_ids = members["member_id"].values.astype("int")
which_channel, n_channels = args.which_channel, args.n_channels
assert( 0 <= args.which_channel < args.n_channels ) # , "channel specified is larger than n_channels")
m = hashlib.sha256()
digest = hashlib.sha256()
for s in member_ids:
digest.update( str(s).encode() )
str_id = digest.hexdigest()[-6:]
print("digest for file {} = {}".format(args.filename, str_id) )
print("running {} out of {} channels".format(which_channel, n_channels))
member_hashed = np.array([ int(hashlib.sha1(str(s).encode() ).hexdigest()[-5:], 16) for s in member_ids])
member_ids_selected = member_ids[member_hashed % args.n_channels == args.which_channel]
print("processing members = {}".format(member_ids_selected[:10]))
crawl_members(member_ids_selected,
"crawl/members_{}_{}_{}.csv".format(str_id, which_channel, n_channels),
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment