Skip to content

Instantly share code, notes, and snippets.

@coreyhermanson
Last active March 22, 2017 19:02
Show Gist options
  • Save coreyhermanson/4ebe7e258b3cc0a263cdaea45c685463 to your computer and use it in GitHub Desktop.
Save coreyhermanson/4ebe7e258b3cc0a263cdaea45c685463 to your computer and use it in GitHub Desktop.
Harvest API - Twitter Harvest
import requests
infile = r'C:\Users\Account\PythonFiles\generic_infile.txt' # full path to any file inside quotes
# Harvest Event Variables
api_key = "123abc" # STRING - 1 API key per Harvest API schema
searchable_items_per_event = 100 # INT - max queries OR max screenNames
name_of_event = "NewYork_Politics" # STRING - Program will pre-pend "TW_" and add "_#" to the end
filterQuery = None # STRING - ex: "nuclear AND (war OR energy)"
event_tags = ["source_Politics", "New York"] # LIST
# Schedule Variables
is_scheduled = "ONCE" # "ONCE" OR "RECURRING"
interval_between_recurring_harvests = None # in hours
hours_to_first_harvest = 8 # HOURS - "Delay" parameter
created_harvest_interval = .5 # HOURS - this spaces out your created harvests so they don't execute simultaneously
# Twitter Variables
twitter_harvest_type = "screenNames" # "screenNames" OR "keyword"
maxTweets = 100
# Seldom-used Twitter Variables
sinceDate = None
tweetRadius = None
tweetRadiusUnits = None
longitude = None
latitude = None
##############################################################################################################
# Functions
def hour_to_milli(hour):
"""
Convert hours to milliseconds
:param hour:
:return: milliseconds
"""
milli = round(hour * 3600000)
return milli
def key_checker(json_dict):
for key in list(json_dict):
if not json_dict[key]:
json_dict.pop(key)
return json_dict
# Harvest API class
class HarvestAPI(object):
def __init__(self, auth):
self.auth = auth
self.harvest_base = 'https://harvestapi.brightplanet.com/harvestapi/api/harvests?api_key={}'.format(self.auth)
self.headers = {"Content-Type": "application/json"}
def twitter(self, delay, query_load, since_date, max_tweets, radius, radius_units, lon, lat, event_name,
source_tags, filter_query, max_docs, max_doc_size=-1, scheduled="ONCE", sked_interval=None):
"""
Triggers a Twitter harvest through the BrightPlanet Harvest API
:param delay:
:param query_load:
:param since_date:
:param max_tweets:
:param radius:
:param radius_units:
:param lon:
:param lat:
:param event_name:
:param source_tags:
:param filter_query:
:param max_docs:
:param max_doc_size:
:param scheduled:
:param sked_interval:
:return:
"""
body = {
"harvestEventType": "TWITTER",
"scheduleType": scheduled,
"name": event_name,
"delay": delay,
"interval": sked_interval,
"twitterHarvestParameters":
{
"{}".format(twitter_harvest_type): query_load,
"sinceDate": since_date,
"maxTweets": max_tweets,
"tweetRadius": radius,
"tweetRadiusUnits": radius_units,
"tweetLongitude": lon,
"tweetLatitude": lat,
},
"filterQuery": filter_query,
"tags": source_tags,
"maxDocCount": max_docs,
"maxDocSize": max_doc_size
}
# Check for "None" dict values and remove those keys from the request
body = key_checker(body)
body['twitterHarvestParameters'] = key_checker(body['twitterHarvestParameters'])
r = requests.post(self.harvest_base, json=body, headers=self.headers, verify=False)
r.raise_for_status()
print("Event: {0} created. Status: {1}".format(eventName, r.status_code))
######################################################################################################
# Loop through and create harvests
bp = HarvestAPI(api_key)
with open(infile, 'r', encoding='utf-8') as f:
queries_list = [line.strip() for line in f]
num_queries = len(queries_list)
counter = 0
start = 0
end = searchable_items_per_event
# Calculate number of events
if (num_queries % searchable_items_per_event) != 0:
events = (num_queries // searchable_items_per_event) + 1
else:
events = (num_queries // searchable_items_per_event)
# Create time variables
time_delay = round(hour_to_milli(hours_to_first_harvest))
# Enum creates a number to append to end of harvest name
for enum_name, event in enumerate(range(events), 1):
eventName = "TW_" + name_of_event + '_' + str(enum_name)
load = queries_list[start:end]
print("Name: {0} || Delay: {1} || filterQuery: {2} || tags: {3} ||"
" maxDocs: {4} || first: {5}".format(eventName, time_delay, filterQuery, event_tags, maxTweets, load[0]))
bp.twitter(time_delay, load, sinceDate, maxTweets, tweetRadius, tweetRadiusUnits, longitude, latitude,
eventName, event_tags, filterQuery, maxTweets, -1, is_scheduled, interval_between_recurring_harvests)
start += searchable_items_per_event
end += searchable_items_per_event
counter += 1
time_delay += round(hour_to_milli(created_harvest_interval))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment