Last active
March 22, 2017 19:02
-
-
Save coreyhermanson/4ebe7e258b3cc0a263cdaea45c685463 to your computer and use it in GitHub Desktop.
Harvest API - Twitter Harvest
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
infile = r'C:\Users\Account\PythonFiles\generic_infile.txt' # full path to any file inside quotes | |
# Harvest Event Variables | |
api_key = "123abc" # STRING - 1 API key per Harvest API schema | |
searchable_items_per_event = 100 # INT - max queries OR max screenNames | |
name_of_event = "NewYork_Politics" # STRING - Program will pre-pend "TW_" and add "_#" to the end | |
filterQuery = None # STRING - ex: "nuclear AND (war OR energy)" | |
event_tags = ["source_Politics", "New York"] # LIST | |
# Schedule Variables | |
is_scheduled = "ONCE" # "ONCE" OR "RECURRING" | |
interval_between_recurring_harvests = None # in hours | |
hours_to_first_harvest = 8 # HOURS - "Delay" parameter | |
created_harvest_interval = .5 # HOURS - this spaces out your created harvests so they don't execute simultaneously | |
# Twitter Variables | |
twitter_harvest_type = "screenNames" # "screenNames" OR "keyword" | |
maxTweets = 100 | |
# Seldom-used Twitter Variables | |
sinceDate = None | |
tweetRadius = None | |
tweetRadiusUnits = None | |
longitude = None | |
latitude = None | |
############################################################################################################## | |
# Functions | |
def hour_to_milli(hour): | |
""" | |
Convert hours to milliseconds | |
:param hour: | |
:return: milliseconds | |
""" | |
milli = round(hour * 3600000) | |
return milli | |
def key_checker(json_dict): | |
for key in list(json_dict): | |
if not json_dict[key]: | |
json_dict.pop(key) | |
return json_dict | |
# Harvest API class | |
class HarvestAPI(object): | |
def __init__(self, auth): | |
self.auth = auth | |
self.harvest_base = 'https://harvestapi.brightplanet.com/harvestapi/api/harvests?api_key={}'.format(self.auth) | |
self.headers = {"Content-Type": "application/json"} | |
def twitter(self, delay, query_load, since_date, max_tweets, radius, radius_units, lon, lat, event_name, | |
source_tags, filter_query, max_docs, max_doc_size=-1, scheduled="ONCE", sked_interval=None): | |
""" | |
Triggers a Twitter harvest through the BrightPlanet Harvest API | |
:param delay: | |
:param query_load: | |
:param since_date: | |
:param max_tweets: | |
:param radius: | |
:param radius_units: | |
:param lon: | |
:param lat: | |
:param event_name: | |
:param source_tags: | |
:param filter_query: | |
:param max_docs: | |
:param max_doc_size: | |
:param scheduled: | |
:param sked_interval: | |
:return: | |
""" | |
body = { | |
"harvestEventType": "TWITTER", | |
"scheduleType": scheduled, | |
"name": event_name, | |
"delay": delay, | |
"interval": sked_interval, | |
"twitterHarvestParameters": | |
{ | |
"{}".format(twitter_harvest_type): query_load, | |
"sinceDate": since_date, | |
"maxTweets": max_tweets, | |
"tweetRadius": radius, | |
"tweetRadiusUnits": radius_units, | |
"tweetLongitude": lon, | |
"tweetLatitude": lat, | |
}, | |
"filterQuery": filter_query, | |
"tags": source_tags, | |
"maxDocCount": max_docs, | |
"maxDocSize": max_doc_size | |
} | |
# Check for "None" dict values and remove those keys from the request | |
body = key_checker(body) | |
body['twitterHarvestParameters'] = key_checker(body['twitterHarvestParameters']) | |
r = requests.post(self.harvest_base, json=body, headers=self.headers, verify=False) | |
r.raise_for_status() | |
print("Event: {0} created. Status: {1}".format(eventName, r.status_code)) | |
###################################################################################################### | |
# Loop through and create harvests | |
bp = HarvestAPI(api_key) | |
with open(infile, 'r', encoding='utf-8') as f: | |
queries_list = [line.strip() for line in f] | |
num_queries = len(queries_list) | |
counter = 0 | |
start = 0 | |
end = searchable_items_per_event | |
# Calculate number of events | |
if (num_queries % searchable_items_per_event) != 0: | |
events = (num_queries // searchable_items_per_event) + 1 | |
else: | |
events = (num_queries // searchable_items_per_event) | |
# Create time variables | |
time_delay = round(hour_to_milli(hours_to_first_harvest)) | |
# Enum creates a number to append to end of harvest name | |
for enum_name, event in enumerate(range(events), 1): | |
eventName = "TW_" + name_of_event + '_' + str(enum_name) | |
load = queries_list[start:end] | |
print("Name: {0} || Delay: {1} || filterQuery: {2} || tags: {3} ||" | |
" maxDocs: {4} || first: {5}".format(eventName, time_delay, filterQuery, event_tags, maxTweets, load[0])) | |
bp.twitter(time_delay, load, sinceDate, maxTweets, tweetRadius, tweetRadiusUnits, longitude, latitude, | |
eventName, event_tags, filterQuery, maxTweets, -1, is_scheduled, interval_between_recurring_harvests) | |
start += searchable_items_per_event | |
end += searchable_items_per_event | |
counter += 1 | |
time_delay += round(hour_to_milli(created_harvest_interval)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment