Skip to content

Instantly share code, notes, and snippets.

@coreyhermanson
Created November 27, 2017 17:01
Show Gist options
  • Save coreyhermanson/85defceac4e5cd6548aef7e32ed89584 to your computer and use it in GitHub Desktop.
Save coreyhermanson/85defceac4e5cd6548aef7e32ed89584 to your computer and use it in GitHub Desktop.
BrightPlanet Harvest API: Create RSS harvests from a spreadsheet of sources
import requests
import csv
input_file = r'YOUR_FULL_FILEPATH_HERE'
var_scheduled = "RECURRING"
var_initial_delay = 1.0 # float
var_time_between_scheduled_events = 12.0 # float
var_max_depth = 1
var_depth_external = 0
var_max_docsize = -1
var_max_docs = 50
var_filterquery = None
var_inclusion_terms = None
var_exclusion_terms = None
var_inclusion_domains = None
var_exclusion_domains = None
var_xpaths = [
"dateNode:((//*[contains(local-name(), 'time')]|//*[contains(local-name(), 'Time')]|//*[contains(local-name(), 'date')]|//*[contains(local-name(), 'Date')])/text()[normalize-space()])[1]",
"dateAttrib:((//*[(contains(@itemprop|@class|@id, 'Date') or contains(@itemprop|@class|@id, 'Time') or contains(@itemprop|@class|@id, 'Posted') or contains(@itemprop|@class|@id, 'Publish') or contains(@itemprop|@class|@id, 'date') or contains(@itemprop|@class|@id, 'time') or contains(@itemprop|@class|@id, 'posted') or contains(@itemprop|@class|@id, 'publish')) and not(contains(@itemprop|@class|@id, 'Timeline')) and not(contains(@itemprop|@class|@id, 'timeline'))])/text()[normalize-space()])[1] ",
"dateMeta:(//meta[contains(@class|@itemprop|@id, 'date') or contains(@class|@itemprop|@id, 'time') or contains(@class|@itemprop|@id, 'Date') or contains(@class|@itemprop|@id, 'Time')])[1]/@content ",
"dateText:(//*[text()[contains(.,'201') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]]|//*[text()[contains(.,'200') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]])[position() < 4]"
]
def main():
# Load CSV into dict
dict_of_urls_tags = get_event_input(input_file)
# Create Auth object
bp = HarvestAPI('YOUR_API_KEY')
# Create RSS harvest for each country
bp.auto_rss(dict_of_urls_tags, var_time_between_scheduled_events, var_initial_delay)
# Helper functions
def hours_to_milliseconds(hour):
"""
:param hour:
:return: milliseconds
"""
ms = round(hour * 3600000)
return ms
def get_event_input(infile):
with open(infile, 'r', encoding='utf8') as inf:
region_dict = {}
reader = csv.reader(inf)
for row in reader:
region = row[1]
country = row[2]
url = row[0]
# If region doesn't exist, then add region + country
if not region in region_dict.keys():
region_dict[region] = {country: [url]}
# If region does exist, check for the country, then add country
elif region in region_dict.keys() and not country in region_dict[region].keys():
region_dict[region][country] = [url]
# If region exists and country exists, append URL
else:
region_dict[region][country].append(url)
return region_dict
def generate_topic(region, country):
region = region[2:]
country = country[2:]
topic = f'RSS_{region}_{country}_daily'
return topic
# Classes
class HarvestAPI(object):
def __init__(self, auth):
self.auth = auth
self.harvest_base = 'https://harvestapi.brightplanet.com/harvestapi/api/harvests?api_key={}'.format(
self.auth)
self.headers = {"Content-Type": "application/json"}
def rss_harvest(self, delay, starting_urls, topic, source_tags, filter_query, max_docs=25, max_depth=0,
depth_external=0, inclusion_terms=None, exclusion_terms=None, inclusion_domains=None,
exclusion_domains=None, xpaths=None, max_doc_size=-1, scheduled="ONCE", sked_interval=None):
body = {
"harvestEventType": "RSS",
"scheduleType": scheduled,
"name": topic,
"tags": source_tags,
"delay": delay,
"interval": sked_interval,
"rssHarvestParameters":
{
"initialUrls": starting_urls,
"levelsInternal": max_depth,
"levelsExternal": depth_external,
},
"filterQuery": filter_query,
"inclusionTerms": inclusion_terms,
"exclusionTerms": exclusion_terms,
"inclusionDomains": inclusion_domains,
"exclusionDomains": exclusion_domains,
"xpaths": xpaths,
"maxDocCount": max_docs,
"maxDocSize": max_doc_size
}
r = requests.post(self.harvest_base, json=body, headers=self.headers, verify=False)
r.raise_for_status()
print(f"Name: {topic} || Delay: {delay} || tags: {source_tags}"
f" || maxDocs: {max_docs} || length: {len(starting_urls)} || first: {starting_urls[0]}")
def auto_rss(self, sources: dict, between_events: float, time_until_event: float):
for region, country_dict in sources.items():
for country, url_list in country_dict.items():
topic = generate_topic(region, country)
tags = [region, country]
ms_between_events = hours_to_milliseconds(between_events)
ms_time_until_event = hours_to_milliseconds(time_until_event)
self.rss_harvest(ms_time_until_event, url_list, topic, tags, var_filterquery, var_max_docs,
var_max_depth, var_depth_external, var_inclusion_terms, var_exclusion_terms,
var_inclusion_domains, var_exclusion_domains, var_xpaths, var_max_docsize,
var_scheduled, ms_between_events)
time_until_event += .33
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment