Created
November 27, 2017 17:01
-
-
Save coreyhermanson/85defceac4e5cd6548aef7e32ed89584 to your computer and use it in GitHub Desktop.
BrightPlanet Harvest API: Create RSS harvests from a spreadsheet of sources
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import csv | |
input_file = r'YOUR_FULL_FILEPATH_HERE' | |
var_scheduled = "RECURRING" | |
var_initial_delay = 1.0 # float | |
var_time_between_scheduled_events = 12.0 # float | |
var_max_depth = 1 | |
var_depth_external = 0 | |
var_max_docsize = -1 | |
var_max_docs = 50 | |
var_filterquery = None | |
var_inclusion_terms = None | |
var_exclusion_terms = None | |
var_inclusion_domains = None | |
var_exclusion_domains = None | |
var_xpaths = [ | |
"dateNode:((//*[contains(local-name(), 'time')]|//*[contains(local-name(), 'Time')]|//*[contains(local-name(), 'date')]|//*[contains(local-name(), 'Date')])/text()[normalize-space()])[1]", | |
"dateAttrib:((//*[(contains(@itemprop|@class|@id, 'Date') or contains(@itemprop|@class|@id, 'Time') or contains(@itemprop|@class|@id, 'Posted') or contains(@itemprop|@class|@id, 'Publish') or contains(@itemprop|@class|@id, 'date') or contains(@itemprop|@class|@id, 'time') or contains(@itemprop|@class|@id, 'posted') or contains(@itemprop|@class|@id, 'publish')) and not(contains(@itemprop|@class|@id, 'Timeline')) and not(contains(@itemprop|@class|@id, 'timeline'))])/text()[normalize-space()])[1] ", | |
"dateMeta:(//meta[contains(@class|@itemprop|@id, 'date') or contains(@class|@itemprop|@id, 'time') or contains(@class|@itemprop|@id, 'Date') or contains(@class|@itemprop|@id, 'Time')])[1]/@content ", | |
"dateText:(//*[text()[contains(.,'201') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]]|//*[text()[contains(.,'200') and (string-length() > 7 and string-length() < 30) and (contains(., 'Jan') or contains(., 'Feb') or contains(., 'Mar') or contains(., 'Apr') or contains(., 'May') or contains(., 'Jun') or contains(., 'Jul') or contains(., 'Aug') or contains(., 'Sep') or contains(., 'Oct') or contains(., 'Nov') or contains(., 'Dec'))]])[position() < 4]" | |
] | |
def main(): | |
# Load CSV into dict | |
dict_of_urls_tags = get_event_input(input_file) | |
# Create Auth object | |
bp = HarvestAPI('YOUR_API_KEY') | |
# Create RSS harvest for each country | |
bp.auto_rss(dict_of_urls_tags, var_time_between_scheduled_events, var_initial_delay) | |
# Helper functions | |
def hours_to_milliseconds(hour): | |
""" | |
:param hour: | |
:return: milliseconds | |
""" | |
ms = round(hour * 3600000) | |
return ms | |
def get_event_input(infile): | |
with open(infile, 'r', encoding='utf8') as inf: | |
region_dict = {} | |
reader = csv.reader(inf) | |
for row in reader: | |
region = row[1] | |
country = row[2] | |
url = row[0] | |
# If region doesn't exist, then add region + country | |
if not region in region_dict.keys(): | |
region_dict[region] = {country: [url]} | |
# If region does exist, check for the country, then add country | |
elif region in region_dict.keys() and not country in region_dict[region].keys(): | |
region_dict[region][country] = [url] | |
# If region exists and country exists, append URL | |
else: | |
region_dict[region][country].append(url) | |
return region_dict | |
def generate_topic(region, country): | |
region = region[2:] | |
country = country[2:] | |
topic = f'RSS_{region}_{country}_daily' | |
return topic | |
# Classes | |
class HarvestAPI(object): | |
def __init__(self, auth): | |
self.auth = auth | |
self.harvest_base = 'https://harvestapi.brightplanet.com/harvestapi/api/harvests?api_key={}'.format( | |
self.auth) | |
self.headers = {"Content-Type": "application/json"} | |
def rss_harvest(self, delay, starting_urls, topic, source_tags, filter_query, max_docs=25, max_depth=0, | |
depth_external=0, inclusion_terms=None, exclusion_terms=None, inclusion_domains=None, | |
exclusion_domains=None, xpaths=None, max_doc_size=-1, scheduled="ONCE", sked_interval=None): | |
body = { | |
"harvestEventType": "RSS", | |
"scheduleType": scheduled, | |
"name": topic, | |
"tags": source_tags, | |
"delay": delay, | |
"interval": sked_interval, | |
"rssHarvestParameters": | |
{ | |
"initialUrls": starting_urls, | |
"levelsInternal": max_depth, | |
"levelsExternal": depth_external, | |
}, | |
"filterQuery": filter_query, | |
"inclusionTerms": inclusion_terms, | |
"exclusionTerms": exclusion_terms, | |
"inclusionDomains": inclusion_domains, | |
"exclusionDomains": exclusion_domains, | |
"xpaths": xpaths, | |
"maxDocCount": max_docs, | |
"maxDocSize": max_doc_size | |
} | |
r = requests.post(self.harvest_base, json=body, headers=self.headers, verify=False) | |
r.raise_for_status() | |
print(f"Name: {topic} || Delay: {delay} || tags: {source_tags}" | |
f" || maxDocs: {max_docs} || length: {len(starting_urls)} || first: {starting_urls[0]}") | |
def auto_rss(self, sources: dict, between_events: float, time_until_event: float): | |
for region, country_dict in sources.items(): | |
for country, url_list in country_dict.items(): | |
topic = generate_topic(region, country) | |
tags = [region, country] | |
ms_between_events = hours_to_milliseconds(between_events) | |
ms_time_until_event = hours_to_milliseconds(time_until_event) | |
self.rss_harvest(ms_time_until_event, url_list, topic, tags, var_filterquery, var_max_docs, | |
var_max_depth, var_depth_external, var_inclusion_terms, var_exclusion_terms, | |
var_inclusion_domains, var_exclusion_domains, var_xpaths, var_max_docsize, | |
var_scheduled, ms_between_events) | |
time_until_event += .33 | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment