Skip to content

Instantly share code, notes, and snippets.

@coreyhermanson
Last active December 13, 2017 20:13
Show Gist options
  • Save coreyhermanson/177ea08c7489dcb7a24941ef8e8ae09f to your computer and use it in GitHub Desktop.
Save coreyhermanson/177ea08c7489dcb7a24941ef8e8ae09f to your computer and use it in GitHub Desktop.
BrightPlanet API: Return docs and Sentiment Measures w/ VADER for Tweets
#!/usr/bin/env python
"""
This script returns documents from the BrightPlanet REST API. Input is a text file with a list of queries.
Output is a CSV file with your desired fields for each document. Default time period is everything until present.
Requires 'requests' module. To install via cmd, enter: python -m pip install requests
"""
import requests
import csv
from pprint import pprint
import re
import time
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# API variables
query_file = r'FILEPATH'
output_file = r'FILEPATH'
api_key_value = 'APIKEY'
data_feed = 'hoc'
start = time.time()
# Functions
def clean_twitter_text(tweet):
remove_newlines = tweet.replace('\n', " ")
regex = '(?:text=)(.+)([\\n]?userName=)'
try:
cleaned_text = re.match(regex, tweet).group(1)
except AttributeError:
cleaned_text = remove_newlines
return cleaned_text
def get_vadersentiment(short_text):
analyzer = SentimentIntensityAnalyzer()
sentiment = analyzer.polarity_scores(short_text)['compound']
return sentiment
def list_of_entity_type(facets_dict, entity_name):
entity_list = [entity for entity in facets_dict[entity_name]]
return entity_list
def api_call(payload):
r = requests.get("https://documentapi.brightplanet.com/documentapi/docs/export", params=payload)
r.raise_for_status()
json_results = r.json()
return json_results
# Opens inputFile (list of queries) and output file (csv)
with open(query_file, 'r') as inf, open(output_file, 'w', newline='', encoding='utf8') as outf:
writer = csv.writer(outf)
headers = ["ID", "Name", "URL", "Twitter Text", "Long Sentiment Keywords", "Short Sentiment Keywords",
"Total Sentiment", "VADER Sentiment", "Polarity", "Aspect", "Mood", "Intensity"]
writer.writerow(headers)
printed_results = 0
# for each query in inputFile...
for line in inf:
query = '"' + line.strip() + '"'
totalCount = None
counter = 1
cursor = None
# while-loop over API responses, increasing start_row each time until start_row > number of results
while True:
api_payload = {'api_key': api_key_value, 'dataFeed': data_feed, 'query': query, 'cursorMark': cursor}
results = api_call(api_payload)
totalCount = results['totalCount']
cursor = results['nextCursorMark']
# Check for too many results or zero results
if totalCount > 50:
new_query = f'otherEntity_person:{query}'
new_payload = {'api_key': api_key_value, 'dataFeed': data_feed, 'query': new_query, 'cursorMark': None}
results = api_call(new_payload)
totalCount = results['totalCount']
cursor = results['nextCursorMark']
if totalCount == 0:
break
# Write URLs to file
for result in results['documents']:
dmid = result['docMasterId']
url = result['finalUrl']
polarity = result['polarity']
aspect = result['aspect']
mood = result['mood']
intensity = result['intensity']
sentiment_measure = result['sentimentMeasure']
try:
long_sentiment_keywords = list_of_entity_type(result['facets'], 'otherEntity_hocPositiveLong')
except KeyError:
long_sentiment_keywords = None
try:
short_sentiment_keywords = list_of_entity_type(result['facets'], 'otherEntity_hocPositiveShort')
except KeyError:
short_sentiment_keywords = None
if result['finalUrlDomain'] == 'twitter.com':
text = clean_twitter_text(result['enrichedDoc'])
twitter_sentiment = get_vadersentiment(text)
else:
text = ""
twitter_sentiment = None
row_fields = [dmid, query.replace('"', ''), url, text, long_sentiment_keywords,
short_sentiment_keywords,
sentiment_measure, twitter_sentiment, polarity, aspect, mood, intensity]
writer.writerow(row_fields)
printed_results += 1
# Feedback
if printed_results % 50 == 0:
print(f"Results thus far: {printed_results}")
# Check for end of results
if totalCount % 10 == 0:
if totalCount // counter == 10:
print(f"Results thus far: {printed_results}")
break
elif totalCount // counter < 10:
print(f"Results thus far: {printed_results}")
break
else:
counter += 1
duration = time.time() - start
print(f"Finished in {duration} seconds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment