Last active
December 13, 2017 20:13
-
-
Save coreyhermanson/177ea08c7489dcb7a24941ef8e8ae09f to your computer and use it in GitHub Desktop.
BrightPlanet API: Return docs and Sentiment Measures w/ VADER for Tweets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
This script returns documents from the BrightPlanet REST API. Input is a text file with a list of queries. | |
Output is a CSV file with your desired fields for each document. Default time period is everything until present. | |
Requires 'requests' module. To install via cmd, enter: python -m pip install requests | |
""" | |
import requests | |
import csv | |
from pprint import pprint | |
import re | |
import time | |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
# API variables | |
query_file = r'FILEPATH' | |
output_file = r'FILEPATH' | |
api_key_value = 'APIKEY' | |
data_feed = 'hoc' | |
start = time.time() | |
# Functions | |
def clean_twitter_text(tweet): | |
remove_newlines = tweet.replace('\n', " ") | |
regex = '(?:text=)(.+)([\\n]?userName=)' | |
try: | |
cleaned_text = re.match(regex, tweet).group(1) | |
except AttributeError: | |
cleaned_text = remove_newlines | |
return cleaned_text | |
def get_vadersentiment(short_text): | |
analyzer = SentimentIntensityAnalyzer() | |
sentiment = analyzer.polarity_scores(short_text)['compound'] | |
return sentiment | |
def list_of_entity_type(facets_dict, entity_name): | |
entity_list = [entity for entity in facets_dict[entity_name]] | |
return entity_list | |
def api_call(payload): | |
r = requests.get("https://documentapi.brightplanet.com/documentapi/docs/export", params=payload) | |
r.raise_for_status() | |
json_results = r.json() | |
return json_results | |
# Opens inputFile (list of queries) and output file (csv) | |
with open(query_file, 'r') as inf, open(output_file, 'w', newline='', encoding='utf8') as outf: | |
writer = csv.writer(outf) | |
headers = ["ID", "Name", "URL", "Twitter Text", "Long Sentiment Keywords", "Short Sentiment Keywords", | |
"Total Sentiment", "VADER Sentiment", "Polarity", "Aspect", "Mood", "Intensity"] | |
writer.writerow(headers) | |
printed_results = 0 | |
# for each query in inputFile... | |
for line in inf: | |
query = '"' + line.strip() + '"' | |
totalCount = None | |
counter = 1 | |
cursor = None | |
# while-loop over API responses, increasing start_row each time until start_row > number of results | |
while True: | |
api_payload = {'api_key': api_key_value, 'dataFeed': data_feed, 'query': query, 'cursorMark': cursor} | |
results = api_call(api_payload) | |
totalCount = results['totalCount'] | |
cursor = results['nextCursorMark'] | |
# Check for too many results or zero results | |
if totalCount > 50: | |
new_query = f'otherEntity_person:{query}' | |
new_payload = {'api_key': api_key_value, 'dataFeed': data_feed, 'query': new_query, 'cursorMark': None} | |
results = api_call(new_payload) | |
totalCount = results['totalCount'] | |
cursor = results['nextCursorMark'] | |
if totalCount == 0: | |
break | |
# Write URLs to file | |
for result in results['documents']: | |
dmid = result['docMasterId'] | |
url = result['finalUrl'] | |
polarity = result['polarity'] | |
aspect = result['aspect'] | |
mood = result['mood'] | |
intensity = result['intensity'] | |
sentiment_measure = result['sentimentMeasure'] | |
try: | |
long_sentiment_keywords = list_of_entity_type(result['facets'], 'otherEntity_hocPositiveLong') | |
except KeyError: | |
long_sentiment_keywords = None | |
try: | |
short_sentiment_keywords = list_of_entity_type(result['facets'], 'otherEntity_hocPositiveShort') | |
except KeyError: | |
short_sentiment_keywords = None | |
if result['finalUrlDomain'] == 'twitter.com': | |
text = clean_twitter_text(result['enrichedDoc']) | |
twitter_sentiment = get_vadersentiment(text) | |
else: | |
text = "" | |
twitter_sentiment = None | |
row_fields = [dmid, query.replace('"', ''), url, text, long_sentiment_keywords, | |
short_sentiment_keywords, | |
sentiment_measure, twitter_sentiment, polarity, aspect, mood, intensity] | |
writer.writerow(row_fields) | |
printed_results += 1 | |
# Feedback | |
if printed_results % 50 == 0: | |
print(f"Results thus far: {printed_results}") | |
# Check for end of results | |
if totalCount % 10 == 0: | |
if totalCount // counter == 10: | |
print(f"Results thus far: {printed_results}") | |
break | |
elif totalCount // counter < 10: | |
print(f"Results thus far: {printed_results}") | |
break | |
else: | |
counter += 1 | |
duration = time.time() - start | |
print(f"Finished in {duration} seconds") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment