coreyhermanson · December 13, 2017 20:13
diff --git a/bp_twitterSentiment.py b/bp_twitterSentiment.py
 #!/usr/bin/env python

 """
 This script returns documents from the BrightPlanet REST API. Input is a text file with a list of queries.
 Output is a CSV file with your desired fields for each document. Default time period is everything until present.
 Requires 'requests' module. To install via cmd, enter: python -m pip install requests
 """

 import requests
 import csv
 from pprint import pprint
 import re
 import time
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

 # API variables
 query_file = r'FILEPATH'
 output_file = r'FILEPATH'
 api_key_value = 'APIKEY'
 data_feed = 'hoc'
 start = time.time()


 # Functions
 def clean_twitter_text(tweet):
    remove_newlines = tweet.replace('\n', " ")
    regex = '(?:text=)(.+)([\\n]?userName=)'
    try:
        cleaned_text = re.match(regex, tweet).group(1)
    except AttributeError:
        cleaned_text = remove_newlines
    return cleaned_text


 def get_vadersentiment(short_text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(short_text)['compound']
    return sentiment


 def list_of_entity_type(facets_dict, entity_name):
    entity_list = [entity for entity in facets_dict[entity_name]]

    return entity_list


 def api_call(payload):
    r = requests.get("https://documentapi.brightplanet.com/documentapi/docs/export", params=payload)
    r.raise_for_status()
    json_results = r.json()

    return json_results


 # Opens inputFile (list of queries) and output file (csv)
 with open(query_file, 'r') as inf, open(output_file, 'w', newline='', encoding='utf8') as outf:
    writer = csv.writer(outf)
    headers = ["ID", "Name", "URL", "Twitter Text", "Long Sentiment Keywords", "Short Sentiment Keywords",
               "Total Sentiment", "VADER Sentiment", "Polarity", "Aspect", "Mood", "Intensity"]
    writer.writerow(headers)
    printed_results = 0

    # for each query in inputFile...
    for line in inf:
        query = '"' + line.strip() + '"'
        totalCount = None
        counter = 1
        cursor = None

        # while-loop over API responses, increasing start_row each time until start_row > number of results
        while True:
            api_payload = {'api_key': api_key_value, 'dataFeed': data_feed, 'query': query, 'cursorMark': cursor}
            results = api_call(api_payload)

            totalCount = results['totalCount']
            cursor = results['nextCursorMark']

            # Check for too many results or zero results
            if totalCount > 50:
                new_query = f'otherEntity_person:{query}'
                new_payload = {'api_key': api_key_value, 'dataFeed': data_feed, 'query': new_query, 'cursorMark': None}
                results = api_call(new_payload)
                totalCount = results['totalCount']
                cursor = results['nextCursorMark']

            if totalCount == 0:
                break

            # Write URLs to file
            for result in results['documents']:
                dmid = result['docMasterId']
                url = result['finalUrl']
                polarity = result['polarity']
                aspect = result['aspect']
                mood = result['mood']
                intensity = result['intensity']
                sentiment_measure = result['sentimentMeasure']
                try:
                    long_sentiment_keywords = list_of_entity_type(result['facets'], 'otherEntity_hocPositiveLong')
                except KeyError:
                    long_sentiment_keywords = None
                try:
                    short_sentiment_keywords = list_of_entity_type(result['facets'], 'otherEntity_hocPositiveShort')
                except KeyError:
                    short_sentiment_keywords = None

                if result['finalUrlDomain'] == 'twitter.com':
                    text = clean_twitter_text(result['enrichedDoc'])
                    twitter_sentiment = get_vadersentiment(text)
                else:
                    text = ""
                    twitter_sentiment = None

                row_fields = [dmid, query.replace('"', ''), url, text, long_sentiment_keywords,
                              short_sentiment_keywords,
                              sentiment_measure, twitter_sentiment, polarity, aspect, mood, intensity]
                writer.writerow(row_fields)

                printed_results += 1

            # Feedback
            if printed_results % 50 == 0:
                print(f"Results thus far: {printed_results}")

            # Check for end of results
            if totalCount % 10 == 0:
                if totalCount // counter == 10:
                    print(f"Results thus far: {printed_results}")
                    break
            elif totalCount // counter < 10:
                print(f"Results thus far: {printed_results}")
                break
            else:
                counter += 1

 duration = time.time() - start
 print(f"Finished in {duration} seconds")
	#!/usr/bin/env python

	"""
	This script returns documents from the BrightPlanet REST API. Input is a text file with a list of queries.
	Output is a CSV file with your desired fields for each document. Default time period is everything until present.
	Requires 'requests' module. To install via cmd, enter: python -m pip install requests
	"""

	import requests
	import csv
	from pprint import pprint
	import re
	import time
	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

	# API variables
	query_file = r'FILEPATH'
	output_file = r'FILEPATH'
	api_key_value = 'APIKEY'
	data_feed = 'hoc'
	start = time.time()


	# Functions
	def clean_twitter_text(tweet):
	remove_newlines = tweet.replace('\n', " ")
	regex = '(?:text=)(.+)([\\n]?userName=)'
	try:
	cleaned_text = re.match(regex, tweet).group(1)
	except AttributeError:
	cleaned_text = remove_newlines
	return cleaned_text


	def get_vadersentiment(short_text):
	analyzer = SentimentIntensityAnalyzer()
	sentiment = analyzer.polarity_scores(short_text)['compound']
	return sentiment


	def list_of_entity_type(facets_dict, entity_name):
	entity_list = [entity for entity in facets_dict[entity_name]]

	return entity_list


	def api_call(payload):
	r = requests.get("https://documentapi.brightplanet.com/documentapi/docs/export", params=payload)
	r.raise_for_status()
	json_results = r.json()

	return json_results


	# Opens inputFile (list of queries) and output file (csv)
	with open(query_file, 'r') as inf, open(output_file, 'w', newline='', encoding='utf8') as outf:
	writer = csv.writer(outf)
	headers = ["ID", "Name", "URL", "Twitter Text", "Long Sentiment Keywords", "Short Sentiment Keywords",
	"Total Sentiment", "VADER Sentiment", "Polarity", "Aspect", "Mood", "Intensity"]
	writer.writerow(headers)
	printed_results = 0

	# for each query in inputFile...
	for line in inf:
	query = '"' + line.strip() + '"'
	totalCount = None
	counter = 1
	cursor = None

	# while-loop over API responses, increasing start_row each time until start_row > number of results
	while True:
	api_payload = {'api_key': api_key_value, 'dataFeed': data_feed, 'query': query, 'cursorMark': cursor}
	results = api_call(api_payload)

	totalCount = results['totalCount']
	cursor = results['nextCursorMark']

	# Check for too many results or zero results
	if totalCount > 50:
	new_query = f'otherEntity_person:{query}'
	new_payload = {'api_key': api_key_value, 'dataFeed': data_feed, 'query': new_query, 'cursorMark': None}
	results = api_call(new_payload)
	totalCount = results['totalCount']
	cursor = results['nextCursorMark']

	if totalCount == 0:
	break

	# Write URLs to file
	for result in results['documents']:
	dmid = result['docMasterId']
	url = result['finalUrl']
	polarity = result['polarity']
	aspect = result['aspect']
	mood = result['mood']
	intensity = result['intensity']
	sentiment_measure = result['sentimentMeasure']
	try:
	long_sentiment_keywords = list_of_entity_type(result['facets'], 'otherEntity_hocPositiveLong')
	except KeyError:
	long_sentiment_keywords = None
	try:
	short_sentiment_keywords = list_of_entity_type(result['facets'], 'otherEntity_hocPositiveShort')
	except KeyError:
	short_sentiment_keywords = None

	if result['finalUrlDomain'] == 'twitter.com':
	text = clean_twitter_text(result['enrichedDoc'])
	twitter_sentiment = get_vadersentiment(text)
	else:
	text = ""
	twitter_sentiment = None

	row_fields = [dmid, query.replace('"', ''), url, text, long_sentiment_keywords,
	short_sentiment_keywords,
	sentiment_measure, twitter_sentiment, polarity, aspect, mood, intensity]
	writer.writerow(row_fields)

	printed_results += 1

	# Feedback
	if printed_results % 50 == 0:
	print(f"Results thus far: {printed_results}")

	# Check for end of results
	if totalCount % 10 == 0:
	if totalCount // counter == 10:
	print(f"Results thus far: {printed_results}")
	break
	elif totalCount // counter < 10:
	print(f"Results thus far: {printed_results}")
	break
	else:
	counter += 1

	duration = time.time() - start
	print(f"Finished in {duration} seconds")