Skip to content

Instantly share code, notes, and snippets.

@Inpirical-Coder
Created November 16, 2014 01:22
Show Gist options
  • Save Inpirical-Coder/aa8a93566f9e7404e4fe to your computer and use it in GitHub Desktop.
Save Inpirical-Coder/aa8a93566f9e7404e4fe to your computer and use it in GitHub Desktop.
Simple script to download, scrub and classify Tweets according to polarity and emotion using a simple Bayes classifier
# Simple script for doing some data-analysis of tweets;
# looking at "sentiment" and "emotion" using the sentiment package.
# see https://sites.google.com/site/miningtwitter/questions/sentiment/sentiment
# for background.
# SETTINGS
# =============================================================================
authenticated = TRUE # If TRUE will load credential from file.
tweets.from.file = TRUE # If TRUE will load tweets from file rather than query.
no.tweets = 1500 # Number of tweets to fetch in every search; <= 1,500.
# Define the list of terms to query twitter about.
tweet.terms = c(
"deutschebank",
"goldmansachs",
"jpmorgan",
"ubs",
"creditsuisse",
"wellsfargo",
"hsbc",
"pimco",
"moodysratings",
"fitchratings",
"aiginsurance",
"fanniemae"
)
language = "en" # Define the language you want tweets in.
# DEPENDENCIES (Packages and source files)
# =============================================================================
# Load your Twitter API keys; needed for authentication. Must define two variables:
# "consumer.key" and "consumer.secret".
source("twitter_api_keys.R")
InstallArchives = function() {
# To install the "sentiment" package and also its "Rstem" dependency.
# Neither package is on current CRAN, therefore download from the archives.
# URL of the CRAN repo
repo.url = "http://cran.r-project.org/src/contrib/Archive/"
# URL tails of the packages we want to install
pack.urls = c(
"Rstem/Rstem_0.4-1.tar.gz",
"sentiment/sentiment_0.2.tar.gz"
)
# Install the packages.
lapply(pack.urls, function(pack.url) {
install.packages(paste0(repo.url, pack.url), repos=NULL)
})
}
# Install the Rstem and sentiment packages if not installed.
if(!("sentiment" %in% installed.packages())) {InstallArchives()}
# Define the dependency packages we need.
required.packs = c("twitteR",
"sentiment", # Sentiment analysis.
"tm", # Text mining.
"plyr", # Splitting, plotting, combining data.
"ggplot2", # Plotting.
"wordcloud", # Create wordclouds.
"data.table", # Data tables.
"RColorBrewer" # Palettes for visualisation.
)
# Install the required packages if missing, then load them.
sapply(required.packs, function(pack) {
if(!(pack %in% installed.packages())) {install.packages(pack)}
require(pack, character.only=TRUE)
})
print("Dependencies met [OK]")
# AUTHENTICATE
# =============================================================================
TwitterAuth = function() {
# Function to authenticate with Twitter API.
# URLs needed for authentication.
request.url = "https://api.twitter.com/oauth/request_token"
access.url = "https://api.twitter.com/oauth/access_token"
auth.url = "https://api.twitter.com/oauth/authorize"
# Create a twitter credential.
twit.cred = OAuthFactory$new(
consumerKey =consumer.key,
consumerSecret=consumer.secret,
requestURL=request.url,
accessURL=access.url,
authURL=auth.url
)
twit.cred$handshake()
save(twit.cred, file="twit_cred.Rdat")
twit.cred
}
# If you have already authenticated before, just load the saved credential.
if(authenticated) {
load("twit_cred.Rdat")
} else {
twit.cred = TwitterAuth()
}
registerTwitterOAuth(twit.cred)
print("Authenticated with Twitter for use of API [OK]")
# HARVEST TWEETS
# =============================================================================
HarvestTweets = function(tweet.terms) {
tweets = lapply(tweet.terms, function(i) {
print(paste("Getting tweets for", i))
x = tryCatch(searchTwitter(i, n=no.tweets), error=function(e) NULL)
x = sapply(x, "[[", "text")
cbind(txt = x, term = i)
})
# Bind all the tweets into one character matrix and purge duplicates.
unique(Reduce(rbind, tweets[sapply(tweets, nrow) > 1]))
}
# If you have already authenticated before, just load the saved credential.
if(tweets.from.file) {
load("data/tweets.Rdat")
print("Tweets loaded from file [OK]")
} else {
tweets = HarvestTweets(tweet.terms)
save(tweets, file="data/tweets.Rdat")
print("Tweets harvested and saved [OK]")
}
# SCRUB TWEETS
# =============================================================================
ScrubTweets = function(txt) {
# Scrubs tweets for NLP analysis.
# Arguments: "txt" the texts of the tweets (character vector)
# Returns: the scrubbed tweet texts (character vector)
x = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", txt) # purge re-tweets
x = gsub("@\\w+", "", x) # purge @...
x = gsub("http\\w+", "", x) # purge http links
x = tolower(x) # make lower case
x = removeNumbers(x)
x = removePunctuation(x)
stripWhitespace(x)
}
tweets[ , "txt"] = ScrubTweets(tweets[ , "txt"])
print("Tweets scrubbed [OK]")
# CLASSIFY TEXT BASED ON EMOTION AND POLARITY
# =============================================================================
ClassifyEmoPol = function(txt) {
# Clasifies a character vector both in terms of emotion categories and
# also polarity.
# Arguments: "txt" a vector with the texts to classify (character)
# Returns: three columns, text, emotion, polarity (data frame)
cbind(
emotion = classify_emotion(txt, algorithm="bayes", prior=1.0)[,"BEST_FIT"],
polarity = classify_polarity(txt, algorithm="bayes")[,"BEST_FIT"]
)
}
# Column-bind classifications to the tweets matrix.
tweets = cbind(tweets, ClassifyEmoPol(tweets))
print("Tweets classified for emotion and polarity [OK]")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment