Skip to content

Instantly share code, notes, and snippets.

@Inpirical-Coder
Created November 26, 2014 00:45
Show Gist options
  • Save Inpirical-Coder/3a529e47f677248d2862 to your computer and use it in GitHub Desktop.
Save Inpirical-Coder/3a529e47f677248d2862 to your computer and use it in GitHub Desktop.
Benchmarking sentiment scoring algorithms for twitter using precision, recall, F-measure
# Short scripts for testing three different sentiment classifiers on tweets,
# acquiring the tweets used for testing,
# calculating systems' precision, recall and F-measures.
require(RCurl) # For downloading file from a given URL.
require(twitteR) # Used for the 'twitter' class.
require(sentiment) # For bayes and voter classifiers.
source("sent140.R") # Used for the Sentiment 140 API. Can be downloaded from here:
# https://github.com/okugami79/sentiment140/blob/master/R/sentiment.r
load("twit_cred.Rdat")
registerTwitterOAuth(twit.cred)
GetSandersCorpus = function() {
# Download the sanders corpus, save it down.
download.file(
url = "http://www.sananalytics.com/lab/twitter-sentiment/sanders-twitter-0.2.zip",
destfile = "sanders_twitter-0.2.zip"
)
file.name = "sanders-twitter-0.2/corpus.csv"
unzip(
zipfile = "sanders_twitter-0.2.zip",
files = c(file.name)
)
san.dat = read.csv(
file = file.name,
stringsAsFactors = FALSE,
header = FALSE
)
colnames(san.dat) = c("term", "clas", "id")
san.dat$id = as.character(san.dat$id)
save(san.dat, file="data/san_dat.Rdat")
}
not.found.message = "Error: Not Found" # Message used to identify tweets no longer avaliable.
# Purge "irrelevant tweets"
san.dat = san.dat[san.dat$clas != "irrelevant", ]
FillSanTweets = function() {
# Pulls tweets using the API based on the Tiwtter ids.
sapply(setdiff(san.dat$id, names(san.tweets)), function(tid) {
tryCatch({
san.tweets[tid] <<- showStatus(tid)
print(paste("Successfully added tweet no", tid))
},
error=function(e) {
msg = geterrmessage()
if(msg == not.found.message) {
print(paste("Tweet no", tid, "longer available"))
san.tweets[tid] <<- NA
} else{
print(paste("FAILED to fetch Tweet no.", tid))
}
}
)
Sys.sleep(15) # delay the next request so we stay within the 180 calls per hour restriction.
})
}
# Create a new tweet list (empty).
san.tweets = list()
# Start populating the list by calling FillSanTweets.
FillSanTweets()
#source("twitter.R")
#dat = read.csv("data/full-corpus.csv", stringsAsFactors=FALSE)
colnames(dat) = c("term", "sent", "tid", "date", "txt")
dat$tid = as.character(dat$tid)
# "Purge" the irrelevant entries
dat = dat[dat$sent!="irrelevant",]
dat$txt = ScrubTweets(dat$txt)
dat = cbind(stringsAsFactors=FALSE,
dat,
classify_polarity(dat$txt, algorithm="voter"),
classify_polarity(dat$txt, algorithm="bayes"),
sentiment(dat$txt)[ , 2]
)
colnames(dat)[6:9] = c("v.pos", "v.neg", "v.ratio", "v.best")
colnames(dat)[10:13] = c("b.pos", "b.neg", "b.ratio", "b.best")
colnames(dat)[14] = "sent140"
dat$v.ratio = as.numeric(dat$v.ratio)
dat$b.ratio = as.numeric(dat$b.ratio)
Distribution = function() {
# Display pc. distribution across categories.
sapply(list("manual"=dat$sent, "voter"=dat$v.best, "bayes"=dat$b.best, "sent140"=dat$sent140),
function(x) {
round(table(x) / nrow(dat) * 100, digits=2)
}
)
}
ConfusionMatrices = function() {
# Calculate the confusion matrices for the three different algorithms we
# are benchmarking.
c(
"voter" = table(dat$sent, dat$v.best),
"bayes" = table(dat$sent, dat$b.best),
"sent140" = table(dat$sent, dat$sent140)
)
}
PrecisionRate = function() {
# Calculate the precision of the two algorithms with respect to the three
# categories.
sapply(c("negative", "neutral", "positive"), function(clas) {
c(
"voter" = mean(dat[dat$v.best == clas, ]$sent == clas),
"bayes" = mean(dat[dat$b.best == clas, ]$sent == clas),
"sent140" = mean(dat[dat$sent140 == clas, ]$sent == clas)
)
})
}
RecallRate = function() {
# Calculate the precision of the two algorithms with respect to the three
# categories.
sapply(c("negative", "neutral", "positive"), function(bin) {
c(
"voter" = mean(dat[dat$sent == bin, ]$v.best == bin),
"bayes" = mean(dat[dat$sent == bin, ]$b.best == bin),
"sent140" = mean(dat[dat$sent == bin, ]$sent140 == bin)
)
})
}
FMeasures = function() {
# Calculates the f-measures for every classifier / class combination.
p = PrecisionRate()
r = RecallRate()
# Return the harmonic mean of precision and recall
2 * p * r / (p + r)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment