Inpirical-Coder · November 26, 2014 00:45
diff --git a/twttr_sentiment_bench.R b/twttr_sentiment_bench.R
 # Short scripts for testing three different sentiment classifiers on tweets,
 # acquiring the tweets used for testing,
 # calculating systems' precision, recall and F-measures.


 require(RCurl)   # For downloading file from a given URL.
 require(twitteR) # Used for the 'twitter' class.
 require(sentiment) # For bayes and voter classifiers.
 source("sent140.R")  # Used for the Sentiment 140 API. Can be downloaded from here:
 # https://github.com/okugami79/sentiment140/blob/master/R/sentiment.r

 load("twit_cred.Rdat")
 registerTwitterOAuth(twit.cred)


 GetSandersCorpus = function() {
  # Download the sanders corpus, save it down.
  download.file(
    url = "http://www.sananalytics.com/lab/twitter-sentiment/sanders-twitter-0.2.zip",
    destfile = "sanders_twitter-0.2.zip"
  )

  file.name = "sanders-twitter-0.2/corpus.csv"

  unzip(
    zipfile = "sanders_twitter-0.2.zip",
    files = c(file.name)
  )

  san.dat = read.csv(
    file = file.name,
    stringsAsFactors = FALSE,
    header = FALSE
  )

  colnames(san.dat) = c("term", "clas", "id")

  san.dat$id = as.character(san.dat$id)

  save(san.dat, file="data/san_dat.Rdat")
 }


 not.found.message = "Error: Not Found"   # Message used to identify tweets no longer avaliable.

 # Purge "irrelevant tweets"
 san.dat = san.dat[san.dat$clas != "irrelevant", ]


 FillSanTweets = function() {
  # Pulls tweets using the API based on the Tiwtter ids.
  sapply(setdiff(san.dat$id, names(san.tweets)), function(tid) {
    tryCatch({
      san.tweets[tid] <<- showStatus(tid)
      print(paste("Successfully added tweet no", tid))
    },
      error=function(e) {
        msg = geterrmessage()
        if(msg == not.found.message) {
          print(paste("Tweet no", tid, "longer available"))
          san.tweets[tid] <<- NA
        } else{
          print(paste("FAILED to fetch Tweet no.", tid))
        }
      }
    )

    Sys.sleep(15)   # delay the next request so we stay within the 180 calls per hour restriction.
  })
 }


 # Create a new tweet list (empty).
 san.tweets = list()

 # Start populating the list by calling FillSanTweets.
 FillSanTweets()

 #source("twitter.R")


 #dat = read.csv("data/full-corpus.csv", stringsAsFactors=FALSE)
 colnames(dat) = c("term", "sent", "tid", "date", "txt")
 dat$tid = as.character(dat$tid)

 # "Purge" the irrelevant entries
 dat = dat[dat$sent!="irrelevant",]

 dat$txt = ScrubTweets(dat$txt)

 dat = cbind(stringsAsFactors=FALSE,
  dat,
  classify_polarity(dat$txt, algorithm="voter"),
  classify_polarity(dat$txt, algorithm="bayes"),
  sentiment(dat$txt)[ , 2]
 )

 colnames(dat)[6:9] = c("v.pos", "v.neg", "v.ratio", "v.best")
 colnames(dat)[10:13] = c("b.pos", "b.neg", "b.ratio", "b.best")
 colnames(dat)[14] = "sent140"

 dat$v.ratio = as.numeric(dat$v.ratio)
 dat$b.ratio = as.numeric(dat$b.ratio)


 Distribution = function() {
  # Display pc. distribution across categories.
  sapply(list("manual"=dat$sent, "voter"=dat$v.best, "bayes"=dat$b.best, "sent140"=dat$sent140),
    function(x) {
      round(table(x) / nrow(dat) * 100, digits=2)
    }
  )
 }


 ConfusionMatrices = function() {
  # Calculate the confusion matrices for the three different algorithms we
  # are benchmarking.
  c(
    "voter"   = table(dat$sent, dat$v.best),
    "bayes"   = table(dat$sent, dat$b.best),
    "sent140" = table(dat$sent, dat$sent140)
  )
 }


 PrecisionRate = function() {
  # Calculate the precision of the two algorithms with respect to the three
  # categories.
  sapply(c("negative", "neutral", "positive"), function(clas) {
    c(
      "voter" = mean(dat[dat$v.best == clas, ]$sent == clas),
      "bayes" = mean(dat[dat$b.best == clas, ]$sent == clas),
      "sent140" = mean(dat[dat$sent140 == clas, ]$sent == clas)
    )
  })
 }


 RecallRate = function() {
  # Calculate the precision of the two algorithms with respect to the three
  # categories.
  sapply(c("negative", "neutral", "positive"), function(bin) {
    c(
      "voter" = mean(dat[dat$sent == bin, ]$v.best == bin),
      "bayes" = mean(dat[dat$sent == bin, ]$b.best == bin),
      "sent140" = mean(dat[dat$sent == bin, ]$sent140 == bin)
    )
  })
 }


 FMeasures = function() {
  # Calculates the f-measures for every classifier / class combination.
  p = PrecisionRate()
  r = RecallRate()

  # Return the harmonic mean of precision and recall
  2 * p * r / (p + r)
 }
	# Short scripts for testing three different sentiment classifiers on tweets,
	# acquiring the tweets used for testing,
	# calculating systems' precision, recall and F-measures.


	require(RCurl) # For downloading file from a given URL.
	require(twitteR) # Used for the 'twitter' class.
	require(sentiment) # For bayes and voter classifiers.
	source("sent140.R") # Used for the Sentiment 140 API. Can be downloaded from here:
	# https://github.com/okugami79/sentiment140/blob/master/R/sentiment.r

	load("twit_cred.Rdat")
	registerTwitterOAuth(twit.cred)


	GetSandersCorpus = function() {
	# Download the sanders corpus, save it down.
	download.file(
	url = "http://www.sananalytics.com/lab/twitter-sentiment/sanders-twitter-0.2.zip",
	destfile = "sanders_twitter-0.2.zip"
	)

	file.name = "sanders-twitter-0.2/corpus.csv"

	unzip(
	zipfile = "sanders_twitter-0.2.zip",
	files = c(file.name)
	)

	san.dat = read.csv(
	file = file.name,
	stringsAsFactors = FALSE,
	header = FALSE
	)

	colnames(san.dat) = c("term", "clas", "id")

	san.dat$id = as.character(san.dat$id)

	save(san.dat, file="data/san_dat.Rdat")
	}


	not.found.message = "Error: Not Found" # Message used to identify tweets no longer avaliable.

	# Purge "irrelevant tweets"
	san.dat = san.dat[san.dat$clas != "irrelevant", ]


	FillSanTweets = function() {
	# Pulls tweets using the API based on the Tiwtter ids.
	sapply(setdiff(san.dat$id, names(san.tweets)), function(tid) {
	tryCatch({
	san.tweets[tid] <<- showStatus(tid)
	print(paste("Successfully added tweet no", tid))
	},
	error=function(e) {
	msg = geterrmessage()
	if(msg == not.found.message) {
	print(paste("Tweet no", tid, "longer available"))
	san.tweets[tid] <<- NA
	} else{
	print(paste("FAILED to fetch Tweet no.", tid))
	}
	}
	)

	Sys.sleep(15) # delay the next request so we stay within the 180 calls per hour restriction.
	})
	}


	# Create a new tweet list (empty).
	san.tweets = list()

	# Start populating the list by calling FillSanTweets.
	FillSanTweets()

	#source("twitter.R")


	#dat = read.csv("data/full-corpus.csv", stringsAsFactors=FALSE)
	colnames(dat) = c("term", "sent", "tid", "date", "txt")
	dat$tid = as.character(dat$tid)

	# "Purge" the irrelevant entries
	dat = dat[dat$sent!="irrelevant",]

	dat$txt = ScrubTweets(dat$txt)

	dat = cbind(stringsAsFactors=FALSE,
	dat,
	classify_polarity(dat$txt, algorithm="voter"),
	classify_polarity(dat$txt, algorithm="bayes"),
	sentiment(dat$txt)[ , 2]
	)

	colnames(dat)[6:9] = c("v.pos", "v.neg", "v.ratio", "v.best")
	colnames(dat)[10:13] = c("b.pos", "b.neg", "b.ratio", "b.best")
	colnames(dat)[14] = "sent140"

	dat$v.ratio = as.numeric(dat$v.ratio)
	dat$b.ratio = as.numeric(dat$b.ratio)


	Distribution = function() {
	# Display pc. distribution across categories.
	sapply(list("manual"=dat$sent, "voter"=dat$v.best, "bayes"=dat$b.best, "sent140"=dat$sent140),
	function(x) {
	round(table(x) / nrow(dat) * 100, digits=2)
	}
	)
	}


	ConfusionMatrices = function() {
	# Calculate the confusion matrices for the three different algorithms we
	# are benchmarking.
	c(
	"voter" = table(dat$sent, dat$v.best),
	"bayes" = table(dat$sent, dat$b.best),
	"sent140" = table(dat$sent, dat$sent140)
	)
	}


	PrecisionRate = function() {
	# Calculate the precision of the two algorithms with respect to the three
	# categories.
	sapply(c("negative", "neutral", "positive"), function(clas) {
	c(
	"voter" = mean(dat[dat$v.best == clas, ]$sent == clas),
	"bayes" = mean(dat[dat$b.best == clas, ]$sent == clas),
	"sent140" = mean(dat[dat$sent140 == clas, ]$sent == clas)
	)
	})
	}


	RecallRate = function() {
	# Calculate the precision of the two algorithms with respect to the three
	# categories.
	sapply(c("negative", "neutral", "positive"), function(bin) {
	c(
	"voter" = mean(dat[dat$sent == bin, ]$v.best == bin),
	"bayes" = mean(dat[dat$sent == bin, ]$b.best == bin),
	"sent140" = mean(dat[dat$sent == bin, ]$sent140 == bin)
	)
	})
	}


	FMeasures = function() {
	# Calculates the f-measures for every classifier / class combination.
	p = PrecisionRate()
	r = RecallRate()

	# Return the harmonic mean of precision and recall
	2 * p * r / (p + r)
	}