statsmaths · July 29, 2015 18:46
diff --git a/gistfile1.txt b/gistfile1.txt
 #' Title: Exploratory data analysis - Agha Shahid Ali
 #' Author: Taylor Arnold (taylor.arnold@acm.org)
 #' Created: 2015-07-29 18:37
 #' Description: Basic exploratory data analysis of the
 #'              poems of Agha Shahid Ali.

 library(syuzhet)

 # Read in the lines and clean some basic characters
 poems <- scan("~/Desktop/CMl.txt", what="character", sep="\n")
 poems <- gsub("\"", "", poems)
 poems <- gsub("(", "", poems, fixed=TRUE)
 poems <- gsub(")", "", poems, fixed=TRUE)

 # For now, we'll just take the first 100 lines as the
 # input has some encoding issues (to be fixed later)
 poems <- poems[1:100]

 # How positive or negative is each line?
 out <- get_sentiment(poems)
 table(out)
 poems[out == -5]

 # Now look at the 8 category sentiment types:
 out <- get_nrc_sentiment(poems)
 poems[out$anticipation > 0]
 poems[out$fear > 0]
 poems[out$anticipation > 0 & out$sadness > 0]

 # A simple plot of the sentiment of each line
 # (did not turn out to be very helpful, but
 #  could be later)
 plot(0,0,xlim=c(1,nrow(out)),ylim=c(0,max(out)),col="white")
 for (i in 1:ncol(out)) {
  lines(1:nrow(out), out[,i], col=rainbow(8)[i], lwd=2)
 }
 legend(80,2.5,col=rainbow(8),colnames(out),pch=19,cex=1)


 # An altenative method -
 # Analysis using coreNLP; package needs to downloaded via:
 #
 #   install.packages("coreNLP")
 #   coreNLP::downloadCoreNLP()

 library(coreNLP)
 coreNLP::initCoreNLP()

 anno <- annotateString(poems)
 tok <- getToken(anno)
 ut <- coreNLP::universalTagset(tok$POS)

 # Show the most frequence lemmas (normalized words) by
 # part of speech.
 sort(table(tok$lemm[ut == "NOUN"]),decreasing=TRUE)[1:24]
 sort(table(tok$lemm[ut == "VERB"]),decreasing=TRUE)[1:24]
 sort(table(tok$lemm[ut == "ADJ"]),decreasing=TRUE)[1:5]
	#' Title: Exploratory data analysis - Agha Shahid Ali
	#' Author: Taylor Arnold (taylor.arnold@acm.org)
	#' Created: 2015-07-29 18:37
	#' Description: Basic exploratory data analysis of the
	#' poems of Agha Shahid Ali.

	library(syuzhet)

	# Read in the lines and clean some basic characters
	poems <- scan("~/Desktop/CMl.txt", what="character", sep="\n")
	poems <- gsub("\"", "", poems)
	poems <- gsub("(", "", poems, fixed=TRUE)
	poems <- gsub(")", "", poems, fixed=TRUE)

	# For now, we'll just take the first 100 lines as the
	# input has some encoding issues (to be fixed later)
	poems <- poems[1:100]

	# How positive or negative is each line?
	out <- get_sentiment(poems)
	table(out)
	poems[out == -5]

	# Now look at the 8 category sentiment types:
	out <- get_nrc_sentiment(poems)
	poems[out$anticipation > 0]
	poems[out$fear > 0]
	poems[out$anticipation > 0 & out$sadness > 0]

	# A simple plot of the sentiment of each line
	# (did not turn out to be very helpful, but
	# could be later)
	plot(0,0,xlim=c(1,nrow(out)),ylim=c(0,max(out)),col="white")
	for (i in 1:ncol(out)) {
	lines(1:nrow(out), out[,i], col=rainbow(8)[i], lwd=2)
	}
	legend(80,2.5,col=rainbow(8),colnames(out),pch=19,cex=1)


	# An altenative method -
	# Analysis using coreNLP; package needs to downloaded via:
	#
	# install.packages("coreNLP")
	# coreNLP::downloadCoreNLP()

	library(coreNLP)
	coreNLP::initCoreNLP()

	anno <- annotateString(poems)
	tok <- getToken(anno)
	ut <- coreNLP::universalTagset(tok$POS)

	# Show the most frequence lemmas (normalized words) by
	# part of speech.
	sort(table(tok$lemm[ut == "NOUN"]),decreasing=TRUE)[1:24]
	sort(table(tok$lemm[ut == "VERB"]),decreasing=TRUE)[1:24]
	sort(table(tok$lemm[ut == "ADJ"]),decreasing=TRUE)[1:5]