LittleOrangeC · December 10, 2015 01:28 · LittleOrangeC · Dec 22, 2012
diff --git a/tm_example.R b/tm_example.R
 #
 ## 12/21/12 - Ths is my fork of DSparks Denver debate analysis script
 #
 ###########
 # Requirement - run this command to create the "denver.txt" file
 # curl https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt > denver.txt
 # From: http://www.cnn.com/2012/10/03/politics/debate-transcript/index.html

 rm(list = ls())
 doInstall <- TRUE  # Change to FALSE if you don't want packages installed.
 toInstall <- c("zoo", "tm", "ggplot2", "Snowball")
 if(doInstall){install.packages(toInstall, repos = "http://cran.r-project.org")}
 lapply(toInstall, library, character.only = TRUE)

 # From: http://www.cnn.com/2012/10/03/politics/debate-transcript/index.html
 #Transcript <- readLines("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt")

 con <- file("denver.txt", "r", blocking = FALSE)
 Transcript <- readLines(con)
 head(Transcript, 20)

 Transcript <- data.frame(Words = Transcript, Speaker = NA, stringsAsFactors = FALSE)
 Transcript$Speaker[regexpr("LEHRER: ", Transcript$Words) != -1] <- 1
 Transcript$Speaker[regexpr("OBAMA: ", Transcript$Words) != -1] <- 2
 Transcript$Speaker[regexpr("ROMNEY: ", Transcript$Words) != -1] <- 3
 table(Transcript$Speaker)
 Transcript$Speaker <- na.locf(Transcript$Speaker)

 # Remove moderator:
 Transcript <- Transcript[Transcript$Speaker != 1, ]

 myCorpus <- Corpus(DataframeSource(Transcript))
 inspect(myCorpus)

 myCorpus <- tm_map(myCorpus, tolower)  # Make lowercase
 myCorpus <- tm_map(myCorpus, removePunctuation, preserve_intra_word_dashes = FALSE)
 myCorpus <- tm_map(myCorpus, removeWords, stopwords("english"))  # Remove stopwords
 myCorpus <- tm_map(myCorpus, removeWords, c("lehrer", "obama", "romney"))
 myCorpus <- tm_map(myCorpus, stemDocument)  # Stem words

 inspect(myCorpus)
 docTermMatrix <- DocumentTermMatrix(myCorpus)

 docTermMatrix <- inspect(docTermMatrix)
 sort(colSums(docTermMatrix))
 table(colSums(docTermMatrix))

 ### so 150 here = items said 7 times
 ### so 100 here = items said 10 times
 ### so  50 here = items said 17 times
 ### so  25 here = items said 24 times
 cutoffCount <- tail(sort(colSums(docTermMatrix)), 15)[1]

 termCountFrame <- data.frame(Term = colnames(docTermMatrix))
 termCountFrame$Obama <- colSums(docTermMatrix[Transcript$Speaker == 2, ])
 termCountFrame$Romney <- colSums(docTermMatrix[Transcript$Speaker == 3, ])
 termCountFrame$Count <- colSums(docTermMatrix)[termCountFrame$Term]
 #termCountFrame$Count <- rowSums(docTermMatrix)[termCountFrame$Term]

 head(termCountFrame)

 # Plot
 ## - this didn't work
 ##zp1 <- ggplot(termCountFrame[termCountFrame$Count >= cutoffCount, termCountFrame$Obama >= 1, ])
 zp1 <- ggplot(termCountFrame[termCountFrame$Count >= cutoffCount, ])
 zp1 <- zp1 + geom_text(aes(x = Obama, y = Romney, label = Term))
 print(zp1)
	#
	## 12/21/12 - Ths is my fork of DSparks Denver debate analysis script
	#
	###########
	# Requirement - run this command to create the "denver.txt" file
	# curl https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt > denver.txt
	# From: http://www.cnn.com/2012/10/03/politics/debate-transcript/index.html

	rm(list = ls())
	doInstall <- TRUE # Change to FALSE if you don't want packages installed.
	toInstall <- c("zoo", "tm", "ggplot2", "Snowball")
	if(doInstall){install.packages(toInstall, repos = "http://cran.r-project.org")}
	lapply(toInstall, library, character.only = TRUE)

	# From: http://www.cnn.com/2012/10/03/politics/debate-transcript/index.html
	#Transcript <- readLines("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt")

	con <- file("denver.txt", "r", blocking = FALSE)
	Transcript <- readLines(con)
	head(Transcript, 20)

	Transcript <- data.frame(Words = Transcript, Speaker = NA, stringsAsFactors = FALSE)
	Transcript$Speaker[regexpr("LEHRER: ", Transcript$Words) != -1] <- 1
	Transcript$Speaker[regexpr("OBAMA: ", Transcript$Words) != -1] <- 2
	Transcript$Speaker[regexpr("ROMNEY: ", Transcript$Words) != -1] <- 3
	table(Transcript$Speaker)
	Transcript$Speaker <- na.locf(Transcript$Speaker)

	# Remove moderator:
	Transcript <- Transcript[Transcript$Speaker != 1, ]

	myCorpus <- Corpus(DataframeSource(Transcript))
	inspect(myCorpus)

	myCorpus <- tm_map(myCorpus, tolower) # Make lowercase
	myCorpus <- tm_map(myCorpus, removePunctuation, preserve_intra_word_dashes = FALSE)
	myCorpus <- tm_map(myCorpus, removeWords, stopwords("english")) # Remove stopwords
	myCorpus <- tm_map(myCorpus, removeWords, c("lehrer", "obama", "romney"))
	myCorpus <- tm_map(myCorpus, stemDocument) # Stem words

	inspect(myCorpus)
	docTermMatrix <- DocumentTermMatrix(myCorpus)

	docTermMatrix <- inspect(docTermMatrix)
	sort(colSums(docTermMatrix))
	table(colSums(docTermMatrix))

	### so 150 here = items said 7 times
	### so 100 here = items said 10 times
	### so 50 here = items said 17 times
	### so 25 here = items said 24 times
	cutoffCount <- tail(sort(colSums(docTermMatrix)), 15)[1]

	termCountFrame <- data.frame(Term = colnames(docTermMatrix))
	termCountFrame$Obama <- colSums(docTermMatrix[Transcript$Speaker == 2, ])
	termCountFrame$Romney <- colSums(docTermMatrix[Transcript$Speaker == 3, ])
	termCountFrame$Count <- colSums(docTermMatrix)[termCountFrame$Term]
	#termCountFrame$Count <- rowSums(docTermMatrix)[termCountFrame$Term]

	head(termCountFrame)

	# Plot
	## - this didn't work
	##zp1 <- ggplot(termCountFrame[termCountFrame$Count >= cutoffCount, termCountFrame$Obama >= 1, ])
	zp1 <- ggplot(termCountFrame[termCountFrame$Count >= cutoffCount, ])
	zp1 <- zp1 + geom_text(aes(x = Obama, y = Romney, label = Term))
	print(zp1)