CateGitau · January 18, 2023 18:26 · oskval · Jan 18, 2023
diff --git a/Twitter text analysis.R b/Twitter text analysis.R
 #install required packages
 install.packages("twitteR")
 install.packages("RCurl")
 install.packages("httr")
 install.packages("devtools")
 install.packages(toInstall, repos = "http://cran.r-project.org")
 library(devtools)

 #Load necessary packages
 library(twitteR)
 library(RCurl)
 library(base64enc)

 # XXX: Go to http://dev.twitter.com/apps/new to create an app and get values
 # for these credentials, which you'll need to provide in place of these
 # empty string values that are defined as placeholders.
 # See https://dev.twitter.com/docs/auth/oauth for more information 
 # on Twitter's OAuth implementation.
 Access_token <-  ""
 Access_token_secret <-  ""
 consumer_key <-  ""
 consumer_secret <-  ""

 #Calling twitteR OAuth function
 setup_twitter_oauth(consumer_key,consumer_secret,Access_token,Access_token_secret)

 #getting timeline data on ma3Route
 tweets <- userTimeline("name", n=3200, maxID=NULL, sinceID=NULL, includeRts=TRUE)

 #convert the tweets into a df
 tweets.df <-twListToDF(tweets)
 dim(tweets.df)

 library(tm)
 library(stringr)

 #build a corpus and specify the source to be character of vectors
 #a corpus is a collection of written texts
 myCorpus <- Corpus(VectorSource(tweets.df$text))
 myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))

 #convert myCorpus into lowercase
 myCorpus <- tm_map(myCorpus, content_transformer(tolower))

 #remove punctuation
 myCorpus <- tm_map(myCorpus, removePunctuation)
 #remove numbers
 myCorpus <- tm_map(myCorpus, removeNumbers)

 Textprocessing <- function(x)
  {gsub("http[[:alnum:]]*",'', x)
  gsub('http\\S+\\s*', '', x) ## Remove URLs
  gsub('\\b+RT', '', x) ## Remove RT
  gsub('#\\S+', '', x) ## Remove Hashtags
  gsub('@\\S+', '', x) ## Remove Mentions
  gsub('[[:cntrl:]]', '', x) ## Remove Controls and special characters
  gsub("\\d", '', x) ## Remove Controls and special characters
  gsub('[[:punct:]]', '', x) ## Remove Punctuations
  gsub("^[[:space:]]*","",x) ## Remove leading whitespaces
  gsub("[[:space:]]*$","",x) ## Remove trailing whitespaces
  gsub(' +',' ',x) ## Remove extra whitespaces
 }
 myCorpus <- tm_map(myCorpus,Textprocessing)

 # remove extra whitespace
 myCorpus <- tm_map(myCorpus, stripWhitespace)

 #add stopwords
 #stopwords are words which do not contain much significance.
 #These words are usually filtered out because they return vast amount of unnecessary information.
 mystopwords <- c(stopwords("english"),"rt","íí","get","like","just","yes","know","will","good","day","people")

 #remove stopwords
 myCorpus <- tm_map(myCorpus,removeWords,mystopwords)

 #copy of corpus
 myCorpus_copy <- myCorpus
 #stem words
 myCorpus <- tm_map(myCorpus,stemDocument)

 for (i in c(1:2, 3163)){
 cat(paste0("[", i, "] "))
 writeLines(strwrap(as.character(myCorpus_copy[[i]]), 60))}

 # stemCompletion2 <- function(x, dictionary){
 # x <- unlist(strsplit(as.character(x), " "))
 # # Unexpectedly, stemCompletion completes an empty string to
 # # a word in dictionary. Remove empty string to avoid above issue.
 # x <- x[x != ""]
 # x <- stemCompletion(x, dictionary=dictionary)
 # x <- paste(x, sep="", collapse=" ")
 # PlainTextDocument(stripWhitespace(x))
 # }

 myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=myCorpus_copy)
 #myCorpus <- lapply(myCorpus, stemCompletion2, dictionary=myCorpus_copy)
 myCorpus <- Corpus(VectorSource(myCorpus))
 myCorpus <- iconv(x = myCorpus,"latin1","ASCII",sub = "")

 tdm <- TermDocumentMatrix(myCorpus_copy,control = list(wordlengths = c(1,Inf)))
 tdm

 #inspect frequent words
 freq.terms <- findFreqTerms(tdm, lowfreq =  50)
 View(freq.terms)


 termFreq <- rowSums(as.matrix(tdm))
 termFreq <- subset(termFreq, termFreq >=20)
 df <- data.frame(term = names(termFreq), freq = termFreq)

 View(df)
 #visualize frequent terms
 library(ggplot2)

 #visualize frequent terms
 library(ggplot2)

 ggplot(df,aes(x = reorder(df$term, +df$freq), y = freq, fill=df$freq)) + geom_bar(stat = "identity") +
  scale_colour_gradientn(colors = terrain.colors(10)) + xlab("Terms") + ylab("Count") + coord_flip()

 #load required libraries
 library(wordcloud)
 library(wordcloud2)

 m <- as.matrix(tdm)

 # colors
 pal <- brewer.pal(9, "BuGn")
 pal <- pal[-(1:4)]

 #calculate the frequency of words as sort it by frequency
 word.freq <- sort(rowSums(m), decreasing = T)
 wordcloud2(df, color = "random-dark", backgroundColor = "white",figPath = "sev.png")
	#install required packages
	install.packages("twitteR")
	install.packages("RCurl")
	install.packages("httr")
	install.packages("devtools")
	install.packages(toInstall, repos = "http://cran.r-project.org")
	library(devtools)

	#Load necessary packages
	library(twitteR)
	library(RCurl)
	library(base64enc)

	# XXX: Go to http://dev.twitter.com/apps/new to create an app and get values
	# for these credentials, which you'll need to provide in place of these
	# empty string values that are defined as placeholders.
	# See https://dev.twitter.com/docs/auth/oauth for more information
	# on Twitter's OAuth implementation.
	Access_token <- ""
	Access_token_secret <- ""
	consumer_key <- ""
	consumer_secret <- ""

	#Calling twitteR OAuth function
	setup_twitter_oauth(consumer_key,consumer_secret,Access_token,Access_token_secret)

	#getting timeline data on ma3Route
	tweets <- userTimeline("name", n=3200, maxID=NULL, sinceID=NULL, includeRts=TRUE)

	#convert the tweets into a df
	tweets.df <-twListToDF(tweets)
	dim(tweets.df)

	library(tm)
	library(stringr)

	#build a corpus and specify the source to be character of vectors
	#a corpus is a collection of written texts
	myCorpus <- Corpus(VectorSource(tweets.df$text))
	myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))

	#convert myCorpus into lowercase
	myCorpus <- tm_map(myCorpus, content_transformer(tolower))

	#remove punctuation
	myCorpus <- tm_map(myCorpus, removePunctuation)
	#remove numbers
	myCorpus <- tm_map(myCorpus, removeNumbers)

	Textprocessing <- function(x)
	{gsub("http[[:alnum:]]*",'', x)
	gsub('http\\S+\\s*', '', x) ## Remove URLs
	gsub('\\b+RT', '', x) ## Remove RT
	gsub('#\\S+', '', x) ## Remove Hashtags
	gsub('@\\S+', '', x) ## Remove Mentions
	gsub('[[:cntrl:]]', '', x) ## Remove Controls and special characters
	gsub("\\d", '', x) ## Remove Controls and special characters
	gsub('[[:punct:]]', '', x) ## Remove Punctuations
	gsub("^[[:space:]]*","",x) ## Remove leading whitespaces
	gsub("[[:space:]]*$","",x) ## Remove trailing whitespaces
	gsub(' +',' ',x) ## Remove extra whitespaces
	}
	myCorpus <- tm_map(myCorpus,Textprocessing)

	# remove extra whitespace
	myCorpus <- tm_map(myCorpus, stripWhitespace)

	#add stopwords
	#stopwords are words which do not contain much significance.
	#These words are usually filtered out because they return vast amount of unnecessary information.
	mystopwords <- c(stopwords("english"),"rt","íí","get","like","just","yes","know","will","good","day","people")

	#remove stopwords
	myCorpus <- tm_map(myCorpus,removeWords,mystopwords)

	#copy of corpus
	myCorpus_copy <- myCorpus
	#stem words
	myCorpus <- tm_map(myCorpus,stemDocument)

	for (i in c(1:2, 3163)){
	cat(paste0("[", i, "] "))
	writeLines(strwrap(as.character(myCorpus_copy[[i]]), 60))}

	# stemCompletion2 <- function(x, dictionary){
	# x <- unlist(strsplit(as.character(x), " "))
	# # Unexpectedly, stemCompletion completes an empty string to
	# # a word in dictionary. Remove empty string to avoid above issue.
	# x <- x[x != ""]
	# x <- stemCompletion(x, dictionary=dictionary)
	# x <- paste(x, sep="", collapse=" ")
	# PlainTextDocument(stripWhitespace(x))
	# }

	myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=myCorpus_copy)
	#myCorpus <- lapply(myCorpus, stemCompletion2, dictionary=myCorpus_copy)
	myCorpus <- Corpus(VectorSource(myCorpus))
	myCorpus <- iconv(x = myCorpus,"latin1","ASCII",sub = "")

	tdm <- TermDocumentMatrix(myCorpus_copy,control = list(wordlengths = c(1,Inf)))
	tdm

	#inspect frequent words
	freq.terms <- findFreqTerms(tdm, lowfreq = 50)
	View(freq.terms)


	termFreq <- rowSums(as.matrix(tdm))
	termFreq <- subset(termFreq, termFreq >=20)
	df <- data.frame(term = names(termFreq), freq = termFreq)

	View(df)
	#visualize frequent terms
	library(ggplot2)

	#visualize frequent terms
	library(ggplot2)

	ggplot(df,aes(x = reorder(df$term, +df$freq), y = freq, fill=df$freq)) + geom_bar(stat = "identity") +
	scale_colour_gradientn(colors = terrain.colors(10)) + xlab("Terms") + ylab("Count") + coord_flip()

	#load required libraries
	library(wordcloud)
	library(wordcloud2)

	m <- as.matrix(tdm)

	# colors
	pal <- brewer.pal(9, "BuGn")
	pal <- pal[-(1:4)]

	#calculate the frequency of words as sort it by frequency
	word.freq <- sort(rowSums(m), decreasing = T)
	wordcloud2(df, color = "random-dark", backgroundColor = "white",figPath = "sev.png")