Author: Dr. Maria Y. Rodriguez
I acknowledge the traditional territory of the Haudenosaunee Confederacy and honor the sovereignty of the Six Nations–the Mohawk, the Cayuga, the Onondaga, the Oneida, the Seneca and the Tuscarora. I live and work on their land, here in Buffalo NY, and I offer my thanks to them and their ancestors for the opportunity.
I also give thanks to my ancestors, the Taino and Arawak people, my West African ancestors (predominantly from Benin, Togo and the Congo) who recreated their home as best they could on the island known then as Ayiti, now Hispaniola, after their kidnapping and enslavement, as well as the colonizers who made their home in Ayiti: the Spanish and Portuguese among others. I thank my ancestors for their guidance and protection and ask them to accompany me during this presentation.
- Text Mining w/R: A Tidy Approach by Silge & Robinson
- Work w/Twitter Social Media Data with R by wasser & Farmer
- Summer Institutes in Computational Social Science (SICSS)
#install.packages("academictwitteR")
library(academictwitteR)
library(tidyverse)
library(igraph)
library(ggraph)
require(tidygraph)
library(tidytext)
library(rtweet) #for function ts_plot (time series plot)
library(kableExtra)
library(wordcloud)
library(RColorBrewer)
setwd("~/Desktop/twitch_demo")
bearer_token <- "insert_yours_here"
If for some reason the CRAN build gives you trouble download directly from Github
install.packages("devtools")
library (devtools)
devtools::install_github("cjbarrie/academictwitteR", build_vignettes = TRUE)
Note: only the first tweet of any given thread, if you want to build the conversation thread, see next section
htagquery <- c("#covid", "#covid-19", "#coronavirus", "#covid19")
# both replies and tweets, no retweets
tweets <- get_all_tweets(
htagquery,
"2021-01-01T00:00:00Z",
"2021-06-21T00:00:00Z",
bearer_token,
data_path = "~/Desktop/twitch_demo",
has_images = TRUE,
is_retweet = FALSE,
is_reply = TRUE,
has_hashtags = TRUE,
country = "US",
lang = "en")
# Bind tweets
covid_tweets <- bind_tweet_jsons ("~/Desktop/twitch_demo")
# Note: academictwitteR saves user info in a different files from the content info
# We'll just look at tweet text for now (and since this is a live demo!)
# Take a look at first 5 lines
head(covid_tweets)
# What are the variables we have?
names(covid_tweets)
# Make it tidy
covid_tweets <- as_tibble (covid_tweets)
# Frequency of tweets over time period
covid_tweets %>%
ts_plot("days", trim = 1L) +
ggplot2::geom_point() +
ggplot2::theme_minimal() +
ggplot2::theme(
legend.title = ggplot2::element_blank(),
legend.position = "bottom",
plot.title = ggplot2::element_text(face = "bold")) +
ggplot2::labs(
x = NULL, y = NULL,
title = "Frequency of tweets containing Covid-19 hastags (01/01/21 to 06/21/21)",
subtitle = "Tweets aggregated by day",
caption = "\nSource: Data collected from Twitter's Academic Research API via academictwitteR"
)
Note: academictwitteR returns some variables (i.e. columns) as dataframes within the overall dataframe
covid_retweets <- covid_tweets %>%
arrange(-public_metrics$retweet_count) %>%
slice(1) %>%
select(created_at, text, public_metrics, conversation_id)
# Pretty print in viewer
covid_retweets %>%
kbl() %>%
kable_styling()
# Top 5 most liked tweets
covid_likes <- covid_tweets %>%
arrange(-public_metrics$like_count) %>%
top_n(5, public_metrics$like_count) %>%
select(created_at, text, public_metrics)
covid_likes %>%
kbl() %>%
kable_styling()
require(wordcloud)
require(RColorBrewer)
pal2 <- brewer.pal(8,"Dark2")
words <- covid_tweets %>%
mutate(text = str_remove_all(text, "&|<|>"),
text = str_remove_all(text, "\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)"),
text = str_remove_all(text, "[^\x01-\x7F]")) %>%
unnest_tokens(word, text, token = "tweets") %>%
filter(!word %in% stop_words$word,
!word %in% str_remove_all(stop_words$word, "'"),
str_detect(word, "[a-z]"),
!str_detect(word, "^#"),
!str_detect(word, "@\\S+")) %>%
count(word, sort = TRUE)
words %>%
with(wordcloud(word, n, random.order = FALSE, max.words = 250, colors = pal2))
## Top words pairs
covid_tweets$stripped_text <- gsub("http.*","", covid_tweets$text)
covid_tweets$stripped_text <- gsub("https.*","", covid_tweets$stripped_text)
library(devtools)
library(widyr)
# Remove punctuation, convert to lowercase, add id for each tweet!
covid_tweets_paired_words <- covid_tweets %>%
dplyr::select(stripped_text) %>%
unnest_tokens(paired_words, stripped_text, token = "ngrams", n = 2)
# Show current word pairs
covid_tweets_paired_words %>%
count(paired_words, sort = TRUE)
library(tidyr)
covid_tweets_separated_words <- covid_tweets_paired_words %>%
separate(paired_words, c("word1", "word2"), sep = " ")
covid_tweets_tweets_filtered <- covid_tweets_separated_words %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
covid_tweets_words_counts <- covid_tweets_tweets_filtered %>%
count(word1, word2, sort = TRUE)
head(covid_tweets_words_counts)
## Words most frequently used in tweets.
# plot word network (top 25 words)
covid_tweets_words_counts %>%
filter(n >= 25) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
geom_node_point(color = "darkslategray4", size = 3) +
geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
labs(title = "Word Network of Covid19 Tweets",
subtitle = "01/01/2021-06/21/2021 (n=3066)",
x = "", y = "")
# remove punctuation, convert to lowercase, add id for each tweet!
covid_tweets_clean <- covid_tweets %>%
dplyr::select(stripped_text) %>%
unnest_tokens(word, stripped_text)
# Remove stop words
# Load list of stop words - from the tidytext package
data("stop_words")
#There will be more stop words
#library(tidyverse)
data('stopwords')
covid_tweets_clean_words <- covid_tweets_clean %>%
anti_join(stop_words)
# Join sentiment classification to the tweet words
bing_word_counts <- covid_tweets_clean_words %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(title = "Sentiment of Codi-19 Tweets 04/01/2021 - 06/21/2021 (n=1174)",
y = "Contribution to sentiment",
x = NULL) +
coord_flip()
library(stm)
library(stminsights)
# Just text for now
covid_tweets_subset <- covid_tweets %>% select (created_at, lang, text, id, author_id, conversation_id, in_reply_to_user_id, source)
processed <- textProcessor(covid_tweets_subset$text, metadata = covid_tweets_subset)
# Prepare
plotRemoved(processed$documents, lower.thresh = seq(1,100, by = 10))
out <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh = 10)
docs<- out$documents
vocab<- out$vocab
meta <- out$meta
# Inspect to see how preprocesing wnet
head(docs) #how many words are in what position
head(vocab)
head(meta)
set.seed(12345)
covid_noK <- stm(documents = out$documents, vocab = out$vocab, K=0,
data = out$meta, init.type = "Spectral")
labelTopics (covid_noK)
# Diagnostics to choose the correct number of topics
# Evaluate K
storage <- searchK(out$documents, out$vocab, K = c(20,30,40,50,60,70), data = meta)
plot(storage)
# Re-run with state as prevalence variable and a specific numebr of topics
covid_k25 <- stm(documents = out$documents, vocab = out$vocab, K=25,
data = out$meta, prevalence = init.type = "Spectral")
plot(covid_k25)
# One idea for a next step: use context annotations are prevalence variable, day and time, etc.
# Plot top 20 topics with 7 words using frex labels
plot.STM(covid_k25,type="summary", labeltype = ("frex"), n=6)
# Look at topic correlations
covid_corrs <- topicCorr(covid_k25, method = "simple", cutoff = 0.01, verbose = TRUE)
plot(covid_corrs)
# More in depth looks at tweets within estimated topics
library(stminsights)
# Make sure your rdata file is saved, including all objects for the stm
run_stminsights() #opens a shiny app in your default browser
Say you note a particularly interesting tweet and want to collect the conversation thread
specific_tweet <-
get_all_tweets(
"conversation_id:1404588507562184704", "2021-06-13T00:00:00Z", "2021-06-15T00:00:00Z", bearer_token,
data_path = "~/Desktop/twitch_demo"
)
# Pretty print in viewer
specific_tweet %>%
kbl() %>%
kable_styling()