Exploratory data analysis with the Twitter API v2

Author: Dr. Maria Y. Rodriguez

Acknowledgments

I acknowledge the traditional territory of the Haudenosaunee Confederacy and honor the sovereignty of the Six Nations–the Mohawk, the Cayuga, the Onondaga, the Oneida, the Seneca and the Tuscarora. I live and work on their land, here in Buffalo NY, and I offer my thanks to them and their ancestors for the opportunity.

I also give thanks to my ancestors, the Taino and Arawak people, my West African ancestors (predominantly from Benin, Togo and the Congo) who recreated their home as best they could on the island known then as Ayiti, now Hispaniola, after their kidnapping and enslavement, as well as the colonizers who made their home in Ayiti: the Spanish and Portuguese among others. I thank my ancestors for their guidance and protection and ask them to accompany me during this presentation.

References

#install.packages("academictwitteR")
library(academictwitteR)
library(tidyverse)
library(igraph)
library(ggraph)
require(tidygraph)
library(tidytext)
library(rtweet) #for function ts_plot (time series plot)
library(kableExtra)
library(wordcloud)
library(RColorBrewer)

setwd("~/Desktop/twitch_demo")
bearer_token <- "insert_yours_here"

If for some reason the CRAN build gives you trouble download directly from Github

install.packages("devtools")
library (devtools) 
devtools::install_github("cjbarrie/academictwitteR", build_vignettes = TRUE)

Get tweets based on a specific hashtag/set of hashtags

Note: only the first tweet of any given thread, if you want to build the conversation thread, see next section

htagquery <- c("#covid", "#covid-19", "#coronavirus", "#covid19")

# both replies and tweets, no retweets
tweets <- get_all_tweets(
     htagquery,
     "2021-01-01T00:00:00Z",
     "2021-06-21T00:00:00Z",
     bearer_token,
     data_path = "~/Desktop/twitch_demo",
     has_images = TRUE,
     is_retweet = FALSE,
     is_reply = TRUE,
     has_hashtags = TRUE,
     country = "US",
     lang = "en")

# Bind tweets
covid_tweets <- bind_tweet_jsons ("~/Desktop/twitch_demo")

# Note: academictwitteR saves user info in a different files from the content info
# We'll just look at tweet text for now (and since this is a live demo!)
 
# Take a look at first 5 lines 
head(covid_tweets)

# What are the variables we have? 
names(covid_tweets)

# Make it tidy 
covid_tweets <- as_tibble (covid_tweets)

Descriptive analysis

# Frequency of tweets over time period 
covid_tweets %>%
  ts_plot("days", trim = 1L) +
  ggplot2::geom_point() +
  ggplot2::theme_minimal() +
  ggplot2::theme(
    legend.title = ggplot2::element_blank(),
    legend.position = "bottom",
    plot.title = ggplot2::element_text(face = "bold")) +
  ggplot2::labs(
    x = NULL, y = NULL,
    title = "Frequency of tweets containing Covid-19 hastags (01/01/21 to 06/21/21)",
    subtitle = "Tweets aggregated by day",
    caption = "\nSource: Data collected from Twitter's Academic Research API via academictwitteR"
  )

Note: academictwitteR returns some variables (i.e. columns) as dataframes within the overall dataframe

Most retweeted

covid_retweets <- covid_tweets %>% 
  arrange(-public_metrics$retweet_count) %>%
  slice(1) %>% 
  select(created_at, text, public_metrics, conversation_id)

# Pretty print in viewer
covid_retweets %>%
  kbl() %>%
  kable_styling()

# Top 5 most liked tweets
covid_likes <- covid_tweets %>% 
  arrange(-public_metrics$like_count) %>%
  top_n(5, public_metrics$like_count) %>% 
  select(created_at, text, public_metrics)

covid_likes %>%
  kbl() %>%
  kable_styling()

Exploratory word cloud 😬

require(wordcloud)
require(RColorBrewer)
pal2 <- brewer.pal(8,"Dark2")
words <- covid_tweets %>%
  mutate(text = str_remove_all(text, "&amp;|&lt;|&gt;"),
         text = str_remove_all(text, "\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)"),
         text = str_remove_all(text, "[^\x01-\x7F]")) %>% 
  unnest_tokens(word, text, token = "tweets") %>%
  filter(!word %in% stop_words$word,
         !word %in% str_remove_all(stop_words$word, "'"),
         str_detect(word, "[a-z]"),
         !str_detect(word, "^#"),         
         !str_detect(word, "@\\S+")) %>%
  count(word, sort = TRUE)


words %>% 
  with(wordcloud(word, n, random.order = FALSE, max.words = 250, colors = pal2))

## Top words pairs 
covid_tweets$stripped_text <- gsub("http.*","",  covid_tweets$text)
covid_tweets$stripped_text <- gsub("https.*","", covid_tweets$stripped_text)

Network of words

library(devtools)
library(widyr)

# Remove punctuation, convert to lowercase, add id for each tweet!
covid_tweets_paired_words <- covid_tweets %>%
  dplyr::select(stripped_text) %>%
  unnest_tokens(paired_words, stripped_text, token = "ngrams", n = 2)

# Show current word pairs
covid_tweets_paired_words %>%
  count(paired_words, sort = TRUE)

library(tidyr)
covid_tweets_separated_words <- covid_tweets_paired_words %>%
  separate(paired_words, c("word1", "word2"), sep = " ")

covid_tweets_tweets_filtered <- covid_tweets_separated_words %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
covid_tweets_words_counts <- covid_tweets_tweets_filtered %>%
  count(word1, word2, sort = TRUE)

head(covid_tweets_words_counts)

## Words most frequently used in tweets. 

# plot word network (top 25 words)
covid_tweets_words_counts %>%
  filter(n >= 25) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 3) +
  geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
  labs(title = "Word Network of Covid19 Tweets",
       subtitle = "01/01/2021-06/21/2021 (n=3066)",
       x = "", y = "")

Superficial Sentiment Analysis of Tweets

# remove punctuation, convert to lowercase, add id for each tweet!
covid_tweets_clean <- covid_tweets %>%
  dplyr::select(stripped_text) %>%
  unnest_tokens(word, stripped_text)

# Remove stop words
# Load list of stop words - from the tidytext package
data("stop_words")
#There will be more stop words 
#library(tidyverse)
data('stopwords')
covid_tweets_clean_words <- covid_tweets_clean %>%
  anti_join(stop_words) 

# Join sentiment classification to the tweet words
bing_word_counts <- covid_tweets_clean_words %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(title = "Sentiment of Codi-19 Tweets 04/01/2021 - 06/21/2021 (n=1174)",
       y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

STM analysis of tweets

library(stm)
library(stminsights)

# Just text for now
covid_tweets_subset <- covid_tweets %>% select (created_at, lang, text, id, author_id, conversation_id, in_reply_to_user_id, source)

processed <- textProcessor(covid_tweets_subset$text, metadata = covid_tweets_subset)

# Prepare
plotRemoved(processed$documents, lower.thresh = seq(1,100, by = 10))

out <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh = 10)

docs<- out$documents
vocab<- out$vocab
meta <- out$meta

# Inspect to see how preprocesing wnet
head(docs) #how many words are in what position 
head(vocab)
head(meta)

set.seed(12345)

covid_noK <- stm(documents = out$documents, vocab = out$vocab, K=0, 
                  data = out$meta, init.type = "Spectral")


labelTopics (covid_noK)

# Diagnostics to choose the correct number of topics
# Evaluate K 
storage <- searchK(out$documents, out$vocab, K = c(20,30,40,50,60,70), data = meta)
plot(storage)

# Re-run with state as prevalence variable and a specific numebr of topics 
covid_k25 <- stm(documents = out$documents, vocab = out$vocab, K=25, 
                        data = out$meta, prevalence =  init.type = "Spectral")
plot(covid_k25)

# One idea for a next step: use context annotations are prevalence variable, day and time, etc. 

# Plot top 20 topics with 7 words using frex labels
plot.STM(covid_k25,type="summary", labeltype = ("frex"), n=6)

# Look at topic correlations
covid_corrs <- topicCorr(covid_k25, method = "simple", cutoff = 0.01, verbose = TRUE)
plot(covid_corrs)

# More in depth looks at tweets within estimated topics 
library(stminsights)

# Make sure your rdata file is saved, including all objects for the stm 
run_stminsights() #opens a shiny app in your default browser

Get specific thread based on a conversation ID

Say you note a particularly interesting tweet and want to collect the conversation thread

specific_tweet <-
   get_all_tweets(
     "conversation_id:1404588507562184704", "2021-06-13T00:00:00Z", "2021-06-15T00:00:00Z", bearer_token, 
   data_path = "~/Desktop/twitch_demo"
   )

# Pretty print in viewer 
specific_tweet %>%
  kbl() %>%
  kable_styling()

sparack/exploratory_twitter_data_analysis.md