Last active
October 31, 2019 07:10
-
-
Save TysonStanley/269abbacf2743e39da18e1d90400a0c1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Tidy Tuesday (on Friday) | |
# 25 October 2019 | |
# Tyson S. Barrett | |
# Packages ---- | |
library(tidyverse) # CRAN | |
library(dtplyr) # CRAN | |
library(data.table) # CRAN | |
library(tidyfast) # remotes::install_github("tysonstanley/tidyfast") | |
# Data ---- | |
url <- "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-22/horror_movies.csv" | |
horror_movies <- fread(url) | |
# for plots later on ---- | |
library(ggbeeswarm) | |
theme_set( | |
theme_minimal() + | |
theme(panel.grid.major.x = element_blank()) | |
) | |
# Data Dictionary ---- | |
tibble::tribble( | |
~variable, ~class, ~description, | |
"title", "character", "Title of the movie", | |
"genres", "character", "Movie Generes", | |
"release_date", "character", "Movie release date - day-month-year", | |
"release_country", "character", "Release country", | |
"movie_rating", "character", "MPAA Rating", | |
"review_rating", "double", "Movie rating (0 - 10)", | |
"movie_run_time", "character", "Movie run time (minutes)", | |
"plot", "character", "Short plot description (raw text)", | |
"cast", "character", "Cast", | |
"language", "character", "Language", | |
"filming_locations", "character", "Filming location", | |
"budget", "character", "Budget (US Dollars)" | |
) | |
# dtplyr and tidyfast ---- | |
horror <- | |
horror_movies %>% | |
lazy_dt() %>% | |
select(title, movie_rating, review_rating, | |
movie_run_time, plot, cast, filming_locations, | |
release_country, budget, release_date) %>% | |
mutate(usa = dt_case_when(release_country == "USA" ~ "USA", | |
release_country != "USA" ~ "Other Country")) %>% | |
mutate(movie_rating = fct_lump(movie_rating, 6)) %>% | |
mutate(budget = as.numeric(str_remove_all(budget, "\\$|,")), | |
budget_mil = budget/1e6) %>% | |
mutate(release_date = lubridate::dmy(release_date), | |
release_year = lubridate::year(release_date)) %>% | |
as.data.table() | |
# nested by movie rating ---- | |
# adds column with lin-log regression model | |
# extracts and tidies the model | |
horror_nest <- | |
dt_nest(horror, movie_rating) %>% | |
lazy_dt() %>% | |
mutate(rate_budget = purrr::map(data, ~{ | |
lm(review_rating ~ log(budget) + release_year, data = .x) | |
})) %>% | |
mutate(rate_budget_tidy = purrr::map(rate_budget, ~broom::tidy(.x))) %>% | |
as.data.table() | |
# unnest model estimates ---- | |
# grab just the logged budget estimates | |
horror_coef <- | |
dt_unnest(horror_nest, | |
col = rate_budget_tidy, | |
id = movie_rating) %>% | |
lazy_dt() %>% | |
filter(term == "log(budget)") %>% | |
as_tibble() | |
# visualize model results ---- | |
# estimates and confidence intervals | |
horror_coef %>% | |
ggplot(aes(x = movie_rating, y = estimate, color = movie_rating)) + | |
geom_point() + | |
geom_errorbar(aes(ymin = estimate - 1.96*std.error, | |
ymax = estimate + 1.96*std.error), | |
width = .3) + | |
labs(x = "", | |
y = "Rating Change per Additional\n1% Increase in Budget", | |
title = "") + | |
theme(legend.position = "none") | |
# Result: There does not seem to be a strong relationship | |
# between the amount of money spent on movie and | |
# the movie ratings for most movie ratings. | |
# That is, spending more doesn't consistently | |
# improve the rating of the movie. Only looks like | |
# it improves ratings (modestly) for PG-13 and R rated | |
# movies. | |
# additional visual of the same relationship | |
horror %>% | |
ggplot(aes(x = log(budget), | |
y = review_rating, | |
color = movie_rating)) + | |
geom_point(alpha = .3) + | |
geom_smooth(method = "lm") + | |
facet_wrap(~movie_rating, scales = "free") + | |
theme(legend.position = "none") + | |
labs(y = "Movie Review Rating", | |
x = "Budget (logged units)") | |
# Result: Same conclusions as before, where PG-13 and R | |
# appear to have a positive relationship but this | |
# doesn't hold for other ratings | |
# other explorations ---- | |
# Movie Rating by Release Date ---- | |
ggplot(horror, aes(x = release_date, y = review_rating)) + | |
geom_point(alpha = .7, color = "darkgrey") + | |
geom_smooth(color = "coral3", size = 2) + | |
labs(x = "", | |
y = "Movie Review Rating") | |
# Movie Rating by USA vs Other ---- | |
ggplot(horror) + | |
aes(x = usa, y = review_rating, color = movie_rating) + | |
geom_beeswarm(alpha = .7) + | |
stat_summary(color = "red") + | |
facet_wrap(~movie_rating) + | |
labs(x = "", | |
y = "Movie Review Rating") + | |
theme(legend.position = "none") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output figures
Figure 1
Figure 2
Figure 3
Figure 4