Created
January 10, 2024 19:47
-
-
Save FrankRuns/cfb88f42778e719cd44de0be543fb321 to your computer and use it in GitHub Desktop.
Script to simulate tv holiday episode data for data analysts make mistakes article
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load Required Libraries | |
if (!require("MASS")) install.packages("MASS") | |
library(MASS) | |
# Define TV Shows | |
# A vector of TV show titles | |
tv_shows <- c( | |
"Breaking Bad", "Game of Thrones", "The Wire", | |
"Stranger Things", "The Crown", "Mad Men", | |
"The Sopranos", "Friends", "The Office", | |
"Parks and Recreation", "Sherlock", "Doctor Who", | |
"Fargo", "The Mandalorian", "Westworld", | |
"Better Call Saul", "Black Mirror", "The Simpsons", | |
"Futurama", "Rick and Morty", "The Big Bang Theory", | |
"Brooklyn Nine-Nine", "The Marvelous Mrs. Maisel", "Succession", | |
"Ted Lasso", "The Handmaid's Tale", "The Witcher", | |
"Mindhunter", "Killing Eve", "Ozark", | |
"Peaky Blinders", "Narcos", "Vikings", | |
"The Boys", "The Expanse", "Bridgerton", | |
"Money Heist", "The Umbrella Academy", "Daredevil", | |
"This Is Us", "Grey's Anatomy", "House of Cards", | |
"The Haunting of Hill House", "Lost", "Chernobyl", | |
"Seinfeld", "Arrested Development", "Twin Peaks", | |
"The Twilight Zone", "South Park", "Archer", | |
"It's Always Sunny in Philadelphia", "Community", "Fleabag", | |
"Dexter", "Homeland", "Prison Break", | |
"House", "The Last Dance", "The Queen's Gambit", | |
"True Detective", "The Walking Dead", "How I Met Your Mother", | |
"Curb Your Enthusiasm", "BoJack Horseman", "Band of Brothers", | |
"The Office (UK)", "Schitt's Creek", "The Americans", | |
"Battlestar Galactica", "The X-Files", "Lost in Space", | |
"Star Trek", "Firefly", "The Good Place", | |
"Atlanta", "Barry", "The Handmaid's Tale", | |
"Orange Is the New Black", "Downton Abbey", "The West Wing", | |
"Hannibal", "Luther", "The Marvelous Mrs. Maisel", | |
"Modern Family", "Scrubs", "Silicon Valley", | |
"Suits", "True Blood", "Vampire Diaries", | |
"Gossip Girl", "The Young Pope", "Lupin", | |
"The Crown" | |
) | |
# Parameters for Simulating Season Counts using Negative Binomial Distribution | |
mu <- 3 # Mean number of seasons | |
size <- 1 # Size parameter (dispersion) | |
# Simulate Season Counts | |
# Reproducibility with set.seed and generate random season counts for each TV show | |
set.seed(42) | |
season_counts <- rnbinom(n = length(tv_shows), size, mu = mu) | |
# Parameters for Simulating Base Ratings using Normal Distribution | |
mean_rating <- 7 | |
sd_rating <- 1.5 | |
# Simulate Base Ratings | |
# Generate base ratings and ensure ratings are within 0 to 10 | |
base_ratings <- rnorm(n = length(tv_shows), mean = mean_rating, sd = sd_rating) | |
base_ratings <- pmin(pmax(base_ratings, 0), 10) | |
# Function to Generate Season Ratings | |
# Generates ratings for each season based on the base rating | |
generate_season_ratings <- function(base_rating, count) { | |
if (count > 0) { | |
season_ratings <- rnorm(n = count, mean = base_rating, sd = 0.5) | |
return(pmin(pmax(season_ratings, 0), 10)) # Ensure ratings are within 0 to 10 | |
} else { | |
return(numeric(0)) # Return empty vector if count is 0 | |
} | |
} | |
# Create a Dataframe of TV Show Seasons and Ratings | |
# Iterates through TV shows, applying the season ratings function | |
tv_show_season_df <- do.call("rbind", lapply(1:length(tv_shows), function(i) { | |
if (season_counts[i] > 0) { | |
group_label <- ifelse(season_counts[i] > 1, "multiple_seasons", "one_season") | |
return(data.frame(TV_Show = tv_shows[i], | |
Season_Number = 1:season_counts[i], | |
Rating = generate_season_ratings(base_ratings[i], season_counts[i]), | |
Group = rep(group_label, season_counts[i]))) | |
} | |
})) | |
# Clean-up and Display | |
# Remove rows with NULL values and rename columns for clarity | |
tv_show_season_df <- tv_show_season_df[!sapply(tv_show_season_df, is.null), ] | |
names(tv_show_season_df) <- c("parent_primary_title", "season_number", "parent_average_rating", "group") | |
# Display a Formatted Sample of the Dataframe | |
tv_show_season_df_head <- head(tv_show_season_df) | |
formattable::formattable(tv_show_season_df_head, list( | |
parent_average_rating = formattable::color_tile("transparent", "lightpink"), | |
group = color_bar("#80ed99"))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment