Last active
December 27, 2022 21:48
-
-
Save NickCH-K/a020d4c037c9dcb4b3d82c4389b45d82 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(purrr) | |
predfunc = function(x, xdat) x[1] + x[2]*xdat | |
residsum = function(x, xdat, ydat, pwr) sum(abs((predfunc(x, xdat)-ydat))^pwr) | |
set.seed(1000) | |
exdat = bind_rows( | |
tibble(label = 'Outlier X', | |
x = rnorm(50)) %>% | |
mutate(y = x+rnorm(50, 0, .3)) %>% | |
mutate(x = case_when(row_number() >= 47 ~ x + 4, | |
TRUE ~ x)), | |
tibble(label = 'Positive Slope, Low Noise', | |
x = rnorm(50)) %>% | |
mutate(y = x+rnorm(50, 0, .3)), | |
tibble(label = 'Positive Slope, High Noise', | |
x = rnorm(50)) %>% | |
mutate(y = x + rnorm(50, 0, 2)), | |
tibble(label = 'Nonlinear Relationship', | |
x = rnorm(50,0,.75)) %>% | |
mutate(y = x + x^2 + rnorm(50, 0, .3)), | |
tibble(label = 'Outlier Y', | |
x = rnorm(50)) %>% | |
mutate(y = case_when( | |
row_number() < 47 ~ x + rnorm(50, 0, .3), | |
TRUE ~ x + rnorm(50,4,.3) | |
)), | |
tibble(label = 'Outlier X & Y', | |
x = c(rnorm(47),rnorm(3,4))) %>% | |
mutate(y = x + rnorm(50)) | |
) | |
domin = function(lb, pw) { | |
print(paste('Working on',lb,'to the',pw,'power at',Sys.time())) | |
xdat = exdat %>% | |
filter(label == lb) %>% | |
pull(x) | |
ydat = exdat %>% | |
filter(label == lb) %>% | |
pull(y) | |
res = optim(c(0,0),residsum, xdat=xdat, ydat=ydat, pwr = pw) | |
tibble(label = lb, | |
power = as.character(pw), | |
intercept = res$par[1], | |
slope = res$par[2]) | |
} | |
results = map_dfr(unique(exdat$label), | |
\(x) bind_rows(map_dfr(c(.5, 1, 2, 3, 4, 5, 100), \(y) domin(x, y)), | |
tibble(label = x, | |
power = 'True', | |
intercept = 0, | |
slope = 1))) | |
results = results %>% | |
mutate(power = factor(power, | |
levels = c('0.5','1','2','3','4','5','100','True'))) | |
ggplot(exdat, aes(x = x, y = y, group = label)) + | |
geom_point() + | |
geom_line(aes(x = x, y = y), | |
data = tibble(x = (-200:200)/100, | |
label = 'Nonlinear Relationship') %>% | |
mutate(y = x+x^2), | |
color = 'red') + | |
geom_abline(aes(intercept = intercept, slope = slope, color = power), | |
data = results %>% filter(label != 'Nonlinear Relationship' | power != 'True')) + | |
facet_wrap(~factor(label,levels = c('Positive Slope, Low Noise', | |
'Positive Slope, High Noise', | |
'Nonlinear Relationship', | |
'Outlier X','Outlier Y','Outlier X & Y')), ncol = 2) + | |
scale_color_manual(values = c(paletteer_dynamic('cartography::blue.pal',7),'red')) + | |
theme_classic() + | |
theme(text = element_text(family = 'serif', size = 13), | |
axis.title.y = element_text(hjust = 1, angle = 0)) + | |
labs(color = 'Power', | |
caption = scales::label_wrap(100)('True model always: X is normal, e is normal, Y = X + e or Y = X + X^2 + e. Outliers add on a shift afterwards, and different graphs have different variances.'), | |
title = 'Why the sum of SQUARES?', | |
subtitle = scales::label_wrap(80)('Bivariate regression slopes minimizing the sum of residuals to different powers.'), | |
x = 'X',y = 'Y') | |
# Discrete color | |
ggplot(exdat, aes(x = x, y = y, group = label)) + | |
geom_point() + | |
geom_line(aes(x = x, y = y), | |
data = tibble(x = (-200:200)/100, | |
label = 'Nonlinear Relationship') %>% | |
mutate(y = x+x^2), | |
color = scales::hue_pal()(8)[8]) + | |
geom_abline(aes(intercept = intercept, slope = slope, color = power), | |
data = results %>% filter(label != 'Nonlinear Relationship' | power != 'True')) + | |
facet_wrap(~factor(label,levels = c('Positive Slope, Low Noise', | |
'Positive Slope, High Noise', | |
'Nonlinear Relationship', | |
'Outlier X','Outlier Y','Outlier X & Y')), ncol = 2) + | |
theme_classic() + | |
theme(text = element_text(family = 'serif', size = 13), | |
axis.title.y = element_text(hjust = 1, angle = 0)) + | |
labs(color = 'Power', | |
caption = scales::label_wrap(100)('True model always: X is normal, e is normal, Y = X + e or Y = X + X^2 + e. Outliers add on a shift afterwards, and different graphs have different variances.'), | |
title = 'Why the sum of SQUARES?', | |
subtitle = scales::label_wrap(80)('Bivariate regression slopes minimizing the sum of residuals to different powers.'), | |
x = 'X',y = 'Y') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment