Created
August 17, 2018 18:53
-
-
Save sebp/2b5bf8e9f323cce8454f72d5365abf9f to your computer and use it in GitHub Desktop.
Simulation study demonstrating performance of logistic regression in the presence of non-linear effects
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# simulation study related to https://stats.stackexchange.com/questions/362194/data-how-smart-are-models-is-my-dummy-redundant | |
library(ROCR) | |
gen.features <- function(n.samples) { | |
age <- runif(n.samples, min=6, max=89) | |
sex <- factor(rbinom(n.samples, 1, 0.5), c(0, 1), c("male", "female")) | |
data <- data.frame(age, sex) | |
return(data) | |
} | |
gen.outcome <- function(data) { | |
# create model | |
mat <- model.matrix(~ sex + I(age < 18), data=data) | |
col.mean <- apply(mat, 2, mean) | |
# subtract the column means | |
mm <- sweep(mat, 2, col.mean) | |
# coefficients in terms of odds-ratio of survival | |
beta <- log(c(1, 1.5, 4)) | |
names(beta) <- colnames(mm) | |
# add independent noise | |
noise <- rnorm(nrow(data), sd=0.2) | |
# linear model | |
nu <- mm %*% beta + noise | |
# probabilities | |
prob <- 1 / (1. + exp(-nu)) | |
y <- prob > 0.5 | |
# flip labels for about 5% | |
flip <- which(rbinom(nrow(data), 1, 0.05) == 1) | |
y[flip] <- !y[flip] | |
return(factor(y, c(FALSE, TRUE), c("dead", "alive"))) | |
} | |
gen.data <- function(n.samples) { | |
data <- gen.features(n.samples) | |
data$outcome <- gen.outcome(data) | |
return(data) | |
} | |
predict.and.evaluate <- function(fit, newdata) { | |
print(summary(fit)) | |
predictions <- predict(fit, type="response", newdata=newdata) | |
pred <- prediction(predictions, newdata$outcome, label.ordering=c("dead", "alive")) | |
perf <- performance(pred, measure = "auc") | |
auc <- unlist(perf@y.values) | |
cat(paste("ROC AUC", auc, "\n")) | |
} | |
set.seed(62354) | |
# generate training data | |
data <- gen.data(200) | |
cat("\n# Overall distribution of outcome:\n") | |
print(table(data$outcome)) | |
cat("\n# Distribution of outcome among children:\n") | |
print(table(data$outcome[data$age < 18])) | |
cat("\n# Distribution of outcome among adults:\n") | |
print(table(data$outcome[data$age >= 18])) | |
# generate testing data | |
test.data <- gen.data(200) | |
cat("\n\n##################\n") | |
cat("## LINEAR MODEL ##\n") | |
cat("##################\n") | |
fit <- glm(I(outcome == "alive") ~ ., data=data, family="binomial") | |
predict.and.evaluate(fit, test.data) | |
cat("\n\n########################\n") | |
cat("## DICHOTOMISED MODEL ##\n") | |
cat("########################\n") | |
fit.actual <- glm(I(outcome == "alive") ~ I(age < 18) + ., data=data, family="binomial") | |
predict.and.evaluate(fit.actual, test.data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment