sebp · August 17, 2018 18:53
diff --git a/simulation.r b/simulation.r
 # simulation study related to https://stats.stackexchange.com/questions/362194/data-how-smart-are-models-is-my-dummy-redundant
 library(ROCR)

 gen.features <- function(n.samples) {
    age <- runif(n.samples, min=6, max=89)
    sex <- factor(rbinom(n.samples, 1, 0.5), c(0, 1), c("male", "female"))
    data <- data.frame(age, sex)
    return(data)
 }

 gen.outcome <- function(data) {
    # create model
    mat <- model.matrix(~ sex + I(age < 18), data=data)
    col.mean <- apply(mat, 2, mean)
    # subtract the column means
    mm <- sweep(mat, 2, col.mean)

    # coefficients in terms of odds-ratio of survival
    beta <- log(c(1, 1.5, 4))
    names(beta) <- colnames(mm)
    # add independent noise
    noise <- rnorm(nrow(data), sd=0.2)
    # linear model
    nu <- mm %*% beta + noise
    # probabilities
    prob <- 1 / (1. + exp(-nu))
    y <- prob > 0.5
    # flip labels for about 5%
    flip <- which(rbinom(nrow(data), 1, 0.05) == 1)
    y[flip] <- !y[flip]
    return(factor(y, c(FALSE, TRUE), c("dead", "alive")))
 }

 gen.data <- function(n.samples) {
    data <- gen.features(n.samples)
    data$outcome <- gen.outcome(data)
    return(data)
 }

 predict.and.evaluate <- function(fit, newdata) {
    print(summary(fit))
    predictions <- predict(fit, type="response", newdata=newdata)

    pred <- prediction(predictions, newdata$outcome, label.ordering=c("dead", "alive"))
    perf <- performance(pred, measure = "auc")
    auc <- unlist(perf@y.values)
    cat(paste("ROC AUC", auc, "\n"))
 }

 set.seed(62354)
 # generate training data
 data <- gen.data(200)

 cat("\n# Overall distribution of outcome:\n")
 print(table(data$outcome))

 cat("\n# Distribution of outcome among children:\n")
 print(table(data$outcome[data$age < 18]))

 cat("\n# Distribution of outcome among adults:\n")
 print(table(data$outcome[data$age >= 18]))

 # generate testing data
 test.data <- gen.data(200)

 cat("\n\n##################\n")
 cat("## LINEAR MODEL ##\n")
 cat("##################\n")
 fit <- glm(I(outcome == "alive") ~ ., data=data, family="binomial")
 predict.and.evaluate(fit, test.data)

 cat("\n\n########################\n")
 cat("## DICHOTOMISED MODEL ##\n")
 cat("########################\n")
 fit.actual <- glm(I(outcome == "alive") ~ I(age < 18) + ., data=data, family="binomial")
 predict.and.evaluate(fit.actual, test.data)
	# simulation study related to https://stats.stackexchange.com/questions/362194/data-how-smart-are-models-is-my-dummy-redundant
	library(ROCR)

	gen.features <- function(n.samples) {
	age <- runif(n.samples, min=6, max=89)
	sex <- factor(rbinom(n.samples, 1, 0.5), c(0, 1), c("male", "female"))
	data <- data.frame(age, sex)
	return(data)
	}

	gen.outcome <- function(data) {
	# create model
	mat <- model.matrix(~ sex + I(age < 18), data=data)
	col.mean <- apply(mat, 2, mean)
	# subtract the column means
	mm <- sweep(mat, 2, col.mean)

	# coefficients in terms of odds-ratio of survival
	beta <- log(c(1, 1.5, 4))
	names(beta) <- colnames(mm)
	# add independent noise
	noise <- rnorm(nrow(data), sd=0.2)
	# linear model
	nu <- mm %*% beta + noise
	# probabilities
	prob <- 1 / (1. + exp(-nu))
	y <- prob > 0.5
	# flip labels for about 5%
	flip <- which(rbinom(nrow(data), 1, 0.05) == 1)
	y[flip] <- !y[flip]
	return(factor(y, c(FALSE, TRUE), c("dead", "alive")))
	}

	gen.data <- function(n.samples) {
	data <- gen.features(n.samples)
	data$outcome <- gen.outcome(data)
	return(data)
	}

	predict.and.evaluate <- function(fit, newdata) {
	print(summary(fit))
	predictions <- predict(fit, type="response", newdata=newdata)

	pred <- prediction(predictions, newdata$outcome, label.ordering=c("dead", "alive"))
	perf <- performance(pred, measure = "auc")
	auc <- unlist(perf@y.values)
	cat(paste("ROC AUC", auc, "\n"))
	}

	set.seed(62354)
	# generate training data
	data <- gen.data(200)

	cat("\n# Overall distribution of outcome:\n")
	print(table(data$outcome))

	cat("\n# Distribution of outcome among children:\n")
	print(table(data$outcome[data$age < 18]))

	cat("\n# Distribution of outcome among adults:\n")
	print(table(data$outcome[data$age >= 18]))

	# generate testing data
	test.data <- gen.data(200)

	cat("\n\n##################\n")
	cat("## LINEAR MODEL ##\n")
	cat("##################\n")
	fit <- glm(I(outcome == "alive") ~ ., data=data, family="binomial")
	predict.and.evaluate(fit, test.data)

	cat("\n\n########################\n")
	cat("## DICHOTOMISED MODEL ##\n")
	cat("########################\n")
	fit.actual <- glm(I(outcome == "alive") ~ I(age < 18) + ., data=data, family="binomial")
	predict.and.evaluate(fit.actual, test.data)