Created
January 16, 2019 13:24
-
-
Save m-Py/8ce68201dd0fd498504e00e2e2f38682 to your computer and use it in GitHub Desktop.
How the p value in a t test can be minimized by data removal
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Warning: This code is just for fun / educational purposes; the file contains functions | |
## to find out how severely the p value in a t-test can be minimized by systematic removal of data points. | |
## SIX OUT OF THIRTY - Martin's approach | |
## Based on @juli_tkotz's (https://twitter.com/juli_tkotz/status/1085446224117985281) | |
## idea that removing from the most extreme values is the best apporach. | |
#' Simulate t-tests and store best p values | |
#' | |
#' nsim = How many iterations in the simulation | |
#' | |
#' return: For each run, the best p value that could be obtained by | |
#' removing 6 cases from the sample | |
#' | |
ttest_sim <- function(nsim) { | |
# prepare matrix to be filled with values of t-test | |
p_values <- rep(NA, nsim) | |
# p value, t value, effect size | |
for (i in 1:nsim) { | |
# both groups: 30 values drawn from a normal distribution, mean = 0, SD = 1 | |
dat1 <- rnorm(30) | |
dat2 <- rnorm(30) | |
p_values[i] <- get_best_pvalue(dat1, dat2, remove = 6) | |
} | |
return(p_values) | |
} | |
#' Lowest p value for the comparison of two group means with fixed number | |
#' of observations removed | |
#' | |
#' dat1 = vector of numeric values for group 1 | |
#' dat2 = vector of numeric values for group 2 | |
#' remove = how many observations have to be removed | |
#' | |
#' returns: The best possible p value for the comparison of the two | |
#' groups when cases are removed | |
#' | |
get_best_pvalue <- function(dat1, dat2, remove) { | |
## Initialize best p value: | |
best_p <- Inf | |
## Generate data frame containing all combinations of possible | |
## data removals from the two groups: | |
to_be_removed <- expand.grid(group1 = 0:remove, group2 = 0:remove) | |
to_be_removed <- to_be_removed[rowSums(to_be_removed) == remove, ] | |
# The following line would remove all six in either one group or the other: | |
# to_be_removed <- data.frame(group1 = c(0, 6), group2 = c(6, 0)) | |
for (i in 1:nrow(to_be_removed)) { | |
## Get indices of values that are not removed: | |
indices <- get_indices(to_be_removed, i, dat1, dat2) | |
## Test all combinations of removals of extreme values: | |
for (j in c(TRUE, FALSE)) { | |
## Sort both groups in decreasing and increasing order to capture | |
## all combinations of extreme values: (therefore, j = TRUE; FALSE) | |
tmp1 <- sort(dat1, decreasing = j)[indices[[1]]] | |
tmp2 <- sort(dat2, decreasing = !j)[indices[[2]]] | |
## Assert that the correct number of observations was removed: | |
if (length(tmp1) + length(tmp2) != length(dat1) + length(dat2) - remove) | |
stop("Something went wrong, the incorrect number of cases was removed") | |
ttest <- t.test(tmp1, tmp2, var.equal = TRUE) | |
if (ttest$p.value < best_p) | |
best_p <- ttest$p.value | |
} | |
} | |
return(best_p) | |
} | |
#' Get indices for selection of non-extreme values | |
#' | |
#' to_be_removed = data frame that contains the number of observation | |
#' to be removed by group (created within function `get_best_pvalue`) | |
#' row = which row of `to_be_removed` is targetted | |
#' dat1 = vector of numeric values for group 1 | |
#' dat2 = vector of numeric values for group 2 | |
#' | |
#' returns: A list of two elements containing the indices that are | |
#' selected in each group | |
get_indices <- function(to_be_removed, row, dat1, dat2) { | |
i1 <- to_be_removed[row, "group1"] | |
i2 <- to_be_removed[row, "group2"] | |
select_from1 <- setdiff(1:length(dat1), 0:i1) | |
select_from2 <- setdiff(1:length(dat2), 0:i2) | |
return(list(select_from1, select_from2)) | |
} | |
# Run 10000 simulations | |
sim1 <- ttest_sim(10000) | |
mean(sim1 < .05) | |
# 0.7722 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment