Created
August 13, 2020 20:21
-
-
Save QuentinAndre/7671c324ec6eb4466b64247c740ee439 to your computer and use it in GitHub Desktop.
Simulation of false-positive rates
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import scipy.stats as stats | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
def exclude_zscore_outliers(x): | |
""" | |
A small utility function to exclude *extreme outliers**: | |
values that are above or below 2 z-score. | |
""" | |
std = x.std() | |
m = x.mean() | |
high_cutoff = m+2*std | |
low_cutoff = m-2*std | |
return x[(x < high_cutoff) & (x > low_cutoff)] | |
def exclude_zscore_outliers_common(x, y): | |
""" | |
Same function, but applies a common cutoff to two vectors. | |
""" | |
xy = np.concatenate([x, y]) | |
std = xy.std() | |
m = xy.mean() | |
high_cutoff = m+2*std | |
low_cutoff = m-2*std | |
return ( | |
x[(x < high_cutoff) & (x > low_cutoff)], | |
y[(y < high_cutoff) & (y > low_cutoff)], | |
) | |
def compare_pvals(n=50): | |
""" | |
Compare two vectors of data of size n coming | |
from the same distribution, and return the p-values of the | |
comparison (1) before excluding extreme outliers, (2) after | |
excluding outliers using a common cutoff and (3) after | |
excluding outliers using a different cutoff for each vector. | |
""" | |
x = np.exp(np.random.normal(3, 1, n)) | |
y = np.exp(np.random.normal(3, 1, n)) | |
#x = np.random.normal(3, 1, n) # Uncomment for normal data | |
#y = np.random.normal(3, 1, n) # Uncomment for normal data | |
p = stats.ttest_ind(x, y).pvalue | |
x_common, y_common = exclude_zscore_outliers_common(x, y) | |
p_common = stats.ttest_ind(x_common, y_common).pvalue | |
x_diff = exclude_zscore_outliers(x) | |
y_diff = exclude_zscore_outliers(y) | |
p_diff = stats.ttest_ind(x_diff, y_diff).pvalue | |
return p, p_common, p_diff | |
# Let's repeat this experiment 10,000 times: | |
N = 5000 | |
pvals = np.empty(shape=(3, N)) | |
for i in range(N): | |
pvals[:, i] = compare_pvals() | |
pvals_no_excl, pvals_common_cutoff, pvals_diff_cutoffs = pvals | |
# Now let's visualize the p-values and false-positive rates: | |
hist_kws = dict( | |
bins=np.arange(0, 1.025, 0.025), | |
align="mid", density=True, | |
histtype="step", lw=1.5 | |
) | |
alpha_no_excl = (pvals_no_excl < 0.05).mean() | |
alpha_common = (pvals_common_cutoff < 0.05).mean() | |
alpha_diff = (pvals_diff_cutoffs < 0.05).mean() | |
plt.hist( | |
pvals_no_excl, | |
**hist_kws, | |
label=f"No Exclusion ($\\alpha= {alpha_no_excl:.3f}$)" | |
) | |
plt.hist( | |
pvals_common_cutoff, | |
**hist_kws, | |
label=f"Common Cutoff ($\\alpha= {alpha_common:.3f}$)", | |
) | |
plt.hist( | |
pvals_diff_cutoffs, | |
**hist_kws, | |
label=f"Different Cutoff ($\\alpha= {alpha_diff:.3f}$)", | |
) | |
ax = plt.gca() | |
ax.legend(frameon=False) | |
ax.set_xlabel("p-value") | |
ax.set_ylabel("Density") | |
sns.despine() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment