QuentinAndre · August 13, 2020 20:21
diff --git a/simulation.py b/simulation.py
 import numpy as np
 import scipy.stats as stats
 import matplotlib.pyplot as plt
 import seaborn as sns
 def exclude_zscore_outliers(x):
    """
    A small utility function to exclude *extreme outliers**:
    values that are above or below 2 z-score.
    """
    std = x.std()
    m = x.mean()
    high_cutoff = m+2*std
    low_cutoff = m-2*std
    return x[(x < high_cutoff) & (x > low_cutoff)]


 def exclude_zscore_outliers_common(x, y):
    """
    Same function, but applies a common cutoff to two vectors.
    """
    xy = np.concatenate([x, y])
    std = xy.std()
    m = xy.mean()
    high_cutoff = m+2*std
    low_cutoff = m-2*std
    return (
        x[(x < high_cutoff) & (x > low_cutoff)],
        y[(y < high_cutoff) & (y > low_cutoff)],
    )


 def compare_pvals(n=50):
    """
    Compare two vectors of data of size n coming
    from the same distribution, and return the p-values of the 
    comparison (1) before excluding extreme outliers, (2) after
    excluding outliers using a common cutoff and (3) after 
    excluding outliers using a different cutoff for each vector.
    """
    x = np.exp(np.random.normal(3, 1, n))
    y = np.exp(np.random.normal(3, 1, n))
    #x = np.random.normal(3, 1, n) # Uncomment for normal data
    #y = np.random.normal(3, 1, n) # Uncomment for normal data
    
    p = stats.ttest_ind(x, y).pvalue

    x_common, y_common = exclude_zscore_outliers_common(x, y)
    p_common = stats.ttest_ind(x_common, y_common).pvalue

    x_diff = exclude_zscore_outliers(x)
    y_diff = exclude_zscore_outliers(y)
    p_diff = stats.ttest_ind(x_diff, y_diff).pvalue
    return p, p_common, p_diff


 # Let's repeat this experiment 10,000 times:
 N = 5000
 pvals = np.empty(shape=(3, N))
 for i in range(N):
    pvals[:, i] = compare_pvals()
 pvals_no_excl, pvals_common_cutoff, pvals_diff_cutoffs = pvals

 # Now let's visualize the p-values and false-positive rates:
 hist_kws = dict(
    bins=np.arange(0, 1.025, 0.025), 
    align="mid", density=True, 
    histtype="step", lw=1.5
 )
 alpha_no_excl = (pvals_no_excl < 0.05).mean()
 alpha_common = (pvals_common_cutoff < 0.05).mean()
 alpha_diff = (pvals_diff_cutoffs < 0.05).mean()
 plt.hist(
    pvals_no_excl, 
    **hist_kws, 
    label=f"No Exclusion ($\\alpha= {alpha_no_excl:.3f}$)"
 )
 plt.hist(
    pvals_common_cutoff,
    **hist_kws,
    label=f"Common Cutoff ($\\alpha= {alpha_common:.3f}$)",
 )
 plt.hist(
    pvals_diff_cutoffs,
    **hist_kws,
    label=f"Different Cutoff ($\\alpha= {alpha_diff:.3f}$)",
 )
 ax = plt.gca()
 ax.legend(frameon=False)
 ax.set_xlabel("p-value")
 ax.set_ylabel("Density")
 sns.despine()
	import numpy as np
	import scipy.stats as stats
	import matplotlib.pyplot as plt
	import seaborn as sns
	def exclude_zscore_outliers(x):
	"""
	A small utility function to exclude extreme outliers*:
	values that are above or below 2 z-score.
	"""
	std = x.std()
	m = x.mean()
	high_cutoff = m+2*std
	low_cutoff = m-2*std
	return x[(x < high_cutoff) & (x > low_cutoff)]


	def exclude_zscore_outliers_common(x, y):
	"""
	Same function, but applies a common cutoff to two vectors.
	"""
	xy = np.concatenate([x, y])
	std = xy.std()
	m = xy.mean()
	high_cutoff = m+2*std
	low_cutoff = m-2*std
	return (
	x[(x < high_cutoff) & (x > low_cutoff)],
	y[(y < high_cutoff) & (y > low_cutoff)],
	)


	def compare_pvals(n=50):
	"""
	Compare two vectors of data of size n coming
	from the same distribution, and return the p-values of the
	comparison (1) before excluding extreme outliers, (2) after
	excluding outliers using a common cutoff and (3) after
	excluding outliers using a different cutoff for each vector.
	"""
	x = np.exp(np.random.normal(3, 1, n))
	y = np.exp(np.random.normal(3, 1, n))
	#x = np.random.normal(3, 1, n) # Uncomment for normal data
	#y = np.random.normal(3, 1, n) # Uncomment for normal data

	p = stats.ttest_ind(x, y).pvalue

	x_common, y_common = exclude_zscore_outliers_common(x, y)
	p_common = stats.ttest_ind(x_common, y_common).pvalue

	x_diff = exclude_zscore_outliers(x)
	y_diff = exclude_zscore_outliers(y)
	p_diff = stats.ttest_ind(x_diff, y_diff).pvalue
	return p, p_common, p_diff


	# Let's repeat this experiment 10,000 times:
	N = 5000
	pvals = np.empty(shape=(3, N))
	for i in range(N):
	pvals[:, i] = compare_pvals()
	pvals_no_excl, pvals_common_cutoff, pvals_diff_cutoffs = pvals

	# Now let's visualize the p-values and false-positive rates:
	hist_kws = dict(
	bins=np.arange(0, 1.025, 0.025),
	align="mid", density=True,
	histtype="step", lw=1.5
	)
	alpha_no_excl = (pvals_no_excl < 0.05).mean()
	alpha_common = (pvals_common_cutoff < 0.05).mean()
	alpha_diff = (pvals_diff_cutoffs < 0.05).mean()
	plt.hist(
	pvals_no_excl,
	**hist_kws,
	label=f"No Exclusion ($\\alpha= {alpha_no_excl:.3f}$)"
	)
	plt.hist(
	pvals_common_cutoff,
	**hist_kws,
	label=f"Common Cutoff ($\\alpha= {alpha_common:.3f}$)",
	)
	plt.hist(
	pvals_diff_cutoffs,
	**hist_kws,
	label=f"Different Cutoff ($\\alpha= {alpha_diff:.3f}$)",
	)
	ax = plt.gca()
	ax.legend(frameon=False)
	ax.set_xlabel("p-value")
	ax.set_ylabel("Density")
	sns.despine()