raphaelvallat · December 12, 2023 06:50
diff --git a/distcorr.py b/distcorr.py
 import numpy as np
 import multiprocessing
 from joblib import Parallel, delayed
 from scipy.spatial.distance import pdist, squareform

 def _dcorr(y, n2, A, dcov2_xx):
    """Helper function for distance correlation bootstrapping.
    """
    # Pairwise Euclidean distances
    b = squareform(pdist(y, metric='euclidean'))
    # Double centering
    B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()
    # Compute squared distance covariances
    dcov2_yy = np.vdot(B, B) / n2
    dcov2_xy = np.vdot(A, B) / n2
    return np.sqrt(dcov2_xy) / np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy))


 def distcorr(x, y, n_boot=1000, seed=None):
    """Compute the distance correlation between two N-dimensional arrays.
    
    Statistical significance (p-value) is evaluated with a permutation test.
    
    Parameters
    ----------
    x, y : np.ndarray
        Input arrays
    n_boot : int or None
        Number of bootstrap to perform.
        If None, no bootstrapping is performed and the function
        only returns the distance correlation (no p-value).
        Default is 1000 (thus giving a precision of 0.001).
    seed : int or None
        Random state seed.
        
    Returns
    -------
    dcor : float
        Distance correlation (range from 0 to 1).
    pval : float
        P-value.
    
    Notes
    -----
    From Wikipedia:
    
    Distance correlation is a measure of dependence between two paired 
    random vectors of arbitrary, not necessarily equal, dimension. The population 
    distance correlation coefficient is zero if and only if the random vectors 
    are independent. Thus, distance correlation measures both linear and 
    nonlinear association between two random variables or random vectors. 
    This is in contrast to Pearson's correlation, which can only detect 
    linear association between two random variables.
    
    This function uses the Joblib package for parallel bootstrapping.
    
    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Distance_correlation
    .. [2] https://gist.github.com/satra/aa3d19a12b74e9ab7941
    .. [3] https://gist.github.com/wladston/c931b1495184fbb99bec
    .. [4] https://joblib.readthedocs.io/en/latest/
    
    Examples
    --------
    1. With two 1D vectors
    
        >>> a = [1, 2, 3, 4, 5]
        >>> b = [1, 2, 9, 4, 4]
        >>> distcorr(a, b, seed=9)
            (0.7626762424168667, 0.334)
    
    2. With two 2D arrays and no p-value
    
        >>> import numpy as np
        >>> np.random.seed(123)
        >>> a = np.random.random((10, 10))
        >>> b = np.random.random((10, 10))
        >>> distcorr(a, b, n_boot=None)
            0.8799633012275321
    """
    x = np.asarray(x)
    y = np.asarray(y)
    if x.ndim == 1:
        x = x[:, None]
    if y.ndim == 1:
        y = y[:, None]
    assert x.shape[0] == y.shape[0], 'x and y must have same number of samples'
    
    # Extract number of samples
    n = x.shape[0]
    n2 = n**2
    
    # Process first array to avoid redundancy when performing bootstrap
    a = squareform(pdist(x, metric='euclidean'))
    A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
    dcov2_xx = np.vdot(A, A) / n2
    
    # Process second array and compute final distance correlation
    dcor = _dcorr(y, n2, A, dcov2_xx)

    # Compute p-value using a bootstrap procedure
    if n_boot is not None and n_boot > 1:
        # Define random seed and permutation
        rng = np.random.RandomState(seed)
        bootsam = rng.random_sample((n, n_boot)).argsort(axis=0)
        num_cores = multiprocessing.cpu_count()
        bootstat = Parallel(n_jobs=num_cores)(delayed(_dcorr)(y[bootsam[:, i]], n2, A, dcov2_xx) for i in range(n_boot))
        pval = np.greater_equal(bootstat, dcor).sum() / n_boot
        return dcor, pval
    else:
        return dcor
    
    
 # Examples
 print('Example 1')
 a = [1, 2, 3, 4, 5]
 b = [1, 2, 9, 4, 4]
 print(distcorr(a, b, seed=9))

 print('Example 2')
 np.random.seed(123)
 a = np.random.random((10, 10))
 b = np.random.random((10, 10))
 print(distcorr(a, b, n_boot=None))
	import numpy as np
	import multiprocessing
	from joblib import Parallel, delayed
	from scipy.spatial.distance import pdist, squareform

	def _dcorr(y, n2, A, dcov2_xx):
	"""Helper function for distance correlation bootstrapping.
	"""
	# Pairwise Euclidean distances
	b = squareform(pdist(y, metric='euclidean'))
	# Double centering
	B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()
	# Compute squared distance covariances
	dcov2_yy = np.vdot(B, B) / n2
	dcov2_xy = np.vdot(A, B) / n2
	return np.sqrt(dcov2_xy) / np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy))


	def distcorr(x, y, n_boot=1000, seed=None):
	"""Compute the distance correlation between two N-dimensional arrays.

	Statistical significance (p-value) is evaluated with a permutation test.

	Parameters
	----------
	x, y : np.ndarray
	Input arrays
	n_boot : int or None
	Number of bootstrap to perform.
	If None, no bootstrapping is performed and the function
	only returns the distance correlation (no p-value).
	Default is 1000 (thus giving a precision of 0.001).
	seed : int or None
	Random state seed.

	Returns
	-------
	dcor : float
	Distance correlation (range from 0 to 1).
	pval : float
	P-value.

	Notes
	-----
	From Wikipedia:

	Distance correlation is a measure of dependence between two paired
	random vectors of arbitrary, not necessarily equal, dimension. The population
	distance correlation coefficient is zero if and only if the random vectors
	are independent. Thus, distance correlation measures both linear and
	nonlinear association between two random variables or random vectors.
	This is in contrast to Pearson's correlation, which can only detect
	linear association between two random variables.

	This function uses the Joblib package for parallel bootstrapping.

	References
	----------
	.. [1] https://en.wikipedia.org/wiki/Distance_correlation
	.. [2] https://gist.github.com/satra/aa3d19a12b74e9ab7941
	.. [3] https://gist.github.com/wladston/c931b1495184fbb99bec
	.. [4] https://joblib.readthedocs.io/en/latest/

	Examples
	--------
	1. With two 1D vectors

	>>> a = [1, 2, 3, 4, 5]
	>>> b = [1, 2, 9, 4, 4]
	>>> distcorr(a, b, seed=9)
	(0.7626762424168667, 0.334)

	2. With two 2D arrays and no p-value

	>>> import numpy as np
	>>> np.random.seed(123)
	>>> a = np.random.random((10, 10))
	>>> b = np.random.random((10, 10))
	>>> distcorr(a, b, n_boot=None)
	0.8799633012275321
	"""
	x = np.asarray(x)
	y = np.asarray(y)
	if x.ndim == 1:
	x = x[:, None]
	if y.ndim == 1:
	y = y[:, None]
	assert x.shape[0] == y.shape[0], 'x and y must have same number of samples'

	# Extract number of samples
	n = x.shape[0]
	n2 = n**2

	# Process first array to avoid redundancy when performing bootstrap
	a = squareform(pdist(x, metric='euclidean'))
	A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
	dcov2_xx = np.vdot(A, A) / n2

	# Process second array and compute final distance correlation
	dcor = _dcorr(y, n2, A, dcov2_xx)

	# Compute p-value using a bootstrap procedure
	if n_boot is not None and n_boot > 1:
	# Define random seed and permutation
	rng = np.random.RandomState(seed)
	bootsam = rng.random_sample((n, n_boot)).argsort(axis=0)
	num_cores = multiprocessing.cpu_count()
	bootstat = Parallel(n_jobs=num_cores)(delayed(_dcorr)(y[bootsam[:, i]], n2, A, dcov2_xx) for i in range(n_boot))
	pval = np.greater_equal(bootstat, dcor).sum() / n_boot
	return dcor, pval
	else:
	return dcor


	# Examples
	print('Example 1')
	a = [1, 2, 3, 4, 5]
	b = [1, 2, 9, 4, 4]
	print(distcorr(a, b, seed=9))

	print('Example 2')
	np.random.seed(123)
	a = np.random.random((10, 10))
	b = np.random.random((10, 10))
	print(distcorr(a, b, n_boot=None))