Created
May 2, 2020 11:13
-
-
Save markloyman/5addde66c865b98ad97a44e6327cdbf6 to your computer and use it in GitHub Desktop.
Hubness calculation (index defined by Miloš Radovanovi´c et al, 2010)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy.stats import skew | |
def k_occurrences(nbrs_indices): | |
""" | |
Calculate k-occurrences distribution | |
Miloš Radovanovi´c et al, | |
Hubs in space: Popular nearest neighbors in high-dimensional data. | |
Journal of Machine Learning Research, 11(Sep):2487–2531, 2010 | |
:param nbrs_indices: array, shape (n_queries, k_neighbors) | |
Indices of the nearest points in the population matrix. | |
if using sklearn: | |
nbrs = sklearn.neighbors.NearestNeighbors(..).fit(..) | |
distances, indices = nbrs.kneighbors(..) | |
nbrs_indices = indices[:, 1:] | |
:return: 1d array where k_occ[i] is the number of times query i appears as nearest-neighbour result of other queries | |
""" | |
return np.bincount(nbrs_indices.flatten()) | |
def hubness(nbrs_indices, k_neighbors=None, nonlinear_rescaling=True): | |
""" | |
hubness: 3rd order moment of K-occurrences distribution | |
Miloš Radovanovi´c et al, | |
Hubs in space: Popular nearest neighbors in high-dimensional data. | |
Journal of Machine Learning Research, 11(Sep):2487–2531, 2010 | |
:param nbrs_indices: array, shape (n_queries, k_neighbors) | |
Indices of the nearest points in the population matrix. | |
if using sklearn: | |
nbrs = sklearn.neighbors.NearestNeighbors(..).fit(..) | |
distances, indices = nbrs.kneighbors(..) | |
nbrs_indices = indices[:, 1:] | |
:param k_neighbors: if provided, take the first k columns of nbrs_indices | |
:param nonlinear_rescaling: apply rescaling to the hubness value | |
:return: Tuple: hubness value, k-occurrences distribution | |
""" | |
nbrs_indices_ = nbrs_indices if k_neighbors is None else nbrs_indices[:, :k_neighbors] | |
k_occ = k_occurrences(nbrs_indices_) | |
distribution = np.bincount(k_occ) | |
index = skew(distribution) | |
if nonlinear_rescaling: | |
index = np.exp(-np.abs(index)) | |
return index, distribution |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment