Created
October 27, 2017 17:37
-
-
Save amueller/9853d77d9a08f4445f7ee1f7cffe4241 to your computer and use it in GitHub Desktop.
bench feature agglomeration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Benchmarks np.bincount method vs np.mean for feature agglomeration in | |
../sklearn/cluster/_feature_agglomeration. Use of np.bincount provides | |
a significant speed up if the pooling function is np.mean. | |
np.bincount performs better especially as the size of X and n_clusters | |
increase. | |
""" | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from sklearn.cluster import FeatureAgglomeration | |
import time | |
def fit_agglomeration(n_features, n_clusters): | |
X = rng.randn(200, n_features) | |
agglo = FeatureAgglomeration(n_clusters=n_clusters) | |
agglo.fit(X) | |
return X, agglo | |
def get_transformed_array(X, agglo, method): | |
size = np.bincount(agglo.labels_) | |
n_samples = X.shape[0] | |
nX = [] | |
if len(agglo.labels_) != X.shape[1]: | |
raise ValueError("X has a different number of features than " | |
"during fitting.") | |
if method == "bincount": | |
# a fast way to compute the mean of grouped features | |
nX = np.array([np.bincount(agglo.labels_, X[i, :]) / size | |
for i in range(n_samples)]) | |
elif method == "np_mean": | |
for l in np.unique(agglo.labels_): | |
nX.append(np.mean(X[:, agglo.labels_ == l], axis=1)) | |
nX = np.array(nX).T | |
else: | |
raise ValueError("Method can have a value of 'bincount' or 'np.mean'") | |
return nX | |
if __name__ == "__main__": | |
rng = np.random.RandomState(0) | |
for n_clusters in [1, 5, 10, 50, 100]: | |
times_mean = [] | |
times_bincount = [] | |
n_features_this = [] | |
for n_features in [2, 10, 100, 200, 500, 1000]: | |
if n_clusters >= n_features: | |
continue | |
n_features_this.append(n_features) | |
print(n_features, n_clusters) | |
X, agglo = fit_agglomeration(n_features, n_clusters) | |
X = rng.randn(100000, n_features) | |
tick = time.time() | |
result_bincount = get_transformed_array(X, agglo, "bincount") | |
time_bincount = time.time() - tick | |
tick = time.time() | |
result_np_mean = get_transformed_array(X, agglo, "np_mean") | |
time_np_mean = time.time() - tick | |
print('==================') | |
print('Took %s seconds using np.bincount' % (time_bincount)) | |
print('Took %s seconds using np.mean' % (time_np_mean)) | |
print('==================') | |
print("np.bincount is %s times faster" % (time_np_mean/time_bincount)) | |
times_mean.append(time_np_mean) | |
times_bincount.append(time_bincount) | |
plt.plot(n_features_this, times_mean, '--', label="mean n_clusters={}".format(n_clusters)) | |
plt.plot(n_features_this, times_bincount, label="bincount n_clusters={}".format(n_clusters)) | |
plt.xlabel("n_features") | |
plt.ylabel("time") | |
plt.legend() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment