Skip to content

Instantly share code, notes, and snippets.

@jmquintana79
Last active August 31, 2024 18:15
Show Gist options
  • Save jmquintana79/923989bad9bd0419857c977a01dc8933 to your computer and use it in GitHub Desktop.
Save jmquintana79/923989bad9bd0419857c977a01dc8933 to your computer and use it in GitHub Desktop.
from sklearn.metrics import silhouette_samples
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
## quantification of clustering quality via silhouette metric
def quantification_clustering_quality(X:np.array, y_km:np.array, verbose:bool = False)->np.array:
"""
Quantification of clustering quality via silhouette metric.
X -- Array of features used to estimate the clustering.
y_km -- Labels returned by the clustering method to be evaluated.
verbose -- Display or not extra information (default, False).
return -- Array of statistics (mean, std) for each cluster and total.
"""
# clusters labels
cluster_labels = np.unique(y_km)
# number of clusters
n_clusters = cluster_labels.shape[0]
# estimate silhouette values (for all records)
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
# initialize
statistics = list()
# loop of cluster labels
for i, c in enumerate(cluster_labels):
# collect silhouette values per cluster
c_silhouette_vals = silhouette_vals[y_km == c]
# estimate statistics for each cluster
statistics.append([f"C{c}", np.mean(c_silhouette_vals), np.std(c_silhouette_vals)])
# display
if verbose:
print(f"C{c}", np.mean(c_silhouette_vals), np.std(c_silhouette_vals))
# final statistics
statistics.append([f"ALL", np.mean(silhouette_vals), np.std(silhouette_vals)])
# display
if verbose:
print(f"ALL", np.mean(silhouette_vals), np.std(silhouette_vals))
# to df and return
return statistics
## estimate the optimal number of clusters for k-Means algorithm
def estimate_optimal_num_clusters_kmeans(X:np.array, max_num_clusters:int = 10, with_plot:bool = False)->int:
"""
Estimate the optimal number of clusters for k-Means algorithm.
X -- Array of features used to estimate the clustering.
max_num_clusters -- Maximum number of clusters to be tested.
with_plot -- Plot or not extra information (default, False).
return -- The most optimal number of clusters.
"""
# number of clusters to be tested
num_clusters_to_test = np.arange(2, max_num_clusters + 1, 1)
# initialize
metrics = np.array([])
# loop of number of clusters
for nc in num_clusters_to_test:
## k-means clustering definition, fit and prediction
km = KMeans(n_clusters=nc , init='k-means++', n_init=10, max_iter=300, tol=1e-04, random_state=0)
y_km = km.fit_predict(X)
## cluster cuantification
metric = quantification_clustering_quality(X, y_km)[-1][1]
# append
metrics = np.append(metrics, metric)
# clean
del metric, km, y_km
# sort num of cluster according to metric values
num_clusters_sorted_according_metric = num_clusters_to_test[np.argsort(metrics)[::-1]]
# get the most optimal
num_cluster_optimal = num_clusters_sorted_according_metric[0]
# with plot if it is required
if with_plot:
## plot
plt.plot(num_clusters_to_test, metrics, color = "grey")
plt.scatter(num_clusters_to_test, metrics,
c='white', marker='o', edgecolor='black', s=25)
plt.axvline(num_cluster_optimal, color="red", linestyle="--")
plt.grid()
plt.tight_layout()
plt.title(f"Optimal Number of Clusters = {num_cluster_optimal}")
plt.xlabel("Number of clusters")
plt.ylabel('Silhouette coefficient')
plt.show()
# return
return num_cluster_optimal
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment