import chromadb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import silhouette_score as sil_score
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import json
from tqdm import tqdm
import random
import time
from utils import parseMeta, findLocalMax, findMaxAfterDip, findMaxEps
client = chromadb.PersistentClient(path='db')
print("Available collections:\n", client.list_collections())
Available collections:
[Collection(name=codebert), Collection(name=salesforce), Collection(name=openai)]
COLLECTION_NAME = 'openai'
collection = client.get_collection(COLLECTION_NAME)
print(collection.name, collection.count())
openai 9668
results = collection.get(include=['embeddings', 'documents'])
points = np.array(results['embeddings'])
ids = results['ids']
print(results.keys())
print(points)
print(points.shape)
dict_keys(['ids', 'embeddings', 'metadatas', 'documents'])
[[ 6.77902391e-03 2.29288395e-02 -1.22946519e-02 ... 3.76131153e-03
7.13638449e-03 -1.24895759e-02]
[ 2.31891249e-06 3.05070207e-02 -1.98971070e-02 ... 8.37958045e-03
-9.39272868e-04 -1.61822289e-02]
[ 5.08600706e-03 4.22003046e-02 4.75074397e-03 ... 2.82298354e-03
1.47087602e-02 -1.53507534e-02]
...
[-1.17532834e-02 -1.08433021e-02 7.84571841e-03 ... 1.23726819e-02
9.51274205e-03 -5.44459186e-03]
[-1.99144091e-02 1.66793598e-03 7.24539580e-03 ... -1.85098313e-02
1.54642612e-02 -2.55188122e-02]
[-7.31560402e-03 8.45605973e-03 -7.87887815e-03 ... -1.04101328e-02
6.89836452e-03 -3.00968960e-02]]
(9668, 1536)
As eps value increases the score will increase, but this is because it is approaching a system where all points get assigned to a single label (not useful). So we consider the best eps value to be the max before that curve off towards inf.
bestMaxEps = findMaxEps(points)
print(f"Calculated best max eps of: {bestMaxEps}")
# epsvals = np.linspace(0.1, bestMaxEps, 20)
epsvals = np.linspace(0.1, 1.0, 20)
realEpsVals = []
scores = []
for eps in epsvals:
db = DBSCAN(eps=eps).fit(points)
numLabels = len(set(db.labels_) - {-1})
if numLabels <= 1:
print(f"BREAKING BECAUSE SETTLED TO ONE CLUSTER")
break
score = sil_score(points, db.labels_)
scores.append(score)
realEpsVals.append(eps)
print(eps, score, numLabels)
Calculated best max eps of: 0.5200515581787151
0.1 -0.15129322858532404 48
0.1473684210526316 -0.15968241353213491 61
0.19473684210526315 -0.09074044385137103 87
0.24210526315789474 -0.08670239441405392 87
0.2894736842105263 -0.058398391857078395 81
0.33684210526315794 -0.057293095069103725 46
0.38421052631578945 -0.06447678739478983 57
0.43157894736842106 -0.002065662739259727 35
0.4789473684210527 0.08230728589265478 4
0.5263157894736842 0.11841273959816147 2
0.5736842105263158 0.15965548510401592 2
0.6210526315789474 0.2016545872676634 2
BREAKING BECAUSE SETTLED TO ONE CLUSTER
bestEps, bestScore = findLocalMax(realEpsVals, scores)
print(f"Best eps value: {bestEps}")
print(f"Best score : {bestScore}")
plt.plot(realEpsVals, scores)
plt.axvline(x=bestEps, color='r', linestyle='--')
plt.title('Eps scores (with local max)')
plt.show()
Best eps value: 0.33684210526315794
Best score : -0.057293095069103725
minSamplesVals = range(0, 55, 5)
REAL_START = 3
realMinSamplesVals = []
minSampleScores = []
for minSamples in minSamplesVals:
minSamples = max(REAL_START, minSamples)
db = DBSCAN(eps=bestEps, min_samples=minSamples).fit(points)
score = sil_score(points, db.labels_)
numLabels = len(set(db.labels_) - {-1})
minSampleScores.append(score)
realMinSamplesVals.append(minSamples)
print(minSamples, score, numLabels)
3 -0.07141319290974064 222
5 -0.057293095069103725 46
10 -0.008364826936755528 18
15 -0.010360524881265652 19
20 0.006339389966989534 14
25 0.00793206912456556 12
30 0.004598409107331424 11
35 0.014858205943250453 10
40 0.014120495837015254 11
45 0.013293592608278563 11
50 0.016344644401964942 10
bestMinSamples, bestMinSamplesScore = findMaxAfterDip(realMinSamplesVals, minSampleScores)
print(f"Best min samples value: {bestMinSamples}")
print(f"Best min samples score: {bestMinSamplesScore}")
plt.plot(realMinSamplesVals, minSampleScores)
plt.axvline(x=bestMinSamples, color='red', linestyle='--')
plt.show()
bestMinSamples = 25
Best min samples value: 50
Best min samples score: 0.016344644401964942
db = DBSCAN(eps=bestEps, min_samples=bestMinSamples).fit(points)
labels = db.labels_
bestScore = sil_score(points, labels)
numLabels = len(set(labels) - {-1})
print(f"Best score: {bestScore}")
print(f"num labels: {numLabels}")
print(len(labels))
print(labels)
Best score: 0.00793206912456556
num labels: 12
9668
[-1 -1 -1 ... -1 -1 -1]
print(json.dumps(collection.metadata, indent=2))
newMeta = collection.metadata.copy()
newMeta['bestEps'] = bestEps
newMeta['bestMinSamples'] = bestMinSamples
newMeta['bestScore'] = bestScore
newMeta['numLabels'] = len(set(labels) - {-1})
newMeta['dims'] = points.shape[-1]
collection.modify(metadata=newMeta)
print(json.dumps(collection.metadata, indent=2))
{
"endpoint": "openai api",
"modelName": "text-embedding-ada-002",
"bestEps": 0.33684210526315794,
"bestMinSamples": 50,
"bestScore": 0.00793206912456556,
"numLabels": 12,
"dims": 1536
}
{
"endpoint": "openai api",
"modelName": "text-embedding-ada-002",
"bestEps": 0.33684210526315794,
"bestMinSamples": 25,
"bestScore": 0.00793206912456556,
"numLabels": 12,
"dims": 1536
}
How many points even get a label (no label defaults to -1)?
How many points are under each label?
isAssigned = np.array(labels > -1, dtype=int)
assignRate = sum(isAssigned) / len(isAssigned)
print(f"Assign rate: {assignRate*100:.3f} %")
print(f"Total num of assigned points: {sum(isAssigned):,} / {len(isAssigned):,}")
plt.hist(isAssigned, bins=2)
plt.title('Unassigned vs Assigned')
plt.show()
Assign rate: 43.184 %
Total num of assigned points: 4,175 / 9,668
realLabels = labels[labels > -1]
print("labels ranges:", min(labels), max(labels))
counts = {}
for val in realLabels:
counts[val] = counts.get(val, 0) + 1
for i in sorted(counts.keys()):
print(f"{str(i):>2} {str(counts[i]):>4}")
plt.hist(realLabels, bins=len(set(realLabels)))
plt.title(f"Real label counts ({len(set(realLabels))} clusters)")
plt.show()
labels ranges: -1 11
0 573
1 1545
2 71
3 62
4 796
5 222
6 39
7 328
8 292
9 46
10 27
11 174
newMetas = []
for idx, label in zip(ids, labels):
meta = parseMeta(idx)
meta['clusterLabel'] = int(label)
newMetas.append(meta)
collection.update(ids=ids, metadatas=newMetas)
print(f"FINISHED UPDATING IN COLLECTION: {collection.name}")
FINISHED UPDATING IN COLLECTION: openai