Last active
January 4, 2022 16:07
-
-
Save SuperKogito/a8b1980105557f034ac9b4dadc331380 to your computer and use it in GitHub Desktop.
Answer to https://stackoverflow.com/questions/56153726/plot-k-nearest-neighbor-graph-with-8-features
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import warnings | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
from pylab import rcParams | |
import matplotlib.pyplot as plt | |
from sklearn import preprocessing | |
from sklearn import neighbors, datasets | |
from matplotlib.colors import ListedColormap | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score, classification_report | |
# filter warnings | |
warnings.filterwarnings("ignore") | |
def plot_correlation(data): | |
''' | |
plot correlation's matrix to explore dependency between features | |
''' | |
# init figure size | |
rcParams['figure.figsize'] = 15, 20 | |
fig = plt.figure() | |
sns.heatmap(data.corr(), annot=True, fmt=".2f") | |
plt.show() | |
fig.savefig('corr.png') | |
def plot_densities(data): | |
''' | |
Plot features densities depending on the outcome values | |
''' | |
# change fig size to fit all subplots beautifully | |
rcParams['figure.figsize'] = 15, 20 | |
# separate data based on outcome values | |
outcome_0 = data[data['Outcome'] == 0] | |
outcome_1 = data[data['Outcome'] == 1] | |
# init figure | |
fig, axs = plt.subplots(8, 1) | |
fig.suptitle('Features densities for different outcomes 0/1') | |
plt.subplots_adjust(left = 0.25, right = 0.9, bottom = 0.1, top = 0.95, | |
wspace = 0.2, hspace = 0.9) | |
# plot densities for outcomes | |
for column_name in names[:-1]: | |
ax = axs[names.index(column_name)] | |
#plt.subplot(4, 2, names.index(column_name) + 1) | |
outcome_0[column_name].plot(kind='density', ax=ax, subplots=True, | |
sharex=False, color="red", legend=True, | |
label=column_name + ' for Outcome = 0') | |
outcome_1[column_name].plot(kind='density', ax=ax, subplots=True, | |
sharex=False, color="green", legend=True, | |
label=column_name + ' for Outcome = 1') | |
ax.set_xlabel(column_name + ' values') | |
ax.set_title(column_name + ' density') | |
ax.grid('on') | |
plt.show() | |
fig.savefig('densities.png') | |
def accuracy(k, X_train, y_train, X_test, y_test): | |
''' | |
compute accuracy of the classification based on k values | |
''' | |
# instantiate learning model and fit data | |
knn = KNeighborsClassifier(n_neighbors=k) | |
knn.fit(X_train, y_train) | |
# predict the response | |
pred = knn.predict(X_test) | |
# evaluate and return accuracy | |
return accuracy_score(y_test, pred) | |
def classify_and_plot(X, y): | |
''' | |
split data, fit, classify, plot and evaluate results | |
''' | |
# split data into training and testing set | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 41) | |
best_n_neighbours = np.argmax(np.array([accuracy(k, X_train, y_train, X_test, y_test) for k in range(1, int(rows_nbr/2))])) + 1 | |
print('----------------------------------------------------------------------') | |
print('For best accuracy use k = ', best_n_neighbours) | |
print('----------------------------------------------------------------------') | |
# init vars | |
n_neighbors = best_n_neighbours | |
h = .02 # step size in the mesh | |
# Create color maps | |
cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF']) | |
cmap_bold = ListedColormap(['#FF0000', '#0000FF']) | |
rcParams['figure.figsize'] = 5, 5 | |
for weights in ['uniform', 'distance']: | |
# we create an instance of Neighbours Classifier and fit the data. | |
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) | |
clf.fit(X_train, y_train) | |
# Plot the decision boundary. For that, we will assign a color to each | |
# point in the mesh [x_min, x_max]x[y_min, y_max]. | |
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 | |
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 | |
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), | |
np.arange(y_min, y_max, h)) | |
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) | |
# Put the result into a color plot | |
Z = Z.reshape(xx.shape) | |
fig = plt.figure() | |
plt.pcolormesh(xx, yy, Z, cmap=cmap_light) | |
# Plot also the training points, x-axis = 'Glucose', y-axis = "BMI" | |
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20) | |
plt.xlim(xx.min(), xx.max()) | |
plt.ylim(yy.min(), yy.max()) | |
plt.title("0/1 outcome classification (k = %i, weights = '%s')" % (n_neighbors, weights)) | |
plt.show() | |
fig.savefig(weights +'.png') | |
# evaluate | |
y_expected = y_test | |
y_predicted = clf.predict(X_test) | |
# print results | |
print('----------------------------------------------------------------------') | |
print('Classification report') | |
print('----------------------------------------------------------------------') | |
print('\n', classification_report(y_expected, y_predicted)) | |
print('----------------------------------------------------------------------') | |
print('Accuracy = %5s' % round(accuracy(n_neighbors, X_train, y_train, X_test, y_test), 3)) | |
print('----------------------------------------------------------------------') | |
# load your data | |
data = pd.read_csv('diabetes.csv') | |
names = list(data.columns) | |
# plot correlation & densities | |
plot_correlation(data) | |
plot_densities(data) | |
# we only take the best two features and prepare them for the KNN classifier | |
rows_nbr = 30#data.shape[0] | |
X_prime = np.array(data.iloc[:rows_nbr, [1,5]]) | |
#X = preprocessing.scale(X_prime) | |
X = X_prime | |
y = np.array(data.iloc[:rows_nbr, 8]) | |
# classify, evaluate and plot results | |
classify_and_plot(X, y) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment