SuperKogito · January 4, 2022 16:07
diff --git a/K-nearest-neighbor.py b/K-nearest-neighbor.py
 import warnings 
 import numpy as np
 import pandas as pd
 import seaborn as sns
 from pylab import rcParams
 import matplotlib.pyplot as plt
 from sklearn import preprocessing 
 from sklearn import neighbors, datasets
 from matplotlib.colors import ListedColormap
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, classification_report
 # filter warnings
 warnings.filterwarnings("ignore")


 def plot_correlation(data):
    '''
    plot correlation's matrix to explore dependency between features 
    '''
    # init figure size
    rcParams['figure.figsize'] = 15, 20
    fig = plt.figure()
    sns.heatmap(data.corr(), annot=True, fmt=".2f")
    plt.show()
    fig.savefig('corr.png')
    
 def plot_densities(data):
    '''
    Plot features densities depending on the outcome values
    '''
    # change fig size to fit all subplots beautifully 
    rcParams['figure.figsize'] = 15, 20

    # separate data based on outcome values 
    outcome_0 = data[data['Outcome'] == 0]
    outcome_1 = data[data['Outcome'] == 1]
    
    # init figure
    fig, axs = plt.subplots(8, 1)
    fig.suptitle('Features densities for different outcomes 0/1')
    plt.subplots_adjust(left = 0.25, right = 0.9, bottom = 0.1, top = 0.95,
                        wspace = 0.2, hspace = 0.9)
             
    # plot densities for outcomes
    for column_name in names[:-1]: 
        ax = axs[names.index(column_name)]
        #plt.subplot(4, 2, names.index(column_name) + 1)
        outcome_0[column_name].plot(kind='density', ax=ax, subplots=True, 
                                    sharex=False, color="red", legend=True,
                                    label=column_name + ' for Outcome = 0')
        outcome_1[column_name].plot(kind='density', ax=ax, subplots=True, 
                                     sharex=False, color="green", legend=True,
                                     label=column_name + ' for Outcome = 1')
        ax.set_xlabel(column_name + ' values')
        ax.set_title(column_name + ' density')
        ax.grid('on')
    plt.show()
    fig.savefig('densities.png')

 def accuracy(k, X_train, y_train, X_test, y_test):
    '''
    compute accuracy of the classification based on k values 
    '''
    # instantiate learning model and fit data
    knn = KNeighborsClassifier(n_neighbors=k)    
    knn.fit(X_train, y_train)
    
    # predict the response
    pred = knn.predict(X_test)
    
    # evaluate and return  accuracy
    return accuracy_score(y_test, pred)

 def classify_and_plot(X, y):
    ''' 
    split data, fit, classify, plot and evaluate results 
    '''
    # split data into training and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 41)
    best_n_neighbours = np.argmax(np.array([accuracy(k, X_train, y_train, X_test, y_test) for k in range(1, int(rows_nbr/2))])) + 1
    print('----------------------------------------------------------------------')
    print('For best accuracy use k = ', best_n_neighbours)
    print('----------------------------------------------------------------------')
    
    # init vars
    n_neighbors = best_n_neighbours
    h           = .02  # step size in the mesh
    
    # Create color maps
    cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
    cmap_bold  = ListedColormap(['#FF0000', '#0000FF'])
    
    rcParams['figure.figsize'] = 5, 5
    for weights in ['uniform', 'distance']:
        # we create an instance of Neighbours Classifier and fit the data.
        clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
        clf.fit(X_train, y_train)
    
        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    
        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        fig = plt.figure()
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    
        # Plot also the training points, x-axis = 'Glucose', y-axis = "BMI"
        plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)   
        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.title("0/1 outcome classification (k = %i, weights = '%s')" % (n_neighbors, weights))
        plt.show()
        fig.savefig(weights +'.png')
    
        # evaluate
        y_expected  = y_test
        y_predicted = clf.predict(X_test)
        
        # print results
        print('----------------------------------------------------------------------')
        print('Classification report')
        print('----------------------------------------------------------------------')
        print('\n', classification_report(y_expected, y_predicted))
        print('----------------------------------------------------------------------')
        print('Accuracy = %5s' % round(accuracy(n_neighbors, X_train, y_train, X_test, y_test), 3))
        print('----------------------------------------------------------------------')


 # load your data 
 data  = pd.read_csv('diabetes.csv')
 names = list(data.columns)

 # plot correlation & densities
 plot_correlation(data)
 plot_densities(data)

 # we only take the best two features and prepare them for the KNN classifier
 rows_nbr = 30#data.shape[0]
 X_prime  = np.array(data.iloc[:rows_nbr, [1,5]])
 #X        = preprocessing.scale(X_prime)
 X        = X_prime
 y        = np.array(data.iloc[:rows_nbr, 8])

 # classify, evaluate and plot results
 classify_and_plot(X, y)
	import warnings
	import numpy as np
	import pandas as pd
	import seaborn as sns
	from pylab import rcParams
	import matplotlib.pyplot as plt
	from sklearn import preprocessing
	from sklearn import neighbors, datasets
	from matplotlib.colors import ListedColormap
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, classification_report
	# filter warnings
	warnings.filterwarnings("ignore")


	def plot_correlation(data):
	'''
	plot correlation's matrix to explore dependency between features
	'''
	# init figure size
	rcParams['figure.figsize'] = 15, 20
	fig = plt.figure()
	sns.heatmap(data.corr(), annot=True, fmt=".2f")
	plt.show()
	fig.savefig('corr.png')

	def plot_densities(data):
	'''
	Plot features densities depending on the outcome values
	'''
	# change fig size to fit all subplots beautifully
	rcParams['figure.figsize'] = 15, 20

	# separate data based on outcome values
	outcome_0 = data[data['Outcome'] == 0]
	outcome_1 = data[data['Outcome'] == 1]

	# init figure
	fig, axs = plt.subplots(8, 1)
	fig.suptitle('Features densities for different outcomes 0/1')
	plt.subplots_adjust(left = 0.25, right = 0.9, bottom = 0.1, top = 0.95,
	wspace = 0.2, hspace = 0.9)

	# plot densities for outcomes
	for column_name in names[:-1]:
	ax = axs[names.index(column_name)]
	#plt.subplot(4, 2, names.index(column_name) + 1)
	outcome_0[column_name].plot(kind='density', ax=ax, subplots=True,
	sharex=False, color="red", legend=True,
	label=column_name + ' for Outcome = 0')
	outcome_1[column_name].plot(kind='density', ax=ax, subplots=True,
	sharex=False, color="green", legend=True,
	label=column_name + ' for Outcome = 1')
	ax.set_xlabel(column_name + ' values')
	ax.set_title(column_name + ' density')
	ax.grid('on')
	plt.show()
	fig.savefig('densities.png')

	def accuracy(k, X_train, y_train, X_test, y_test):
	'''
	compute accuracy of the classification based on k values
	'''
	# instantiate learning model and fit data
	knn = KNeighborsClassifier(n_neighbors=k)
	knn.fit(X_train, y_train)

	# predict the response
	pred = knn.predict(X_test)

	# evaluate and return accuracy
	return accuracy_score(y_test, pred)

	def classify_and_plot(X, y):
	'''
	split data, fit, classify, plot and evaluate results
	'''
	# split data into training and testing set
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 41)
	best_n_neighbours = np.argmax(np.array([accuracy(k, X_train, y_train, X_test, y_test) for k in range(1, int(rows_nbr/2))])) + 1
	print('----------------------------------------------------------------------')
	print('For best accuracy use k = ', best_n_neighbours)
	print('----------------------------------------------------------------------')

	# init vars
	n_neighbors = best_n_neighbours
	h = .02 # step size in the mesh

	# Create color maps
	cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
	cmap_bold = ListedColormap(['#FF0000', '#0000FF'])

	rcParams['figure.figsize'] = 5, 5
	for weights in ['uniform', 'distance']:
	# we create an instance of Neighbours Classifier and fit the data.
	clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
	clf.fit(X_train, y_train)

	# Plot the decision boundary. For that, we will assign a color to each
	# point in the mesh [x_min, x_max]x[y_min, y_max].
	x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
	y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
	xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
	np.arange(y_min, y_max, h))
	Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

	# Put the result into a color plot
	Z = Z.reshape(xx.shape)
	fig = plt.figure()
	plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

	# Plot also the training points, x-axis = 'Glucose', y-axis = "BMI"
	plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
	plt.xlim(xx.min(), xx.max())
	plt.ylim(yy.min(), yy.max())
	plt.title("0/1 outcome classification (k = %i, weights = '%s')" % (n_neighbors, weights))
	plt.show()
	fig.savefig(weights +'.png')

	# evaluate
	y_expected = y_test
	y_predicted = clf.predict(X_test)

	# print results
	print('----------------------------------------------------------------------')
	print('Classification report')
	print('----------------------------------------------------------------------')
	print('\n', classification_report(y_expected, y_predicted))
	print('----------------------------------------------------------------------')
	print('Accuracy = %5s' % round(accuracy(n_neighbors, X_train, y_train, X_test, y_test), 3))
	print('----------------------------------------------------------------------')


	# load your data
	data = pd.read_csv('diabetes.csv')
	names = list(data.columns)

	# plot correlation & densities
	plot_correlation(data)
	plot_densities(data)

	# we only take the best two features and prepare them for the KNN classifier
	rows_nbr = 30#data.shape[0]
	X_prime = np.array(data.iloc[:rows_nbr, [1,5]])
	#X = preprocessing.scale(X_prime)
	X = X_prime
	y = np.array(data.iloc[:rows_nbr, 8])

	# classify, evaluate and plot results
	classify_and_plot(X, y)