Created
July 20, 2021 16:07
-
-
Save devarshi16/cd09e245ffaf64eaf780ab346b2d0599 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import numpy as np | |
import time | |
import matplotlib.pyplot as plt | |
from tqdm import tqdm | |
import requests | |
import gzip | |
from numpy import loadtxt,savetxt | |
np.seterr(divide='ignore',invalid='ignore') | |
###################----DOWNLOADING THE PIMA-INDIAN-DIABETES-DATASET-----################### | |
url = 'https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv' | |
target = 'diabetes.csv' | |
response = requests.get(url,stream=True) | |
if response.status_code == 200: | |
with open(target, 'wb') as f: | |
f.write(response.raw.read()) | |
else: | |
print("status code:",response.status_code) | |
print("Unable to download:",url) | |
##########################--------HELPER FUNCTIONS---------########################### | |
def scalify_min_max(np_dataframe): | |
minimum_array=np.amin(np_dataframe,axis=0) | |
maximum_array=np.amax(np_dataframe,axis=0) | |
range_array = maximum_array-minimum_array | |
scaled = (np_dataframe-minimum_array)/range_array | |
return scaled | |
def accuracy_calculator(Y_out,Y): | |
accuracy=np.sum(np.logical_not(np.logical_xor(Y_out,Y)))/Y.shape[0] | |
true_positives=np.sum(np.logical_and(Y_out,Y)) | |
false_positives=np.sum(np.logical_and(Y_out,np.logical_not(Y))) | |
false_negatives=np.sum(np.logical_and(np.logical_not(Y_out),Y)) | |
precision=true_positives/(true_positives+false_positives) | |
recall=true_positives/(true_positives+false_negatives) | |
#print("Precision:",precision,".Recall:",recall) | |
F1_score=precision*recall/(precision+recall) | |
return [accuracy,precision,recall,F1_score] | |
####################-----DATA PREP FUNCITONS------############################### | |
def pre_data_prep(filename,dest_fileloc): | |
with open(filename,'rb') as f: | |
gzip_fd=gzip.GzipFile(fileobj=f) | |
next(gzip_fd) | |
diabetes_df = loadtxt(gzip_fd,delimiter=',',dtype=np.float32) | |
#diabetes_df = loadtxt(filename,delimiter=',',dtype=np.float32) | |
Y=diabetes_df[:,-1] | |
scaled_diabetes_df = scalify_min_max(diabetes_df[:,:-1]) | |
concat_diabetes = np.concatenate((scaled_diabetes_df,np.array([Y]).T),axis=1) | |
savetxt(dest_fileloc,concat_diabetes,delimiter=',') | |
def dataprep(fileloc,split): | |
assert len(split) == 3 | |
assert sum(split) == 1 | |
diabetes_data = loadtxt(fileloc,delimiter=',',dtype=np.float32) | |
Y=np.array([diabetes_data[:,-1]]).T | |
classes = np.unique(Y) | |
assert len(classes) == 2 | |
X=diabetes_data[:,:-1] | |
data_size=X.shape[0] | |
print(data_size,X.shape,Y.shape) | |
split_size=int(split[0]*data_size) | |
val_split=int(split[1]*data_size) | |
X_train=X[:split_size] | |
X_val=X[split_size:split_size+val_split] | |
X_test=X[split_size+val_split:] | |
Y_train=Y[:split_size] | |
Y_val=Y[split_size:split_size+val_split] | |
Y_test=Y[split_size+val_split:] | |
return X_train,X_val,X_test,Y_train,Y_val,Y_test | |
# Evaluating the learned model | |
def evaluate(theta_params,X,Y=None,thresh=0.5): | |
data_size=X.shape[0] | |
X_extend=np.concatenate((np.ones((data_size,1)),X),axis=1) | |
pred = np.greater(np.matmul(X_extend,theta_params),thresh)*1 | |
cost=np.sum(np.square(np.matmul(X_extend,theta_params)-Y))/(data_size*2) | |
return pred,cost | |
###############--------REGRESSION FUNCTION----------############################ | |
def linear_regression(X,Y,learning_rate=0.001,num_iters=100,thresh=0.5,rand_seed=None): | |
if rand_seed!=None: | |
np.random.seed(rand_seed) | |
data_size = X.shape[0] | |
#print(X.shape,Y.shape) | |
theta_params=np.array([np.random.randn(X.shape[1]+1)]).T | |
X_extend = np.concatenate((np.ones((data_size,1)),X),axis=1) | |
cost=[] | |
for i in tqdm(range(num_iters),desc="Training.."): | |
theta_params=theta_params-learning_rate*np.matmul((np.matmul(theta_params.T,X_extend.T)-Y.T),X_extend).T/data_size | |
cost.append(np.sum(np.square(np.matmul(X_extend,theta_params)-Y)[0])/(data_size*2)) | |
final_pred = np.greater(np.matmul(X_extend,theta_params),thresh)*1 | |
accuracy=np.sum(np.logical_not(np.logical_xor(final_pred,Y)))/data_size | |
cost=np.array(cost) | |
return theta_params,accuracy,cost | |
###############--------LOGISTIC FUNCTION------------############################ | |
def sigmoid_func(theta,X): | |
retval = 1/(1+np.exp(-1*np.matmul(theta.T,X))) | |
return retval | |
def logistic_regression(X,Y,learning_rate=0.001,num_iters=100,thresh=0.5,rand_seed=None): | |
if rand_seed!=None: | |
np.random.seed(rand_seed) | |
data_size = X.shape[0] | |
#print(X.shape,Y.shape) | |
theta_params=np.array([np.random.randn(X.shape[1]+1)]).T | |
X_extend = np.concatenate((np.ones((data_size,1)),X),axis=1).T | |
cost=[] | |
for i in tqdm(range(num_iters),desc="Training.."): | |
h_theta=sigmoid_func(theta_params,X_extend).T#mX1 | |
grad=np.matmul(X_extend,(h_theta-Y))/data_size#nXm*mX1=nX1 | |
theta_params=theta_params-learning_rate*grad | |
cost.append(-1*np.sum(Y*np.log(h_theta)+(1-Y)*np.log(1-h_theta))/(data_size)) | |
final_pred = np.greater(np.matmul(X_extend.T,theta_params),thresh)*1 | |
accuracy=np.sum(np.logical_not(np.logical_xor(final_pred,Y)))/data_size | |
cost=np.array(cost) | |
return theta_params,accuracy,cost | |
###############--------REGRESSION RUNNER---------############################### | |
def regression_runner(fileloc,data_split_ratios,seed_values): | |
X_train,X_val,X_test,Y_train,Y_val,Y_test = dataprep(fileloc,data_split_ratios) | |
all_models=[] | |
all_val_accuracies=[] | |
random_seeds=seed_values | |
num_iters=500 | |
x_axis=np.arange(num_iters) | |
for i in range(len(random_seeds)): | |
model,train_accuracy,cost=linear_regression(X_train,Y_train,rand_seed=random_seeds[i],num_iters=num_iters) | |
print("Trial:",i,".Train Accuracy:",train_accuracy) | |
all_models.append(model) | |
plt.plot(x_axis,cost,label=str(random_seeds[i])) | |
val_prediction,val_cost=evaluate(model,X_val,Y_val) | |
accuracy_precision=accuracy_calculator(val_prediction,Y_val) | |
all_val_accuracies.append(accuracy_precision[0]) | |
print("Validation Accuracy:",accuracy_precision,"Validation Cost:",val_cost) | |
#print("Validation Accuracy:",accuracy_precision) | |
#print("Validation Cost:",val_cost) | |
#plt.legend() | |
plt.title("Linear Regression") | |
plt.xlabel('Number of iterations') | |
plt.ylabel('Cost') | |
plt.show() | |
max_accuracy_idx=np.where(all_val_accuracies==np.amax(all_val_accuracies))[0][0] | |
best_model=all_models[max_accuracy_idx] | |
#print(best_model.shape) | |
#print(X_test.shape,Y_test.shape) | |
test_pred,test_cost=evaluate(best_model,X_test,Y_test) | |
#print(test_pred.shape,print(test_cost)) | |
test_accuracy,test_precision,test_recall,test_f1=accuracy_calculator(test_pred,Y_test) | |
print("Test accuracy:",test_accuracy,".Test cost:",test_cost) | |
#####################-------------LOGISTIC RUNNER--------------########################## | |
def logistic_runner(fileloc,data_split_ratios,seed_values): | |
X_train,X_val,X_test,Y_train,Y_val,Y_test = dataprep(fileloc,data_split_ratios) | |
all_models=[] | |
all_val_accuracies=[] | |
random_seeds=seed_values | |
num_iters=1500 | |
x_axis=np.arange(num_iters) | |
for i in range(10): | |
model,train_accuracy,cost=logistic_regression(X_train,Y_train,rand_seed=random_seeds[i],num_iters=num_iters) | |
print("Trial:",i,".Train Accuracy:",train_accuracy) | |
all_models.append(model) | |
plt.plot(x_axis,cost,label=str(random_seeds[i])) | |
val_prediction,val_cost=evaluate(model,X_val,Y_val) | |
accuracy_precision=accuracy_calculator(val_prediction,Y_val) | |
all_val_accuracies.append(accuracy_precision[0]) | |
print("Validation Accuracy:",accuracy_precision,"Validation Cost:",val_cost) | |
#print("Validation Cost:",val_cost) | |
#plt.legend() | |
plt.title("Logistic Regression") | |
plt.xlabel('Number of iterations') | |
plt.ylabel('Cost') | |
plt.show() | |
max_accuracy_idx=np.where(all_val_accuracies==np.amax(all_val_accuracies))[0][0] | |
best_model=all_models[max_accuracy_idx] | |
test_pred,test_cost=evaluate(best_model,X_test,Y_test) | |
#print(test_pred.shape,print(test_cost)) | |
test_accuracy,test_precision,test_recall,test_f1=accuracy_calculator(test_pred,Y_test) | |
print("Test accuracy:",test_accuracy,".Test cost:",test_cost) | |
if __name__ =="__main__": | |
fileloc='diabetes_scaled.csv' | |
pre_data_prep('diabetes.csv',fileloc) | |
data_split_ratios = [0.7,0.15,0.15] | |
seed_values=[12345,65432,872485,13500,198613,426713,923451,155978,18289,1050] | |
regression_runner(fileloc,data_split_ratios,seed_values) | |
seed_values=[12345,8123,872485,1350,198613,426713,923451,155978,18289,167] | |
logistic_runner(fileloc,data_split_ratios,seed_values) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment