-
-
Save newtonmwai/46a500bc5e55e3a2da05bfbf850b0ca0 to your computer and use it in GitHub Desktop.
This module is to ease your analysis with Scikit_Learn in Python.
It gives a few functionalities that the current Scikit_Learn library does not offer.
Please free to download and use it.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=UTF8 | |
######################################################################### | |
# This class is to help sklearn to handle statistical process # | |
# Author: Joon Lim from Master of Science in Analytics at Northwestern # | |
# Date: 04.23.2013 # | |
######################################################################### | |
''' this Module is built on top of numpy and sklearn. ''' | |
#### covariance Matrix function | |
def CovMat(X): | |
'''Calculate the covariance matrix with ndarray & DataFrame''' | |
cov = 1/float(len(X)-1) * (X-X.mean(0)).T.dot(X-X.mean(0)) | |
return cov | |
#### correlation Matrix function | |
def CorrMat(X): | |
'''Calculate the correlation matrix with ndarray & DataFrame''' | |
corr = 1/float(len(X)-1) * ((X-X.mean(0))/X.std(0, ddof=True)).T.dot(((X-X.mean(0))/X.std(0, ddof=True))) | |
return corr | |
#Adj R square — first find the metrics.r2_score — then, | |
def adj_r2_score(self,model,y,yhat): | |
"""Adjusted R square — put fitted linear model, y value, estimated y value in order | |
Example: | |
In [142]: metrics.r2_score(diabetes_y_train,yhat) | |
Out[142]: 0.51222621477934993 | |
In [144]: adj_r2_score(lm,diabetes_y_train,yhat) | |
Out[144]: 0.50035823946984515""" | |
from sklearn import metrics | |
adj = 1 - float(len(y)-1)/(len(y)-len(model.coef_)-1)*(1 - metrics.r2_score(y,yhat)) | |
return adj | |
### sample list of one generator | |
def one(p): | |
''' numpy array of ones generator | |
In [1]: one(10) | |
Out[2]: array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) | |
''' | |
return np.random.randint(1,2,p) | |
### sample list of zero generator | |
def zero(p): | |
''' numpy array of ones generator | |
In [1]: zero(10) | |
Out[2]: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) | |
''' | |
return np.random.randint(0,1,p) | |
### Summary Statistic that we can get in 'R' | |
def summary(df): | |
'''summary statistic with min, mean, sd, median, max, and sample size | |
In [218]: summary(df3) | |
Out[218]: | |
Column1 Column2 Column3 | |
Min -1.141389 -0.358481 -0.814520 | |
Mean -0.612211 0.551169 0.231508 | |
std -0.580043 0.609654 0.183388 | |
Median -0.147370 1.343851 1.373775 | |
Max 0.420570 0.723449 1.006741 | |
count 4.000000 4.000000 4.000000 | |
''' | |
import pandas as pd | |
def f(x): | |
return pd.Series([x.min(),x.mean(),x.median(),x.max(),x.std(),len(x.notnull())], index=['Min','Mean','std','Median','Max','count']) | |
return df.apply(f) | |
### sklearn viewer for predict_proba & predict | |
def viewer(mat,yhat): | |
''' sklearn viewer for predict_proba & predict. | |
In [418]: viewer(lgm.predict_proba(X),lgm.predict(X)) | |
Out[418]: | |
array([[ 0.52038098, 0.47961902, 0. ], | |
[ 0.27792502, 0.72207498, 1. ], | |
[ 0.12013796, 0.87986204, 1. ]]) | |
''' | |
a,b = np.shape(mat); p=a*(b+1) | |
background = np.arange(p,dtype=float).reshape([a,(b+1)]) | |
background[:,:-1] = mat | |
background[:,-1] = yhat | |
return background | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment