Created
December 29, 2017 14:31
-
-
Save joseph-allen/9ada35224e617745ec0871a29b0df0ed to your computer and use it in GitHub Desktop.
learning curve, kfold and gridsearch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
from sklearn.model_selection import GridSearchCV, StratifiedKFold, learning_curve | |
from sklearn.ensemble import GradientBoostingClassifier | |
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, | |
n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)): | |
"""Generate a simple plot of the test and training learning curve""" | |
plt.figure() | |
plt.title(title) | |
if ylim is not None: | |
plt.ylim(*ylim) | |
plt.xlabel("Training examples") | |
plt.ylabel("Score") | |
train_sizes, train_scores, test_scores = learning_curve( | |
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) | |
train_scores_mean = np.mean(train_scores, axis=1) | |
train_scores_std = np.std(train_scores, axis=1) | |
test_scores_mean = np.mean(test_scores, axis=1) | |
test_scores_std = np.std(test_scores, axis=1) | |
plt.grid() | |
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, | |
train_scores_mean + train_scores_std, alpha=0.1, | |
color="r") | |
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, | |
test_scores_mean + test_scores_std, alpha=0.1, color="g") | |
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", | |
label="Training score") | |
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", | |
label="Cross-validation score") | |
plt.legend(loc="best") | |
return plt | |
# Cross validate model with Kfold stratified cross val | |
kfold = StratifiedKFold(n_splits=10) | |
# Gradient boosting tunning | |
GBC = GradientBoostingClassifier() | |
gb_param_grid = {'loss': ["deviance"]} | |
gsGBC = GridSearchCV(GBC, param_grid=gb_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose=1) | |
gsGBC.fit(X_train,Y_train) | |
GBC_best = gsGBC.best_estimator_ | |
# Best score | |
gsGBC.best_score_ | |
g = plot_learning_curve(gsGBC.best_estimator_,"GradientBoosting learning curves",X_train,Y_train,cv=kfold) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment