Skip to content

Instantly share code, notes, and snippets.

@raghavrv
Last active February 22, 2016 14:16
Show Gist options
  • Save raghavrv/881d377ad5061a27b564 to your computer and use it in GitHub Desktop.
Save raghavrv/881d377ad5061a27b564 to your computer and use it in GitHub Desktop.
RF Missing Value Benchmark script
*swp
*ipynb_checkpoints
*build
*.dat
The shape of the dataset is (581012, 54)
The number of trees for this benchmarking is 100
Score with the entire dataset = 0.95
Score RF with the 1.82173199202 % missing = 0.94
Score RF+Imp. with the 1.82173199202 % missing = 0.95
Score RF with the 3.55273467929 % missing = 0.96
Score RF+Imp. with the 3.55273467929 % missing = 0.96
Score RF with the 5.19537302857 % missing = 0.96
Score RF+Imp. with the 5.19537302857 % missing = 0.96
Score RF with the 6.75907503408 % missing = 0.97
Score RF+Imp. with the 6.75907503408 % missing = 0.97
Score RF with the 8.24469488869 % missing = 0.97
Score RF+Imp. with the 8.24469488869 % missing = 0.97
Score RF with the 9.65472823791 % missing = 0.97
Score RF+Imp. with the 9.65472823791 % missing = 0.97
Score RF with the 10.9961424906 % missing = 0.97
Score RF+Imp. with the 10.9961424906 % missing = 0.98
Score RF with the 12.2712707406 % missing = 0.98
Score RF+Imp. with the 12.2712707406 % missing = 0.98
Score RF with the 13.4793894739 % missing = 0.98
Score RF+Imp. with the 13.4793894739 % missing = 0.98
Score RF with the 14.6313482146 % missing = 0.98
Score RF+Imp. with the 14.6313482146 % missing = 0.98
Score RF with the 15.722225792 % missing = 0.98
Score RF+Imp. with the 15.722225792 % missing = 0.98
Score RF with the 16.7580461779 % missing = 0.98
Score RF+Imp. with the 16.7580461779 % missing = 0.98
Score RF with the 17.7444126226 % missing = 0.98
Score RF+Imp. with the 17.7444126226 % missing = 0.98
Score RF with the 18.6789537846 % missing = 0.98
Score RF+Imp. with the 18.6789537846 % missing = 0.99
Score RF with the 19.5694338945 % missing = 0.98
Score RF+Imp. with the 19.5694338945 % missing = 0.98
Score RF with the 20.4148903918 % missing = 0.98
Score RF+Imp. with the 20.4148903918 % missing = 0.98
Score RF with the 21.2185360613 % missing = 0.98
Score RF+Imp. with the 21.2185360613 % missing = 0.99
Score RF with the 21.9800458 % missing = 0.98
Score RF+Imp. with the 21.9800458 % missing = 0.99
Score RF with the 22.7055551348 % missing = 0.98
Score RF+Imp. with the 22.7055551348 % missing = 0.99
Score RF with the 23.3928265904 % missing = 0.98
Score RF+Imp. with the 23.3928265904 % missing = 0.99
Score RF with the 24.0459845159 % missing = 0.98
Score RF+Imp. with the 24.0459845159 % missing = 0.99
Score RF with the 24.6653890746 % missing = 0.98
Score RF+Imp. with the 24.6653890746 % missing = 0.99
Score RF with the 25.2552315487 % missing = 0.98
Score RF+Imp. with the 25.2552315487 % missing = 0.99
Score RF with the 25.8147979859 % missing = 0.98
Score RF+Imp. with the 25.8147979859 % missing = 0.99
Score RF with the 26.347275673 % missing = 0.98
Score RF+Imp. with the 26.347275673 % missing = 0.99
Score RF with the 26.8534359334 % missing = 0.98
Score RF+Imp. with the 26.8534359334 % missing = 0.99
Score RF with the 27.3326508715 % missing = 0.98
Score RF+Imp. with the 27.3326508715 % missing = 0.99
Score RF with the 27.7895579896 % missing = 0.98
Score RF+Imp. with the 27.7895579896 % missing = 0.99
Score RF with the 28.2251995305 % missing = 0.98
Score RF+Imp. with the 28.2251995305 % missing = 0.99
Score RF with the 28.6352089113 % missing = 0.98
Score RF+Imp. with the 28.6352089113 % missing = 0.99
Score RF with the 29.0269296408 % missing = 0.99
Score RF+Imp. with the 29.0269296408 % missing = 0.99
Score RF with the 29.3992557303 % missing = 0.99
Score RF+Imp. with the 29.3992557303 % missing = 0.99
Score RF with the 29.7540389935 % missing = 0.98
Score RF+Imp. with the 29.7540389935 % missing = 0.99
Score RF with the 30.0889495238 % missing = 0.99
Score RF+Imp. with the 30.0889495238 % missing = 0.99
Score RF with the 30.4072383537 % missing = 0.99
Score RF+Imp. with the 30.4072383537 % missing = 0.99
Score RF with the 30.7117740413 % missing = 0.99
Score RF+Imp. with the 30.7117740413 % missing = 0.99
Score RF with the 30.9991270659 % missing = 0.99
Score RF+Imp. with the 30.9991270659 % missing = 0.99
Score RF with the 31.2725134 % missing = 0.99
Score RF+Imp. with the 31.2725134 % missing = 0.99
Score RF with the 31.5319171071 % missing = 0.99
Score RF+Imp. with the 31.5319171071 % missing = 0.99
Score RF with the 31.7783836172 % missing = 0.99
Score RF+Imp. with the 31.7783836172 % missing = 0.99
he
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_covtype, load_digits, load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score
rng = np.random.RandomState(0)
from time import time
# dataset = load_digits()
# dataset = load_iris()
dataset = fetch_covtype()
X, y = dataset.data, dataset.target
# Take only 2 classes
# mask = y < 3
# mask = (y == 1) | (y == 2)
# X = X[mask]
# y = y[mask]
# plt.hist(y)
# plt.show()
# X, y = X[::20].copy(), y[::20].copy()
X, y = X[::2].copy(), y[::2].copy()
n_samples, n_features = X.shape
n_estimators = 100
n_jobs = -1
rng = np.random.RandomState(42)
cv = StratifiedShuffleSplit(n_iter=3, test_size=0.3, random_state=rng)
print "The shape of the dataset is %s" % str(X.shape)
print "The number of trees for this benchmarking is %s" % n_estimators
start = time()
# Estimate the score on the entire dataset, with no missing values
estimator = RandomForestClassifier(random_state=0, n_estimators=n_estimators,
missing_values=None, n_jobs=n_jobs)
score = cross_val_score(estimator, X, y, cv=cv).mean()
end = time()
print "Score with the entire dataset = %.2f in %d seconds" % (score, end - start)
baseline_score = score
scores_missing = []
scores_impute = []
rf_missing = RandomForestClassifier(random_state=0, n_estimators=n_estimators,
missing_values='NaN', n_jobs=n_jobs)
rf_impute = Pipeline([("imputer", Imputer(missing_values='NaN',
strategy="median", axis=0)),
("forest", RandomForestClassifier(
random_state=0,
n_estimators=n_estimators,
n_jobs=n_jobs))])
missing_fraction_range = []
missing_mask = np.zeros(X.shape, dtype=bool)
X_missing = X.copy()
X_missing_feat_min = X.copy()
for _ in range(70):
rv = rng.randn(*X.shape)
thresh = np.sort(rv.ravel())[int(0.05 * n_samples * n_features)]
missing_mask += rv < thresh
missing_mask[y!=2] = False # Features should go missing only for y=1
missing_fraction = np.mean(missing_mask)
missing_fraction_range.append(missing_fraction)
X_missing[missing_mask] = np.nan
train, test = iter(cv.split(X, y)).next()
# print(len(train), len(test))
# score_missing = rf_missing.fit(X_missing[train], y[train]).score(X[test], y[test])
# score_impute = rf_impute.fit(X_missing[train], y[train]).score(X[test], y[test])
start = time()
score_missing = cross_val_score(rf_missing, X_missing, y, cv=cv).mean()
end = time()
scores_missing.append(score_missing)
print ("Score RF with the %s %% missing = %.2f in %d seconds"
% (missing_fraction*100, score_missing, end - start))
start = time()
score_impute = cross_val_score(rf_impute, X_missing, y, cv=cv).mean()
end = time()
scores_impute.append(score_impute)
print ("Score RF+Imp. with the %s %% missing = %.2f in %d seconds"
% (missing_fraction*100, score_impute, end - start))
np.save('scores_missing.npy', scores_missing)
np.save('scores_impute.npy', scores_impute)
np.save('missing_fraction_range.npy', missing_fraction_range)
np.save('baseline_score.npy', baseline_score)
import numpy as np
import matplotlib.pyplot as plt
baseline_score = np.load('baseline_score.npy')
missing_fraction_range = np.load('missing_fraction_range.npy')
scores_missing = np.load('scores_missing.npy')
scores_impute = np.load('scores_impute.npy')
plt.close('all')
plt.plot(missing_fraction_range, scores_missing, 'o--', color='r', label='RF mv')
plt.plot(missing_fraction_range, scores_impute, 'o--', color='b', label='RF imp.')
plt.axhline(baseline_score, label='no missing', color='k')
plt.xlabel('Missing fraction')
plt.ylabel('Score')
plt.legend(loc='best')
plt.show()
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment