Created
June 16, 2019 22:23
-
-
Save afraenkel/577df3687d09dd97c239699a8eef0d28 to your computer and use it in GitHub Desktop.
Groupwise leave one out
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
# download data for example | |
from sklearn.datasets.california_housing import fetch_california_housing | |
d = fetch_california_housing() | |
df = pd.DataFrame(d['data'], columns=d['feature_names']) | |
df['price'] = d['target'] | |
df = df.assign(region=df['Latitude'].round()) | |
# --------------------------- | |
# import regressor / group-wise data splitter | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.model_selection import GroupKFold | |
# put predictions here; same index as data | |
preds = pd.Series(np.NaN, index=df.index) | |
# Create variables and target arrays; create a column of groups | |
variables = df.drop('price', axis=1) | |
target = df['price'] | |
groups = df['region'] | |
# Create a "leave one group out" iterator | |
n_groups = df['region'].nunique() | |
group_kfold = GroupKFold(n_splits=n_groups) | |
# train = data used for fitting the model | |
# test = data on which the model predicts | |
for train, test in group_kfold.split(variables, target, groups): | |
# define the model (could be different) | |
mdl = RandomForestRegressor(n_estimators=50) | |
# fit the model on train ("outside of group") | |
mdl.fit(variables.loc[train], target.loc[train]) | |
# predict using model on test ("in group"); add it to preds | |
preds.loc[test] = mdl.predict(variables.loc[test]) | |
# calculate RMSE | |
def rmse(preds, target): | |
return np.mean((preds - target)**2) | |
print(rmse(preds, target)) | |
# add preds to original dataframe; calculate RMSE by group | |
results = pd.concat([preds.rename('predictions'), target, df['region']], axis=1) | |
results.groupby('region').apply(lambda x:rmse(x['predictions'], x['price'])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment