afraenkel · June 16, 2019 22:23
diff --git a/groupwise_leave_one_out.py b/groupwise_leave_one_out.py
 import pandas as pd
 import numpy as np

 # download data for example

 from sklearn.datasets.california_housing import fetch_california_housing

 d = fetch_california_housing()

 df = pd.DataFrame(d['data'], columns=d['feature_names'])
 df['price'] = d['target']
 df = df.assign(region=df['Latitude'].round())

 # ---------------------------
 # import regressor / group-wise data splitter

 from sklearn.ensemble import RandomForestRegressor
 from sklearn.model_selection import GroupKFold

 # put predictions here; same index as data
 preds = pd.Series(np.NaN, index=df.index)

 # Create variables and target arrays; create a column of groups
 variables = df.drop('price', axis=1)
 target = df['price']
 groups = df['region']

 # Create a "leave one group out" iterator
 n_groups = df['region'].nunique()
 group_kfold = GroupKFold(n_splits=n_groups)

 # train = data used for fitting the model
 # test = data on which the model predicts
 for train, test in group_kfold.split(variables, target, groups):
    
    # define the model (could be different)
    mdl = RandomForestRegressor(n_estimators=50)
    
    # fit the model on train ("outside of group")
    mdl.fit(variables.loc[train], target.loc[train])
    
    # predict using model on test ("in group"); add it to preds
    preds.loc[test] = mdl.predict(variables.loc[test])

    
 # calculate RMSE

 def rmse(preds, target):
    return np.mean((preds - target)**2)

 print(rmse(preds, target))

 # add preds to original dataframe; calculate RMSE by group
 results = pd.concat([preds.rename('predictions'), target, df['region']], axis=1)
 results.groupby('region').apply(lambda x:rmse(x['predictions'], x['price']))
	import pandas as pd
	import numpy as np

	# download data for example

	from sklearn.datasets.california_housing import fetch_california_housing

	d = fetch_california_housing()

	df = pd.DataFrame(d['data'], columns=d['feature_names'])
	df['price'] = d['target']
	df = df.assign(region=df['Latitude'].round())

	# ---------------------------
	# import regressor / group-wise data splitter

	from sklearn.ensemble import RandomForestRegressor
	from sklearn.model_selection import GroupKFold

	# put predictions here; same index as data
	preds = pd.Series(np.NaN, index=df.index)

	# Create variables and target arrays; create a column of groups
	variables = df.drop('price', axis=1)
	target = df['price']
	groups = df['region']

	# Create a "leave one group out" iterator
	n_groups = df['region'].nunique()
	group_kfold = GroupKFold(n_splits=n_groups)

	# train = data used for fitting the model
	# test = data on which the model predicts
	for train, test in group_kfold.split(variables, target, groups):

	# define the model (could be different)
	mdl = RandomForestRegressor(n_estimators=50)

	# fit the model on train ("outside of group")
	mdl.fit(variables.loc[train], target.loc[train])

	# predict using model on test ("in group"); add it to preds
	preds.loc[test] = mdl.predict(variables.loc[test])


	# calculate RMSE

	def rmse(preds, target):
	return np.mean((preds - target)**2)

	print(rmse(preds, target))

	# add preds to original dataframe; calculate RMSE by group
	results = pd.concat([preds.rename('predictions'), target, df['region']], axis=1)
	results.groupby('region').apply(lambda x:rmse(x['predictions'], x['price']))