thistleknot · January 5, 2023 14:27
diff --git a/random_feature_selection.py b/random_feature_selection.py
 """
 Based on Damien Benveniste, PhD 'quick Feature Selection' method

 original post: https://lnkd.in/gCDSEJcF

 quick FEATURE SELECTION 
 	 train a Supervised Learning algorithm with a Feature Importance measure
 	 This is also a method that can be used for highly non-linear data as opposed to LASSO (for example) that tends to only understand linear relationships in the data. The random feature is a "Random Bar" because this is the minimum bar a feature needs to beat to be a part of the potentially useful features set. Now it doesn't mean there are not additional features that could be beneficial to further remove to optimize your model. 
 		 
 	This is a technique I like to perform a quick FEATURE SELECTION for Machine Learning applications. I tend to call it the "Random Bar" method! Let's assume you have a feature set X and a target Y. Let's create a random vector V (for example np.random.normal(size=(1, 100))) and append that vector as a new feature to X:

 	X' = [X, V]

 	X' is just the original feature set with additionally the new random feature. Keep in mind that this new feature cannot possibly help to predict the target Y since it is random! Now, take that data (X', Y) and train a Supervised Learning algorithm with a Feature Importance measure that is relevant for you application. Intuitively, the mean entropy gain per split of tree based algorithms (Random Forest, Xgboost, ...) is a convincing measure of feature importance to me. The statistical fluctuation of the data is such that even the random feature will be attributed a non-zero feature importance by the algorithm, but we know it is artificial. Any feature with a lower feature importance than the random feature has to be useless to predict the target and the features with a higher feature importance are at least better than random noise at predicting the target. 

 	This is especially useful if you have thousands of features and you want to weed out quickly the ones that won't have any impact in the learning process. This is also a method that can be used for highly non-linear data as opposed to LASSO (for example) that tends to only understand linear relationships in the data. The random feature is a "Random Bar" because this is the minimum bar a feature needs to beat to be a part of the potentially useful features set. Now it doesn't mean there are not additional features that could be beneficial to further remove to optimize your model. Do you know if this method has a more jargon-y name?  

 """



 # Import necessary libraries 
 from sklearn import tree
 from sklearn.ensemble import BaggingRegressor
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.model_selection import GridSearchCV
 from sklearn.model_selection import KFold
 from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import train_test_split
 from sklearn.tree import DecisionTreeRegressor
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import sys
 import termplotlib as tpl
 import warnings
 from sklearn.metrics import mean_squared_error
 import math

 #load in your data
 #data = pd.read_csv('states.csv').set_index('States')
 data = pd.read_csv('https://raw.githubusercontent.com/thistleknot/python-ml/master/data/raw/states.csv').set_index('States')

 data.columns

 models = ['rf']

 def MAPE(Y_actual,Y_Predicted):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape

 if not sys.warnoptions:
    warnings.simplefilter("ignore")

    for m in models:
        print("Model: ",m)
        # Declare independent variable
        for independent in data.columns:
            print("independent:", independent)
            #independent = 'Traf Deaths'

            #X = data
            kf = KFold(n_splits=5)

            useful_features = []

            internal_predictions = []
            internal_test = []

            MAPEs = []
            RMSEs = []
            outer_cv = []

            for train_i, test_i in kf.split(data):
                #print("%s %s" % (train_i, test_i))

                train = data.iloc[train_i]
                test = data.iloc[test_i]

                feature_importances = []

                #after reading https://towardsdatascience.com/bagging-decision-trees-clearly-explained-57d4d19ed2d3
                #went with RandomForest to avoid correlated variables being reselected for each pass, also makes model more robust because it does subsampling (which is what I was trying to do with cross validation)

                for i in range(20):
                    # Instantiate model 

                    df = train
                    df['random'] = np.random.normal(size=len(df))

                    if(m=='rf'):                        
                        model = RandomForestRegressor(n_estimators=20, bootstrap=True, 
                                           max_samples=0.8, n_jobs=-1)
                    elif(m=='dt'):                        
                        model = BaggingRegressor(n_jobs=-1)

                    # Define X and y
                    X = df.drop(independent, axis=1)
                    y = df[independent]

                    # Fit model
                    model.fit(X, y)

                    # Get feature importance
                    importances = pd.DataFrame(model.feature_importances_,
                                               index = X.columns,
                                               columns=['importance']).sort_values('importance',ascending=False)
                    #print(importances)
                    # Append feature importance to list
                    feature_importances.append(importances)

                #bagged importances

                # Calculate mean feature importance across iterations 
                mean_importances = pd.concat(feature_importances).groupby(level=0).mean()

                # Any feature with a lower feature importance than the random feature has to be useless to predict the target
                random_feature_importance = mean_importances.loc['random','importance']

                # Select features above random feature importance
                internal_useful_features = mean_importances[mean_importances['importance'] > random_feature_importance]
                useful_features.append(internal_useful_features.sort_values(by='importance',ascending=False))

                # Print useful features
                if(False):
                    print("internal: ", internal_useful_features.sort_values(by='importance',ascending=False))    
                    print(internal_useful_features.sort_values(by='importance',ascending=False).sum())

                #create an array of features

                features = internal_useful_features.index

                #create an array of target
                target = train[independent].values

                if(m == 'rf'):
                    model = RandomForestRegressor(n_estimators=20, bootstrap=True, 
                                           max_samples=0.8, n_jobs=-1)
                elif(m == 'dt'):
                    model = BaggingRegressor(n_jobs=-1)
                    
                model.fit(train[features], target)
                try:
                    #fit the model
                    
                    predictions_rf = model.predict(test[features])

                    internal_predictions.append(predictions_rf)
                    internal_test.append(test[independent])
                    MAPEs.append(MAPE(test[independent],predictions_rf))
                    MSE = mean_squared_error(test[independent], predictions_rf)
                    RMSE = math.sqrt(MSE)
                    RMSEs.append(RMSE)

                    outer_cv.append(predictions_rf-test[independent])
                except:
                    pass

            print("OOS MAPE: ", np.nanmean(MAPEs))
            print("OOS MAPE std: ", np.nanstd(MAPEs))
            print("OOS RMSEs ", np.nanmean(RMSEs))
            print("OOS RMSEs std ", np.nanstd(RMSEs))
            #show out of sample predictions
            print('out of sample predictions')
            plt.scatter(internal_predictions,internal_test)
            plt.show()

            external_importance = pd.DataFrame(np.nanmean(pd.concat(useful_features,axis=1),axis=1),index=pd.concat(useful_features,axis=1).index,columns=['importance']).sort_values(by='importance',ascending=False)
            print("external: ", external_importance)
            #print(outer_cv,np.nanmean(outer_cv))
            print("sum: ", external_importance.sum())

            model = RandomForestRegressor(n_estimators=20, bootstrap=True, 
                                           max_samples=0.8, n_jobs=-1)

            #fit the model
            model.fit(data[external_importance.index], data[independent])

            #make predictions
            predictions_rf = model.predict(data[external_importance.index])

            print('best features on entire dataset')
            plt.scatter(predictions_rf,data[independent])
            plt.show()
	"""
	Based on Damien Benveniste, PhD 'quick Feature Selection' method

	original post: https://lnkd.in/gCDSEJcF

	quick FEATURE SELECTION
	train a Supervised Learning algorithm with a Feature Importance measure
	This is also a method that can be used for highly non-linear data as opposed to LASSO (for example) that tends to only understand linear relationships in the data. The random feature is a "Random Bar" because this is the minimum bar a feature needs to beat to be a part of the potentially useful features set. Now it doesn't mean there are not additional features that could be beneficial to further remove to optimize your model.

	This is a technique I like to perform a quick FEATURE SELECTION for Machine Learning applications. I tend to call it the "Random Bar" method! Let's assume you have a feature set X and a target Y. Let's create a random vector V (for example np.random.normal(size=(1, 100))) and append that vector as a new feature to X:

	X' = [X, V]

	X' is just the original feature set with additionally the new random feature. Keep in mind that this new feature cannot possibly help to predict the target Y since it is random! Now, take that data (X', Y) and train a Supervised Learning algorithm with a Feature Importance measure that is relevant for you application. Intuitively, the mean entropy gain per split of tree based algorithms (Random Forest, Xgboost, ...) is a convincing measure of feature importance to me. The statistical fluctuation of the data is such that even the random feature will be attributed a non-zero feature importance by the algorithm, but we know it is artificial. Any feature with a lower feature importance than the random feature has to be useless to predict the target and the features with a higher feature importance are at least better than random noise at predicting the target.

	This is especially useful if you have thousands of features and you want to weed out quickly the ones that won't have any impact in the learning process. This is also a method that can be used for highly non-linear data as opposed to LASSO (for example) that tends to only understand linear relationships in the data. The random feature is a "Random Bar" because this is the minimum bar a feature needs to beat to be a part of the potentially useful features set. Now it doesn't mean there are not additional features that could be beneficial to further remove to optimize your model. Do you know if this method has a more jargon-y name?

	"""



	# Import necessary libraries
	from sklearn import tree
	from sklearn.ensemble import BaggingRegressor
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.model_selection import GridSearchCV
	from sklearn.model_selection import KFold
	from sklearn.model_selection import cross_val_score
	from sklearn.model_selection import train_test_split
	from sklearn.tree import DecisionTreeRegressor
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import sys
	import termplotlib as tpl
	import warnings
	from sklearn.metrics import mean_squared_error
	import math

	#load in your data
	#data = pd.read_csv('states.csv').set_index('States')
	data = pd.read_csv('https://raw.githubusercontent.com/thistleknot/python-ml/master/data/raw/states.csv').set_index('States')

	data.columns

	models = ['rf']

	def MAPE(Y_actual,Y_Predicted):
	mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
	return mape

	if not sys.warnoptions:
	warnings.simplefilter("ignore")

	for m in models:
	print("Model: ",m)
	# Declare independent variable
	for independent in data.columns:
	print("independent:", independent)
	#independent = 'Traf Deaths'

	#X = data
	kf = KFold(n_splits=5)

	useful_features = []

	internal_predictions = []
	internal_test = []

	MAPEs = []
	RMSEs = []
	outer_cv = []

	for train_i, test_i in kf.split(data):
	#print("%s %s" % (train_i, test_i))

	train = data.iloc[train_i]
	test = data.iloc[test_i]

	feature_importances = []

	#after reading https://towardsdatascience.com/bagging-decision-trees-clearly-explained-57d4d19ed2d3
	#went with RandomForest to avoid correlated variables being reselected for each pass, also makes model more robust because it does subsampling (which is what I was trying to do with cross validation)

	for i in range(20):
	# Instantiate model

	df = train
	df['random'] = np.random.normal(size=len(df))

	if(m=='rf'):
	model = RandomForestRegressor(n_estimators=20, bootstrap=True,
	max_samples=0.8, n_jobs=-1)
	elif(m=='dt'):
	model = BaggingRegressor(n_jobs=-1)

	# Define X and y
	X = df.drop(independent, axis=1)
	y = df[independent]

	# Fit model
	model.fit(X, y)

	# Get feature importance
	importances = pd.DataFrame(model.feature_importances_,
	index = X.columns,
	columns=['importance']).sort_values('importance',ascending=False)
	#print(importances)
	# Append feature importance to list
	feature_importances.append(importances)

	#bagged importances

	# Calculate mean feature importance across iterations
	mean_importances = pd.concat(feature_importances).groupby(level=0).mean()

	# Any feature with a lower feature importance than the random feature has to be useless to predict the target
	random_feature_importance = mean_importances.loc['random','importance']

	# Select features above random feature importance
	internal_useful_features = mean_importances[mean_importances['importance'] > random_feature_importance]
	useful_features.append(internal_useful_features.sort_values(by='importance',ascending=False))

	# Print useful features
	if(False):
	print("internal: ", internal_useful_features.sort_values(by='importance',ascending=False))
	print(internal_useful_features.sort_values(by='importance',ascending=False).sum())

	#create an array of features

	features = internal_useful_features.index

	#create an array of target
	target = train[independent].values

	if(m == 'rf'):
	model = RandomForestRegressor(n_estimators=20, bootstrap=True,
	max_samples=0.8, n_jobs=-1)
	elif(m == 'dt'):
	model = BaggingRegressor(n_jobs=-1)

	model.fit(train[features], target)
	try:
	#fit the model

	predictions_rf = model.predict(test[features])

	internal_predictions.append(predictions_rf)
	internal_test.append(test[independent])
	MAPEs.append(MAPE(test[independent],predictions_rf))
	MSE = mean_squared_error(test[independent], predictions_rf)
	RMSE = math.sqrt(MSE)
	RMSEs.append(RMSE)

	outer_cv.append(predictions_rf-test[independent])
	except:
	pass

	print("OOS MAPE: ", np.nanmean(MAPEs))
	print("OOS MAPE std: ", np.nanstd(MAPEs))
	print("OOS RMSEs ", np.nanmean(RMSEs))
	print("OOS RMSEs std ", np.nanstd(RMSEs))
	#show out of sample predictions
	print('out of sample predictions')
	plt.scatter(internal_predictions,internal_test)
	plt.show()

	external_importance = pd.DataFrame(np.nanmean(pd.concat(useful_features,axis=1),axis=1),index=pd.concat(useful_features,axis=1).index,columns=['importance']).sort_values(by='importance',ascending=False)
	print("external: ", external_importance)
	#print(outer_cv,np.nanmean(outer_cv))
	print("sum: ", external_importance.sum())

	model = RandomForestRegressor(n_estimators=20, bootstrap=True,
	max_samples=0.8, n_jobs=-1)

	#fit the model
	model.fit(data[external_importance.index], data[independent])

	#make predictions
	predictions_rf = model.predict(data[external_importance.index])

	print('best features on entire dataset')
	plt.scatter(predictions_rf,data[independent])
	plt.show()