srishtis · October 9, 2018 13:51
diff --git a/hpp_feature_engg_1.py b/hpp_feature_engg_1.py
 # Let us start with the variables having highest correlation with the target variable
 # looking at OverallQual, GrLivArea, GarageCars and TotalBsmtSF

 # Since it is one of the highest correlated variables with the response, we can create a quadratic variable that might be a part of the regression equation
 total_df["OverallQual_2"] = total_df["OverallQual"].astype(int) ** 2
 #also creating cubic
 total_df["OverallQual_3"] = total_df["OverallQual"].astype(int) ** 3
 # another sqrt transformation
 total_df["OverallQual_sqrt"] = np.sqrt(total_df["OverallQual"].astype(int))

 # OverallQual is just a categorical variable in guise of integers
 # Changing OverallQual into a categorical variable
 total_df['OverallQual'] = total_df['OverallQual'].astype(str)

 # next variable: GrLivArea
 # creating the polynomial variables from here as well
 total_df["GrLivArea_2"] = total_df["GrLivArea"] ** 2
 #also creating cubic
 total_df["GrLivArea_3"] = total_df["GrLivArea"] ** 3
 # another sqrt transformation
 total_df["GrLivArea_sqrt"] = np.sqrt(total_df["GrLivArea"])
 # log transformed
 total_df['GrLivArea_log'] = np.log1p(total_df['GrLivArea'])
 # we can also create buckets on GrLivArea
 total_df['GrLivArea_Band'] = pd.cut(total_df['GrLivArea'], 6,labels=["1", "2", "3","4","5","6"])
 print(total_df['GrLivArea_Band'].unique())

 # since these are essential categorical variables,
 # let us convert them to string
 total_df['GrLivArea_Band'] = total_df['GrLivArea_Band'].astype(str)

 # creating polynomial features from TotalBsmtSF
 total_df["TotalBsmtSF_2"] = total_df["TotalBsmtSF"] ** 2
 #also creating cubic
 total_df["TotalBsmtSF_3"] = total_df["TotalBsmtSF"] ** 3
 # another sqrt transformation
 total_df["TotalBsmtSF_sqrt"] = np.sqrt(total_df["TotalBsmtSF"])

 # log transformed variable
 total_df['TotalBsmtSF_log'] = np.log1p(total_df['TotalBsmtSF'])

 # also creating a 1-0 flag called 'HasBsmt' using 'TotalBsmtSF'
 #if area>0 it is 'Y', else 'N'
 total_df['HasBsmt'] = np.where(total_df['TotalBsmtSF']>0, 'Y', 'N')

 # we can also create buckets on TotalBsmtSF
 total_df['TotalBsmtSF_Band'] = pd.cut(total_df['TotalBsmtSF'], 3,labels=["1", "2", "3"])
 print(total_df['TotalBsmtSF_Band'].unique())

 # since these are essential categorical variables,
 # let us convert them to string
 total_df['TotalBsmtSF_Band'] = total_df['TotalBsmtSF_Band'].astype(str)

 # creating polynomial features from GarageCars
 total_df["GarageCars_2"] = total_df["GarageCars"] ** 2
 #also creating cubic
 total_df["GarageCars_3"] = total_df["GarageCars"] ** 3
 # another sqrt transformation
 total_df["GarageCars_sqrt"] = np.sqrt(total_df["GarageCars"])

 # log transformed variable
 total_df['GarageCars_log'] = np.log1p(total_df['GarageCars'])
	# Let us start with the variables having highest correlation with the target variable
	# looking at OverallQual, GrLivArea, GarageCars and TotalBsmtSF

	# Since it is one of the highest correlated variables with the response, we can create a quadratic variable that might be a part of the regression equation
	total_df["OverallQual_2"] = total_df["OverallQual"].astype(int) ** 2
	#also creating cubic
	total_df["OverallQual_3"] = total_df["OverallQual"].astype(int) ** 3
	# another sqrt transformation
	total_df["OverallQual_sqrt"] = np.sqrt(total_df["OverallQual"].astype(int))

	# OverallQual is just a categorical variable in guise of integers
	# Changing OverallQual into a categorical variable
	total_df['OverallQual'] = total_df['OverallQual'].astype(str)

	# next variable: GrLivArea
	# creating the polynomial variables from here as well
	total_df["GrLivArea_2"] = total_df["GrLivArea"] ** 2
	#also creating cubic
	total_df["GrLivArea_3"] = total_df["GrLivArea"] ** 3
	# another sqrt transformation
	total_df["GrLivArea_sqrt"] = np.sqrt(total_df["GrLivArea"])
	# log transformed
	total_df['GrLivArea_log'] = np.log1p(total_df['GrLivArea'])
	# we can also create buckets on GrLivArea
	total_df['GrLivArea_Band'] = pd.cut(total_df['GrLivArea'], 6,labels=["1", "2", "3","4","5","6"])
	print(total_df['GrLivArea_Band'].unique())

	# since these are essential categorical variables,
	# let us convert them to string
	total_df['GrLivArea_Band'] = total_df['GrLivArea_Band'].astype(str)

	# creating polynomial features from TotalBsmtSF
	total_df["TotalBsmtSF_2"] = total_df["TotalBsmtSF"] ** 2
	#also creating cubic
	total_df["TotalBsmtSF_3"] = total_df["TotalBsmtSF"] ** 3
	# another sqrt transformation
	total_df["TotalBsmtSF_sqrt"] = np.sqrt(total_df["TotalBsmtSF"])

	# log transformed variable
	total_df['TotalBsmtSF_log'] = np.log1p(total_df['TotalBsmtSF'])

	# also creating a 1-0 flag called 'HasBsmt' using 'TotalBsmtSF'
	#if area>0 it is 'Y', else 'N'
	total_df['HasBsmt'] = np.where(total_df['TotalBsmtSF']>0, 'Y', 'N')

	# we can also create buckets on TotalBsmtSF
	total_df['TotalBsmtSF_Band'] = pd.cut(total_df['TotalBsmtSF'], 3,labels=["1", "2", "3"])
	print(total_df['TotalBsmtSF_Band'].unique())

	# since these are essential categorical variables,
	# let us convert them to string
	total_df['TotalBsmtSF_Band'] = total_df['TotalBsmtSF_Band'].astype(str)

	# creating polynomial features from GarageCars
	total_df["GarageCars_2"] = total_df["GarageCars"] ** 2
	#also creating cubic
	total_df["GarageCars_3"] = total_df["GarageCars"] ** 3
	# another sqrt transformation
	total_df["GarageCars_sqrt"] = np.sqrt(total_df["GarageCars"])

	# log transformed variable
	total_df['GarageCars_log'] = np.log1p(total_df['GarageCars'])