srishtis · October 9, 2018 11:32
diff --git a/hpp_missing_values_2.py b/hpp_missing_values_2.py
 # Let us first focus on the lesser null percentages (except LoTFrontage)
 # Let us see the distribution of data across these fields

 # first up: Utilities
 total_df.groupby(['Utilities']).size() # only one NoSeWa value and 2 nulls 
 train_df.groupby(['Utilities']).size() # train data contains the 'NoSeWa'i.e. Test has no NoSeWa value
 # 2 null values come from Test data
 ## intuitively this will not play a significant role in our model prediction
 # for now let us populate the nulls with the most frequent value 'AllPub'-- can drop it later
 total_df['Utilities'] = total_df['Utilities'].fillna(total_df['Utilities'].mode()[0]) 

 # next is: Functional
 # Similarly for Functional
 #Functional : by the definition of the column, 'NA' means typical
 total_df.groupby(['Functional']).size() # typ has 2717 as of now
 # Since 'typ' is also the most frequent value, let us replace 'NA' with 'typ'
 total_df["Functional"] = total_df["Functional"].fillna("Typ")
 total_df.groupby(['Functional']).size() # typ= 2719 now

 # Let us now look at: Electrical
 total_df.groupby(['Electrical']).size() # this has one missing value in Train i.e. SBrKr (currently 2671)
 # Let us just populate the NA with the most frequent entry
 total_df['Electrical'] = total_df['Electrical'].fillna(total_df['Electrical'].mode()[0])
 total_df.groupby(['Electrical']).size() # now SBrKr= 2672

 # Like Electrical, KitchenQual has 1 missing value
 total_df.groupby(['KitchenQual']).size() # the missing value is in Test; most frequent value is 'TA'= 1492
 # Let us just replace null with 'TA'
 total_df['KitchenQual'] = total_df['KitchenQual'].fillna(total_df['KitchenQual'].mode()[0])
 total_df.groupby(['KitchenQual']).size() # 'TA'= 1493

 # The next column is SaleType
 total_df.groupby(['SaleType']).size() # one NA in Test, most frequent value is 'WD'=2525
 #populating nulls with the most frequent values
 total_df['SaleType'] = total_df['SaleType'].fillna(total_df['SaleType'].mode()[0])
 total_df.groupby(['SaleType']).size() # 'WD'= 2526'

 # Doing the same thing for Exterior1st and 2nd
 total_df['Exterior1st'] = total_df['Exterior1st'].fillna(total_df['Exterior1st'].mode()[0])
 total_df['Exterior2nd'] = total_df['Exterior2nd'].fillna(total_df['Exterior2nd'].mode()[0])

 # Moving on to the higher null percentages: MSZoninng
 total_df.groupby(['MSZoning']).size() #most frequent value is 'RL'=2265
 # Let us just substitute the 4 nulls with the most frequent values
 total_df['MSZoning'] = total_df['MSZoning'].fillna(total_df['MSZoning'].mode()[0])
 total_df.groupby(['MSZoning']).size()
	# Let us first focus on the lesser null percentages (except LoTFrontage)
	# Let us see the distribution of data across these fields

	# first up: Utilities
	total_df.groupby(['Utilities']).size() # only one NoSeWa value and 2 nulls
	train_df.groupby(['Utilities']).size() # train data contains the 'NoSeWa'i.e. Test has no NoSeWa value
	# 2 null values come from Test data
	## intuitively this will not play a significant role in our model prediction
	# for now let us populate the nulls with the most frequent value 'AllPub'-- can drop it later
	total_df['Utilities'] = total_df['Utilities'].fillna(total_df['Utilities'].mode()[0])

	# next is: Functional
	# Similarly for Functional
	#Functional : by the definition of the column, 'NA' means typical
	total_df.groupby(['Functional']).size() # typ has 2717 as of now
	# Since 'typ' is also the most frequent value, let us replace 'NA' with 'typ'
	total_df["Functional"] = total_df["Functional"].fillna("Typ")
	total_df.groupby(['Functional']).size() # typ= 2719 now

	# Let us now look at: Electrical
	total_df.groupby(['Electrical']).size() # this has one missing value in Train i.e. SBrKr (currently 2671)
	# Let us just populate the NA with the most frequent entry
	total_df['Electrical'] = total_df['Electrical'].fillna(total_df['Electrical'].mode()[0])
	total_df.groupby(['Electrical']).size() # now SBrKr= 2672

	# Like Electrical, KitchenQual has 1 missing value
	total_df.groupby(['KitchenQual']).size() # the missing value is in Test; most frequent value is 'TA'= 1492
	# Let us just replace null with 'TA'
	total_df['KitchenQual'] = total_df['KitchenQual'].fillna(total_df['KitchenQual'].mode()[0])
	total_df.groupby(['KitchenQual']).size() # 'TA'= 1493

	# The next column is SaleType
	total_df.groupby(['SaleType']).size() # one NA in Test, most frequent value is 'WD'=2525
	#populating nulls with the most frequent values
	total_df['SaleType'] = total_df['SaleType'].fillna(total_df['SaleType'].mode()[0])
	total_df.groupby(['SaleType']).size() # 'WD'= 2526'

	# Doing the same thing for Exterior1st and 2nd
	total_df['Exterior1st'] = total_df['Exterior1st'].fillna(total_df['Exterior1st'].mode()[0])
	total_df['Exterior2nd'] = total_df['Exterior2nd'].fillna(total_df['Exterior2nd'].mode()[0])

	# Moving on to the higher null percentages: MSZoninng
	total_df.groupby(['MSZoning']).size() #most frequent value is 'RL'=2265
	# Let us just substitute the 4 nulls with the most frequent values
	total_df['MSZoning'] = total_df['MSZoning'].fillna(total_df['MSZoning'].mode()[0])
	total_df.groupby(['MSZoning']).size()