This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Impute with scikit-learn SimpleImputer | |
from sklearn.impute import SimpleImputer | |
imputer = SimpleImputer(strategy='most_frequent') | |
imputer.fit(imputed_df) | |
imputed_df = imputer.transform(imputed_df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# check number of missing values | |
print(f"Missing Values Before Flagging: {df['OCCUPATION_TYPE'].isnull().sum()}") | |
# check values of OCCUPATION_TYPE feature | |
print(df['OCCUPATION_TYPE'].value_counts()) | |
# replace values with flag 1=data present 0=data missing | |
df['OCCUPATION_TYPE'] = np.where(df['OCCUPATION_TYPE'].isnull(), # condition | |
1, # value if true | |
0 # value if false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# impute using fillna | |
# make a copy of dataframe for example purposes | |
imputed_df = df_threshold.copy() | |
# list of columns to impute | |
impute_cols = ['AMT_REQ_CREDIT_BUREAU_YEAR', | |
'AMT_REQ_CREDIT_BUREAU_HOUR', | |
'AMT_REQ_CREDIT_BUREAU_DAY', | |
'AMT_REQ_CREDIT_BUREAU_WEEK', | |
'AMT_REQ_CREDIT_BUREAU_MON', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# remove columns base on percentage of missing values | |
def drop_missing_values(dataframe, threshold): | |
# create list of features with missing values over threshold | |
to_drop = [col for col in dataframe if \ | |
(dataframe[col].isnull().sum()/len(dataframe) >= threshold)] | |
print('Columns to drop: ' , (len(to_drop))) | |
# Drop features | |
dataframe = dataframe.drop(columns=to_drop) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create a function to show missing value info | |
def get_missing_values_info(df): | |
# find missing values in each column | |
count_missing = df.isnull().sum() | |
# get missing values as percent | |
percent_missing = (100 * count_missing / df.shape[0]).round(1) | |
# Make dataframe with the results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# find percent of columns with missing values and compare to total number of columns | |
cols_missing_vals = len(missing_values_list) | |
df_cols = df.shape[1] | |
print(f'Columns With Missing Values: {cols_missing_vals}') | |
print(f'Total Columns: {df_cols}') | |
print(f'Percent of Columns with Missing Values: {round(cols_missing_vals/df_cols*100,1)}%') | |
>>> Columns With Missing Values: 67 | |
>>> Total Columns: 122 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# rename columns extra columns | |
worker_df_renamed = worker_df.rename(columns={'Worker Name':'worker_name', | |
'Worker DOB':'worker_dob', | |
'Termination Date':'termination_date', | |
'Team':'team', | |
'Worker Status':'worker_status', | |
'Hire Date':'hire_date', | |
'Worker ID':'worker_id'}, errors='raise') | |
worker_df_renamed.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# we have to be careful to assign the column names in the correct order | |
worker_df.columns = ['worker name', 'worker id', 'hire date', 'worker status', 'team'] | |
worker_df.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# assign column headers directly | |
worker_df.columns = ['worker_id', 'worker_name', 'hire_date', 'worker_status', 'team'] | |
worker_df.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# rename columns extra columns | |
worker_df_renamed = worker_df.rename(columns={'Worker Name':'worker_name', | |
'Worker DOB':'worker_dob', | |
'Termination Date':'termination_date', | |
'Team':'team', | |
'Worker Status':'worker_status', | |
'Hire Date':'hire_date', | |
'Worker ID':'worker_id'}) | |
worker_df_renamed.head() |
NewerOlder