Tara terrah27

I'm a pharmacist turned data engineer specializing in robot process automation, reporting, machine learning, and creating beautiful visualizations

terrah27 / missing_imputer.py

Created March 12, 2022 02:10

	# Impute with scikit-learn SimpleImputer
	from sklearn.impute import SimpleImputer
	imputer = SimpleImputer(strategy='most_frequent')
	imputer.fit(imputed_df)
	imputed_df = imputer.transform(imputed_df)

terrah27 / missing_flag.py

Created March 12, 2022 01:51

	# check number of missing values
	print(f"Missing Values Before Flagging: {df['OCCUPATION_TYPE'].isnull().sum()}")

	# check values of OCCUPATION_TYPE feature
	print(df['OCCUPATION_TYPE'].value_counts())

	# replace values with flag 1=data present 0=data missing
	df['OCCUPATION_TYPE'] = np.where(df['OCCUPATION_TYPE'].isnull(), # condition
	1, # value if true
	0 # value if false

terrah27 / impute_mean.py

Created March 12, 2022 01:35

	# impute using fillna
	# make a copy of dataframe for example purposes
	imputed_df = df_threshold.copy()

	# list of columns to impute
	impute_cols = ['AMT_REQ_CREDIT_BUREAU_YEAR',
	'AMT_REQ_CREDIT_BUREAU_HOUR',
	'AMT_REQ_CREDIT_BUREAU_DAY',
	'AMT_REQ_CREDIT_BUREAU_WEEK',
	'AMT_REQ_CREDIT_BUREAU_MON',

terrah27 / drop_missing.py

Last active March 12, 2022 00:30

	# remove columns base on percentage of missing values
	def drop_missing_values(dataframe, threshold):

	# create list of features with missing values over threshold
	to_drop = [col for col in dataframe if \
	(dataframe[col].isnull().sum()/len(dataframe) >= threshold)]

	print('Columns to drop: ' , (len(to_drop)))
	# Drop features
	dataframe = dataframe.drop(columns=to_drop)

terrah27 / missing_vals_function.py

Last active March 12, 2022 00:51

	# create a function to show missing value info
	def get_missing_values_info(df):

	# find missing values in each column
	count_missing = df.isnull().sum()

	# get missing values as percent
	percent_missing = (100 * count_missing / df.shape[0]).round(1)

	# Make dataframe with the results

terrah27 / missing_values.py

Last active March 11, 2022 23:17

	# find percent of columns with missing values and compare to total number of columns
	cols_missing_vals = len(missing_values_list)
	df_cols = df.shape[1]

	print(f'Columns With Missing Values: {cols_missing_vals}')
	print(f'Total Columns: {df_cols}')
	print(f'Percent of Columns with Missing Values: {round(cols_missing_vals/df_cols*100,1)}%')

	>>> Columns With Missing Values: 67
	>>> Total Columns: 122

terrah27 / rename4.py

Created March 7, 2022 01:39

	# rename columns extra columns
	worker_df_renamed = worker_df.rename(columns={'Worker Name':'worker_name',
	'Worker DOB':'worker_dob',
	'Termination Date':'termination_date',
	'Team':'team',
	'Worker Status':'worker_status',
	'Hire Date':'hire_date',
	'Worker ID':'worker_id'}, errors='raise')
	worker_df_renamed.head()

terrah27 / rename_direct_out_of_order.py

Created March 7, 2022 01:31

	# we have to be careful to assign the column names in the correct order
	worker_df.columns = ['worker name', 'worker id', 'hire date', 'worker status', 'team']
	worker_df.head()

terrah27 / rename_direct.py

Created March 7, 2022 01:28

	# assign column headers directly
	worker_df.columns = ['worker_id', 'worker_name', 'hire_date', 'worker_status', 'team']
	worker_df.head()

terrah27 / rename3.py

Created March 7, 2022 01:19

	# rename columns extra columns
	worker_df_renamed = worker_df.rename(columns={'Worker Name':'worker_name',
	'Worker DOB':'worker_dob',
	'Termination Date':'termination_date',
	'Team':'team',
	'Worker Status':'worker_status',
	'Hire Date':'hire_date',
	'Worker ID':'worker_id'})
	worker_df_renamed.head()