Created
August 9, 2018 19:54
-
-
Save gabefair/da976005c64e432030d479b4141956c2 to your computer and use it in GitHub Desktop.
merge rows of two csv files based on a column-key and place one of them ahead infront of the other file's rows after the merge
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, os | |
try: | |
import pandas as pd | |
except ImportError: | |
print("Please run `pip install pandas` from a command line") | |
exit() | |
def get_file_head(file_name): | |
with open(file_name, "r") as f: | |
counter = 0 | |
for line in f: | |
print(line) | |
counter += 1 | |
if counter == 5: break | |
def merge_files(new_name_list, old_map_df, source_file_column_name, target_file_column_name): | |
# Extract only the matches | |
print("Now spliting matches from not matches") | |
new_and_old_matches = old_map_df[old_map_df.full_name_h.isin(new_name_list[target_file_column_name])]#.dropna(how='any', axis=0) | |
remaining_old = old_map_df[~old_map_df.full_name_h.isin(new_name_list[target_file_column_name])]#.dropna(how='any', axis=0) | |
file_name = source_file_column_name + "_and_" + target_file_column_name + "_merged.csv" | |
print("Now saving to disk") | |
# First dump the new ones to the top of the csv file | |
new_and_old_matches.to_csv(file_name, index=False) | |
# Now append the rows from the old file | |
remaining_old.to_csv(file_name, header=False, index=False, mode='a') | |
# pandas python2 bug adding index. Unable to drop or ignore it. Reading it back in to save it again without it | |
remaining_old = pd.read_csv(file_name, header=0, index_col='Unnamed: 0') | |
remaining_old.to_csv(file_name, index=False) | |
print("Merge Complete. Here is the merged file's head()") | |
get_file_head(file_name) | |
def read_in_files(target_file, source_file, source_file_column_name, target_file_column_name, target_file_format, source_file_format): | |
print("Reading in files now. This will take ~.15mins") | |
# Read in new file | |
if (target_file_format == '.csv'): | |
new_name_list = pd.read_csv(target_file, header=0) | |
new_name_list.columns = new_name_list.columns.str.strip() #removes any spaces that might be in the column names | |
else: | |
new_name_list = pd.read_csv(target_file, header=None) | |
# Read in old file | |
if (source_file_format == '.csv'): | |
old_map_df = pd.read_csv(source_file, header=0) | |
else: | |
old_map_df = pd.read_csv(source_file, header=None) | |
merge_files(new_name_list, old_map_df, source_file_column_name, target_file_column_name) | |
def get_file_format(file_name): | |
filename, file_extension = os.path.splitext(file_name) | |
return file_extension | |
def reconcile_inputs(arg_length, target_file, source_file, source_file_column_name): | |
target_file_format = get_file_format(target_file) | |
source_file_format = get_file_format(source_file) | |
if (arg_length < 4): | |
target_file_column_name = source_file_column_name | |
else: | |
target_file_column_name = sys.argv[4] | |
read_in_files(target_file, source_file, source_file_column_name, target_file_column_name, target_file_format, source_file_format) | |
def main(): | |
arg_length = len(sys.argv) | |
if (arg_length < 3): | |
print("Please specify at least 3 parameters: <name_of_file_to_merge_on> <name_of_sorce_file_to_merge_from> <source_file_column_name> <OPTIONAL: target_file_column_name>") | |
exit() | |
reconcile_inputs(arg_length, sys.argv[1], sys.argv[2], sys.argv[3]) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment