gabefair · August 9, 2018 19:54
diff --git a/setup.py b/setup.py
 import sys, os 
 
 try: 
    import pandas as pd 
 except ImportError: 
    print("Please run `pip install pandas` from a command line") 
    exit() 
 
 
 def get_file_head(file_name): 
    with open(file_name, "r") as f: 
        counter = 0 
        for line in f: 
            print(line) 
            counter += 1 
            if counter == 5: break 
 
 
 def merge_files(new_name_list, old_map_df, source_file_column_name, target_file_column_name): 
    # Extract only the matches 
    print("Now spliting matches from not matches") 

 	new_and_old_matches = old_map_df[old_map_df.full_name_h.isin(new_name_list[target_file_column_name])]#.dropna(how='any', axis=0) 
 	remaining_old = old_map_df[~old_map_df.full_name_h.isin(new_name_list[target_file_column_name])]#.dropna(how='any', axis=0) 

 
    file_name = source_file_column_name + "_and_" + target_file_column_name + "_merged.csv"
 
    print("Now saving to disk") 
    # First dump the new ones to the top of the csv file 
    new_and_old_matches.to_csv(file_name, index=False) 
 
    # Now append the rows from the old file 
    remaining_old.to_csv(file_name, header=False, index=False, mode='a') 
 
    # pandas python2 bug adding index. Unable to drop or ignore it. Reading it back in to save it again without it 
    remaining_old = pd.read_csv(file_name, header=0, index_col='Unnamed: 0') 
    remaining_old.to_csv(file_name, index=False) 
 
    print("Merge Complete. Here is the merged file's head()") 
    get_file_head(file_name) 
 
 
 def read_in_files(target_file, source_file, source_file_column_name, target_file_column_name, target_file_format, source_file_format): 
    print("Reading in files now. This will take ~.15mins") 
 
    # Read in new file 
    if (target_file_format == '.csv'): 
        new_name_list = pd.read_csv(target_file, header=0) 
        new_name_list.columns = new_name_list.columns.str.strip() #removes any spaces that might be in the column names 
    else: 
        new_name_list = pd.read_csv(target_file, header=None) 
 
    # Read in old file 
    if (source_file_format == '.csv'): 
        old_map_df = pd.read_csv(source_file, header=0) 
    else: 
        old_map_df = pd.read_csv(source_file, header=None) 
 
    merge_files(new_name_list, old_map_df, source_file_column_name, target_file_column_name) 
 
 
 def get_file_format(file_name): 
    filename, file_extension = os.path.splitext(file_name) 
    return file_extension 
 
 
 def reconcile_inputs(arg_length, target_file, source_file, source_file_column_name): 
    target_file_format = get_file_format(target_file) 
    source_file_format = get_file_format(source_file) 
 
 	if (arg_length < 4): 
 		target_file_column_name = source_file_column_name 
 	else: 
 		target_file_column_name = sys.argv[4] 
 
 
    read_in_files(target_file, source_file, source_file_column_name, target_file_column_name, target_file_format, source_file_format) 
 
 
 def main(): 
    arg_length = len(sys.argv) 
    if (arg_length < 3): 
        print("Please specify at least 3 parameters: <name_of_file_to_merge_on> <name_of_sorce_file_to_merge_from> <source_file_column_name> <OPTIONAL: target_file_column_name>") 
        exit() 
    reconcile_inputs(arg_length, sys.argv[1], sys.argv[2], sys.argv[3]) 
 
 
 if __name__ == '__main__': 
    main()
	import sys, os

	try:
	import pandas as pd
	except ImportError:
	print("Please run `pip install pandas` from a command line")
	exit()


	def get_file_head(file_name):
	with open(file_name, "r") as f:
	counter = 0
	for line in f:
	print(line)
	counter += 1
	if counter == 5: break


	def merge_files(new_name_list, old_map_df, source_file_column_name, target_file_column_name):
	# Extract only the matches
	print("Now spliting matches from not matches")

	new_and_old_matches = old_map_df[old_map_df.full_name_h.isin(new_name_list[target_file_column_name])]#.dropna(how='any', axis=0)
	remaining_old = old_map_df[~old_map_df.full_name_h.isin(new_name_list[target_file_column_name])]#.dropna(how='any', axis=0)


	file_name = source_file_column_name + "_and_" + target_file_column_name + "_merged.csv"

	print("Now saving to disk")
	# First dump the new ones to the top of the csv file
	new_and_old_matches.to_csv(file_name, index=False)

	# Now append the rows from the old file
	remaining_old.to_csv(file_name, header=False, index=False, mode='a')

	# pandas python2 bug adding index. Unable to drop or ignore it. Reading it back in to save it again without it
	remaining_old = pd.read_csv(file_name, header=0, index_col='Unnamed: 0')
	remaining_old.to_csv(file_name, index=False)

	print("Merge Complete. Here is the merged file's head()")
	get_file_head(file_name)


	def read_in_files(target_file, source_file, source_file_column_name, target_file_column_name, target_file_format, source_file_format):
	print("Reading in files now. This will take ~.15mins")

	# Read in new file
	if (target_file_format == '.csv'):
	new_name_list = pd.read_csv(target_file, header=0)
	new_name_list.columns = new_name_list.columns.str.strip() #removes any spaces that might be in the column names
	else:
	new_name_list = pd.read_csv(target_file, header=None)

	# Read in old file
	if (source_file_format == '.csv'):
	old_map_df = pd.read_csv(source_file, header=0)
	else:
	old_map_df = pd.read_csv(source_file, header=None)

	merge_files(new_name_list, old_map_df, source_file_column_name, target_file_column_name)


	def get_file_format(file_name):
	filename, file_extension = os.path.splitext(file_name)
	return file_extension


	def reconcile_inputs(arg_length, target_file, source_file, source_file_column_name):
	target_file_format = get_file_format(target_file)
	source_file_format = get_file_format(source_file)

	if (arg_length < 4):
	target_file_column_name = source_file_column_name
	else:
	target_file_column_name = sys.argv[4]


	read_in_files(target_file, source_file, source_file_column_name, target_file_column_name, target_file_format, source_file_format)


	def main():
	arg_length = len(sys.argv)
	if (arg_length < 3):
	print("Please specify at least 3 parameters: <name_of_file_to_merge_on> <name_of_sorce_file_to_merge_from> <source_file_column_name> <OPTIONAL: target_file_column_name>")
	exit()
	reconcile_inputs(arg_length, sys.argv[1], sys.argv[2], sys.argv[3])


	if __name__ == '__main__':
	main()