Created
December 15, 2016 20:07
-
-
Save JohnLonginotto/728d3527fe0483c599894d901096e7b8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import csv | |
import argparse | |
parser = argparse.ArgumentParser(description='Process output from Mash') | |
parser.add_argument('-i', '--input', help='input file', metavar='FILE', required=True) | |
parser.add_argument('-o', '--output', help='output file', metavar='FILE' ) | |
parser.add_argument('-a', '--alias', help='alias file', metavar='FILE' ) | |
parser.add_argument('-n', '--nearness', help='invert distance',action="store_true" ) | |
args = parser.parse_args() | |
aliases = {} | |
if args.alias is not None: | |
with open(args.alias,'rb') as alias_file: | |
aliases = dict(csv.reader(alias_file, delimiter='\t', quotechar='|')) | |
##### While the above will work, you should really do more checking on the alias file. Something like this: | |
#for alias_line, alias_data in enumerate(csv.reader(alias_file, delimiter='\t', quotechar='|')): | |
# if len(alias_data) == 2: | |
# alias_from,alias_to = alias_data | |
# if alias_from in aliases.keys(): print 'ERROR: Line',alias_line+1,'in alias file',args.alias,'contained an alias previously mapped!'; exit() | |
# if alias_to in aliases.values(): print 'ERROR: Line',alias_line+1,'in alias file',args.alias,'wants to map',alias_from,'to',alias_to,'but the latter has already been assigned!'; exit() | |
# aliases[alias[0]] = alias[1] | |
# else: print 'ERROR: Line',alias_line+1,'in alias file',args.alias,'contained more than two columns!'; exit() | |
if args.output is None: | |
path,extension = os.path.splitext(os.path.abspath(os.path.expanduser(args.input))) | |
args.output = path +'.2' + extension | |
def clean_file_name(file_path): | |
clean_name = os.path.basename(os.path.splitext(file_path)[0]) # I would remove all extensions | |
return aliases[clean_name] if clean_name in aliases else clean_name | |
with open(args.input,'rb') as input_file, open(args.output,'wb') as output_file: | |
input_csv = csv.reader(input_file, delimiter='\t', quotechar='|') | |
output_csv = csv.writer(output_file, delimiter='\t', quotechar='|') | |
output_csv.writerow( [clean_file_name(name) for name in next(input_csv)[1:]] ) #1: gets rid of the #query | |
for input_data in input_csv: | |
filename = [ clean_file_name(input_data[0]) ] | |
if args.nearness: values = [1 - float(x) for x in input_data[1:]] # nearness is not a good choice of words, ideally you want something that makes sense in the context of "if X:" like "if do_invert:". | |
else: values = input_data[1:] | |
output_csv.writerow(filename + values) | |
print "All done! See ", args.output |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If you do not wish to remove all file extensions, then instead of using
re.sub(r'\.fn?a(st[aq])?$', '', name)
and similar, which no biologist would/should ever understand, I would just take the extension, and check if it is in a set of extensions you want to remove the extension of, and then decide to use the path with the extension or not. This will be faster and simpler for your users :)