malisas · April 10, 2016 07:56
diff --git a/reciprocal_best_hits_for_blastp.py b/reciprocal_best_hits_for_blastp.py
 #!/usr/bin/python
 #Author: Malisa Smith     08-12-2015
 import optparse
 from operator import itemgetter

 """
 This program takes as input the results of 2 blastp runs to identify pairs of orthologous genes
 between two species. File 1 is the result of performing blastp with your query organism against
 the subject database. File 2 is the result of performing blastp with your subject organism against
 query database.

 It identifies orthologs using the logic of "reciprocal best hits" .
 For example, if Gene X from organism A identifies Gene Y from organism B
 as its highest scoring hit (lowest e-value), then Gene X and Gene Y are
 identified as reciprocal best hits if Gene Y from organism B identifies
 Gene X from organism A as its highest scoring hit. These genes are now
 identified as orthologs.

 This program prints out the total number of query and subject genes, and of
 those genes, how many pairs of orthologs were identified. The program
 finally outputs the set of identified orthologs.
 """
 ##########################################
 """
 Let the user specify the path+name/name of the two input blastp output files and the path+name/name of the output file

 Input files should be formatted to have three tab-separated columns:
 query_gene	hit1_from_subject	e-value for this hit
 query_gene  hit2_from_subject   e-value for this hit

 """
 p = optparse.OptionParser()
 p.add_option("--in1", action="store", default="G_X_test_2genes", dest="file1")
 p.add_option("--in2", action="store", default="X_G_test_2genes_unsorted", dest="file2")
 p.add_option("-o", action="store", default="out_test",dest="out")
 opts, args = p.parse_args()
 file1_path = opts.file1 #forward blastp hits
 file2_path = opts.file2 #backwards blastp hits
 out_file_path = opts.out #file to write the orthologs to

 #Open the first file.
 file1_handle = open(file1_path, "r")

 #Create a dictionary from file1: place all the hits from each gene into its own entry
 file1_dict = {}
 for line in file1_handle:
    line = line.strip('\n')
    line = line.split('\t')
    #convert e-values into floats
    line[2] = float(line[2])
    key = line[0]
    if key in file1_dict:
        file1_dict[key].append(line[1:])
    else:
        file1_dict[key] = [line[1:]]

 #Close the first file.
 file1_handle.close()

 #Open the second file.
 file2_handle = open(file2_path, "r")

 #Create a dictionary from file2: place all the hits from each gene into its own entry
 file2_dict = {}
 for line in file2_handle:
    line = line.strip('\n')
    line = line.split('\t')
    #convert e-values into floats
    line[2] = float(line[2])
    key = line[0]
    if key in file2_dict:
        file2_dict[key].append(line[1:])
    else:
        file2_dict[key] = [line[1:]]

 #Close the second file.
 file2_handle.close()

 ##################################################
 #Now find the reciprocal best hits 

 #Open the output file
 out_handle = open(out_file_path, "w")

 """
 Loop through the list of query genes (with their hits) in the first dictionary, file1_dict.

 	For each query gene, sort the query hits by ascending e-value.
 	Then loop through the query hits until:
        1) you find the reciprocal best hit in the inner loop described below
        2) you run out of query hits
        3) the e-value of the next hit is greater than the original e-value

 		A query hit can be thought of as a subject gene.

 		For each query hit/subject gene, sort the subject hits by ascending e-value, in file2_dict.
 		Then loop through the list of subject hits until:
                1) you find the reciprocal best hit (meaning the query gene == subject hit)
                2) you run out of subject hits
                3) the e-value of the next hit is greater than the original e-value

 Because the program stops searching through query hits as soon as one reciprocal best hit is identified, this program does not identify co-orthologs.
 """

 #Keep track of the number of orthologs identified
 num_orthologs_identified = 0
 #Loop through all the genes in file1_dict
 for gene,hits in file1_dict.items():
    #First sort the hits for the query gene by ascending e-value. This may be unnecessary if hits are already in order in blastp output
    hits = sorted(hits, key=itemgetter(1))

    found_reciprocal_best_hit = False
    current_query_hit = 0 #index of the current hit in hits
    query_lowest_e_value = hits[current_query_hit][1] #the e-value of the current hit, which will be the lowest e-value among the hits because of sorting
    num_query_hits = len(hits)

    #Loop through the hits for the query gene until you find a reciprocal best hit in the innermost loop or reach then end of all hits with the lowest e-value
    while not found_reciprocal_best_hit and current_query_hit < num_query_hits and not (hits[current_query_hit][1] > query_lowest_e_value):
        current_subject_gene_name = hits[current_query_hit][0] #this is the same thing as the corresponding name for current_query_hit
    	
 	#Make sure there is an entry for the query hit in file2_dict
        if not current_subject_gene_name in file2_dict:
            current_query_hit+=1
        else:
            #First sort the hits for the subject gene by ascending e-value. This may be unnecessary if hits are already in order in blastp output
            file2_dict[current_subject_gene_name] = sorted(file2_dict[current_subject_gene_name], key=itemgetter(1))

            current_subject_hit = 0
            subject_lowest_e_value = file2_dict[current_subject_gene_name][current_subject_hit][1]
            num_subject_hits = len(file2_dict[current_subject_gene_name])

            #Loop through the list of subject hits until you find the reciprocal best hit or reach the end of all hits with the lowest e-value
            while not found_reciprocal_best_hit and current_subject_hit < num_subject_hits and not (file2_dict[current_subject_gene_name][current_subject_hit][1] > subject_lowest_e_value):
                current_subject_hit_name = file2_dict[current_subject_gene_name][current_subject_hit][0]

            	#Make sure there is an entry for the subject hit in file1_dict
                if not file2_dict[current_subject_gene_name][current_subject_hit][0] in file1_dict:
                    current_subject_hit+=1
                else:
                    #Check to see if the reciprocal hit (current subject hit name) is the same as the query gene (gene)
                    if current_subject_hit_name == gene:
                        #You've got a match!
                        #Write the name of the orthologs in the output file
                        out_handle.write(gene+"\t"+current_subject_gene_name+"\n")
                        #Increment counter of total orthologs found
                        num_orthologs_identified+=1
                        #End the loop by setting found_reciprocal_best_hit to True
                        found_reciprocal_best_hit = True
                current_subject_hit+=1
            current_query_hit+=1
        
 print "Number of genes in " + file1_path + ": " + str(len(file1_dict))
 print "Number of genes in " + file2_path + ": " + str(len(file2_dict))
 print "Number of orthologs identified: " + str(num_orthologs_identified)

 out_handle.close()
	#!/usr/bin/python
	#Author: Malisa Smith 08-12-2015
	import optparse
	from operator import itemgetter

	"""
	This program takes as input the results of 2 blastp runs to identify pairs of orthologous genes
	between two species. File 1 is the result of performing blastp with your query organism against
	the subject database. File 2 is the result of performing blastp with your subject organism against
	query database.

	It identifies orthologs using the logic of "reciprocal best hits" .
	For example, if Gene X from organism A identifies Gene Y from organism B
	as its highest scoring hit (lowest e-value), then Gene X and Gene Y are
	identified as reciprocal best hits if Gene Y from organism B identifies
	Gene X from organism A as its highest scoring hit. These genes are now
	identified as orthologs.

	This program prints out the total number of query and subject genes, and of
	those genes, how many pairs of orthologs were identified. The program
	finally outputs the set of identified orthologs.
	"""
	##########################################
	"""
	Let the user specify the path+name/name of the two input blastp output files and the path+name/name of the output file

	Input files should be formatted to have three tab-separated columns:
	query_gene hit1_from_subject e-value for this hit
	query_gene hit2_from_subject e-value for this hit

	"""
	p = optparse.OptionParser()
	p.add_option("--in1", action="store", default="G_X_test_2genes", dest="file1")
	p.add_option("--in2", action="store", default="X_G_test_2genes_unsorted", dest="file2")
	p.add_option("-o", action="store", default="out_test",dest="out")
	opts, args = p.parse_args()
	file1_path = opts.file1 #forward blastp hits
	file2_path = opts.file2 #backwards blastp hits
	out_file_path = opts.out #file to write the orthologs to

	#Open the first file.
	file1_handle = open(file1_path, "r")

	#Create a dictionary from file1: place all the hits from each gene into its own entry
	file1_dict = {}
	for line in file1_handle:
	line = line.strip('\n')
	line = line.split('\t')
	#convert e-values into floats
	line[2] = float(line[2])
	key = line[0]
	if key in file1_dict:
	file1_dict[key].append(line[1:])
	else:
	file1_dict[key] = [line[1:]]

	#Close the first file.
	file1_handle.close()

	#Open the second file.
	file2_handle = open(file2_path, "r")

	#Create a dictionary from file2: place all the hits from each gene into its own entry
	file2_dict = {}
	for line in file2_handle:
	line = line.strip('\n')
	line = line.split('\t')
	#convert e-values into floats
	line[2] = float(line[2])
	key = line[0]
	if key in file2_dict:
	file2_dict[key].append(line[1:])
	else:
	file2_dict[key] = [line[1:]]

	#Close the second file.
	file2_handle.close()

	##################################################
	#Now find the reciprocal best hits

	#Open the output file
	out_handle = open(out_file_path, "w")

	"""
	Loop through the list of query genes (with their hits) in the first dictionary, file1_dict.

	For each query gene, sort the query hits by ascending e-value.
	Then loop through the query hits until:
	1) you find the reciprocal best hit in the inner loop described below
	2) you run out of query hits
	3) the e-value of the next hit is greater than the original e-value

	A query hit can be thought of as a subject gene.

	For each query hit/subject gene, sort the subject hits by ascending e-value, in file2_dict.
	Then loop through the list of subject hits until:
	1) you find the reciprocal best hit (meaning the query gene == subject hit)
	2) you run out of subject hits
	3) the e-value of the next hit is greater than the original e-value

	Because the program stops searching through query hits as soon as one reciprocal best hit is identified, this program does not identify co-orthologs.
	"""

	#Keep track of the number of orthologs identified
	num_orthologs_identified = 0
	#Loop through all the genes in file1_dict
	for gene,hits in file1_dict.items():
	#First sort the hits for the query gene by ascending e-value. This may be unnecessary if hits are already in order in blastp output
	hits = sorted(hits, key=itemgetter(1))

	found_reciprocal_best_hit = False
	current_query_hit = 0 #index of the current hit in hits
	query_lowest_e_value = hits[current_query_hit][1] #the e-value of the current hit, which will be the lowest e-value among the hits because of sorting
	num_query_hits = len(hits)

	#Loop through the hits for the query gene until you find a reciprocal best hit in the innermost loop or reach then end of all hits with the lowest e-value
	while not found_reciprocal_best_hit and current_query_hit < num_query_hits and not (hits[current_query_hit][1] > query_lowest_e_value):
	current_subject_gene_name = hits[current_query_hit][0] #this is the same thing as the corresponding name for current_query_hit

	#Make sure there is an entry for the query hit in file2_dict
	if not current_subject_gene_name in file2_dict:
	current_query_hit+=1
	else:
	#First sort the hits for the subject gene by ascending e-value. This may be unnecessary if hits are already in order in blastp output
	file2_dict[current_subject_gene_name] = sorted(file2_dict[current_subject_gene_name], key=itemgetter(1))

	current_subject_hit = 0
	subject_lowest_e_value = file2_dict[current_subject_gene_name][current_subject_hit][1]
	num_subject_hits = len(file2_dict[current_subject_gene_name])

	#Loop through the list of subject hits until you find the reciprocal best hit or reach the end of all hits with the lowest e-value
	while not found_reciprocal_best_hit and current_subject_hit < num_subject_hits and not (file2_dict[current_subject_gene_name][current_subject_hit][1] > subject_lowest_e_value):
	current_subject_hit_name = file2_dict[current_subject_gene_name][current_subject_hit][0]

	#Make sure there is an entry for the subject hit in file1_dict
	if not file2_dict[current_subject_gene_name][current_subject_hit][0] in file1_dict:
	current_subject_hit+=1
	else:
	#Check to see if the reciprocal hit (current subject hit name) is the same as the query gene (gene)
	if current_subject_hit_name == gene:
	#You've got a match!
	#Write the name of the orthologs in the output file
	out_handle.write(gene+"\t"+current_subject_gene_name+"\n")
	#Increment counter of total orthologs found
	num_orthologs_identified+=1
	#End the loop by setting found_reciprocal_best_hit to True
	found_reciprocal_best_hit = True
	current_subject_hit+=1
	current_query_hit+=1

	print "Number of genes in " + file1_path + ": " + str(len(file1_dict))
	print "Number of genes in " + file2_path + ": " + str(len(file2_dict))
	print "Number of orthologs identified: " + str(num_orthologs_identified)

	out_handle.close()