Created
April 10, 2016 07:56
-
-
Save malisas/792a1366bd1b3d04ab571a759abfdd4b to your computer and use it in GitHub Desktop.
A script for finding pairs of orthologous genes between two species, based on their blastp alignment results.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
#Author: Malisa Smith 08-12-2015 | |
import optparse | |
from operator import itemgetter | |
""" | |
This program takes as input the results of 2 blastp runs to identify pairs of orthologous genes | |
between two species. File 1 is the result of performing blastp with your query organism against | |
the subject database. File 2 is the result of performing blastp with your subject organism against | |
query database. | |
It identifies orthologs using the logic of "reciprocal best hits" . | |
For example, if Gene X from organism A identifies Gene Y from organism B | |
as its highest scoring hit (lowest e-value), then Gene X and Gene Y are | |
identified as reciprocal best hits if Gene Y from organism B identifies | |
Gene X from organism A as its highest scoring hit. These genes are now | |
identified as orthologs. | |
This program prints out the total number of query and subject genes, and of | |
those genes, how many pairs of orthologs were identified. The program | |
finally outputs the set of identified orthologs. | |
""" | |
########################################## | |
""" | |
Let the user specify the path+name/name of the two input blastp output files and the path+name/name of the output file | |
Input files should be formatted to have three tab-separated columns: | |
query_gene hit1_from_subject e-value for this hit | |
query_gene hit2_from_subject e-value for this hit | |
""" | |
p = optparse.OptionParser() | |
p.add_option("--in1", action="store", default="G_X_test_2genes", dest="file1") | |
p.add_option("--in2", action="store", default="X_G_test_2genes_unsorted", dest="file2") | |
p.add_option("-o", action="store", default="out_test",dest="out") | |
opts, args = p.parse_args() | |
file1_path = opts.file1 #forward blastp hits | |
file2_path = opts.file2 #backwards blastp hits | |
out_file_path = opts.out #file to write the orthologs to | |
#Open the first file. | |
file1_handle = open(file1_path, "r") | |
#Create a dictionary from file1: place all the hits from each gene into its own entry | |
file1_dict = {} | |
for line in file1_handle: | |
line = line.strip('\n') | |
line = line.split('\t') | |
#convert e-values into floats | |
line[2] = float(line[2]) | |
key = line[0] | |
if key in file1_dict: | |
file1_dict[key].append(line[1:]) | |
else: | |
file1_dict[key] = [line[1:]] | |
#Close the first file. | |
file1_handle.close() | |
#Open the second file. | |
file2_handle = open(file2_path, "r") | |
#Create a dictionary from file2: place all the hits from each gene into its own entry | |
file2_dict = {} | |
for line in file2_handle: | |
line = line.strip('\n') | |
line = line.split('\t') | |
#convert e-values into floats | |
line[2] = float(line[2]) | |
key = line[0] | |
if key in file2_dict: | |
file2_dict[key].append(line[1:]) | |
else: | |
file2_dict[key] = [line[1:]] | |
#Close the second file. | |
file2_handle.close() | |
################################################## | |
#Now find the reciprocal best hits | |
#Open the output file | |
out_handle = open(out_file_path, "w") | |
""" | |
Loop through the list of query genes (with their hits) in the first dictionary, file1_dict. | |
For each query gene, sort the query hits by ascending e-value. | |
Then loop through the query hits until: | |
1) you find the reciprocal best hit in the inner loop described below | |
2) you run out of query hits | |
3) the e-value of the next hit is greater than the original e-value | |
A query hit can be thought of as a subject gene. | |
For each query hit/subject gene, sort the subject hits by ascending e-value, in file2_dict. | |
Then loop through the list of subject hits until: | |
1) you find the reciprocal best hit (meaning the query gene == subject hit) | |
2) you run out of subject hits | |
3) the e-value of the next hit is greater than the original e-value | |
Because the program stops searching through query hits as soon as one reciprocal best hit is identified, this program does not identify co-orthologs. | |
""" | |
#Keep track of the number of orthologs identified | |
num_orthologs_identified = 0 | |
#Loop through all the genes in file1_dict | |
for gene,hits in file1_dict.items(): | |
#First sort the hits for the query gene by ascending e-value. This may be unnecessary if hits are already in order in blastp output | |
hits = sorted(hits, key=itemgetter(1)) | |
found_reciprocal_best_hit = False | |
current_query_hit = 0 #index of the current hit in hits | |
query_lowest_e_value = hits[current_query_hit][1] #the e-value of the current hit, which will be the lowest e-value among the hits because of sorting | |
num_query_hits = len(hits) | |
#Loop through the hits for the query gene until you find a reciprocal best hit in the innermost loop or reach then end of all hits with the lowest e-value | |
while not found_reciprocal_best_hit and current_query_hit < num_query_hits and not (hits[current_query_hit][1] > query_lowest_e_value): | |
current_subject_gene_name = hits[current_query_hit][0] #this is the same thing as the corresponding name for current_query_hit | |
#Make sure there is an entry for the query hit in file2_dict | |
if not current_subject_gene_name in file2_dict: | |
current_query_hit+=1 | |
else: | |
#First sort the hits for the subject gene by ascending e-value. This may be unnecessary if hits are already in order in blastp output | |
file2_dict[current_subject_gene_name] = sorted(file2_dict[current_subject_gene_name], key=itemgetter(1)) | |
current_subject_hit = 0 | |
subject_lowest_e_value = file2_dict[current_subject_gene_name][current_subject_hit][1] | |
num_subject_hits = len(file2_dict[current_subject_gene_name]) | |
#Loop through the list of subject hits until you find the reciprocal best hit or reach the end of all hits with the lowest e-value | |
while not found_reciprocal_best_hit and current_subject_hit < num_subject_hits and not (file2_dict[current_subject_gene_name][current_subject_hit][1] > subject_lowest_e_value): | |
current_subject_hit_name = file2_dict[current_subject_gene_name][current_subject_hit][0] | |
#Make sure there is an entry for the subject hit in file1_dict | |
if not file2_dict[current_subject_gene_name][current_subject_hit][0] in file1_dict: | |
current_subject_hit+=1 | |
else: | |
#Check to see if the reciprocal hit (current subject hit name) is the same as the query gene (gene) | |
if current_subject_hit_name == gene: | |
#You've got a match! | |
#Write the name of the orthologs in the output file | |
out_handle.write(gene+"\t"+current_subject_gene_name+"\n") | |
#Increment counter of total orthologs found | |
num_orthologs_identified+=1 | |
#End the loop by setting found_reciprocal_best_hit to True | |
found_reciprocal_best_hit = True | |
current_subject_hit+=1 | |
current_query_hit+=1 | |
print "Number of genes in " + file1_path + ": " + str(len(file1_dict)) | |
print "Number of genes in " + file2_path + ": " + str(len(file2_dict)) | |
print "Number of orthologs identified: " + str(num_orthologs_identified) | |
out_handle.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment