Skip to content

Instantly share code, notes, and snippets.

@malisas
Created April 10, 2016 07:56
Show Gist options
  • Save malisas/792a1366bd1b3d04ab571a759abfdd4b to your computer and use it in GitHub Desktop.
Save malisas/792a1366bd1b3d04ab571a759abfdd4b to your computer and use it in GitHub Desktop.
A script for finding pairs of orthologous genes between two species, based on their blastp alignment results.
#!/usr/bin/python
#Author: Malisa Smith 08-12-2015
import optparse
from operator import itemgetter
"""
This program takes as input the results of 2 blastp runs to identify pairs of orthologous genes
between two species. File 1 is the result of performing blastp with your query organism against
the subject database. File 2 is the result of performing blastp with your subject organism against
query database.
It identifies orthologs using the logic of "reciprocal best hits" .
For example, if Gene X from organism A identifies Gene Y from organism B
as its highest scoring hit (lowest e-value), then Gene X and Gene Y are
identified as reciprocal best hits if Gene Y from organism B identifies
Gene X from organism A as its highest scoring hit. These genes are now
identified as orthologs.
This program prints out the total number of query and subject genes, and of
those genes, how many pairs of orthologs were identified. The program
finally outputs the set of identified orthologs.
"""
##########################################
"""
Let the user specify the path+name/name of the two input blastp output files and the path+name/name of the output file
Input files should be formatted to have three tab-separated columns:
query_gene hit1_from_subject e-value for this hit
query_gene hit2_from_subject e-value for this hit
"""
p = optparse.OptionParser()
p.add_option("--in1", action="store", default="G_X_test_2genes", dest="file1")
p.add_option("--in2", action="store", default="X_G_test_2genes_unsorted", dest="file2")
p.add_option("-o", action="store", default="out_test",dest="out")
opts, args = p.parse_args()
file1_path = opts.file1 #forward blastp hits
file2_path = opts.file2 #backwards blastp hits
out_file_path = opts.out #file to write the orthologs to
#Open the first file.
file1_handle = open(file1_path, "r")
#Create a dictionary from file1: place all the hits from each gene into its own entry
file1_dict = {}
for line in file1_handle:
line = line.strip('\n')
line = line.split('\t')
#convert e-values into floats
line[2] = float(line[2])
key = line[0]
if key in file1_dict:
file1_dict[key].append(line[1:])
else:
file1_dict[key] = [line[1:]]
#Close the first file.
file1_handle.close()
#Open the second file.
file2_handle = open(file2_path, "r")
#Create a dictionary from file2: place all the hits from each gene into its own entry
file2_dict = {}
for line in file2_handle:
line = line.strip('\n')
line = line.split('\t')
#convert e-values into floats
line[2] = float(line[2])
key = line[0]
if key in file2_dict:
file2_dict[key].append(line[1:])
else:
file2_dict[key] = [line[1:]]
#Close the second file.
file2_handle.close()
##################################################
#Now find the reciprocal best hits
#Open the output file
out_handle = open(out_file_path, "w")
"""
Loop through the list of query genes (with their hits) in the first dictionary, file1_dict.
For each query gene, sort the query hits by ascending e-value.
Then loop through the query hits until:
1) you find the reciprocal best hit in the inner loop described below
2) you run out of query hits
3) the e-value of the next hit is greater than the original e-value
A query hit can be thought of as a subject gene.
For each query hit/subject gene, sort the subject hits by ascending e-value, in file2_dict.
Then loop through the list of subject hits until:
1) you find the reciprocal best hit (meaning the query gene == subject hit)
2) you run out of subject hits
3) the e-value of the next hit is greater than the original e-value
Because the program stops searching through query hits as soon as one reciprocal best hit is identified, this program does not identify co-orthologs.
"""
#Keep track of the number of orthologs identified
num_orthologs_identified = 0
#Loop through all the genes in file1_dict
for gene,hits in file1_dict.items():
#First sort the hits for the query gene by ascending e-value. This may be unnecessary if hits are already in order in blastp output
hits = sorted(hits, key=itemgetter(1))
found_reciprocal_best_hit = False
current_query_hit = 0 #index of the current hit in hits
query_lowest_e_value = hits[current_query_hit][1] #the e-value of the current hit, which will be the lowest e-value among the hits because of sorting
num_query_hits = len(hits)
#Loop through the hits for the query gene until you find a reciprocal best hit in the innermost loop or reach then end of all hits with the lowest e-value
while not found_reciprocal_best_hit and current_query_hit < num_query_hits and not (hits[current_query_hit][1] > query_lowest_e_value):
current_subject_gene_name = hits[current_query_hit][0] #this is the same thing as the corresponding name for current_query_hit
#Make sure there is an entry for the query hit in file2_dict
if not current_subject_gene_name in file2_dict:
current_query_hit+=1
else:
#First sort the hits for the subject gene by ascending e-value. This may be unnecessary if hits are already in order in blastp output
file2_dict[current_subject_gene_name] = sorted(file2_dict[current_subject_gene_name], key=itemgetter(1))
current_subject_hit = 0
subject_lowest_e_value = file2_dict[current_subject_gene_name][current_subject_hit][1]
num_subject_hits = len(file2_dict[current_subject_gene_name])
#Loop through the list of subject hits until you find the reciprocal best hit or reach the end of all hits with the lowest e-value
while not found_reciprocal_best_hit and current_subject_hit < num_subject_hits and not (file2_dict[current_subject_gene_name][current_subject_hit][1] > subject_lowest_e_value):
current_subject_hit_name = file2_dict[current_subject_gene_name][current_subject_hit][0]
#Make sure there is an entry for the subject hit in file1_dict
if not file2_dict[current_subject_gene_name][current_subject_hit][0] in file1_dict:
current_subject_hit+=1
else:
#Check to see if the reciprocal hit (current subject hit name) is the same as the query gene (gene)
if current_subject_hit_name == gene:
#You've got a match!
#Write the name of the orthologs in the output file
out_handle.write(gene+"\t"+current_subject_gene_name+"\n")
#Increment counter of total orthologs found
num_orthologs_identified+=1
#End the loop by setting found_reciprocal_best_hit to True
found_reciprocal_best_hit = True
current_subject_hit+=1
current_query_hit+=1
print "Number of genes in " + file1_path + ": " + str(len(file1_dict))
print "Number of genes in " + file2_path + ": " + str(len(file2_dict))
print "Number of orthologs identified: " + str(num_orthologs_identified)
out_handle.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment