Skip to content

Instantly share code, notes, and snippets.

@parashardhapola
Created July 17, 2016 10:36
Show Gist options
  • Save parashardhapola/595ea962f044834e787e76be1d60bd3a to your computer and use it in GitHub Desktop.
Save parashardhapola/595ea962f044834e787e76be1d60bd3a to your computer and use it in GitHub Desktop.
import sys
import numpy as np
class CuffDiff(object):
def __init__(self, filename, up_log_fc_cutoff=1, down_log_fc_cutoff=-1, alpha=0.05):
self.filename = filename
self.upLogFCcutoff = up_log_fc_cutoff
self.downLogFCcutoff = down_log_fc_cutoff
self.alpha = alpha
self.conditions = self._get_conditions()
self.data_dict = self._read()
self.filtered_data_dict = self._filter()
self.totalGenes, self.filteredGenes = len(self.data_dict), len(self.filtered_data_dict)
self.upGenes, self.downGenes = self._get_diff_genes()
def _get_conditions(self):
with open(self.filename) as h:
h.next()
c = h.next().rstrip('\n').split('\t')
return {'1':c[4], '2':c[5]}
def _read(self):
data = {}
with open(self.filename) as h:
h.next()
for l in h:
c = l.rstrip('\n').split('\t')
if c[2] in data:
data[c[2]]['fpkm_cond1'].append(float(c[7]))
data[c[2]]['fpkm_cond2'].append(float(c[8]))
data[c[2]]['log2_fc'].append(float(c[9]))
data[c[2]]['p_val'].append(float(c[11]))
data[c[2]]['q_val'].append(float(c[12]))
else:
data[c[2]] = {
'fpkm_cond1': [float(c[7])],
'fpkm_cond2': [float(c[8])],
'log2_fc': [float(c[9])],
'p_val': [float(c[11])],
'q_val': [float(c[12])]
}
return data
def _filter(self):
data = {}
for gene in self.data_dict:
if np.sum(self.data_dict[gene]['fpkm_cond1']) + np.sum(self.data_dict[gene]['fpkm_cond2']) > 2:
data[gene] = self.data_dict[gene]
return data
def _get_diff_genes(self):
up_genes = {}
down_genes = {}
for gene in self.filtered_data_dict:
for i in range(len(self.filtered_data_dict[gene]['q_val'])):
if self.filtered_data_dict[gene]['q_val'][i] <= self.alpha:
f1 = max(self.filtered_data_dict[gene]['fpkm_cond1'][i], 1)
f2 = max(self.filtered_data_dict[gene]['fpkm_cond2'][i], 1)
if self.filtered_data_dict[gene]['log2_fc'][i] >= self.upLogFCcutoff:
up_genes[gene] = [f1, f2]
break
elif self.filtered_data_dict[gene]['log2_fc'][i] <= self.downLogFCcutoff:
down_genes[gene] = [f1, f2]
break
return up_genes, down_genes
def _write_output(self):
fpkms = ["\t".join(["Genes", self.conditions['1'], self.conditions['2']])]
for gene in self.upGenes:
fpkms.append("\t".join(map(str, [gene, np.log2(self.upGenes[gene][0]), np.log2(self.upGenes[gene][1])])))
with open('up_genes.txt', 'w') as outfile:
outfile.write("\n".join(fpkms))
fpkms = ["\t".join(["Genes", self.conditions['1'], self.conditions['2']])]
for gene in self.downGenes:
fpkms.append("\t".join(map(str, [gene, np.log2(self.downGenes[gene][0]), np.log2(self.downGenes[gene][1])])))
with open('down_genes.txt', 'w') as outfile:
outfile.write("\n".join(fpkms))
if __name__ == "__main__":
res = CuffDiff(sys.argv[1], up_log_fc_cutoff=np.log2(2), down_log_fc_cutoff=np.log2(0.5))
print "Total %d genes processed." % res.totalGenes
print "Total genes after filtering low expressing genes: %d" % res.filteredGenes
print "Condition 1: %s; Condition 2: %s" % (res.conditions['1'], res.conditions['2'])
print "Number of up-regulated genes: %d" % len(res.upGenes)
print "Number of down-regulated genes: %d" % len(res.downGenes)
res._write_output()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment