Skip to content

Instantly share code, notes, and snippets.

@parashardhapola
Created April 19, 2018 09:04
Show Gist options
  • Save parashardhapola/e55665162867edb1bab7736d38334bd0 to your computer and use it in GitHub Desktop.
Save parashardhapola/e55665162867edb1bab7736d38334bd0 to your computer and use it in GitHub Desktop.
Remove mean variance trend in single cell RNA-Seq data
import numpy as np
import pandas as pd
from statsmodels.nonparametric.smoothers_lowess import lowess
def fix_var(gene_stats, n_bins = 200, lowess_frac = 0.4):
# gene_stats is a dataframe with gene names as index (rownames) and two
# columns:
# 'm': mean value of each gene
# 'v': variance of each gene
# find out bin positions
bin_edges = np.histogram(gene_stats.m, bins=n_bins)[1]
bin_edges[-1] += 0.1 # For inclusion of last gene
bin_genes = [] # contains names of genes in each bin
for i in range(n_bins):
# idx is a boolean vector where each element corresponds to a gene
# value is set to true is gene is present in current bin (value of
# i in this loop)
idx = (gene_stats.m >= bin_edges[i]) & \
(gene_stats.m < bin_edges[i + 1])
if idx.sum() > 0: # making sure that the bin contains at least one gene
bin_genes.append(list(idx[idx].index))
bin_vals = [] # holds the variance and mean value of such a gene
# from each bin that has lowest variance in that bin
for genes in bin_genes:
# subset the full dataframe to extract genes of current bin
temp_stat = gene_stats.reindex(genes)
# identify gene with lowest varaince in current bin
temp_gene = temp_stat.idxmin().v
bin_vals.append(
[temp_stat.v[temp_gene], temp_stat.m[temp_gene]])
# transpose to have a shape : (2, nbins) such that first row contains
# variances and second contain the means
bin_vals = np.array(bin_vals).T
# Fit lowess curve and get predicted y values
bin_cor_fac = lowess(bin_vals[0], bin_vals[1], return_sorted=False, frac=lowess_frac, it=100).T
fixed_var = {}
for bcf, genes in zip(bin_cor_fac, bin_genes):
for gene in genes:
# correct predicted y values from all
# genes of the corresponding bin
fixed_var[gene] = gene_stats.v[gene] - bcf
fixed_var = pd.Series(fixed_var)
return fixed_var
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment