Created
May 25, 2011 00:41
-
-
Save DarioS/990079 to your computer and use it in GitHub Desktop.
Count sequencing reads in exons and over junctions.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
setGeneric("rnaCounts", function(alns, tx.db, ...) {standardGeneric("rnaCounts")}) | |
setMethod("unique", "GRangesList", | |
function(x) | |
{ | |
gr <- unname(unlist(x)) | |
values(gr)$gene <- rep(names(x), elementLengths(x)) | |
gr.df <- as.data.frame(gr) | |
gr.df <- gr.df[!duplicated(gr.df), ] | |
split(GRanges(gr.df$seqnames, IRanges(gr.df$start, gr.df$end), gr.df$strand), | |
factor(gr.df$gene, levels = names(x))) | |
}) | |
# Get junctions from introns by transcripts. | |
.junctsByGene <- function(tx.db) | |
{ | |
db.list <- as.list(tx.db) | |
tx.juncts <- intronsByTranscript(tx.db) | |
all.juncts <- unlist(tx.juncts) | |
names(all.juncts) <- values(all.juncts) <- NULL | |
tx_id <- rep(1:length(tx.juncts), elementLengths(tx.juncts)) | |
gene_id <- factor(db.list$genes$gene_id[match(tx_id, db.list$genes$tx_id)], | |
unique(db.list$genes$gene_id)) | |
unique(split(all.juncts, gene_id)) | |
} | |
library(org.Hs.eg.db) | |
# alns is a list of GappedAlignments | |
setMethod("rnaCounts", c("list", "TranscriptDb"), | |
function(alns, tx.db, genes.symbols = as.list(org.Hs.egSYMBOL), verbose = TRUE) | |
{ | |
if(verbose) message("Gathering exons for each gene.") | |
gene.exons <- exonsBy(tx.db, "gene") | |
exons.genes <- rep(names(gene.exons), elementLengths(gene.exons)) | |
exons <- unlist(gene.exons) | |
values(exons) <- NULL | |
gene.exons <- split(exons, exons.genes) | |
if(verbose) message("Disjoining overlapping exons. This may take a few minutes.") | |
exons.disjoint <- isDisjoint(gene.exons) | |
gene.exons[!exons.disjoint] <- endoapply(gene.exons[!exons.disjoint], disjoin) | |
if(verbose) message("Determining junctions for each gene.") | |
gene.juncts <- .junctsByGene(tx.db)[names(gene.exons)] | |
if(verbose) message("Determining intronic regions for each gene.") | |
intronics <- psetdiff(gene.juncts, gene.exons) | |
# findOverlaps can't do 'within' overlaps when a chr is circular. | |
isCircular(seqinfo(gene.exons)) <- rep(FALSE, length(seqlevels(gene.exons))) | |
isCircular(seqinfo(gene.juncts)) <- rep(FALSE, length(seqlevels(gene.juncts))) | |
isCircular(seqinfo(intronics)) <- rep(FALSE, length(seqlevels(intronics))) | |
if(verbose) message("Finding alignment gaps.") | |
alns.gaps <- lapply(alns, function(x) | |
{ | |
gaps.grl <- grglist(x[ngap(x) > 0]) | |
alns.exons <- elementLengths(gaps.grl) | |
exons.sums <- cumsum(alns.exons) | |
index.offsets <- c(0, exons.sums[-length(exons.sums)]) | |
left.exons <- unlist(mapply(function(x, y) y + (1:(x - 1)), | |
as.list(alns.exons), as.list(index.offsets), | |
SIMPLIFY = FALSE)) | |
right.exons <- unlist(mapply(function(x, y) y + (2:x), | |
as.list(alns.exons), as.list(index.offsets), | |
SIMPLIFY = FALSE)) | |
gaps.gr <- unlist(gaps.grl) | |
gaps.ranges <- ranges(gaps.gr) | |
GRanges(seqnames(gaps.gr[left.exons]), | |
pgap(gaps.ranges[left.exons], gaps.ranges[right.exons]), | |
strand(gaps.gr[left.exons])) | |
}) | |
if(verbose) message("Counting in exons.") | |
ex.all <- unname(unlist(gene.exons)) | |
values(ex.all) <- NULL | |
ex.counts <- lapply(alns, function(x) | |
table(factor(subjectHits(findOverlaps(granges(x), ex.all, type = "within")), 1:length(ex.all)))) | |
ex.counts <- as.data.frame(do.call(cbind, ex.counts)) | |
ex.counts$gene <- rep(names(gene.exons), elementLengths(gene.exons)) | |
if(verbose) message("Counting across junctions.") | |
junct.all <- unname(unlist(gene.juncts)) | |
junct.counts <- lapply(alns.gaps, function(x) | |
countOverlaps(junct.all, x, type = "equal")) | |
junct.counts <- as.data.frame(do.call(cbind, junct.counts)) | |
junct.counts$gene <- rep(names(gene.juncts), elementLengths(gene.juncts)) | |
if(verbose) message("Counting inside intronic regions.") | |
intr.all <- unname(unlist(intronics)) | |
intr.counts <- lapply(alns, function(x) | |
table(factor(subjectHits(findOverlaps(granges(x), intr.all, type = "within")), 1:length(intr.all)))) | |
intr.counts <- as.data.frame(do.call(cbind, intr.counts)) | |
intr.counts$gene <- rep(names(intronics), elementLengths(intronics)) | |
all.counts <- rbind(ex.counts, junct.counts, intr.counts) | |
if(!is.null(genes.symbols)) | |
{ | |
if(verbose) message("Annotating with gene symbols.") | |
genes.symbols <- genes.symbols[all.counts$gene] | |
all.counts$gene.symbol <- sapply(genes.symbols, function(x) ifelse(length(x) > 0, x[1], NA)) | |
} | |
features.all <- c(ex.all, junct.all, intr.all) | |
values(features.all)$type <- rep(c("exon", "junction", "intronic"), | |
c(length(ex.all), length(junct.all), length(intr.all))) | |
values(features.all) <- cbind(values(features.all), DataFrame(all.counts)) | |
meta.columns <- colnames(values(features.all)) | |
metadata(features.all) <- list(totals = elementLengths(alns), | |
counts.cols = which(!meta.columns %in% c("type", "gene", "gene.symbol"))) | |
features.all | |
}) |
Overlapping exons are partitioned. Total reads and column indexes of count data are stored in GRanges metadata, making less variables to be passed to isoDiff and to calcFPKMs.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
20 July : The exon counts part only includes reads that fall wholly within an exon now.