Skip to content

Instantly share code, notes, and snippets.

@cwhelan
Created November 20, 2012 21:02
Show Gist options
  • Save cwhelan/4121097 to your computer and use it in GitHub Desktop.
Save cwhelan/4121097 to your computer and use it in GitHub Desktop.
Using BSSmooth / bsseq to plot methylation
library(GenomicRanges)
library(rtracklayer)
library(bsseq)
library(Homo.sapiens)
# methylation summaries are data frames with columns, "chr", "start", "strand", "meth", "unmeth"
# get them all in one data frame
mergedSummary <- merge(normalMethylationSummary, cancerMethylationSummary,
by=c("chr", "start", "strand", "end"),
suffixes=c("normal","cancer"))
# build the bsseq object with methylation (M) and coverage (C) matrices
bs.fit <- BSseq(M=as.matrix(mergedSummary[,c("methnormal","methcancer")]),
C=as.matrix(cbind(mergedSummary[,"methnormal"] + mergedSummary[,"unmethnormal"],
mergedSummary[,"methcancer"] + mergedSummary[,"unmethcancer"])),
pos=mergedSummary$start,
chr=mergedSummary$chr,
sampleNames=c("normal","cancer"))
# need to order the BSseq object
bs.fit <- orderBSseq(bs.fit)
# compute the smoothed fit in parallel by chromosome using 4 cores (takes a while)
bs.fit <- BSmooth(bs.fit, mc.cores = 4, verbose = TRUE, parallelBy = "chromosome")
# load breakpoints
bpAnchoringRegions <- import.bed('LC3961_vs_41_stringent_high_score_anchoring_regions.bed', asRangedData=FALSE)
# find AML Genes (geneMaxRanges is contains the max start and end of all transcripts for a gene, precomputed)
amlGeneSymbols <- c("ZBTB20", "CAV2", "NBEAL1", "CTDSPL", "WIF1", "SFRP1", "SFRP2", "SFRP4", "SFRP5", "DKK1", "PRDM16", "CDKN2B", "CDKN2A", "WT1", "DNMT3A", "DNMT3L", "DNMT3B", "DNMT1", "CEBPA",\
"CDH1", "RUNX1")
amlGeneIds <- select(Homo.sapiens, keys=amlGeneSymbols, cols=c("ENTREZID"), keytype="SYMBOL")$ENTREZID
amlGenes <- geneMaxRanges[amlGeneIds]
# load other features
cpgIslands <- import.gff('/u0/dbase/genomes/human/features/human_cpgislands.gff', asRangedData=FALSE)
repmask <- import.gff3('/u0/dbase/genomes/human/features/human_repmask.gff', asRangedData=FALSE)
repsByClass <- split(repmask, as.factor(mcols(repmask)$repClass))
# build an annotations object
annotations <- c(list(breakpoints=bpAnchoringRegions, exons=exons(hg19UCSCGenes),
"CpG islands"=cpgIslands),
as.list(repsByClass[c("LINE","SINE","Satellite","Simple_repeat")]))
# set the colors for each sample
pData <- pData(bs.fit)
pData$col <- c("blue", "red")
pData(bs.fit) <- pData
# plot the smoothed values for each of the AML gene regions
pdf('amlGenesSmoothed.pdf', width=10, height=9)
plotManyRegions(bs.fit, unlist(amlGenes), extend=5000, main=amlGeneSymbols,
annoTrack=annotations,
)
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment