-
-
Save crazyhottommy/042aa268bbc8c6a4067545600da10d24 to your computer and use it in GitHub Desktop.
```{r} | |
TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM", "MUC16", "MUC1", "CD276", | |
"FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2") | |
``` | |
```{r} | |
TCGAExprsAndPheno2 <- function(gene_vector, cancer_types = "all", expression_type = "TPM"){ | |
library(recount3) | |
## Get all possible projects | |
all_proj <- available_projects() | |
tcga_proj <- all_proj %>% filter(file_source == "tcga") | |
if (cancer_types != "all") { | |
tcga_proj <- tcga_proj[tcga_proj$project %in% cancer_types,] | |
} | |
final_tcga <- NULL | |
for (tcga_ind in seq(nrow(tcga_proj))) { | |
proj_info <- tcga_proj[tcga_ind, c("project", "organism", "project_home")] | |
rse_gene_temp<- create_rse(proj_info) | |
count_matrix <- rse_gene_temp@assays@data$raw_counts | |
## Select gene ind | |
if (!identical("all", gene_vector)) { | |
gene_ind <- rse_gene_temp@rowRanges$gene_name %in% gene_vector | |
} else { | |
gene_ind <- seq(rse_gene_temp@rowRanges$gene_name) | |
} | |
if (expression_type == "TPM") { | |
#### Creating TPM | |
kilobase_length <- rse_gene_temp@rowRanges$bp_length | |
reads_per_rpk <- count_matrix/kilobase_length | |
per_mil_scale <- colSums(reads_per_rpk)/1000000 | |
tpm_matrix <- t(t(reads_per_rpk)/per_mil_scale) | |
## Make sure they match the ENSG and gene order | |
exprs_vector <- tpm_matrix[gene_ind,] | |
} else{ | |
#log2 normalize the count matrix | |
total_reads<- colSums(count_matrix) | |
normalized_counts<- log2(t(t(count_matrix)*10000000/total_reads) + 1) | |
exprs_vector <- normalized_counts[gene_ind,] | |
} | |
tcga_data_frame <- rse_gene_temp@colData@listData |> | |
as.data.frame() | |
if (nrow(exprs_vector) > length(gene_vector)) { | |
ind_vector <- rownames(exprs_vector) | |
} else { | |
ind_vector <- rse_gene_temp@rowRanges[gene_ind, ]$gene_name | |
} | |
tcga_data_frame[,ind_vector] <- exprs_vector | |
tcga_data_frame <- select(tcga_data_frame, !!ind_vector, everything()) | |
final_tcga <- rbind(final_tcga, tcga_data_frame) | |
} | |
## Group cancer data | |
final_tcga_filt <- final_tcga %>% | |
mutate(sample_type = case_when( | |
tcga.cgc_sample_sample_type == "Additional - New Primary" ~ "cancer", | |
tcga.cgc_sample_sample_type == "Additional Metastatic" ~ "metastatic", | |
tcga.cgc_sample_sample_type == "Metastatic" ~ "metastatic", | |
tcga.cgc_sample_sample_type == "Primary Blood Derived Cancer - Peripheral Blood " ~ "cancer", | |
tcga.cgc_sample_sample_type == "Primary Tumor" ~ "cancer", | |
tcga.cgc_sample_sample_type == "Recurrent Tumor" ~ "cancer", | |
tcga.cgc_sample_sample_type == "Solid Tissue Normal" ~ "normal" | |
)) | |
if (startsWith(colnames(final_tcga_filt)[1], "ENSG")) { | |
message("Multiple columns share gene names, returning ENSG") | |
} | |
return(final_tcga_filt) | |
} | |
``` | |
```{r} | |
tcga_TAA <- TCGAExprsAndPheno2(gene_vector = c(TAAs, "CD24"),cancer_type="all") | |
tcga_df<- tcga_TAA %>% | |
select(any_of(TAAs), CD24, tcga.tcga_barcode, sample_type, study) %>% | |
group_by(sample_type, study) %>% | |
summarise(across(1:20, ~log2(.x+1))) %>% | |
summarise(across(1:20, median)) %>% | |
arrange(study) %>% | |
filter(!is.na(sample_type)) | |
tcga_df<- tcga_df %>% | |
filter(sample_type == "cancer") | |
tcga_mat<- tcga_df[, -c(1,2)] %>% as.matrix() | |
rownames(tcga_mat)<- tcga_df %>% pull(study) | |
cell_fun = function(j, i, x, y, w, h, fill) { | |
grid.rect(x = x, y = y, width = w, height = h, | |
gp = gpar(col = "black", fill = NA)) | |
} | |
Heatmap(tcga_mat, cluster_columns = TRUE, cell_fun = cell_fun, | |
name = "log2TPM") | |
``` |
crazyhottommy
commented
Oct 20, 2023
Hi Tommy. Here is the code and the result of my analysis!
`library(cBioPortalData)
library(org.Hs.eg.db)
library(stringr)
library(ComplexHeatmap)
genes_of_interest <- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM",
"MUC16", "MUC1", "CD276", "FOLH1", "DLL3", "VTCN1",
"PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2")
cbio <- cBioPortal()
studies <- getStudies(cbio, buildReport = TRUE)
TCGA_studies <- studies[grep("PanCancer", studies$name), ]
df <- setNames(as.data.frame(lapply(TCGA_studies$studyId, function(study_ID) {
median_value <- unlist(lapply(genes_of_interest, function(gene_of_interest) {
entrez <- mapIds(org.Hs.eg.db, gene_of_interest, "ENTREZID", "SYMBOL")
mol_prof <- paste(study_ID, "rna_seq_v2_mrna", sep = "_" )
df <- molecularData(api = cbio, molecularProfileIds = mol_prof,
entrezGeneIds = entrez,
sampleIds = allSamples(cbio, study_ID)$sampleId)
result <- median(df[[mol_prof]]$value)
if (is.null(result)) {NA} else {result}
}))
setNames(median_value, genes_of_interest)
})), nm = TCGA_studies$studyId)
colnames(df) <- toupper(str_split(colnames(df), "_", simplify = TRUE)[, 1])
Heatmap(log(as.matrix(df) + 0.01), name = "log(RSEM)")`
Best,
Peter
Thanks, Peter! This result looks right to me, with MSLN high in MESO and FOLH1 high in PRAD.
I need to figure out what's wrong with my code working with recount3 data... likely the way I calculate TPM is wrong.
Thanks again!
Tommy
It seems the recount3 raw count is not right... I tried using a different method and the result (raw counts to TPM) is similar to the RSEM
a different data resource
library(ExperimentHub)
library(ComplexHeatmap)
eh = ExperimentHub()
query(eh , "GSE62944")
tcga_data<- eh[["EH1043"]]
colData(tcga_data)[1:5, 1:5]
count_mat<- assay(tcga_data)
count_mat[1:5, 1:5]
pheno_data<- phenoData(tcga_data)
get the gene length
#BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
#BiocManager::install("org.Hs.eg.db")
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
library(readr)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
hg19_genes<- genes(txdb)
hg19_genes
# map the Entrez ID to gene symbol
gene_symbol<- AnnotationDbi::select(org.Hs.eg.db, keys=hg19_genes$gene_id,
columns="SYMBOL", keytype="ENTREZID")
all.equal(hg19_genes$gene_id, gene_symbol$ENTREZID)
hg19_genes$symbol<- gene_symbol$SYMBOL
hg19_genes
gene_df<- data.frame(EntrezID = hg19_genes$gene_id,
Symbol = hg19_genes$symbol,
Gene_length = width(hg19_genes))
# almost 3000 genes not in the dataframe..
table(rownames(count_mat) %in% gene_df$Symbol)
indx<- intersect(rownames(count_mat), gene_df$Symbol)
setdiff(rownames(count_mat), gene_df$Symbol)
setdiff(gene_df$Symbol, rownames(count_mat))
count_mat<- count_mat[indx, ]
gene_df<- gene_df %>%
slice(match(indx, Symbol))
dim(gene_df)
convert counts to TPM
count_to_tpm<- function(mat, gene_length){
mat<- mat/gene_length
total_per_sample<- colSums(mat)
mat<- t(t(mat)/total_per_sample)
return(mat*1000000)
}
tpm_mat<- count_to_tpm(count_mat, gene_df$Gene_length)
TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "EPCAM", "MUC16", "MUC1", "CD276",
"FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2")
# can not find "NECTIN4", it might be a different name
TAAs[!TAAs %in% rownames(tpm_mat)]
tpm_sub<- tpm_mat[TAAs, ]
tpm_median<- cbind(t(tpm_sub), CancerType = as.character(colData(tcga_data)$CancerType)) %>%
as.data.frame() %>%
mutate(across(1:18, as.numeric)) %>%
group_by(CancerType) %>%
summarise(across(1:18, median))
tpm_sub_mat<- as.matrix(tpm_median[,-1])
rownames(tpm_sub_mat)<- tpm_median$CancerType
col_fun<- circlize::colorRamp2(c(-2,0,2), colors = c("blue", "white", "red"))
Heatmap(t(scale(log2(tpm_sub_mat +1))), cluster_columns = TRUE, cell_fun = cell_fun,
name = "log2TPM", col = col_fun)
I rewrote the code, and now the results make sense:
#BiocManager::install("recount3")
library(recount3)
library(purrr)
human_projects <- available_projects()
tcga_info = subset(
human_projects,
file_source == "tcga" & project_type == "data_sources"
)
seq(nrow(tcga_info))
proj_info <- map(seq(nrow(tcga_info)), ~tcga_info[.x, ])
rse_tcga <- map(proj_info, ~create_rse(.x))
/Users/tommytang/Library/Caches/org.R-project.R/R/recount3
does not exist, create directory? (yes/no): yes
TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM", "MUC16", "MUC1", "CD276", "FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2", "CD24")
#### Creating TPM
count2tpm<- function(rse){
count_matrix <- rse@assays@data$raw_counts
gene_length <- rse@rowRanges$bp_length
reads_per_rpk <- count_matrix/gene_length
per_mil_scale <- colSums(reads_per_rpk)/1000000
tpm_matrix <- t(t(reads_per_rpk)/per_mil_scale)
## Make sure they match the ENSG and gene order
gene_ind<- rse@rowRanges$gene_name %in% TAAs
tpm_submatrix <- tpm_matrix[gene_ind,]
rownames(tpm_submatrix)<- rse@rowRanges[gene_ind, ]$gene_name
return(tpm_submatrix)
}
tpm_data<- map(rse_tcga, count2tpm)
metadata<- map(rse_tcga, ~.x@colData@listData %>% as.data.frame())
tpm_data2<- reduce(tpm_data, cbind)
metadata2<- reduce(metadata, rbind)
dim(tpm_data2)
dim(metadata2)
metadata2<- metadata2 %>%
select(tcga.tcga_barcode, tcga.cgc_sample_sample_type, study) %>%
mutate(sample_type = case_when(
tcga.cgc_sample_sample_type == "Additional - New Primary" ~ "cancer",
tcga.cgc_sample_sample_type == "Additional Metastatic" ~ "metastatic",
tcga.cgc_sample_sample_type == "Metastatic" ~ "metastatic",
tcga.cgc_sample_sample_type == "Primary Blood Derived Cancer - Peripheral Blood " ~ "cancer",
tcga.cgc_sample_sample_type == "Primary Tumor" ~ "cancer",
tcga.cgc_sample_sample_type == "Recurrent Tumor" ~ "cancer",
tcga.cgc_sample_sample_type == "Solid Tissue Normal" ~ "normal"
))
final_df<- cbind(t(tpm_data2), metadata2)
library(ComplexHeatmap)
tcga_df<- final_df %>%
filter(sample_type == "cancer") %>%
group_by(sample_type, study) %>%
summarise(across(1:20, ~log2(.x+1))) %>%
summarise(across(1:20, median)) %>%
arrange(study) %>%
filter(!is.na(sample_type))
tcga_mat<- tcga_df[, -c(1,2)] %>% as.matrix()
rownames(tcga_mat)<- tcga_df %>% pull(study)
cell_fun = function(j, i, x, y, w, h, fill) {
grid.rect(x = x, y = y, width = w, height = h,
gp = gpar(col = "black", fill = NA))
}
Heatmap(tcga_mat, cluster_columns = TRUE, cell_fun = cell_fun,
name = "log2TPM")
You can also scale the expression for each gene across the cancer types
Heatmap(scale(tcga_mat), cluster_columns = TRUE, cell_fun = cell_fun,
name = "scaled\nlog2TPM")
That's great, Tommy!