Last active
July 5, 2017 18:00
-
-
Save MarcinKosinski/bf8924c2ac1da613af675e66c6a12545 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
extractSurvival <- function(cohorts){ | |
survivalData <- list() | |
for(i in cohorts){ | |
get(paste0(i, ".clinical"), envir = .GlobalEnv) %>% | |
select(patient.bcr_patient_barcode, | |
patient.vital_status, | |
patient.days_to_last_followup, | |
patient.days_to_death ) %>% | |
mutate(bcr_patient_barcode = toupper(patient.bcr_patient_barcode), | |
patient.vital_status = ifelse(patient.vital_status %>% | |
as.character() =="dead",1,0), | |
barcode = patient.bcr_patient_barcode %>% | |
as.character(), | |
times = ifelse( !is.na(patient.days_to_last_followup), | |
patient.days_to_last_followup %>% | |
as.character() %>% | |
as.numeric(), | |
patient.days_to_death %>% | |
as.character() %>% | |
as.numeric() ) | |
) %>% | |
filter(!is.na(times)) -> survivalData[[i]] | |
} | |
do.call(rbind,survivalData) %>% | |
select(bcr_patient_barcode, patient.vital_status, times) %>% | |
unique | |
} | |
extractMutations <- function(cohorts, prc){ | |
mutationsData <- list() | |
for(i in cohorts){ | |
get(paste0(i, ".mutations"), envir = .GlobalEnv) %>% | |
select(Hugo_Symbol, bcr_patient_barcode) %>% | |
filter(nchar(bcr_patient_barcode)==15) %>% | |
filter(substr(bcr_patient_barcode, 14, 15)=="01") %>% | |
unique -> mutationsData[[i]] | |
} | |
do.call(rbind,mutationsData) %>% unique -> mutationsData | |
mutationsData %>% | |
group_by(Hugo_Symbol) %>% | |
summarise(count = n()) %>% | |
arrange(desc(count)) %>% | |
mutate(count_prc = count/length(unique(mutationsData$bcr_patient_barcode))) %>% | |
filter_(paste0("count_prc > ",prc)) %>% | |
select(Hugo_Symbol) %>% | |
unlist -> topGenes | |
mutationsData %>% | |
filter(Hugo_Symbol %in% topGenes) -> mutationsData_top | |
mutationsData_top %>% | |
dplyr::group_by(bcr_patient_barcode) %>% | |
dplyr::summarise(count = n()) %>% | |
group_by(count) %>% | |
summarise(total = n()) %>% | |
arrange(desc(count)) | |
# | |
# mutationsData_top %>% | |
# spread(Hugo_Symbol, bcr_patient_barcode) -> mutationsData_top_sp | |
as.data.table(mutationsData_top) -> mutationsData_top_DT | |
dcast.data.table(mutationsData_top_DT, bcr_patient_barcode ~ Hugo_Symbol , fill = 0) %>% | |
as.data.frame -> mutationsData_top_dcasted | |
mutationsData_top_dcasted[,-1][mutationsData_top_dcasted[,-1] != "0"] <- 1 | |
mutationsData_top_dcasted -> result | |
names(result) <- gsub(names(result),pattern = "-", replacement = "") | |
result | |
} | |
extractCohortIntersection <- function(){ | |
data(package = "RTCGA.mutations")$results[,3] %>% | |
gsub(".mutations", "", x = .) -> mutations_data | |
data(package = "RTCGA.clinical")$results[,3] %>% | |
gsub(".clinical", "", x = .) -> clinical_data | |
intersect(mutations_data, clinical_data) | |
} | |
prepareCoxDataSplit <- function(mutationsData, survivalData, groups, seed = 4561){ | |
mutationsData %>% | |
mutate(bcr_patient_barcode = substr(bcr_patient_barcode,1,12)) %>% | |
left_join(survivalData, | |
by = "bcr_patient_barcode") -> coxData | |
coxData <- coxData[, -c(1,2)] | |
coxData %>% | |
filter(times > 0) %>% | |
filter(!is.na(times)) -> coxData | |
apply(coxData[,-c(1092, 1093)], MARGIN = 2,function(x){ | |
as.numeric(as.character(x)) | |
}) -> coxData[,-c(1092, 1093)] | |
set.seed(seed) | |
sample(groups, replace = TRUE, size = 6085) -> groups | |
split(coxData, groups) #coxData_split | |
} | |
prepareForumlaSGD <- function(coxData){ | |
as.formula(paste("Surv(times, patient.vital_status) ~ ", | |
paste(names(coxData[[1]])[-c(1092, 1093)], | |
collapse="+"), collapse = "")) | |
} | |
full_cox_loglik_matrix <- function(beta, x, censored){ | |
order(x$times) -> order2 | |
x[order2, ] -> xORD | |
censored[order2] -> censORD | |
sum(censORD*(beta%*%x[, -which(names(x)=='times')] - | |
log(cumsum(exp(beta1*rev(x1) + beta2*rev(x2)))))) | |
} | |
library(dplyr) | |
library(RTCGA.clinical) | |
library(RTCGA.mutations) | |
library(data.table) | |
library(coxphSGD) | |
library(archivist) | |
(extractCohortIntersection() -> cohorts) | |
head(extractSurvival(cohorts) -> survivalData) | |
extractMutations(cohorts, 0.02) -> mutationsData | |
mutationsData[1:8, c(1,4,56,100,207,801)] | |
set.seed(4561) | |
prepareCoxDataSplit(mutationsData,survivalData, groups = 100) -> coxData_split | |
head(coxData_split[[1]][c(1,10), c(210,302,356,898,911,1092:1093)]) | |
prepareForumlaSGD(coxData_split) -> formulaSGD | |
coxData_split[99:100] -> testCox | |
coxData_split[1:98] -> trainCox | |
coxphSGD(formulaSGD, data = trainCox, max.iter = 490) -> model_1_over_t | |
coxphSGD(formulaSGD, data = trainCox, max.iter = 490, | |
learningRates = function(t){1/(50*sqrt(t))}) -> model_1_over_50sqrt_t | |
coxphSGD(formulaSGD, data = trainCox, max.iter = 490, | |
learningRates = function(t){1/(100*sqrt(t))}) -> model_1_over_100sqrt_t | |
saveToRepo(testCox) | |
saveToRepo(trainCox) | |
saveToRepo(model_1_over_t) | |
saveToRepo(model_1_over_50sqrt_t) | |
saveToRepo(model_1_over_100sqrt_t) | |
model_1_over_t$coefficients[c(98, 196, 294, 392, 490)] -> coeffs_1_overt | |
do.call(rbind, coeffs_1_overt) %>% | |
as.data.frame %>% | |
mutate(epoka = 1:5) %>% | |
tidyr::gather(key=epoka) ->gat | |
names(gat)[2]<- "gen" | |
library(ggplot2) | |
gat %>% | |
ggplot(aes(value)) + geom_histogram(fill=4, col = 1, binwidth = 0.01) + | |
facet_grid(epoka~.) + | |
theme_classic(base_size = 15) + | |
ylab('liczność') + xlab('wartości współczynników modelu') + | |
ggtitle('Histogramy wartości współczynników modelu po kolejnych epokach') + coord_cartesian(xlim = c(-0.5,0.5)) -> fig1 | |
model_1_over_50sqrt_t$coefficients[c(98, 196, 294, 392, 490)] -> coeffs_over_50sqrt_t | |
do.call(rbind, coeffs_over_50sqrt_t) %>% | |
as.data.frame %>% | |
mutate(epoka = 1:5) %>% | |
tidyr::gather(key=epoka) ->gat2 | |
names(gat2)[2]<- "gen" | |
library(ggplot2) | |
gat2 %>% | |
ggplot(aes(value)) + geom_histogram(fill=5, col = 1, binwidth = 0.01) + | |
facet_grid(epoka~.) + | |
theme_classic(base_size = 15) + | |
ylab('liczność') + xlab('wartości współczynników modelu') + | |
ggtitle('Histogramy wartości współczynników modelu po kolejnych epokach') + coord_cartesian(xlim = c(-0.25,0.25)) -> fig2 | |
model_1_over_100sqrt_t$coefficients[c(98, 196, 294, 392, 490)] -> coeffs_over_100sqrt_t | |
do.call(rbind, coeffs_over_100sqrt_t) %>% | |
as.data.frame %>% | |
mutate(epoka = 1:5) %>% | |
tidyr::gather(key=epoka) ->gat3 | |
names(gat3)[2]<- "gen" | |
gat3 %>% | |
ggplot(aes(value)) + geom_histogram(fill=6, col = 1, binwidth = 0.01) + | |
facet_grid(epoka~.) + | |
theme_classic(base_size = 15) + | |
ylab('liczność') + xlab('wartości współczynników modelu') + | |
ggtitle('Histogramy wartości współczynników modelu po kolejnych epokach') + coord_cartesian(xlim = c(-0.1,0.1)) -> fig3 | |
gat %>% | |
mutate(model = "model1") %>% | |
bind_rows(gat2 %>% | |
mutate(model = "model2")) %>% | |
bind_rows(gat3 %>% | |
mutate(model = "model3")) %>% | |
ggplot(aes(value)) + geom_histogram(fill=4, col = 1, binwidth = 0.001) + | |
facet_grid(epoka~model) + | |
theme_classic(base_size = 15) + | |
ylab('liczność') + xlab('wartość współczynnika modelu') + | |
ggtitle('Histogramy wartości współczynników modelu po kolejnych epokach') -> fig4 | |
do.call(rbind, coeffs_over_100sqrt_t) %>% t %>% as.data.frame -> model3 | |
do.call(rbind, coeffs_over_50sqrt_t) %>% t %>% as.data.frame -> model2 | |
do.call(rbind, coeffs_1_overt) %>% t %>% as.data.frame -> model1 | |
names(model3) <- c("epoka1", "epoka2","epoka3", "epoka4", "epoka5") | |
names(model2) <- c("epoka1", "epoka2","epoka3", "epoka4", "epoka5") | |
names(model1) <- c("epoka1", "epoka2","epoka3", "epoka4", "epoka5") | |
library(GGally) | |
ggpairs( title = 'Różnice między współczynnikami w kolejnych epokach', | |
model1, | |
upper = "blank", | |
lower = list(continuous = "points") | |
) -> fig5 | |
saveToRepo(fig1) # 3dc2e14a037bbaae9e892dd255150c28 | |
saveToRepo(fig2) # 6e5065abdbeae888a2836c1e59460424 | |
saveToRepo(fig3) # ac9a06e9c6428581cdacb2c1e56e5b57 | |
saveToRepo(fig4) # 60e327d310c600493c6c661f7330b868 | |
saveToRepo(fig5) # db5b77ff56b7128244e88d905173cf22 | |
pdf('hist_overt_t.pdf', width = 10, height = 8 ) | |
fig2 | |
dev.off() | |
alink('3dc2e14a037bbaae9e892dd255150c28', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex') | |
alink('6e5065abdbeae888a2836c1e59460424', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex') | |
alink('ac9a06e9c6428581cdacb2c1e56e5b57', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex') | |
alink('60e327d310c600493c6c661f7330b868', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex') | |
alink('db5b77ff56b7128244e88d905173cf22', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex') | |
names(tail(sort(abs(tail(model_1_over_t$coefficients,1)[[1]])),40)) -> top40 | |
tail(model_1_over_t$coefficients,1)[[1]][which(names(tail(model_1_over_t$coefficients,1)[[1]]) %in% top40)] -> real_top40 | |
data.frame(beta = sort(real_top40, decreasing = TRUE), | |
exp_beta = exp(sort(real_top40, decreasing = TRUE))) %>% xtable::xtable() | |
do.call(rbind, trainCox) -> trainCox_df | |
colMeans(trainCox_df[, -c(1092:1093)]) -> srednie_trainCox_df | |
data.frame( beta =tail(model_1_over_t$coefficients,1)[[1]], | |
beta_star = srednie_trainCox_df) -> wyniki | |
data.frame( gen = rownames(wyniki), | |
beta = round(wyniki[, 1], 3), | |
exp_beta = round(exp(wyniki[, 1]),2), | |
srednia = round(wyniki[, 2],3), | |
beta_star = round(wyniki[, 1]/wyniki[, 2], 2) ) %>% | |
arrange(desc(beta_star)) %>% | |
.[1:40,] %>% xtable::xtable() | |
aoptions('repoDir', getwd()) | |
loadFromLocalRepo('446ac4dcb7d65bf39057bb341b296f1a') | |
do.call(rbind, model_1_over_t$coefficients) -> wspolczynniki | |
qplot(1:491,wspolczynniki[ ,which(colnames(wspolczynniki) == "BRAF")]) + | |
xlab('Krok algorytmu') + | |
ylab('Wartość współczynnika') + | |
theme_minimal(base_size = 20) + | |
ggtitle('Wartości współczynnika genu BRAF') -> fig8 | |
qplot(1:491,wspolczynniki[ ,which(colnames(wspolczynniki) == "EGFR")]) + | |
xlab('Krok algorytmu') + | |
ylab('Wartość współczynnika') + | |
theme_minimal(base_size = 20) + | |
ggtitle('Wartości współczynnika genu EGFR') -> fig9 | |
saveToRepo(fig8) # 3cd8e1f6da766d3028fc2d8f5fef220f | |
alink('3cd8e1f6da766d3028fc2d8f5fef220f', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex') | |
saveToRepo(fig9) # 0f5e8e306cc2cba11fceb2bb5f34cbeb | |
alink('0f5e8e306cc2cba11fceb2bb5f34cbeb', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex') | |
pdf('fig8.pdf', width = 10, height = 8 ) | |
fig8 | |
dev.off() | |
pdf('fig9.pdf', width = 10, height = 8 ) | |
fig9 | |
dev.off() | |
library(archivist) | |
setLocalRepo(getwd()) | |
loadFromLocalRepo('1a06bef4a60a237bb65ca3e2f3f23515') | |
loadFromLocalRepo('3eebc99bd231b16a3ea4dbeec9ab5edb') | |
do.call(rbind,testCox) -> testCox_binded | |
do.call(rbind,trainCox) -> trainCox_binded | |
library(dplyr) | |
library(survMisc) | |
trainCox_binded %>% | |
select(patient.vital_status, times, BRAF, EGFR) %>% | |
survfit( Surv(times, patient.vital_status) ~ BRAF, data = .) %>% | |
survMisc:::autoplot.survfit( titleSize=12, type="CI", palette = "Set1", | |
alpha = 0.7, survLineSize=2, | |
pX = 0.3, sigP =3, title = "Przeżycie a mutacja genu BRAF" ) -> km_plot | |
xlims <- c(0,5000) | |
library(ggplot2) | |
km_plot[[1]] <- km_plot[[1]] + | |
coord_cartesian(xlim = c(xlims[1],xlims[2])) + | |
theme_light(base_size = 22) | |
km_plot[[2]] <- km_plot[[2]] + | |
coord_cartesian(xlim = xlims)+ | |
theme_light(base_size = 22) | |
pdf("BRAF.pdf", width = 10, height = 8) | |
survMisc::autoplot(km_plot) | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment