Created
May 22, 2015 08:25
-
-
Save johanez/7b227aca3db10309ebad to your computer and use it in GitHub Desktop.
randomForest CV based variable selection (rfcv) wrapper, that returns the best rf_model and important variable names.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# paralell RF with foreach | |
# careful: no confusion, err.rate, mse and rsq components | |
# (as well as the corresponding components in the test compnent, if exist) | |
rf_parallel <- function(x, y, ntree=500, ncore=4, ...){ | |
require(foreach) | |
require(randomForest) | |
# optional parallel backend | |
if (ncore > 1) { | |
require(doParallel) | |
registerDoParallel(ncore) | |
} | |
rf <- foreach(ntree_i=rep(floor(ntree/ncore), ncore), | |
.combine=randomForest::combine, | |
.packages='randomForest') %do% { | |
randomForest(x, y, ntree=ntree_i, ...) | |
} | |
return(rf) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# modified rfcv that returns the rf witht he optimal ncov | |
# todo: | |
# parallelize cv instead of rf? -> (no double calc of top rf)) | |
source("rf_parallel.R") | |
rfcv_ncov <- function (trainx, trainy, cv.fold = 5, scale = "log", step = 0.5, | |
mtry = function(p) max(1, floor(sqrt(p))), recursive = FALSE, ntree=500, ncore=4, | |
...) | |
{ | |
classRF <- is.factor(trainy) | |
n <- nrow(trainx) | |
p <- ncol(trainx) | |
if (scale == "log") { | |
k <- floor(log(p, base = 1/step)) | |
n.var <- round(p * step^(0:(k - 1))) | |
same <- diff(n.var) == 0 | |
if (any(same)) | |
n.var <- n.var[-which(same)] | |
if (!1 %in% n.var) | |
n.var <- c(n.var, 1) | |
} | |
else { | |
n.var <- seq(from = p, to = 1, by = step) | |
} | |
k <- length(n.var) | |
cv.pred <- vector(k, mode = "list") | |
for (i in 1:k) cv.pred[[i]] <- trainy | |
if (classRF) { | |
f <- trainy | |
} | |
else { | |
f <- factor(rep(1:5, length = length(trainy))[order(order(trainy))]) | |
} | |
nlvl <- table(f) | |
idx <- numeric(n) | |
for (i in 1:length(nlvl)) { | |
idx[which(f == levels(f)[i])] <- sample(rep(1:cv.fold, | |
length = nlvl[i])) | |
} | |
for (i in 1:cv.fold) { | |
# replaced rf by rf_parallel | |
all.rf <- rf_parallel(x=trainx[idx != i, , drop = FALSE], | |
y=trainy[idx != i], | |
ntree=ntree, | |
ncore=ncore, | |
xtest=trainx[idx == i, , drop = FALSE], | |
ytest=trainy[idx == i], mtry = mtry(p), importance = TRUE, | |
keep.forest=FALSE, | |
...) | |
cv.pred[[1]][idx == i] <- all.rf$test$predicted | |
impvar <- (1:p)[order(all.rf$importance[, 1], decreasing = TRUE)] | |
for (j in 2:k) { | |
imp.idx <- impvar[1:n.var[j]] | |
# replaced rf by rf_parallel | |
sub.rf <- rf_parallel(x= trainx[idx != i, imp.idx, drop=F], | |
y=trainy[idx != i], | |
ntree=ntree, | |
ncore=ncore, | |
xtest= trainx[idx == i, imp.idx, drop = FALSE], | |
ytest= trainy[idx == i], | |
mtry = mtry(n.var[j]), importance = recursive, | |
keep.forest=FALSE, | |
...) | |
cv.pred[[j]][idx == i] <- sub.rf$test$predicted | |
if (recursive) { | |
impvar <- (1:length(imp.idx))[order(sub.rf$importance[, | |
1], decreasing = TRUE)] | |
} | |
NULL | |
} | |
NULL | |
cat(i, ".") | |
} | |
if (classRF) { | |
error.cv <- sapply(cv.pred, function(x) mean(trainy != x)) | |
} | |
else { | |
error.cv <- sapply(cv.pred, function(x) mean((trainy - x)^2)) | |
} | |
names(error.cv) <- names(cv.pred) <- n.var | |
n_top <- n.var[which.min(error.cv)] | |
#use seq. random forest here, to keep OOB error est. | |
rf_top <- randomForest(trainx[, impvar[1:n_top], drop = FALSE], trainy, | |
importance = T, ntree=ntree, | |
...) | |
rf_all <- randomForest(trainx, trainy, | |
importance = T, ntree=ntree, | |
...) | |
res <- data.frame(row.names=c("All cov.", "Imp. Cov."), | |
ncov=c(ncol(trainx), n_top), | |
rsqOOB=c(rf_all$rsq[ntree], rf_top$rsq[ntree]), | |
mseOOB=c(rf_all$mse[ntree], rf_top$mse[ntree]), | |
rsqCV=c(1 - (error.cv[1]/var(trainy)), 1 - (error.cv[which.min(error.cv)]/var(trainy))), | |
mseCV=c(error.cv[1], error.cv[which.min(error.cv)]) | |
) | |
cat("\n") | |
print(round(res, 5)) | |
return(list(rf=rf_top, res=res, impvar=names(trainx)[impvar[1:n_top]])) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
See:
Svetnik, V., Liaw, A., Tong, C. and Wang, T., “Application of Breiman's Random Forest to Modeling Structure-Activity Relationships of Pharmaceutical Molecules”, MCS 2004, Roli, F. and Windeatt, T. (Eds.) pp. 334-343.