Last active
March 6, 2021 17:21
-
-
Save florianhartig/ed38cd4627ef4cb99958cda485cfd683 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The purpose of this script is to show that RF variable importance | |
# will split importance values for collinear variables evenly, | |
# even if collinearity is low enough so that variables are sepearable | |
# and would be correctly separameted by an lm / ANOVA | |
set.seed(123) | |
# simulation parameters | |
n = 3000 | |
col = 0.7 | |
# create collinear predictors | |
x1 = runif(n) | |
x2 = col * x1 + (1-col) * runif(n) | |
# response is only influenced by x1 | |
y = x1 + rnorm(n) | |
# fit RF and show variable importance | |
library(randomForest) | |
fit <- randomForest(y ~ x1 + x2, importance=TRUE) | |
# VP splits importance evenly because of collinearity | |
par(mfrow = c(1,2)) | |
varImpPlot(fit, type = 2, main = "RF variable importance") | |
# lm / anova correctly identify x1 as causal variable | |
anova(lm(y ~ x1 + x2)) | |
barplot(coef(lm(y ~ x1 + x2)), main = "lm estimates") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment