Last active
January 20, 2022 12:16
-
-
Save JoeUnsung/26e7269e55476f9064a646171cc3ca44 to your computer and use it in GitHub Desktop.
brunch_r_xgboost
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## apply xgboost on otto data | |
## url : https://www.kaggle.com/c/otto-group-product-classification-challenge/data | |
## reference : https://www.analyticsvidhya.com/blog/2016/01/xgboost-algorithm-easy-steps/ | |
install.packages("Matrix") | |
library(Matrix) | |
install.packages(c("caret", "car", "dplyr")) | |
library(xgboost) | |
library(readr) | |
library(stringr) | |
library(caret) | |
library(car) | |
library(dplyr) | |
setwd("C:\\Users\\ts93856\\Desktop\\datasource") | |
# load data | |
df_train <- read.csv("train.csv") | |
df_test <- read.csv("test.csv") | |
df_test <- lapply(df_test, as.numeric) | |
df_test <- as.data.frame(df_test) | |
x_test <- df_test[, -1] | |
## very simple way to convert categorical data into numeric data | |
## xgboost는 numeric만 처리할 수 있음. 명목변수 -> 연속형 변수로 변형이 필요 | |
x <- lapply(df_train, as.numeric) | |
train <- as.data.frame(x) | |
train <- train[, -ncol(train)] | |
train <- train[, -1] | |
## made target data | |
y <- x$target | |
y <- y-1 | |
unique(y) | |
?xgboost | |
View(data.matrix(train)) | |
## modeling | |
xgb <- xgboost(data = data.matrix(train), | |
label = y, | |
eta = 0.3, ## eta 학습률, x의 움직임 (default = 0.5) | |
max_depth = 15, ## max_depth, decision tree가 몇번 들어가는지 | |
nround=25, ## nround 최대로 iteration 돌아가는 횟수 | |
subsample = 1, | |
colsample_bytree = 0.5, | |
seed = 1, | |
eval_metric = "merror", ## 랜덤포레스트의 평가 지표 | |
objective = "multi:softprob", | |
num_class = 9, | |
nthread = 3 | |
) | |
xgb | |
## scoring | |
y_pred <- predict(xgb, data.matrix(x_test), na.action = na.pass) | |
sum(y_pred) | |
## prediction | |
test_prediction <- matrix(y_pred, nrow = 9, | |
ncol=length(y_pred)/9) %>% | |
t() %>% | |
data.frame() %>% | |
mutate(label = 1, | |
max_prob = max.col(., "last")) | |
head(test_prediction, 3) | |
result <- test_prediction$max_prob | |
## submission file 작성 | |
sub_csv <- matrix(0, nrow = nrow(x) , ncol = 9) | |
x <- data.frame(sample_sub , result) | |
for (i in 1:nrow(x)) { | |
if (x$result[i] == 1){ | |
sub_csv[i,] <- c(1,0,0,0,0,0,0,0,0) | |
} else if (x$result[i] == 2){ | |
sub_csv[i,] <- c(0,1,0,0,0,0,0,0,0) | |
} else if (x$result[i] == 3){ | |
sub_csv[i,] <- c(0,0,1,0,0,0,0,0,0) | |
} else if (x$result[i] == 4){ | |
sub_csv[i,] <- c(0,0,0,1,0,0,0,0,0) | |
} else if (x$result[i] == 5){ | |
sub_csv[i,] <- c(0,0,0,0,1,0,0,0,0) | |
} else if (x$result[i] == 6){ | |
sub_csv[i,] <- c(0,0,0,0,0,1,0,0,0) | |
} else if (x$result[i] == 7){ | |
sub_csv[i,] <- c(0,0,0,0,0,0,1,0,0) | |
} else if (x$result[i] == 8){ | |
sub_csv[i,] <- c(0,0,0,0,0,0,0,1,0) | |
} else { | |
sub_csv[i,] <- c(0,0,0,0,0,0,0,0,1) | |
} | |
} | |
id <- seq(1:nrow(x)) | |
result_submission <- data.frame(id, sub_csv) | |
write.csv(result_submission,"sampleSubmission.csv") | |
colnames(result_submission) <- c("id", "Class_1","Class_2", | |
"Class_3", | |
"Class_4", | |
"Class_5", | |
"Class_6", | |
"Class_7", | |
"Class_8", | |
"Class_9") | |
nrow(result_submission) | |
colnames(result_submission) | |
## 가장 중요한 변수가 무엇인지 시각화하여 확인 | |
# Lets start with finding what the actual tree looks like | |
model <- xgb.dump(xgb, with.stats = T) | |
model[1:10] #This statement prints top 10 nodes of the model | |
# Get the feature real names | |
names <- dimnames(data.matrix(train))[[2]] | |
# Compute feature importance matrix | |
importance_matrix <- xgb.importance(names, model = xgb) | |
# Nice graph | |
xgb.plot.importance(importance_matrix[1:10,], top_n = 10) | |
## 검정 과정 | |
# pearson's validation | |
test <- chisq.test(train$feat_11, data) | |
print(test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment