Skip to content

Instantly share code, notes, and snippets.

@Laurae2
Created March 29, 2020 14:16
Show Gist options
  • Save Laurae2/13d6b55848b84fcb16f5b83f8cb3a070 to your computer and use it in GitHub Desktop.
Save Laurae2/13d6b55848b84fcb16f5b83f8cb3a070 to your computer and use it in GitHub Desktop.
Benchmark xgboost and LightGBM using HIGGS dataset in R
library(xgboost)
library(lightgbm)
library(data.table)
setwd("/home/laurae/Documents/R/GBM-perf")
n_threads <- 16
data <- fread("HIGGS.csv")
labels <- data$V1
data[, V1 := NULL]
data <- as.matrix(data)
invisible(gc(verbose = FALSE))
data_train <- data[1:10000000, ]
data_valid <- data[10000001:11000000, ]
labels_train <- labels[1:10000000]
labels_valid <- labels[10000001:11000000]
auc <- function(preds, labels) {
x1 = as.numeric(preds[labels == 1])
n1 = as.numeric(length(x1))
x2 = as.numeric(preds[labels == 0])
n2 = as.numeric(length(x2))
r = rank(c(x1,x2))
return((sum(r[1:n1]) - n1 * (n1 + 1) / 2) / (n1 * n2))
}
invisible(gc(verbose = FALSE))
# 0.005 seconds
system.time({
dtrain_lgb <- lgb.Dataset(data_train, label = labels_train)
dvalid_lgb <- lgb.Dataset.create.valid(dtrain_lgb, data_valid, label = labels_valid)
})
invisible(gc(verbose = FALSE))
valids_lgb <- list(valid = dvalid_lgb)
params_lgb <- list(max_depth = 0,
num_leaves = 255,
learning_rate = 0.1,
min_data_in_leaf = 1,
min_sum_hessian_in_leaf = 100,
lambda_l1 = 0,
lambda_l2 = 0,
min_gain_to_split = 0,
max_bin = 255,
force_row_wise = TRUE,
boosting = "gbdt",
objective = "regression",
metric = "na")
# [2646.234] 74.762 seconds (36 threads) / [1245.910] 79.184 seconds (16 threads)
system.time({
model_lgb <- lgb.train(
params = params_lgb,
data = dtrain_lgb,
nrounds = 500,
num_threads = n_threads,
verbose = 2
)
})
invisible(gc(verbose = FALSE))
# [58.849] 1.939 seconds (36 threads) / [61.698] 3.985 seconds (16 threads)
system.time({
predict_lgb <- predict(model_lgb, data_valid, num_threads = n_threads)
})
auc(predict_lgb, labels_valid) # 0.8422343
invisible(gc(verbose = FALSE))
params_lgb <- list(max_depth = 0,
num_leaves = 255,
learning_rate = 0.1,
min_data_in_leaf = 1,
min_sum_hessian_in_leaf = 100,
lambda_l1 = 0,
lambda_l2 = 0,
min_gain_to_split = 0,
max_bin = 255,
force_row_wise = TRUE,
boosting = "gbdt",
objective = "regression",
metric = "auc")
# [15073.675] 427.270 seconds (36 threads) / [6979.774] 444.395 seconds (16 threads)
system.time({
model_lgb <- lgb.train(
params = params_lgb,
data = dtrain_lgb,
nrounds = 1000000,
valids = valids_lgb,
num_threads = n_threads,
early_stopping_rounds = 10,
verbose = 2
)
})
invisible(gc(verbose = FALSE))
# [327.103] 9.868 seconds (36 threads) / [321.714] 20.912 seconds (16 threads)
system.time({
predict_lgb <- predict(model_lgb, data_valid, num_threads = n_threads)
})
auc(predict_lgb, labels_valid) # 0.8525457
invisible(gc(verbose = FALSE))
# 6.109 seconds
system.time({
dtrain_xgb <- xgb.DMatrix(data_train, label = labels_train)
dvalid_xgb <- xgb.DMatrix(data_valid, label = labels_valid)
})
invisible(gc(verbose = FALSE))
valids_xgb <- list(test = dvalid_xgb)
params_xgb <- list(max_depth = 0,
max_leaves = 255,
eta = 0.1,
min_child_weight = 100,
alpha = 0,
lambda = 0,
gamma = 0,
max_bin = 255,
tree_method = "hist",
grow_policy = "lossguide",
objective = "reg:squarederror",
disable_default_eval_metric = 1)
# [3751.056] 105.180 seconds (36 threads) / [1826.430] 115.885 seconds (16 threads)
system.time({
model_xgb <- xgb.train(
params = params_xgb,
data = dtrain_xgb,
nrounds = 500,
nthread = n_threads,
verbose = 2
)
})
invisible(gc(verbose = FALSE))
# [40.590] 1.822 seconds (36 threads) / [39.515] 3.041 seconds (16 threads)
system.time({
predict_xgb <- predict(model_xgb, data_valid)
})
auc(predict_xgb, labels_valid) # 0.8425165
invisible(gc(verbose = FALSE))
params_xgb <- list(max_depth = 0,
max_leaves = 255,
eta = 0.1,
min_child_weight = 100,
alpha = 0,
lambda = 0,
gamma = 0,
max_bin = 255,
tree_method = "hist",
grow_policy = "lossguide",
objective = "reg:squarederror",
eval_metric = "auc")
# [21136.858] 627.479 seconds (36 threads) / [9528.641] 613.953 seconds (16 threads)
system.time({
model_xgb <- xgb.train(
params = params_xgb,
data = dtrain_xgb,
nrounds = 1000000,
watchlist = valids_xgb,
nthread = n_threads,
early_stopping_rounds = 10,
verbose = 2
)
})
invisible(gc(verbose = FALSE))
# [214.984] 7.067 seconds (36 threads) / [228.591] 15.334 seconds (16 threads)
system.time({
predict_xgb <- predict(model_xgb, data_valid)
})
auc(predict_xgb, labels_valid) # 0.8525526
invisible(gc(verbose = FALSE))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment