Skip to content

Instantly share code, notes, and snippets.

@Laurae2
Last active October 25, 2018 18:24
Show Gist options
  • Save Laurae2/0e9b6a3cdf0b5480e12c8856e7fb10ca to your computer and use it in GitHub Desktop.
Save Laurae2/0e9b6a3cdf0b5480e12c8856e7fb10ca to your computer and use it in GitHub Desktop.
data.table benchmark 2 billions
# library(data.table)
# N=2e9; K=100
# set.seed(1)
# DT <- data.table(
# id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char)
# id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char)
# id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char)
# id4 = sample(K, N, TRUE), # large groups (int)
# id5 = sample(K, N, TRUE), # large groups (int)
# id6 = sample(N/K, N, TRUE), # small groups (int)
# v1 = sample(5, N, TRUE), # int in range [1,5]
# v2 = sample(5, N, TRUE), # int in range [1,5]
# v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749
# )
# cat("GB =", round(sum(gc()[,2])/1024, 3), "\n")
# system.time( DT[, sum(v1), keyby=id1] )
# system.time( DT[, sum(v1), keyby=id1] )
# system.time( DT[, sum(v1), keyby="id1,id2"] )
# system.time( DT[, sum(v1), keyby="id1,id2"] )
# system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )
# system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )
# system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )
# system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )
# system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )
# system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )
R version 3.5.0 (2018-04-23) -- "Joy in Playing"
Copyright (C) 2018 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
Natural language support but running in an English locale
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
# Dual Xeon 6130, 32 cores / 64 threads @3.7/2.8 GHz
> library(data.table)
data.table 1.11.8 Latest news: r-datatable.com
> N=2e9; K=100
> set.seed(1)
> DT <- data.table(
+ id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char)
+ id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char)
+ id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char)
+ id4 = sample(K, N, TRUE), # large groups (int)
+ id5 = sample(K, N, TRUE), # large groups (int)
+ id6 = sample(N/K, N, TRUE), # small groups (int)
+ v1 = sample(5, N, TRUE), # int in range [1,5]
+ v2 = sample(5, N, TRUE), # int in range [1,5]
+ v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749
+ )
> cat("GB =", round(sum(gc()[,2])/1024, 3), "\n")
GB = 98.359
> system.time( DT[, sum(v1), keyby=id1] )
user system elapsed
76.547 18.953 95.518
> system.time( DT[, sum(v1), keyby=id1] )
user system elapsed
75.859 17.281 93.134
> system.time( DT[, sum(v1), keyby="id1,id2"] )
user system elapsed
185.609 17.047 202.638
> system.time( DT[, sum(v1), keyby="id1,id2"] )
user system elapsed
186.125 18.062 204.194
> system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )
user system elapsed
433.578 35.891 469.432
> system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )
user system elapsed
433.031 34.219 467.235
> system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )
user system elapsed
77.125 13.344 89.658
> system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )
user system elapsed
77.438 13.297 90.337
> system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )
user system elapsed
276.328 29.796 306.104
> system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )
user system elapsed
276.094 31.766 307.860
> sessionInfo()
R version 3.5.0 (2018-04-23)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.04.4 LTS
Matrix products: default
BLAS: /usr/local/lib/R/lib/libRblas.so
LAPACK: /usr/local/lib/R/lib/libRlapack.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] data.table_1.11.8
loaded via a namespace (and not attached):
[1] compiler_3.5.0 tools_3.5.0 yaml_2.1.19
# Dual Xeon 6154, 36 cores / 72 threads @3.7/3.7 GHz
> library(data.table)
data.table 1.11.8 Latest news: r-datatable.com
> N=2e9; K=100
> set.seed(1)
> DT <- data.table(
+ id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char)
+ id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char)
+ id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char)
+ id4 = sample(K, N, TRUE), # large groups (int)
+ id5 = sample(K, N, TRUE), # large groups (int)
+ id6 = sample(N/K, N, TRUE), # small groups (int)
+ v1 = sample(5, N, TRUE), # int in range [1,5]
+ v2 = sample(5, N, TRUE), # int in range [1,5]
+ v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749
+ )
> cat("GB =", round(sum(gc()[,2])/1024, 3), "\n")
GB = 98.359
> system.time( DT[, sum(v1), keyby=id1] )
user system elapsed
51.727 16.908 68.624
> system.time( DT[, sum(v1), keyby=id1] )
user system elapsed
50.534 13.117 63.641
> system.time( DT[, sum(v1), keyby="id1,id2"] )
user system elapsed
131.590 22.851 154.409
> system.time( DT[, sum(v1), keyby="id1,id2"] )
user system elapsed
130.969 20.963 151.900
> system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )
user system elapsed
339.665 50.357 389.927
> system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )
user system elapsed
339.569 42.565 382.041
> system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )
user system elapsed
49.583 19.953 69.292
> system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )
user system elapsed
49.299 13.689 62.820
> system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )
user system elapsed
217.549 35.290 252.762
> system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )
user system elapsed
218.605 32.129 250.658
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment