Last active
June 20, 2020 22:09
-
-
Save jangorecki/0cf9170bee01ee6763719ada418c81e7 to your computer and use it in GitHub Desktop.
parallel bmerge
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ssa = function(unq_n, size, sort=FALSE) { | |
if (unq_n > size) return(sample.int(unq_n, size)) | |
unq_sub = seq_len(unq_n) | |
ans = sample(c(unq_sub, sample(unq_sub, size=max(size-unq_n, 0), replace=TRUE))) | |
if (sort) sort(ans) else ans | |
} | |
set.seed(108) | |
library(data.table) | |
options(width=200) | |
options(datatable.auto.index=FALSE, datatable.verbose=FALSE) ## not needed but just to be future proof if forder will setindex | |
N = 1e9L | |
## unsorted no duplicates | |
d1 = data.table(x=ssa(N, N))[, "v1":=seq_len(.N)] | |
d2 = data.table(y=ssa(N, N))[, "v2":=seq_len(.N)] | |
#d1 = data.table(x=ssa(N-1L, N))[, "v1":=seq_len(.N)] ## unsorted single duplicate | |
#d2 = data.table(y=ssa(N-1L, N))[, "v2":=seq_len(.N)] | |
## no index | |
setDTthreads(1L) | |
system.time(b <- d1[d2, on="x==y"]) | |
setDTthreads(40L) | |
system.time(B <- d1[d2, on="x==y"]) | |
all.equal(b, B) | |
## index | |
setindexv2 = function(x, cols) { ## pretend we are after #4386 | |
stopifnot(is.data.table(x), is.character(cols)) | |
if (is.null(attr(x, "index", TRUE))) setattr(x, "index", integer()) | |
setattr(attr(x, "index", TRUE), paste0("__", cols, collapse="__"), data.table:::forderv(x, cols, retGrp=TRUE)) | |
invisible(x) | |
} | |
setindexv2(d1, "x"); setindexv2(d2, "y") | |
setDTthreads(1L) | |
system.time(b <- d1[d2, on="x==y"]) | |
setDTthreads(40L) | |
system.time(B <- d1[d2, on="x==y"]) | |
all.equal(b, B) | |
## sorted index | |
setkeyv(d1, "x"); setkeyv(d2, "y"); | |
setindexv2(d1, "x"); setindexv2(d2, "y") | |
setDTthreads(1L) | |
system.time(b <- d1[d2, on="x==y"]) | |
setDTthreads(40L) | |
system.time(B <- d1[d2, on="x==y"]) | |
all.equal(b, B) | |
options(datatable.verbose=TRUE) | |
setDTthreads(1L) | |
system.time(b <- d1[d2, on="x==y"]) | |
setDTthreads(40L) | |
system.time(B <- d1[d2, on="x==y"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## all timings are timings of X[Y, on=.] rather than bmerge/smerge only | |
## bmerge is current master bmerge | |
## smerge is sort-merge | |
## Bmerge is parallel bmerge | |
## smerge as of 103b9c63cdb10c7ab60dd2a3f185caf20b2eb70d | |
## Bmerge as of 484384a3890730882ffe91b45f906189e0e7c23e | |
# no duplicates | |
## single thread no index | |
user system elapsed | |
bmerge 394.931 32.802 427.750 | |
smerge 336.447 73.674 410.139 | |
Bmerge 397.821 28.743 426.583 | |
## all threads no index | |
user system elapsed | |
bmerge 819.928 143.668 368.546 | |
smerge 1136.770 166.111 100.471 | |
Bmerge 3038.182 146.055 142.944 | |
## all threads index | |
user system elapsed | |
bmerge 658.381 103.473 377.290 | |
smerge 579.559 109.165 78.886 | |
Bmerge 2735.321 73.923 142.985 | |
## all threads sorted index | |
user system elapsed | |
bmerge 68.579 47.075 69.485 | |
smerge 37.985 42.468 20.842 | |
Bmerge 2315.506 38.060 87.270 | |
# single duplicate | |
## single thread no index | |
user system elapsed | |
bmerge 429.513 34.142 463.676 | |
smerge 367.919 80.007 447.952 | |
Bmerge 423.764 34.258 458.043 | |
## all threads no index | |
user system elapsed | |
bmerge 819.215 149.786 368.750 | |
smerge 1191.925 212.859 137.033 | |
Bmerge 3045.483 178.684 172.440 | |
## all threads index | |
user system elapsed | |
bmerge 654.823 98.173 379.881 | |
smerge 623.015 160.435 115.979 | |
Bmerge 2776.128 91.119 184.812 | |
## all threads sorted index | |
user system elapsed | |
bmerge 87.594 47.124 94.729 | |
smerge 71.851 64.715 42.599 | |
Bmerge 2290.993 44.478 109.372 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# no duplicates | |
> setDTthreads(1L) | |
> system.time(b <- d1[d2, on="x==y"]) | |
i.y has same type (integer) as x.x. No coercion needed. | |
on= matches existing key, using key | |
Starting bmerge ... | |
bmerge done in 35.4s elapsed (32.6s cpu) | |
Constructing irows for '!byjoin || nqbyjoin' ... 0.000s elapsed (0.000s cpu) | |
user system elapsed | |
52.008 16.121 68.132 | |
> setDTthreads(40L) | |
> system.time(B <- d1[d2, on="x==y"]) | |
i.y has same type (integer) as x.x. No coercion needed. | |
on= matches existing key, using key | |
Starting bmerge ... | |
bmerge done in 00:01:03 elapsed (00:38:22 cpu) | |
Constructing irows for '!byjoin || nqbyjoin' ... 0.000s elapsed (0.000s cpu) | |
user system elapsed | |
2328.887 47.733 88.212 | |
# single duplicate | |
> setDTthreads(1L) | |
> system.time(b <- d1[d2, on="x==y"]) | |
i.y has same type (integer) as x.x. No coercion needed. | |
on= matches existing key, using key | |
Starting bmerge ... | |
bmerge done in 36.9s elapsed (34.1s cpu) | |
Constructing irows for '!byjoin || nqbyjoin' ... 19.8s elapsed (15.5s cpu) | |
user system elapsed | |
70.631 20.086 90.720 | |
> setDTthreads(40L) | |
> system.time(B <- d1[d2, on="x==y"]) | |
i.y has same type (integer) as x.x. No coercion needed. | |
on= matches existing key, using key | |
Starting bmerge ... | |
bmerge done in 00:01:02 elapsed (00:37:50 cpu) | |
Constructing irows for '!byjoin || nqbyjoin' ... 19.7s elapsed (15.5s cpu) | |
user system elapsed | |
2312.471 42.391 107.200 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment