Created
May 11, 2020 01:15
-
-
Save jangorecki/ef1bb100cdd46f77f84c3aeda56fbe41 to your computer and use it in GitHub Desktop.
mergelist left-right
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
setDTthreads(40L) | |
test.data.table("mergelist.Rraw") ## warmup | |
set.seed(108) | |
N = 1e8L | |
## medium cardinality | |
region = data.table(region_id=seq_len(N/1e3), key="region_id") | |
division = data.table(division_id=seq_len(N/1e2), region_id=sample(N/1e3, N/1e2, TRUE), key="division_id") | |
setindexv(division, "region_id") | |
state = data.table(state_id=seq_len(N/1e1), division_id=sample(N/1e2, N/1e1, TRUE), key="state_id") | |
setindexv(state, "division_id") | |
fact = data.table(state_id=sample(N/1e1, N, TRUE), population=1) | |
setindexv(fact, "state_id") | |
l = list(fact, state, division, region) | |
sapply(l, nrow) | |
system.time(ans1<-mergelist(l, how="left")) | |
system.time(ans2<-mergelist(rev(l), how="right")) | |
all.equal(ans1, ans2, ignore.col.order=TRUE) | |
rm(ans1, ans2) | |
## high cardinality | |
region = data.table(region_id=seq_len(N/8), key="region_id") | |
division = data.table(division_id=seq_len(N/4), region_id=sample(N/8, N/4, TRUE), key="division_id") | |
setindexv(division, "region_id") | |
state = data.table(state_id=seq_len(N/2), division_id=sample(N/4, N/2, TRUE), key="state_id") | |
setindexv(state, "division_id") | |
fact = data.table(state_id=sample(N/2, N, TRUE), population=1) | |
setindexv(fact, "state_id") | |
l = list(fact, state, division, region) | |
sapply(l, nrow) | |
system.time(ans1<-mergelist(l, how="left")) | |
system.time(ans2<-mergelist(rev(l), how="right")) | |
all.equal(ans1, ans2, ignore.col.order=TRUE) | |
rm(ans1, ans2) | |
## low cardinality | |
region = data.table(region_id=seq_len(N/1e6), key="region_id") | |
division = data.table(division_id=seq_len(N/1e4), region_id=sample(N/1e6, N/1e4, TRUE), key="division_id") | |
setindexv(division, "region_id") | |
state = data.table(state_id=seq_len(N/1e2), division_id=sample(N/1e4, N/1e2, TRUE), key="state_id") | |
setindexv(state, "division_id") | |
fact = data.table(state_id=sample(N/1e2, N, TRUE), population=1) | |
setindexv(fact, "state_id") | |
l = list(fact, state, division, region) | |
sapply(l, nrow) | |
system.time(ans1<-mergelist(l, how="left")) | |
system.time(ans2<-mergelist(rev(l), how="right")) | |
all.equal(ans1, ans2, ignore.col.order=TRUE) | |
rm(ans1, ans2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
> library(data.table) | |
data.table 1.12.9 IN DEVELOPMENT built 2020-05-11 01:05:19 UTC; jan using 20 threads (see ?getDTthreads). Latest news: r-datatable.com | |
> setDTthreads(40L) | |
> test.data.table("mergelist.Rraw") ## warmup | |
getDTthreads(verbose=TRUE): | |
omp_get_num_procs() 40 | |
R_DATATABLE_NUM_PROCS_PERCENT unset (default 50) | |
R_DATATABLE_NUM_THREADS unset | |
omp_get_thread_limit() 2147483647 | |
omp_get_max_threads() 40 | |
OMP_THREAD_LIMIT unset | |
OMP_NUM_THREADS unset | |
RestoreAfterFork true | |
data.table is using 40 threads. See ?setDTthreads. | |
test.data.table() running: /usr/local/lib/R/site-library/data.table/tests/mergelist.Rraw | |
Running test id 251.04 | |
Running test id 291.06 | |
Sun May 10 18:12:34 2020 endian==little, sizeof(long double)==16, sizeof(pointer)==8, TZ=America/Los_Angeles, locale='LC_CTYPE=en_US.UTF-8;LC_NUMERI | |
C=C;LC_TIME=C.UTF-8;LC_COLLATE=en_US.UTF-8;LC_MONETARY=C.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=C.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEAS | |
UREMENT=C.UTF-8;LC_IDENTIFICATION=C', l10n_info()='MBCS=TRUE; UTF-8=TRUE; Latin-1=FALSE', getDTthreads()='omp_get_num_procs()==40; R_DATATABLE_NUM_PR | |
OCS_PERCENT==unset (default 50); R_DATATABLE_NUM_THREADS==unset; omp_get_thread_limit()==2147483647; omp_get_max_threads()==40; OMP_THREAD_LIMIT==uns | |
et; OMP_NUM_THREADS==unset; RestoreAfterFork==true; data.table is using 40 threads. See ?setDTthreads.' | |
10 longest running tests took 1s (58% of 2s) | |
ID time nTest | |
1: 28 0.188 64 | |
2: 291 0.181 6 | |
3: 102 0.169 43 | |
4: 121 0.134 54 | |
5: 26 0.131 64 | |
6: 103 0.129 48 | |
7: 22 0.129 64 | |
8: 25 0.124 64 | |
9: 27 0.111 64 | |
10: 21 0.103 64 | |
All 957 tests in tests/mergelist.Rraw completed ok in 2.537s elapsed (00:01:26 cpu) | |
> | |
> set.seed(108) | |
> N = 1e8L | |
> | |
> ## medium cardinality | |
> region = data.table(region_id=seq_len(N/1e3), key="region_id") | |
> division = data.table(division_id=seq_len(N/1e2), region_id=sample(N/1e3, N/1e2, TRUE), key="division_id") | |
> setindexv(division, "region_id") | |
> state = data.table(state_id=seq_len(N/1e1), division_id=sample(N/1e2, N/1e1, TRUE), key="state_id") | |
> setindexv(state, "division_id") | |
> fact = data.table(state_id=sample(N/1e1, N, TRUE), population=1) | |
> setindexv(fact, "state_id") | |
> l = list(fact, state, division, region) | |
> sapply(l, nrow) | |
[1] 100000000 10000000 1000000 100000 | |
> system.time(ans1<-mergelist(l, how="left")) | |
user system elapsed | |
66.078 10.530 19.845 | |
> system.time(ans2<-mergelist(rev(l), how="right")) | |
user system elapsed | |
29.692 4.947 9.334 | |
> all.equal(ans1, ans2, ignore.col.order=TRUE) | |
[1] TRUE | |
> rm(ans1, ans2) | |
> | |
> ## high cardinality | |
> region = data.table(region_id=seq_len(N/8), key="region_id") | |
> division = data.table(division_id=seq_len(N/4), region_id=sample(N/8, N/4, TRUE), key="division_id") | |
> setindexv(division, "region_id") | |
> state = data.table(state_id=seq_len(N/2), division_id=sample(N/4, N/2, TRUE), key="state_id") | |
> setindexv(state, "division_id") | |
> fact = data.table(state_id=sample(N/2, N, TRUE), population=1) | |
> setindexv(fact, "state_id") | |
> l = list(fact, state, division, region) | |
> sapply(l, nrow) | |
[1] 100000000 50000000 25000000 12500000 | |
> system.time(ans1<-mergelist(l, how="left")) | |
user system elapsed | |
95.045 13.896 32.975 | |
> system.time(ans2<-mergelist(rev(l), how="right")) | |
user system elapsed | |
69.442 10.305 22.856 | |
> all.equal(ans1, ans2, ignore.col.order=TRUE) | |
[1] TRUE | |
> rm(ans1, ans2) | |
> | |
> ## low cardinality | |
> region = data.table(region_id=seq_len(N/1e6), key="region_id") | |
> division = data.table(division_id=seq_len(N/1e4), region_id=sample(N/1e6, N/1e4, TRUE), key="division_id") | |
> setindexv(division, "region_id") | |
> state = data.table(state_id=seq_len(N/1e2), division_id=sample(N/1e4, N/1e2, TRUE), key="state_id") | |
> setindexv(state, "division_id") | |
> fact = data.table(state_id=sample(N/1e2, N, TRUE), population=1) | |
> setindexv(fact, "state_id") | |
> l = list(fact, state, division, region) | |
> sapply(l, nrow) | |
[1] 100000000 1000000 10000 100 | |
> system.time(ans1<-mergelist(l, how="left")) | |
user system elapsed | |
42.330 11.342 16.467 | |
> system.time(ans2<-mergelist(rev(l), how="right")) | |
user system elapsed | |
23.495 5.346 7.861 | |
> all.equal(ans1, ans2, ignore.col.order=TRUE) | |
[1] TRUE | |
> rm(ans1, ans2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment