Last active
December 20, 2023 19:33
-
-
Save azat/2dc33fdadbb2feaf18e9cb591392f6cb to your computer and use it in GitHub Desktop.
Answers the question "Does cache oblivious in jemalloc still make sense?" - Yes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <bits/time.h> | |
#include <stdint.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <sys/types.h> | |
#include <time.h> | |
// Answers the question "Does cache oblivious in jemalloc still make sense?" | |
// The short answer is "Yes"! | |
// | |
// $ clang -O3 -g3 bench-malloc.c -o bench-malloc && prlimit --cpu=10 ./bench-malloc | |
// | |
// $ LD_PRELOAD=/src/oss/jemalloc/.build/lib/libjemalloc.so.2 ./bench-malloc | |
// elapsed: 205832268 | |
// elapsed: 2061036 | |
// elapsed: 526032 | |
// elapsed: 515628 | |
// | |
// $ LD_PRELOAD=/src/oss/jemalloc/.build-no-cache-oblivious/lib/libjemalloc.so.2 ./bench-malloc | |
// elapsed: 206214588 | |
// elapsed: 3120804 | |
// elapsed: 2628288 | |
// elapsed: 2583684 | |
// | |
// *(Numbers from AMD Ryzen Threadripper PRO 5975WX)* | |
// | |
// Refs: | |
// - https://github.com/jemalloc/jemalloc/issues/1098 | |
// - https://www.cs.tau.ac.il/~mad/publications/ismm2011-CIF.pdf | |
__inline__ uint64_t rdtsc(void) | |
{ | |
uint32_t lo, hi; | |
__asm__ __volatile__ ( // serialize | |
"xorl %%eax,%%eax \n cpuid" | |
::: "%rax", "%rbx", "%rcx", "%rdx"); | |
/* We cannot use "=A", since this would use %rax on x86_64 and return only the lower 32bits of the TSC */ | |
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); | |
return (uint64_t)hi << 32 | lo; | |
} | |
#define N 65535 | |
int main() | |
{ | |
int ** array = calloc(N, sizeof(int *)); | |
for (size_t i = 0; i < N; ++i) | |
{ | |
// we need 16K or above, since only for them jemalloc cache oblivious has difference | |
array[i] = malloc(16<<10); | |
} | |
for (size_t n = 0; n < 4; ++n) | |
{ | |
uint64_t start = rdtsc(); | |
for (size_t i = 0; i < N; ++i) | |
*array[i] *= 3; | |
uint64_t end = rdtsc(); | |
printf("elapsed: %lu\n", end - start); | |
} | |
// whatever... leaks... | |
return 0; | |
} |
Increasing number of iterations to 100 to make this numbers pops up in profiler, and you can see that in case of no cache oblivious there are more L1-dcache-load-misses
and dTLB-load-misses
perf stat
jemalloc without cache oblivious
$ LD_PRELOAD=/src/oss/jemalloc/.build-no-cache-oblivious/lib/libjemalloc.so.2 perf stat -ddd ./bench-malloc
...
Performance counter stats for './bench-malloc':
166.55 msec task-clock # 0.996 CPUs utilized
1 context-switches # 6.004 /sec
0 cpu-migrations # 0.000 /sec
66,852 page-faults # 401.398 K/sec
725,678,276 cycles # 4.357 GHz (17.78%)
12,979,725 stalled-cycles-frontend # 1.79% frontend cycles idle (19.45%)
21,721,064 stalled-cycles-backend # 2.99% backend cycles idle (21.25%)
749,803,299 instructions # 1.03 insn per cycle
# 0.03 stalled cycles per insn (21.49%)
146,939,234 branches # 882.264 M/sec (21.61%)
297,380 branch-misses # 0.20% of all branches (21.61%)
237,663,001 L1-dcache-loads # 1.427 G/sec (21.62%)
39,964,059 L1-dcache-load-misses # 16.82% of all L1-dcache accesses (21.61%)
<not supported> LLC-loads
<not supported> LLC-load-misses
47,339,691 L1-icache-loads # 284.241 M/sec (21.62%)
507,670 L1-icache-load-misses # 1.07% of all L1-icache accesses (21.62%)
16,649,890 dTLB-loads # 99.971 M/sec (21.11%)
4,764,871 dTLB-load-misses # 28.62% of all dTLB cache accesses (19.30%)
74 iTLB-loads # 444.316 /sec (17.50%)
288,943 iTLB-load-misses # 390463.51% of all iTLB cache accesses (16.21%)
6,954,391 L1-dcache-prefetches # 41.756 M/sec (16.21%)
<not supported> L1-dcache-prefetch-misses
0.167295842 seconds time elapsed
0.083487000 seconds user
0.083483000 seconds sys
jemalloc with cache oblivious
$ LD_PRELOAD=/src/oss/jemalloc/.build/lib/libjemalloc.so.2 perf stat -ddd ./bench-malloc
Performance counter stats for './bench-malloc':
110.19 msec task-clock # 0.995 CPUs utilized
1 context-switches # 9.075 /sec
1 cpu-migrations # 9.075 /sec
67,349 page-faults # 611.218 K/sec
481,200,036 cycles # 4.367 GHz (17.80%)
11,929,597 stalled-cycles-frontend # 2.48% frontend cycles idle (20.53%)
16,922,271 stalled-cycles-backend # 3.52% backend cycles idle (23.25%)
834,993,130 instructions # 1.74 insn per cycle
# 0.02 stalled cycles per insn (24.50%)
170,839,336 branches # 1.550 G/sec (24.50%)
178,939 branch-misses # 0.10% of all branches (24.50%)
252,957,276 L1-dcache-loads # 2.296 G/sec (24.48%)
7,838,587 L1-dcache-load-misses # 3.10% of all L1-dcache accesses (22.30%)
<not supported> LLC-loads
<not supported> LLC-load-misses
55,823,354 L1-icache-loads # 506.619 M/sec (19.70%)
828,421 L1-icache-load-misses # 1.48% of all L1-icache accesses (16.88%)
4,731,785 dTLB-loads # 42.943 M/sec (16.34%)
418,129 dTLB-load-misses # 8.84% of all dTLB cache accesses (16.22%)
42 iTLB-loads # 381.166 /sec (16.34%)
228,691 iTLB-load-misses # 544502.38% of all iTLB cache accesses (16.34%)
2,439,273 L1-dcache-prefetches # 22.137 M/sec (16.34%)
<not supported> L1-dcache-prefetch-misses
0.110748395 seconds time elapsed
0.033515000 seconds user
0.077077000 seconds sys
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Numbers from AMD Ryzen Threadripper PRO 5975WX: