azat · December 20, 2023 19:33 · azat · Dec 20, 2023 · azat · Dec 20, 2023
diff --git a/bench-jemalloc-cache-oblivious.c b/bench-jemalloc-cache-oblivious.c
 #include <bits/time.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <sys/types.h>
 #include <time.h>

 // Answers the question "Does cache oblivious in jemalloc still make sense?"
 // The short answer is "Yes"!
 //
 //     $ clang -O3 -g3 bench-malloc.c -o bench-malloc && prlimit --cpu=10 ./bench-malloc
 //
 //     $ LD_PRELOAD=/src/oss/jemalloc/.build/lib/libjemalloc.so.2 ./bench-malloc
 //     elapsed: 205832268
 //     elapsed: 2061036
 //     elapsed: 526032
 //     elapsed: 515628
 //
 //     $ LD_PRELOAD=/src/oss/jemalloc/.build-no-cache-oblivious/lib/libjemalloc.so.2 ./bench-malloc
 //     elapsed: 206214588
 //     elapsed: 3120804
 //     elapsed: 2628288
 //     elapsed: 2583684
 //
 //     *(Numbers from AMD Ryzen Threadripper PRO 5975WX)*
 //
 // Refs:
 // - https://github.com/jemalloc/jemalloc/issues/1098
 // - https://www.cs.tau.ac.il/~mad/publications/ismm2011-CIF.pdf

 __inline__ uint64_t rdtsc(void)
 {
  uint32_t lo, hi;
  __asm__ __volatile__ (      // serialize
  "xorl %%eax,%%eax \n        cpuid"
  ::: "%rax", "%rbx", "%rcx", "%rdx");
  /* We cannot use "=A", since this would use %rax on x86_64 and return only the lower 32bits of the TSC */
  __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
  return (uint64_t)hi << 32 | lo;
 }

 #define N 65535
 int main()
 {
    int ** array = calloc(N, sizeof(int *));
    for (size_t i = 0; i < N; ++i)
    {
        // we need 16K or above, since only for them jemalloc cache oblivious has difference
        array[i] = malloc(16<<10);
    }

    for (size_t n = 0; n < 4; ++n)
    {
        uint64_t start = rdtsc();
        for (size_t i = 0; i < N; ++i)
            *array[i] *= 3;
        uint64_t end = rdtsc();
        printf("elapsed: %lu\n", end - start);
    }

    // whatever... leaks...

    return 0;
 }
	#include <bits/time.h>
	#include <stdint.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <sys/types.h>
	#include <time.h>

	// Answers the question "Does cache oblivious in jemalloc still make sense?"
	// The short answer is "Yes"!
	//
	// $ clang -O3 -g3 bench-malloc.c -o bench-malloc && prlimit --cpu=10 ./bench-malloc
	//
	// $ LD_PRELOAD=/src/oss/jemalloc/.build/lib/libjemalloc.so.2 ./bench-malloc
	// elapsed: 205832268
	// elapsed: 2061036
	// elapsed: 526032
	// elapsed: 515628
	//
	// $ LD_PRELOAD=/src/oss/jemalloc/.build-no-cache-oblivious/lib/libjemalloc.so.2 ./bench-malloc
	// elapsed: 206214588
	// elapsed: 3120804
	// elapsed: 2628288
	// elapsed: 2583684
	//
	// (Numbers from AMD Ryzen Threadripper PRO 5975WX)
	//
	// Refs:
	// - https://github.com/jemalloc/jemalloc/issues/1098
	// - https://www.cs.tau.ac.il/~mad/publications/ismm2011-CIF.pdf

	__inline__ uint64_t rdtsc(void)
	{
	uint32_t lo, hi;
	__asm__ __volatile__ ( // serialize
	"xorl %%eax,%%eax \n cpuid"
	::: "%rax", "%rbx", "%rcx", "%rdx");
	/* We cannot use "=A", since this would use %rax on x86_64 and return only the lower 32bits of the TSC */
	__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
	return (uint64_t)hi << 32 \| lo;
	}

	#define N 65535
	int main()
	{
	int ** array = calloc(N, sizeof(int *));
	for (size_t i = 0; i < N; ++i)
	{
	// we need 16K or above, since only for them jemalloc cache oblivious has difference
	array[i] = malloc(16<<10);
	}

	for (size_t n = 0; n < 4; ++n)
	{
	uint64_t start = rdtsc();
	for (size_t i = 0; i < N; ++i)
	array[i] = 3;
	uint64_t end = rdtsc();
	printf("elapsed: %lu\n", end - start);
	}

	// whatever... leaks...

	return 0;
	}