Last active
July 20, 2023 14:47
-
-
Save bjourne/02df9abb0712584da3a74cb325e4fbe6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Compile and run with: | |
// | |
// gcc -O3 -march=native fast-strlen.c -lpthread -o fast-strlen | |
// && ./fast-strlen | |
// | |
// Use gcc because clang is too smart and optimizes away parts of the | |
// benchmark. Results on Xeon(R) CPU E5-2650 v4 @ 2.20GHz with gcc | |
// 9.4.0: | |
// | |
// Scanning 10 times over 4.00GB... | |
// strlen_avx2 -> 5.09 seconds, 7.86 GB/s (length: 4000007776) | |
// threaded_strlen -> 1.17 seconds, 34.09 GB/s (length: 4000007776) | |
// slow_strlen -> 18.34 seconds, 2.18 GB/s (length: 4000007776) | |
// strlen -> 4.56 seconds, 8.76 GB/s (length: 4000007776) | |
// | |
// | |
// On Xeon(R) W-2245 CPU @ 3.90GHz with gcc 13.1.1: | |
// | |
// Scanning 10 times over 2.00GB... | |
// strlen_avx2 -> 1.30 seconds, 15.36 GB/s (length: 1999999999) | |
// strlen_avx512 -> 1.22 seconds, 16.36 GB/s (length: 1999999999) | |
// threaded_strlen -> 0.46 seconds, 43.56 GB/s (length: 1999999999) | |
// slow_strlen -> 1.19 seconds, 16.87 GB/s (length: 1999999999) | |
// strlen -> 1.19 seconds, 16.84 GB/s (length: 1999999999) | |
// | |
// How? The function threaded_strlen intentionally triggers a page | |
// fault to find an upper bound for the length of the string. It | |
// then has a set of threads run memchr in parallel on non-overlapping | |
// chunks of memory. | |
// | |
// Why? For fun. I was nerd sniped by the {n} times benchmark [1] and | |
// found that performance was limited by strlen (the best variant [2] | |
// is within 10% of strlen on the buffer). So if you make strlen | |
// faster that benchmark can be faster too. Besides, I haven't seen | |
// any multithreaded strlen's before. | |
// | |
// Obviously, this technique heavily relies on undefined behavior and | |
// shouldn't be used for anything other than beating benchmarks. | |
// | |
// [1]: https://owen.cafe/posts/six-times-faster-than-c/ | |
// [2]: https://gist.github.com/Const-me/3ade77faad47f0fbb0538965ae7f8e04 | |
#include <assert.h> | |
#include <immintrin.h> | |
#include <pthread.h> | |
#include <signal.h> | |
#include <stdbool.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <time.h> | |
#include <unistd.h> | |
// Size of buffer to test with and number of times the benchmarked | |
// function is called. | |
#define N_BUF (2L * 1000 * 1000 * 1000) | |
#define N_REPS 10 | |
#define N_GIG ((double)N_BUF / (1000 * 1000 * 1000)) | |
__attribute__((noinline)) size_t | |
slow_strlen(const char *s) { | |
size_t i = 0; | |
while (s[i++] != '\0'); | |
return --i; | |
} | |
#ifdef __AVX2__ | |
__attribute__((noinline)) size_t | |
strlen_avx2(const char *s) { | |
assert((uintptr_t)s % 32 == 0); | |
const __m256i zero = _mm256_setzero_si256(); | |
const char *p = s; | |
while (true) { | |
__m256i mask = _mm256_cmpeq_epi8(*(const __m256i *)p, zero); | |
if (!_mm256_testz_si256(mask, mask)) { | |
return p - s + slow_strlen(p); | |
} | |
p += 32; | |
} | |
} | |
#endif | |
#ifdef __AVX512F__ | |
__attribute__((noinline)) size_t | |
strlen_avx512(const char *s) { | |
assert((uintptr_t)s % 64 == 0); | |
const __m512i zero = _mm512_setzero_epi32(); | |
const char *p = s; | |
while (true) { | |
__m512i v = _mm512_loadu_si512(p); | |
__mmask64 mask = _mm512_cmpeq_epi8_mask(v, zero); | |
if (mask) { | |
return p - s + slow_strlen(p); | |
} | |
p += 64; | |
} | |
} | |
#endif | |
typedef struct { | |
const char *s; | |
size_t n_bytes; | |
size_t cnt; | |
bool found; | |
} strlen_job; | |
// Limit is 256TB | |
#define SIGSEGV_SWEEP_SPACE 256L * 1000 * 1000 * 1000 * 1000 | |
#define SIGSEGV_SWEEP_STEP 4L * 1000 * 1000; | |
static char * | |
sigsegv_addr = NULL; | |
static void * | |
thr_sigsegv(void *arg) { | |
size_t at = SIGSEGV_SWEEP_STEP; | |
const char *s = (const char *)arg; | |
size_t cnt = 0; | |
// Without the conditional the compiler may assume the loop is | |
// infinite and optimize away the memory read. | |
while (at < SIGSEGV_SWEEP_SPACE) { | |
if (!s[at]) { | |
cnt++; | |
} | |
at += SIGSEGV_SWEEP_STEP; | |
} | |
return (void *)cnt; | |
} | |
// Plain-old memchr appears to work the best when the length is | |
// limited. | |
static void * | |
thr_strlen(void *arg) { | |
strlen_job *job = (strlen_job *)arg; | |
const char *s = job->s; | |
const char *p = memchr(s, '\0', job->n_bytes); | |
job->found = p != NULL; | |
job->cnt = p - s; | |
return NULL; | |
} | |
static void | |
sigsegv_record(int signal, siginfo_t* siginfo, void* uap) { | |
sigsegv_addr = siginfo->si_addr; | |
pthread_exit(NULL); | |
} | |
static size_t | |
ceil_div(size_t a, size_t b) { | |
return a / b + (a % b != 0); | |
} | |
__attribute__((noinline)) size_t | |
threaded_strlen(const char *s) { | |
struct sigaction act, oldact; | |
memset(&act, 0, sizeof(struct sigaction)); | |
sigemptyset(&act.sa_mask); | |
act.sa_flags = SA_SIGINFO | SA_ONSTACK; | |
act.sa_sigaction = sigsegv_record; | |
// Launch a thread whose only job is to page fault to give us an | |
// upper bound on the number of bytes to search. | |
pthread_t h; | |
sigaction(SIGSEGV, &act, &oldact); | |
assert(!pthread_create(&h, NULL, thr_sigsegv, (void *)s)); | |
assert(!pthread_join(h, NULL)); | |
size_t n_bytes = sigsegv_addr - s; | |
// Launch a thread pool scanning for '\0' in disjoint regions of | |
// memory in parallel. | |
int n_threads = sysconf(_SC_NPROCESSORS_ONLN); | |
size_t n_bytes_per_thread = ceil_div(n_bytes, n_threads); | |
strlen_job *jobs = malloc(sizeof(strlen_job) * n_threads); | |
pthread_t *handles = malloc(sizeof(pthread_t) * n_threads); | |
for (int i = 0; i < n_threads; i++) { | |
jobs[i] = (strlen_job){ | |
s + i * n_bytes_per_thread, | |
n_bytes_per_thread, | |
0, | |
false | |
}; | |
assert(!pthread_create(&handles[i], | |
NULL, | |
thr_strlen, | |
(void *)&jobs[i])); | |
} | |
for (int i = 0; i < n_threads; i++) { | |
assert(!pthread_join(handles[i], NULL)); | |
} | |
// Restore old handler here -- thr_strlen can also fault if | |
// the buffer is small. | |
sigaction(SIGSEGV, &oldact, NULL); | |
for (int i = 0; i < n_threads; i++) { | |
strlen_job j = jobs[i]; | |
if (j.found) { | |
free(jobs); | |
free(handles); | |
return i * n_bytes_per_thread + j.cnt; | |
} | |
} | |
assert(false); | |
} | |
static uint64_t | |
nano_count() { | |
struct timespec t; | |
assert(!clock_gettime(CLOCK_MONOTONIC, &t)); | |
return (uint64_t)t.tv_sec * 1000000000 + t.tv_nsec; | |
} | |
void | |
benchmark(const char *name, | |
size_t (*func)(const char *s), | |
const char *s) { | |
size_t start = nano_count(); | |
size_t cnt = func(s); | |
for (int i = 1; i < N_REPS; i++) { | |
assert(func(s) == cnt); | |
} | |
size_t end = nano_count(); | |
double secs = (double)(end - start) / (1000 * 1000 * 1000); | |
double tot_gbs = N_GIG * N_REPS; | |
printf("%-15s -> %5.2f seconds, %5.2f GB/s (length: %ld)\n", | |
name, secs, tot_gbs / secs, cnt); | |
} | |
int | |
main(int argc, char *argv[]) { | |
char *buf = NULL; | |
assert(!posix_memalign((void *)&buf, 64, N_BUF)); | |
memset(buf, 'A', N_BUF); | |
buf[N_BUF - 1] = '\0'; | |
printf("Scanning %d times over %.2fGB...\n", N_REPS, N_GIG); | |
#ifdef __AVX2__ | |
benchmark("strlen_avx2", strlen_avx2, buf); | |
#endif | |
#ifdef __AVX512F__ | |
benchmark("strlen_avx512", strlen_avx512, buf); | |
#endif | |
benchmark("threaded_strlen", threaded_strlen, buf); | |
benchmark("slow_strlen", slow_strlen, buf); | |
benchmark("strlen", strlen, buf); | |
free(buf); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment