Skip to content

Instantly share code, notes, and snippets.

@kprotty
Last active August 21, 2024 17:26
Show Gist options
  • Save kprotty/9f45dde0eaea94a9a8d13097ee44b3cf to your computer and use it in GitHub Desktop.
Save kprotty/9f45dde0eaea94a9a8d13097ee44b3cf to your computer and use it in GitHub Desktop.
Updated to Zig 0.13
const std = @import("std");
const builtin = @import("builtin");
pub fn main() !void {
try bench(struct {
pub const name = "FAA";
pub fn inc(counter: *std.atomic.Value(usize), current: usize, rng: *u32) usize {
_ = rng;
_ = current;
return counter.fetchAdd(1, .monotonic) + 1;
}
});
try bench(struct {
pub const name = "CAS";
pub fn inc(counter: *std.atomic.Value(usize), current: usize, rng: *u32) usize {
_ = rng;
var value = current;
while (true) {
const new_value = value + 1;
value = counter.cmpxchgWeak(value, new_value, .monotonic, .monotonic) orelse return new_value;
}
}
});
try bench(struct {
pub const name = "CAS backoff";
pub fn inc(counter: *std.atomic.Value(usize), current: usize, rng: *u32) usize {
var value = current;
while (true) {
const new_value = value + 1;
_ = counter.cmpxchgWeak(value, new_value, .monotonic, .monotonic) orelse return new_value;
// reload after spinning
defer value = counter.load(.monotonic);
// On M1 & similar, WFE acts like an efficient pause yield when backing off.
if (comptime builtin.target.cpu.arch == .aarch64) {
asm volatile("wfe" ::: "memory");
continue;
}
// Spin a randomized amount of time to avoid threads resonating
// at the same frequency and causing permanent contention.
// https://github.com/apple/swift-corelibs-libdispatch/blob/542b7f32311680b11b6fc8fcb2576955460ba7da/src/shims/yield.h#L91-L120
const spin_min = 2;
const spin_max = 1024;
const spin_count = ((rng.* >> 24) & (spin_max - 1)) | (spin_min - 1);
rng.* = (rng.* *% 1103515245) +% 12345;
for (0..spin_count) |_| std.atomic.spinLoopHint();
}
}
});
}
fn bench(comptime Config: type) !void {
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena.deinit();
const allocator = arena.allocator();
const num_cpus = try std.Thread.getCpuCount();
const Shared = extern struct {
front_padding: [std.atomic.cache_line]u8 = undefined,
counter: std.atomic.Value(usize) = .{ .raw = 0 },
back_padding: [std.atomic.cache_line]u8 = undefined,
};
const Worker = struct {
shutdown: std.atomic.Value(bool) = .{ .raw = false },
thread: std.Thread = undefined,
increments: usize = 0,
shared: *Shared,
fn run(worker: *@This(), seed: usize) void {
var local_inc: usize = 0;
defer worker.increments = local_inc;
const shared = worker.shared;
var rng: u32 = @intCast(seed + 1);
var current = shared.counter.load(.monotonic);
while (!worker.shutdown.load(.monotonic)) {
current = Config.inc(&shared.counter, current, &rng);
local_inc += 1;
}
}
};
const workers = try allocator.alloc(Worker, num_cpus);
defer allocator.free(workers);
var shared = Shared{};
for (workers, 0..) |*w, i| {
w.* = .{ .shared = &shared };
w.thread = try std.Thread.spawn(.{}, Worker.run, .{w, i});
}
const measure_duration_ns = 1000 * std.time.ns_per_ms;
std.time.sleep(measure_duration_ns);
for (workers) |*w| w.shutdown.store(true, .monotonic);
for (workers) |w| w.thread.join();
var min: i128 = std.math.maxInt(i128);
var max: i128 = 0;
var sum: i128 = 0;
for (workers) |w| {
sum += w.increments;
min = @min(min, w.increments);
max = @max(max, w.increments);
}
const avg = @divFloor(sum, workers.len);
const throughput_ms = @divFloor(sum, measure_duration_ns / std.time.ns_per_ms);
var stdev: i128 = 0;
for (workers) |w| stdev += (w.increments - avg) * 2;
stdev = @divFloor(stdev, workers.len);
std.debug.print(
\\-------------------------
\\| {s}
\\-------------------------
\\| {s:>10} | {s:>8} |
\\| {d:>7}/ms | {d:>8} |
\\
\\
, .{
Config.name,
"throughput", "stdev",
@as(u128, @intCast(throughput_ms)),
@as(u128, @intCast(stdev)),
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment