Last active
January 7, 2024 03:41
-
-
Save notcancername/d63e27235d6ee59d9e68bb12470d668d to your computer and use it in GitHub Desktop.
Zig add vs add inline vs @addWithOverflow
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
extern fn not_inlined(ptr: [*]usize, len: usize) usize; | |
extern fn inlined(ptr: [*]usize, len: usize) usize; | |
extern fn builtin(ptr: [*]usize, len: usize) usize; | |
extern fn builtin_no_destructure(ptr: [*]usize, len: usize) usize; | |
extern fn sum_not_inlined(ptr: [*]usize, len: usize) usize; | |
extern fn sum_inlined(ptr: [*]usize, len: usize) usize; | |
extern fn sum_builtin(ptr: [*]usize, len: usize) usize; | |
extern fn sum_builtin_no_destructure(ptr: [*]usize, len: usize) usize; | |
pub fn xmain() !void { | |
const datap: [*]usize = @ptrCast(try std.os.mmap(null, (100 << 20) * @sizeOf(usize), std.os.PROT.READ, std.os.MAP.ANONYMOUS | std.os.MAP.PRIVATE, -1, 0)); | |
const data = datap[0..100 << 20]; | |
const r = switch(std.os.argv[1][0]) { | |
'n' => not_inlined(data.ptr, data.len), | |
'i' => inlined(data.ptr, data.len), | |
'b' => builtin(data.ptr, data.len), | |
'd' => builtin_no_destructure(data.ptr, data.len), | |
'a' => sum_not_inlined(data.ptr, data.len), | |
'x' => sum_inlined(data.ptr, data.len), | |
'c' => sum_builtin(data.ptr, data.len), | |
'e' => sum_builtin_no_destructure(data.ptr, data.len), | |
else => 1, | |
}; | |
if(r != 0) return error.Shit; | |
} | |
pub fn main() u8 { | |
return if(xmain()) 0 else |_| 1; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/sh | |
zig build-obj -fno-lto -O ReleaseFast -march=skylake test_add_inline.zig && | |
zig build-exe -fstrip -fno-lto -O ReleaseFast -marck=skylake bench_add_inline.zig test_add_inline.o |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
not_inlined: | |
xor eax, eax | |
test rsi, rsi | |
je .LBB0_5 | |
xor ecx, ecx | |
.LBB0_2: | |
mov rdx, qword ptr [rdi + 8*rcx] | |
add rdx, rdx | |
setb r8b | |
setb byte ptr [rsp - 16] | |
test r8b, r8b | |
jne .LBB0_7 | |
add rax, rdx | |
setb dl | |
setb byte ptr [rsp - 8] | |
test dl, dl | |
jne .LBB0_7 | |
inc rcx | |
cmp rsi, rcx | |
jne .LBB0_2 | |
.LBB0_5: | |
ret | |
.LBB0_7: | |
mov rax, -1 | |
ret | |
inlined: | |
test rsi, rsi | |
je .LBB1_1 | |
mov rdx, qword ptr [rdi] | |
add rdx, rdx | |
setb cl | |
setb byte ptr [rsp - 16] | |
mov rax, -1 | |
test cl, cl | |
jne .LBB1_2 | |
mov ecx, 1 | |
xor r8d, r8d | |
.LBB1_6: | |
mov r9, r8 | |
mov r8, rdx | |
add r8, r9 | |
setb dl | |
setb byte ptr [rsp - 8] | |
test dl, dl | |
jne .LBB1_2 | |
cmp rsi, rcx | |
je .LBB1_8 | |
mov rdx, qword ptr [rdi + 8*rcx] | |
add rdx, rdx | |
setb r9b | |
setb byte ptr [rsp - 16] | |
inc rcx | |
test r9b, r9b | |
je .LBB1_6 | |
.LBB1_2: | |
ret | |
.LBB1_1: | |
xor eax, eax | |
ret | |
.LBB1_8: | |
mov rax, r8 | |
ret | |
builtin: | |
test rsi, rsi | |
je .LBB2_1 | |
xor ecx, ecx | |
xor edx, edx | |
.LBB2_4: | |
mov r8, qword ptr [rdi + 8*rdx] | |
add r8, r8 | |
setb byte ptr [rsp - 16] | |
setb r9b | |
setb byte ptr [rsp - 2] | |
mov rax, -1 | |
test r9b, r9b | |
jne .LBB2_2 | |
add rcx, r8 | |
setb byte ptr [rsp - 8] | |
setb r8b | |
setb byte ptr [rsp - 1] | |
test r8b, r8b | |
jne .LBB2_2 | |
inc rdx | |
mov rax, rcx | |
cmp rsi, rdx | |
jne .LBB2_4 | |
.LBB2_2: | |
ret | |
.LBB2_1: | |
xor eax, eax | |
ret | |
builtin_no_destructure: | |
test rsi, rsi | |
je .LBB3_1 | |
xor ecx, ecx | |
xor edx, edx | |
.LBB3_4: | |
mov r8, qword ptr [rdi + 8*rdx] | |
add r8, r8 | |
setb r9b | |
setb byte ptr [rsp - 16] | |
mov rax, -1 | |
test r9b, r9b | |
jne .LBB3_2 | |
add rcx, r8 | |
setb r8b | |
setb byte ptr [rsp - 8] | |
test r8b, r8b | |
jne .LBB3_2 | |
inc rdx | |
mov rax, rcx | |
cmp rsi, rdx | |
jne .LBB3_4 | |
.LBB3_2: | |
ret | |
.LBB3_1: | |
xor eax, eax | |
ret | |
sum_not_inlined: | |
xor eax, eax | |
test rsi, rsi | |
je .LBB4_5 | |
xor ecx, ecx | |
.LBB4_2: | |
add rax, qword ptr [rdi + 8*rcx] | |
setb dl | |
setb byte ptr [rsp - 8] | |
test dl, dl | |
jne .LBB4_3 | |
inc rcx | |
cmp rsi, rcx | |
jne .LBB4_2 | |
.LBB4_5: | |
ret | |
.LBB4_3: | |
mov rax, -1 | |
ret | |
sum_inlined: | |
test rsi, rsi | |
je .LBB5_1 | |
mov rcx, qword ptr [rdi] | |
mov byte ptr [rsp - 8], 0 | |
mov rax, -1 | |
xor edx, edx | |
test dl, dl | |
jne .LBB5_2 | |
mov edx, 1 | |
.LBB5_6: | |
cmp rsi, rdx | |
je .LBB5_7 | |
add rcx, qword ptr [rdi + 8*rdx] | |
setb r8b | |
setb byte ptr [rsp - 8] | |
inc rdx | |
test r8b, r8b | |
je .LBB5_6 | |
.LBB5_2: | |
ret | |
.LBB5_1: | |
xor eax, eax | |
ret | |
.LBB5_7: | |
mov rax, rcx | |
ret | |
sum_builtin: | |
xor eax, eax | |
test rsi, rsi | |
je .LBB6_5 | |
xor ecx, ecx | |
.LBB6_2: | |
add rax, qword ptr [rdi + 8*rcx] | |
setb byte ptr [rsp - 8] | |
setb dl | |
setb byte ptr [rsp - 1] | |
test dl, dl | |
jne .LBB6_3 | |
inc rcx | |
cmp rsi, rcx | |
jne .LBB6_2 | |
.LBB6_5: | |
ret | |
.LBB6_3: | |
mov rax, -1 | |
ret | |
sum_builtin_no_destructure: | |
test rsi, rsi | |
je .LBB7_1 | |
xor edx, edx | |
xor ecx, ecx | |
.LBB7_4: | |
mov rax, qword ptr [rdi + 8*rcx] | |
add rax, rdx | |
setb r8b | |
setb byte ptr [rsp - 8] | |
test r8b, r8b | |
jne .LBB7_5 | |
add rax, rdx | |
inc rcx | |
mov rdx, rax | |
cmp rsi, rcx | |
jne .LBB7_4 | |
ret | |
.LBB7_1: | |
xor eax, eax | |
ret | |
.LBB7_5: | |
mov rax, -1 | |
ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/sh | |
poop -d 15000 --color never './bench_add_inline n' './bench_add_inline i' './bench_add_inline b' './bench_add_inline d' | |
poop -d 15000 --color never './bench_add_inline a' './bench_add_inline x' './bench_add_inline c' './bench_add_inline e' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Benchmark 1 (96 runs): ./bench_add_inline n | |
measurement mean ± σ min … max outliers delta | |
wall_time 157ms ± 15.6ms 133ms … 189ms 0 ( 0%) 0% | |
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) 0% | |
cpu_cycles 444M ± 43.9M 381M … 537M 0 ( 0%) 0% | |
instructions 1.47G ± 9.04 1.47G … 1.47G 4 ( 4%) 0% | |
cache_references 26.7M ± 403K 26.0M … 27.3M 0 ( 0%) 0% | |
cache_misses 1.44M ± 557K 535K … 4.01M 1 ( 1%) 0% | |
branch_misses 254 ± 273 22 … 1.35K 4 ( 4%) 0% | |
Benchmark 2 (88 runs): ./bench_add_inline i | |
measurement mean ± σ min … max outliers delta | |
wall_time 171ms ± 15.8ms 142ms … 203ms 0 ( 0%) 💩+ 8.4% ± 2.9% | |
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) - 0.0% ± 0.0% | |
cpu_cycles 481M ± 43.4M 409M … 572M 0 ( 0%) 💩+ 8.3% ± 2.8% | |
instructions 1.68G ± 11.4 1.68G … 1.68G 2 ( 2%) 💩+ 14.3% ± 0.0% | |
cache_references 26.9M ± 319K 26.3M … 27.3M 0 ( 0%) + 0.8% ± 0.4% | |
cache_misses 1.84M ± 1.33M 695K … 8.36M 8 ( 9%) 💩+ 28.2% ± 20.2% | |
branch_misses 200 ± 230 25 … 1.01K 2 ( 2%) - 21.5% ± 28.8% | |
Benchmark 3 (68 runs): ./bench_add_inline b | |
measurement mean ± σ min … max outliers delta | |
wall_time 221ms ± 24.0ms 181ms … 284ms 0 ( 0%) 💩+ 40.4% ± 3.9% | |
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) - 0.0% ± 0.0% | |
cpu_cycles 602M ± 50.0M 521M … 708M 0 ( 0%) 💩+ 35.6% ± 3.3% | |
instructions 1.89G ± 11.9 1.89G … 1.89G 0 ( 0%) 💩+ 28.6% ± 0.0% | |
cache_references 26.9M ± 456K 26.3M … 27.7M 0 ( 0%) + 0.6% ± 0.5% | |
cache_misses 2.45M ± 1.88M 1.14M … 12.6M 8 (12%) 💩+ 70.7% ± 27.8% | |
branch_misses 201 ± 208 29 … 1.13K 2 ( 3%) - 21.0% ± 30.3% | |
Benchmark 4 (88 runs): ./bench_add_inline d | |
measurement mean ± σ min … max outliers delta | |
wall_time 172ms ± 15.8ms 142ms … 210ms 0 ( 0%) 💩+ 9.2% ± 2.9% | |
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) - 0.0% ± 0.0% | |
cpu_cycles 479M ± 43.7M 410M … 576M 0 ( 0%) 💩+ 7.9% ± 2.9% | |
instructions 1.68G ± 9.05 1.68G … 1.68G 0 ( 0%) 💩+ 14.3% ± 0.0% | |
cache_references 26.8M ± 361K 26.3M … 27.4M 0 ( 0%) + 0.3% ± 0.4% | |
cache_misses 1.53M ± 526K 658K … 3.88M 1 ( 1%) + 6.7% ± 10.9% | |
branch_misses 576 ± 495 26 … 2.43K 2 ( 2%) 💩+126.3% ± 44.9% | |
Benchmark 1 (156 runs): ./bench_add_inline a | |
measurement mean ± σ min … max outliers delta | |
wall_time 96.4ms ± 14.9ms 79.0ms … 139ms 1 ( 1%) 0% | |
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) 0% | |
cpu_cycles 268M ± 41.7M 221M … 385M 0 ( 0%) 0% | |
instructions 839M ± 69.9 839M … 839M 11 ( 7%) 0% | |
cache_references 26.3M ± 321K 25.8M … 26.7M 0 ( 0%) 0% | |
cache_misses 1.07M ± 928K 127K … 8.18M 3 ( 2%) 0% | |
branch_misses 264 ± 339 16 … 1.88K 10 ( 6%) 0% | |
Benchmark 2 (159 runs): ./bench_add_inline x | |
measurement mean ± σ min … max outliers delta | |
wall_time 93.9ms ± 12.5ms 78.4ms … 126ms 0 ( 0%) - 2.5% ± 3.2% | |
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) - 0.0% ± 0.0% | |
cpu_cycles 259M ± 34.1M 220M … 344M 0 ( 0%) - 3.4% ± 3.1% | |
instructions 839M ± 6.61 839M … 839M 4 ( 3%) + 0.0% ± 0.0% | |
cache_references 26.1M ± 329K 25.7M … 26.7M 0 ( 0%) - 0.5% ± 0.3% | |
cache_misses 1.19M ± 1.09M 96.7K … 7.24M 9 ( 6%) + 10.6% ± 20.9% | |
branch_misses 157 ± 224 16 … 1.00K 27 (17%) ⚡- 40.6% ± 24.0% | |
Benchmark 3 (112 runs): ./bench_add_inline c | |
measurement mean ± σ min … max outliers delta | |
wall_time 133ms ± 24.5ms 98.7ms … 205ms 0 ( 0%) 💩+ 38.3% ± 4.9% | |
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) - 0.0% ± 0.0% | |
cpu_cycles 343M ± 48.0M 279M … 479M 5 ( 4%) 💩+ 28.1% ± 4.0% | |
instructions 944M ± 29.6 944M … 944M 15 (13%) 💩+ 12.5% ± 0.0% | |
cache_references 26.5M ± 376K 25.8M … 27.4M 0 ( 0%) + 0.8% ± 0.3% | |
cache_misses 3.57M ± 4.00M 192K … 17.1M 13 (12%) 💩+233.2% ± 60.7% | |
branch_misses 289 ± 313 22 … 1.81K 4 ( 4%) + 9.5% ± 30.2% | |
Benchmark 4 (129 runs): ./bench_add_inline e | |
measurement mean ± σ min … max outliers delta | |
wall_time 117ms ± 14.9ms 97.5ms … 170ms 1 ( 1%) 💩+ 21.0% ± 3.6% | |
peak_rss 131KB ± 0 131KB … 131KB 0 ( 0%) - 0.0% ± 0.0% | |
cpu_cycles 319M ± 39.7M 274M … 461M 1 ( 1%) 💩+ 19.1% ± 3.6% | |
instructions 1.15G ± 32.0 1.15G … 1.15G 0 ( 0%) 💩+ 37.5% ± 0.0% | |
cache_references 26.2M ± 376K 25.7M … 26.9M 0 ( 0%) - 0.3% ± 0.3% | |
cache_misses 1.35M ± 1.14M 207K … 8.85M 5 ( 4%) 💩+ 25.6% ± 22.3% | |
branch_misses 414 ± 420 22 … 2.47K 3 ( 2%) 💩+ 56.9% ± 33.4% |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
fn add(comptime T: type, a: T, b: T) (error{Overflow}!T) { | |
if (T == comptime_int) return a + b; | |
const ov = @addWithOverflow(a, b); | |
if (ov[1] != 0) return error.Overflow; | |
return ov[0]; | |
} | |
inline fn addI(comptime T: type, a: T, b: T) (error{Overflow}!T) { | |
if (T == comptime_int) return a + b; | |
const ov = @addWithOverflow(a, b); | |
if (ov[1] != 0) return error.Overflow; | |
return ov[0]; | |
} | |
inline fn sumOfSquares(ptr: [*]usize, len: usize) !usize { | |
var accum: usize = 0; | |
for(ptr[0..len]) |i| accum = try add(usize, accum, try add(usize, i, i)); | |
return accum; | |
} | |
inline fn sumOfSquaresInlined(ptr: [*]usize, len: usize) !usize { | |
var accum: usize = 0; | |
for(ptr[0..len]) |i| accum = try addI(usize, accum, try addI(usize, i, i)); | |
return accum; | |
} | |
inline fn sumOfSquaresBuiltin(ptr: [*]usize, len: usize) usize { | |
var accum: usize = 0; | |
for(ptr[0..len]) |i| { | |
const i_squared, const i_sq_overflowed = @addWithOverflow(i, i); | |
if(i_sq_overflowed != 0) return std.math.maxInt(usize); | |
accum, const accum_overflowed = @addWithOverflow(accum, i_squared); | |
if(accum_overflowed != 0) return std.math.maxInt(usize); | |
} | |
return accum; | |
} | |
inline fn sumOfSquaresBuiltinNoDestructure(ptr: [*]usize, len: usize) usize { | |
var accum: usize = 0; | |
for(ptr[0..len]) |i| { | |
const ov1 = @addWithOverflow(i, i); | |
if(ov1[1] != 0) return std.math.maxInt(usize); | |
const ov2 = @addWithOverflow(accum, ov1[0]); | |
if(ov2[1] != 0) return std.math.maxInt(usize); | |
accum = ov2[0]; | |
} | |
return accum; | |
} | |
export fn not_inlined(ptr: [*]usize, len: usize) usize { | |
return sumOfSquares(ptr, len) catch std.math.maxInt(usize); | |
} | |
export fn inlined(ptr: [*]usize, len: usize) usize { | |
return sumOfSquaresInlined(ptr, len) catch std.math.maxInt(usize); | |
} | |
export fn builtin(ptr: [*]usize, len: usize) usize { | |
return sumOfSquaresBuiltin(ptr, len); | |
} | |
export fn builtin_no_destructure(ptr: [*]usize, len: usize) usize { | |
return sumOfSquaresBuiltinNoDestructure(ptr, len); | |
} | |
inline fn sum(ptr: [*]usize, len: usize) !usize { | |
var accum: usize = 0; | |
for(ptr[0..len]) |i| accum = try add(usize, accum, i); | |
return accum; | |
} | |
inline fn sumInlined(ptr: [*]usize, len: usize) !usize { | |
var accum: usize = 0; | |
for(ptr[0..len]) |i| accum = try addI(usize, accum, i); | |
return accum; | |
} | |
inline fn sumBuiltin(ptr: [*]usize, len: usize) usize { | |
var accum: usize = 0; | |
for(ptr[0..len]) |i| { | |
accum, const accum_overflowed = @addWithOverflow(accum, i); | |
if(accum_overflowed != 0) return std.math.maxInt(usize); | |
} | |
return accum; | |
} | |
inline fn sumBuiltinNoDestructure(ptr: [*]usize, len: usize) usize { | |
var accum: usize = 0; | |
for(ptr[0..len]) |i| { | |
const ov = @addWithOverflow(accum, i); | |
if(ov[1] != 0) return std.math.maxInt(usize); | |
accum += ov[0]; | |
} | |
return accum; | |
} | |
export fn sum_not_inlined(ptr: [*]usize, len: usize) usize { | |
return sum(ptr, len) catch std.math.maxInt(usize); | |
} | |
export fn sum_inlined(ptr: [*]usize, len: usize) usize { | |
return sumInlined(ptr, len) catch std.math.maxInt(usize); | |
} | |
export fn sum_builtin(ptr: [*]usize, len: usize) usize { | |
return sumBuiltin(ptr, len); | |
} | |
export fn sum_builtin_no_destructure(ptr: [*]usize, len: usize) usize { | |
return sumBuiltinNoDestructure(ptr, len); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment