Skip to content

Instantly share code, notes, and snippets.

@Validark
Last active September 6, 2024 07:01
Show Gist options
  • Save Validark/dcf8d59caf462af875a388462b7fab63 to your computer and use it in GitHub Desktop.
Save Validark/dcf8d59caf462af875a388462b7fab63 to your computer and use it in GitHub Desktop.
Interleaved Vector compress on arm/aarch64
fn tbl4(
table_part_1: @Vector(16, u8),
table_part_2: @Vector(16, u8),
table_part_3: @Vector(16, u8),
table_part_4: @Vector(16, u8),
indices: @Vector(8, u8)
) @TypeOf(indices) {
return struct {
extern fn @"llvm.aarch64.neon.tbl4"(@TypeOf(table_part_1), @TypeOf(table_part_2), @TypeOf(table_part_3), @TypeOf(table_part_4), @TypeOf(indices)) @TypeOf(indices);
}.@"llvm.aarch64.neon.tbl4"(table_part_1, table_part_2, table_part_3, table_part_4, indices);
}
export fn compress(bitstring: u64, chunk0: @Vector(16, u8), chunk1: @Vector(16, u8), chunk2: @Vector(16, u8), chunk3: @Vector(16, u8), dest: [*]u8) void {
comptime var lookups: [256]@Vector(8, u8) = undefined;
@setEvalBranchQuota(100000);
comptime {
for (&lookups, 0..) |*slot, i| {
var pos: u8 = 0;
for (0..8) |bit_i| {
const bit: u1 = @truncate(i >> bit_i);
if (bit == 1) {
slot[pos] = bit_i / 4 + (bit_i & 3) * 16;
}
pos += bit;
}
for (pos..8) |j| {
slot[j] = 255;
}
}
}
const prefix_sum_of_popcounts = @as(u64, @bitCast(@as(@Vector(8, u8), @popCount(@as(@Vector(8, u8), @bitCast(bitstring)))))) *% 0x0101010101010101;
inline for (@as([8]u8, @bitCast(bitstring)), @as([8]u8, @bitCast(prefix_sum_of_popcounts)), 0..) |byte, pos, i| {
dest[pos..][0..8].* = tbl4(chunk0, chunk1, chunk2, chunk3, lookups[byte] +| @as(@Vector(8, u8), @splat(2*i)));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment