Let's start with a naive case, the following Code
define void*
function which takes pointers and intergers as input and put the sum into address the fourth pointer points to.
#include <xbyak/xbyak_util.h>
struct Code : public Xbyak::CodeGenerator {
Code()
{
// xbyak also provides advanced usage like StakeFrame
// see xbyak/sample/sf_test.cpp for how to use other parameter
// Xbyak::util::StackFrame sf(this, 4);
mov(eax, ptr[rdi + 4]); // rdi is always the reg for the 1st argument
mov(rax, eax); // since the 1st arguments will be a pointer, we need to read the address and load the interger
add(rax, rsi); // rsi is always the reg for the 2nd argument
add(rax, rdx); // rdx is always the reg for the 3rd argument
mov(ptr[rcx], rax); // rcx is always the reg for the 4th argument
ret();
}
};
int main()
{
Code c;
int* a = (int*) malloc(2 * sizeof(int));
a[0] = 3;
a[1] = 4;
int res;
void (*f)(int*, int, int, int*) = c.getCode<void(*) (int*, int, int, int*)>();
f(a, 5, 2, &res);
if (res == 4 + 5 + 2) {
puts("ok");
} else {
printf("res = %d\n", res);
puts("ng");
}
}
Here we use rdi
, rsi
, rdx
, rcx
4 regiters to store the 4 auguments. Xbyak supports up to 14 registers, plus r8
, r9
to r15
. For detailed introduction about registers, you can refer to any assembly instructions or http://6.s081.scripts.mit.edu/sp18/x86-64-architecture-guide.html
For some larger cases, we may need to malloc on stack in the function as the following, which is cited from oneDNN. BTW I strongly recommend to refer to oneDNN for some xbyak usages that are not covered in Xbyak's own samples.
void generate() {
/* this is the same as programmers allocate on the stack */
sub(rsp, STACKSIZE);,
/* here we preserve all variables into stack to prevent flushing */
/* for details you can refer to http://6.s081.scripts.mit.edu/sp18/x86-64-architecture-guide.html for caller and calle stack
mov(ptr[rsp + 0x00], rbx);
mov(ptr[rsp + 0x08], rbp);
mov(ptr[rsp + 0x10], r12);
mov(ptr[rsp + 0x18], r13);
mov(ptr[rsp + 0x20], r14);
mov(ptr[rsp + 0x28], r15);
#ifdef _WIN32
/* we don't consider windows ABI in this note */
#else
mov(Bm, qword[rdi]);
mov(Bn, qword[rsi]);
mov(K, qword[rdx]);
mov(rcx, r8);
mov(I, r9);
mov(bm, qword[rsp + STACKSIZE + 0x08]); // In case the numbers of variables beyond the limit, other auguments will be after RSP.
mov(LDC, qword[rsp + STACKSIZE + 0x08]);
#endif
}
Morever, for readers who are new to assembly, we suggest to pack all arguments into one struct and read arguments beginning at RDI
like the following:
typedef struct params {
void *A;
void *B;
void *C;
} params_t;
#define GET_OFF(field) offsetof(params_t, field)
...
mov(reg_A, ptr[rdi + GET_OFF(A)]);
mov(reg_B, ptr[rdi + GET_OFF(B)]);
mov(reg_C, ptr[rdi + GET_OFF(C)]);
The above examples will cover most of sessions that programmers will need. Of cause there are a lot of other functions but they can be queried through any X86 assembly websites. Luckily Xbyak authers ramain the same names between X86 instructions and Xbayk function, for example sal
in xbyak does the same things as SAL
instruction.
But for readers of this article, I mean if your are not working in HPC you may not walk into this gist and even read till here, we care more about SIMD instructions, which consist AVX2, avx512, AMX (in 2022 no one will think SSE can still be called as High-Performance-Computing, right?). The following provides an example how to use AVX512 in xbyak, which is not included in xbyak offical samples.
Click to see the whole case!
#include <xbyak/xbyak_util.h>
#include <climits>
#define STACKSIZE 8192 // here we define a very large stackspace to demonstrate EVEX_compress_addr
Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int raw_offt,
bool bcast = false) {
using Xbyak::Address;
using Xbyak::Reg64;
using Xbyak::RegExp;
using Xbyak::Zmm;
const int EVEX_max_8b_offt = 0x200;
const Xbyak::Reg64 reg_EVEX_max_8b_offt = Xbyak::util::rbp;
assert(raw_offt <= INT_MAX);
auto offt = static_cast<int>(raw_offt);
int scale = 0;
if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) {
offt = offt - 2 * EVEX_max_8b_offt;
scale = 1;
} else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) {
offt = offt - 4 * EVEX_max_8b_offt;
scale = 2;
}
auto re = RegExp() + base + offt;
if (scale) re = re + reg_EVEX_max_8b_offt * scale;
if (bcast)
return Xbyak::util::zword_b[re];
else
return Xbyak::util::zword[re];
}
struct Code : public Xbyak::CodeGenerator {
Code() {
sub(rsp, STACKSIZE);
mov(eax, ptr[rdi + 4]);
mov(rax, eax);
vmovups(zmm0, zword[rsi]); // note for AVX512 instruction you will need zword other then Xbyak will not
vmovups(zmm1, zword[rsi]); // know you want zmm, ymm or xmm.
vaddps(zmm0, zmm1); // zmm for AVX512, ymm for AVX2, xmm for AVX
// vmovdqu(zword[rsp + 4096], zmm0);
vmovdqu32(EVEX_compress_addr(rsp, 4096), zmm0); // you'd better to compress address here
mov(rdi, ptr[rsp + 4104]); // after AVX512 instructions, you can load the 3rd elements using ptr not zword
add(rax, rdi);
add(rax, rdx);
mov(ptr[rcx], rax);
add(rsp, STACKSIZE);
ret();
}
};
int main() {
Code c;
int* a = (int*)malloc(2 * sizeof(int));
int* b = (int*)malloc(16 * sizeof(int));
a[0] = 3;
a[1] = 4;
for (int i = 0; i < 16; ++i) {
b[i] = (int)i % 3;
}
int res;
void (*f)(int*, int*, int, int*) =
c.getCode<void (*)(int*, int*, int, int*)>();
f(a, b, 2, &res);
if (res == 4 + 4 + 2) {
puts("ok");
} else {
printf("res = %d\n", res);
puts("ng");
}
free(a);
free(b);
}
EVEX_compress_addr
is a complext tech but to be brief, if the offset of address is two large to exceed the instruction length limit, you can use 3 IMM to reprensent the address instead of passing the whole address. So if your think the offset of your address will be more than 0x200
, it will be always useful to utIlize this function. You can find a more detailed explaination in <<Intel Architecture Instruction Set Extensions and Future Features>> chaptor 1.6.
As we all know, AMX load/store instructions need strides besides starting address.
start := tileconfig.startRow
IF start == 0 // not restarting, zero incoming state
tilezero(dst)
FI
nbytes := dst.colsb
DO WHILE start < dst.rows
memptr := base + start * stride
write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes)
start := start + 1
OD
zero_upper_rows(dst, dst.rows)
zero_tileconfig_start()
Also learned from oneDNN, User need to pass starting address, offset and a register holding strides like
tileloadd(tmm0, ptr[rax + r10 + UNROLL_M * UNROLL_KK * SIZE_A * 0]);
// rax: starting address
// r10: holding strides
// UNROLL_M * UNROLL_KK * SIZE_A * 0 : offsets, can be 0.
Also attched an instruction on Debugging Xbyak via GDB
hard to understand the usage of some codes, like
zero_upper_rows(dst, dst.rows)
zero_tileconfig_start()