|
.arch armv8-a |
|
.file "bli_gemm_armv8a_asm_d6x8.c" |
|
.text |
|
.align 2 |
|
.p2align 3,,7 |
|
.global _bli_sgemm_armv8a_asm_8x12 |
|
// .type bli_sgemm_armv8a_asm_8x12, %function |
|
_bli_sgemm_armv8a_asm_8x12: |
|
.LFB738: |
|
.cfi_startproc |
|
stp x19, x20, [sp, -240]! |
|
.cfi_def_cfa_offset 240 |
|
.cfi_offset 19, -240 |
|
.cfi_offset 20, -232 |
|
cmp x0, 0 |
|
add x9, x0, 3 |
|
ldr x10, [sp, 240] |
|
csel x9, x9, x0, lt |
|
negs x8, x0 |
|
and x0, x0, 3 |
|
and x8, x8, 3 |
|
asr x9, x9, 2 |
|
ldp x11, x10, [x10, 8] |
|
csneg x0, x0, x8, mi |
|
stp x21, x22, [sp, 16] |
|
stp x23, x24, [sp, 32] |
|
stp x25, x26, [sp, 48] |
|
str x27, [sp, 64] |
|
stp d8, d9, [sp, 80] |
|
stp d10, d11, [sp, 96] |
|
stp d12, d13, [sp, 112] |
|
stp d14, d15, [sp, 128] |
|
.cfi_offset 21, -224 |
|
.cfi_offset 22, -216 |
|
.cfi_offset 23, -208 |
|
.cfi_offset 24, -200 |
|
.cfi_offset 25, -192 |
|
.cfi_offset 26, -184 |
|
.cfi_offset 27, -176 |
|
.cfi_offset 72, -160 |
|
.cfi_offset 73, -152 |
|
.cfi_offset 74, -144 |
|
.cfi_offset 75, -136 |
|
.cfi_offset 76, -128 |
|
.cfi_offset 77, -120 |
|
.cfi_offset 78, -112 |
|
.cfi_offset 79, -104 |
|
stp x5, x4, [sp, 152] |
|
stp x3, x2, [sp, 168] |
|
stp x1, x11, [sp, 184] |
|
stp x10, x9, [sp, 200] |
|
stp x0, x6, [sp, 216] |
|
str x7, [sp, 232] |
|
#APP |
|
// 76 "bli_gemm_armv8a_asm_d6x8.c" 1 |
|
|
|
|
|
ldr x0,[sp, 176] |
|
ldr x1,[sp, 168] |
|
ldr x2,[sp, 152] |
|
|
|
ldr x3,[sp, 192] |
|
ldr x4,[sp, 200] |
|
|
|
ldr x5,[sp, 208] |
|
ldr x6,[sp, 216] |
|
|
|
ldr x7,[sp, 184] |
|
ldr x8,[sp, 160] |
|
|
|
ldr x9,[sp, 232] |
|
lsl x10,x9,#2 |
|
|
|
ldr x13,[sp, 224] |
|
lsl x14,x13,#2 |
|
|
|
add x16,x2,x10 |
|
add x17,x16,x10 |
|
add x18,x17,x10 |
|
add x19,x18,x10 |
|
add x20,x19,x10 |
|
add x21,x20,x10 |
|
add x22,x21,x10 |
|
add x23,x22,x10 |
|
add x24,x23,x10 |
|
add x25,x24,x10 |
|
add x26,x25,x10 |
|
|
|
prfm pldl1keep,[x2] |
|
prfm pldl1keep,[x16] |
|
prfm pldl1keep,[x17] |
|
prfm pldl1keep,[x18] |
|
prfm pldl1keep,[x19] |
|
prfm pldl1keep,[x20] |
|
prfm pldl1keep,[x21] |
|
prfm pldl1keep,[x22] |
|
prfm pldl1keep,[x23] |
|
prfm pldl1keep,[x24] |
|
prfm pldl1keep,[x25] |
|
prfm pldl1keep,[x26] |
|
|
|
dup v8.4s, wzr |
|
prfm PLDL1KEEP, [x1, #192] |
|
dup v9.4s, wzr |
|
prfm PLDL1KEEP, [x1, #256] |
|
dup v10.4s, wzr |
|
prfm PLDL1KEEP, [x1, #320] |
|
dup v11.4s, wzr |
|
dup v12.4s, wzr |
|
dup v13.4s, wzr |
|
|
|
dup v14.4s, wzr |
|
prfm PLDL1KEEP, [x0, #128] |
|
dup v15.4s, wzr |
|
prfm PLDL1KEEP, [x0, #192] |
|
dup v16.4s, wzr |
|
dup v17.4s, wzr |
|
dup v18.4s, wzr |
|
dup v19.4s, wzr |
|
|
|
dup v20.4s, wzr |
|
dup v21.4s, wzr |
|
dup v22.4s, wzr |
|
dup v23.4s, wzr |
|
dup v24.4s, wzr |
|
dup v25.4s, wzr |
|
|
|
dup v26.4s, wzr |
|
dup v27.4s, wzr |
|
dup v28.4s, wzr |
|
dup v29.4s, wzr |
|
dup v30.4s, wzr |
|
dup v31.4s, wzr |
|
|
|
cmp x5,#0 |
|
beq .SCONSIDERKLEFT |
|
|
|
ldr q0, [x0] |
|
ldr q1, [x0, #16] |
|
|
|
ldr q2, [x1] |
|
ldr q3, [x1, #16] |
|
ldr q4, [x1, #32] |
|
|
|
add x0, x0, #32 |
|
add x1, x1, #48 |
|
|
|
cmp x5,1 |
|
beq .SLASTITER |
|
|
|
.SLOOPKITER: |
|
|
|
ldr q5, [x0] |
|
fmla v8.4s, v0.4s,v2.s[0] |
|
fmla v9.4s, v1.4s,v2.s[0] |
|
ldr q6, [x0, #16] |
|
fmla v10.4s,v0.4s,v2.s[1] |
|
fmla v11.4s,v1.4s,v2.s[1] |
|
fmla v12.4s,v0.4s,v2.s[2] |
|
fmla v13.4s,v1.4s,v2.s[2] |
|
fmla v14.4s,v0.4s,v2.s[3] |
|
fmla v15.4s,v1.4s,v2.s[3] |
|
ldr q2, [x1] |
|
|
|
fmla v16.4s,v0.4s,v3.s[0] |
|
prfm PLDL1KEEP, [x1, #336] |
|
fmla v17.4s,v1.4s,v3.s[0] |
|
prfm PLDL1KEEP, [x1, #400] |
|
fmla v18.4s,v0.4s,v3.s[1] |
|
fmla v19.4s,v1.4s,v3.s[1] |
|
prfm PLDL1KEEP, [x1, #464] |
|
fmla v20.4s,v0.4s,v3.s[2] |
|
fmla v21.4s,v1.4s,v3.s[2] |
|
fmla v22.4s,v0.4s,v3.s[3] |
|
fmla v23.4s,v1.4s,v3.s[3] |
|
|
|
fmla v24.4s,v0.4s,v4.s[0] |
|
fmla v26.4s,v0.4s,v4.s[1] |
|
fmla v28.4s,v0.4s,v4.s[2] |
|
fmla v30.4s,v0.4s,v4.s[3] |
|
ldr q3, [x1, #16] |
|
|
|
fmla v25.4s,v1.4s,v4.s[0] |
|
fmla v27.4s,v1.4s,v4.s[1] |
|
fmla v29.4s,v1.4s,v4.s[2] |
|
fmla v31.4s,v1.4s,v4.s[3] |
|
ldr q4, [x1, #32] |
|
|
|
|
|
ldr q0, [x0, #32] |
|
fmla v8.4s,v5.4s,v2.s[0] |
|
fmla v9.4s,v6.4s,v2.s[0] |
|
ldr q1, [x0, #48] |
|
fmla v10.4s,v5.4s,v2.s[1] |
|
fmla v11.4s,v6.4s,v2.s[1] |
|
fmla v12.4s,v5.4s,v2.s[2] |
|
fmla v13.4s,v6.4s,v2.s[2] |
|
fmla v14.4s,v5.4s,v2.s[3] |
|
fmla v15.4s,v6.4s,v2.s[3] |
|
ldr q2, [x1, #48] |
|
|
|
fmla v16.4s,v5.4s,v3.s[0] |
|
prfm PLDL1KEEP, [x0, #224] |
|
fmla v17.4s,v6.4s,v3.s[0] |
|
prfm PLDL1KEEP, [x0, #288] |
|
fmla v18.4s,v5.4s,v3.s[1] |
|
fmla v19.4s,v6.4s,v3.s[1] |
|
fmla v20.4s,v5.4s,v3.s[2] |
|
fmla v21.4s,v6.4s,v3.s[2] |
|
fmla v22.4s,v5.4s,v3.s[3] |
|
fmla v23.4s,v6.4s,v3.s[3] |
|
|
|
fmla v24.4s,v5.4s,v4.s[0] |
|
fmla v26.4s,v5.4s,v4.s[1] |
|
fmla v28.4s,v5.4s,v4.s[2] |
|
fmla v30.4s,v5.4s,v4.s[3] |
|
ldr q3, [x1, #64] |
|
|
|
fmla v25.4s,v6.4s,v4.s[0] |
|
fmla v27.4s,v6.4s,v4.s[1] |
|
fmla v29.4s,v6.4s,v4.s[2] |
|
fmla v31.4s,v6.4s,v4.s[3] |
|
ldr q4, [x1, #80] |
|
|
|
|
|
ldr q5, [x0, #64] |
|
fmla v8.4s,v0.4s,v2.s[0] |
|
fmla v9.4s,v1.4s,v2.s[0] |
|
ldr q6, [x0, #80] |
|
fmla v10.4s,v0.4s,v2.s[1] |
|
fmla v11.4s,v1.4s,v2.s[1] |
|
fmla v12.4s,v0.4s,v2.s[2] |
|
fmla v13.4s,v1.4s,v2.s[2] |
|
fmla v14.4s,v0.4s,v2.s[3] |
|
fmla v15.4s,v1.4s,v2.s[3] |
|
ldr q2, [x1, #96] |
|
|
|
fmla v16.4s,v0.4s,v3.s[0] |
|
fmla v17.4s,v1.4s,v3.s[0] |
|
fmla v18.4s,v0.4s,v3.s[1] |
|
fmla v19.4s,v1.4s,v3.s[1] |
|
fmla v20.4s,v0.4s,v3.s[2] |
|
fmla v21.4s,v1.4s,v3.s[2] |
|
fmla v22.4s,v0.4s,v3.s[3] |
|
fmla v23.4s,v1.4s,v3.s[3] |
|
|
|
fmla v24.4s,v0.4s,v4.s[0] |
|
fmla v26.4s,v0.4s,v4.s[1] |
|
fmla v28.4s,v0.4s,v4.s[2] |
|
fmla v30.4s,v0.4s,v4.s[3] |
|
ldr q3, [x1, #112] |
|
|
|
fmla v25.4s,v1.4s,v4.s[0] |
|
fmla v27.4s,v1.4s,v4.s[1] |
|
fmla v29.4s,v1.4s,v4.s[2] |
|
fmla v31.4s,v1.4s,v4.s[3] |
|
ldr q4, [x1, #128] |
|
|
|
|
|
ldr q0, [x0, #96] |
|
fmla v8.4s,v5.4s,v2.s[0] |
|
fmla v9.4s,v6.4s,v2.s[0] |
|
ldr q1, [x0, #112] |
|
fmla v10.4s,v5.4s,v2.s[1] |
|
fmla v11.4s,v6.4s,v2.s[1] |
|
fmla v12.4s,v5.4s,v2.s[2] |
|
fmla v13.4s,v6.4s,v2.s[2] |
|
fmla v14.4s,v5.4s,v2.s[3] |
|
fmla v15.4s,v6.4s,v2.s[3] |
|
ldr q2, [x1, #144] |
|
|
|
fmla v16.4s,v5.4s,v3.s[0] |
|
fmla v17.4s,v6.4s,v3.s[0] |
|
fmla v18.4s,v5.4s,v3.s[1] |
|
fmla v19.4s,v6.4s,v3.s[1] |
|
fmla v20.4s,v5.4s,v3.s[2] |
|
fmla v21.4s,v6.4s,v3.s[2] |
|
fmla v22.4s,v5.4s,v3.s[3] |
|
fmla v23.4s,v6.4s,v3.s[3] |
|
|
|
fmla v24.4s,v5.4s,v4.s[0] |
|
fmla v26.4s,v5.4s,v4.s[1] |
|
fmla v28.4s,v5.4s,v4.s[2] |
|
fmla v30.4s,v5.4s,v4.s[3] |
|
ldr q3, [x1, #160] |
|
|
|
fmla v25.4s,v6.4s,v4.s[0] |
|
fmla v27.4s,v6.4s,v4.s[1] |
|
fmla v29.4s,v6.4s,v4.s[2] |
|
fmla v31.4s,v6.4s,v4.s[3] |
|
ldr q4, [x1, #176] |
|
add x1, x1, #192 |
|
add x0, x0, #128 |
|
|
|
sub x5,x5,1 |
|
cmp x5,1 |
|
bne .SLOOPKITER |
|
|
|
.SLASTITER: |
|
|
|
|
|
ldr q5, [x0] |
|
fmla v8.4s,v0.4s,v2.s[0] |
|
fmla v9.4s,v1.4s,v2.s[0] |
|
ldr q6, [x0, #16] |
|
fmla v10.4s,v0.4s,v2.s[1] |
|
fmla v11.4s,v1.4s,v2.s[1] |
|
fmla v12.4s,v0.4s,v2.s[2] |
|
fmla v13.4s,v1.4s,v2.s[2] |
|
fmla v14.4s,v0.4s,v2.s[3] |
|
fmla v15.4s,v1.4s,v2.s[3] |
|
ldr q2, [x1] |
|
|
|
fmla v16.4s,v0.4s,v3.s[0] |
|
fmla v17.4s,v1.4s,v3.s[0] |
|
fmla v18.4s,v0.4s,v3.s[1] |
|
fmla v19.4s,v1.4s,v3.s[1] |
|
fmla v20.4s,v0.4s,v3.s[2] |
|
fmla v21.4s,v1.4s,v3.s[2] |
|
fmla v22.4s,v0.4s,v3.s[3] |
|
fmla v23.4s,v1.4s,v3.s[3] |
|
|
|
fmla v24.4s,v0.4s,v4.s[0] |
|
fmla v26.4s,v0.4s,v4.s[1] |
|
fmla v28.4s,v0.4s,v4.s[2] |
|
fmla v30.4s,v0.4s,v4.s[3] |
|
ldr q3, [x1, #16] |
|
|
|
fmla v25.4s,v1.4s,v4.s[0] |
|
fmla v27.4s,v1.4s,v4.s[1] |
|
fmla v29.4s,v1.4s,v4.s[2] |
|
fmla v31.4s,v1.4s,v4.s[3] |
|
ldr q4, [x1, #32] |
|
|
|
|
|
ldr q0, [x0, #32] |
|
fmla v8.4s,v5.4s,v2.s[0] |
|
fmla v9.4s,v6.4s,v2.s[0] |
|
ldr q1, [x0, #48] |
|
fmla v10.4s,v5.4s,v2.s[1] |
|
fmla v11.4s,v6.4s,v2.s[1] |
|
fmla v12.4s,v5.4s,v2.s[2] |
|
fmla v13.4s,v6.4s,v2.s[2] |
|
fmla v14.4s,v5.4s,v2.s[3] |
|
fmla v15.4s,v6.4s,v2.s[3] |
|
ldr q2, [x1, #48] |
|
|
|
fmla v16.4s,v5.4s,v3.s[0] |
|
fmla v17.4s,v6.4s,v3.s[0] |
|
fmla v18.4s,v5.4s,v3.s[1] |
|
fmla v19.4s,v6.4s,v3.s[1] |
|
fmla v20.4s,v5.4s,v3.s[2] |
|
fmla v21.4s,v6.4s,v3.s[2] |
|
fmla v22.4s,v5.4s,v3.s[3] |
|
fmla v23.4s,v6.4s,v3.s[3] |
|
|
|
fmla v24.4s,v5.4s,v4.s[0] |
|
fmla v26.4s,v5.4s,v4.s[1] |
|
fmla v28.4s,v5.4s,v4.s[2] |
|
fmla v30.4s,v5.4s,v4.s[3] |
|
ldr q3, [x1, #64] |
|
|
|
fmla v25.4s,v6.4s,v4.s[0] |
|
fmla v27.4s,v6.4s,v4.s[1] |
|
fmla v29.4s,v6.4s,v4.s[2] |
|
fmla v31.4s,v6.4s,v4.s[3] |
|
ldr q4, [x1, #80] |
|
|
|
|
|
ldr q5, [x0, #64] |
|
fmla v8.4s,v0.4s,v2.s[0] |
|
fmla v9.4s,v1.4s,v2.s[0] |
|
ldr q6, [x0, #80] |
|
fmla v10.4s,v0.4s,v2.s[1] |
|
fmla v11.4s,v1.4s,v2.s[1] |
|
fmla v12.4s,v0.4s,v2.s[2] |
|
fmla v13.4s,v1.4s,v2.s[2] |
|
fmla v14.4s,v0.4s,v2.s[3] |
|
fmla v15.4s,v1.4s,v2.s[3] |
|
ldr q2, [x1, #96] |
|
|
|
fmla v16.4s,v0.4s,v3.s[0] |
|
fmla v17.4s,v1.4s,v3.s[0] |
|
fmla v18.4s,v0.4s,v3.s[1] |
|
fmla v19.4s,v1.4s,v3.s[1] |
|
fmla v20.4s,v0.4s,v3.s[2] |
|
fmla v21.4s,v1.4s,v3.s[2] |
|
fmla v22.4s,v0.4s,v3.s[3] |
|
fmla v23.4s,v1.4s,v3.s[3] |
|
|
|
fmla v24.4s,v0.4s,v4.s[0] |
|
fmla v26.4s,v0.4s,v4.s[1] |
|
fmla v28.4s,v0.4s,v4.s[2] |
|
fmla v30.4s,v0.4s,v4.s[3] |
|
ldr q3, [x1, #112] |
|
|
|
fmla v25.4s,v1.4s,v4.s[0] |
|
fmla v27.4s,v1.4s,v4.s[1] |
|
fmla v29.4s,v1.4s,v4.s[2] |
|
fmla v31.4s,v1.4s,v4.s[3] |
|
ldr q4, [x1, #128] |
|
|
|
|
|
fmla v8.4s,v5.4s,v2.s[0] |
|
fmla v9.4s,v6.4s,v2.s[0] |
|
fmla v10.4s,v5.4s,v2.s[1] |
|
fmla v11.4s,v6.4s,v2.s[1] |
|
fmla v12.4s,v5.4s,v2.s[2] |
|
fmla v13.4s,v6.4s,v2.s[2] |
|
fmla v14.4s,v5.4s,v2.s[3] |
|
fmla v15.4s,v6.4s,v2.s[3] |
|
|
|
fmla v16.4s,v5.4s,v3.s[0] |
|
fmla v17.4s,v6.4s,v3.s[0] |
|
fmla v18.4s,v5.4s,v3.s[1] |
|
fmla v19.4s,v6.4s,v3.s[1] |
|
fmla v20.4s,v5.4s,v3.s[2] |
|
fmla v21.4s,v6.4s,v3.s[2] |
|
fmla v22.4s,v5.4s,v3.s[3] |
|
fmla v23.4s,v6.4s,v3.s[3] |
|
|
|
fmla v24.4s,v5.4s,v4.s[0] |
|
fmla v26.4s,v5.4s,v4.s[1] |
|
fmla v28.4s,v5.4s,v4.s[2] |
|
fmla v30.4s,v5.4s,v4.s[3] |
|
|
|
fmla v25.4s,v6.4s,v4.s[0] |
|
fmla v27.4s,v6.4s,v4.s[1] |
|
fmla v29.4s,v6.4s,v4.s[2] |
|
fmla v31.4s,v6.4s,v4.s[3] |
|
add x1, x1, #144 |
|
add x0, x0, #96 |
|
|
|
|
|
.SCONSIDERKLEFT: |
|
cmp x6,0 |
|
beq .SPOSTACCUM |
|
|
|
.SLOOPKLEFT: |
|
|
|
ldr q0, [x0],#16 |
|
ldr q1, [x0],#16 |
|
|
|
ldr q2, [x1],#16 |
|
ldr q3, [x1],#16 |
|
ldr q4, [x1],#16 |
|
|
|
sub x6,x6,1 |
|
|
|
fmla v8.4s,v0.4s,v2.s[0] |
|
fmla v9.4s,v1.4s,v2.s[0] |
|
fmla v10.4s,v0.4s,v2.s[1] |
|
fmla v11.4s,v1.4s,v2.s[1] |
|
fmla v12.4s,v0.4s,v2.s[2] |
|
fmla v13.4s,v1.4s,v2.s[2] |
|
fmla v14.4s,v0.4s,v2.s[3] |
|
fmla v15.4s,v1.4s,v2.s[3] |
|
|
|
fmla v16.4s,v0.4s,v3.s[0] |
|
fmla v17.4s,v1.4s,v3.s[0] |
|
fmla v18.4s,v0.4s,v3.s[1] |
|
fmla v19.4s,v1.4s,v3.s[1] |
|
fmla v20.4s,v0.4s,v3.s[2] |
|
fmla v21.4s,v1.4s,v3.s[2] |
|
fmla v22.4s,v0.4s,v3.s[3] |
|
fmla v23.4s,v1.4s,v3.s[3] |
|
|
|
fmla v24.4s,v0.4s,v4.s[0] |
|
fmla v26.4s,v0.4s,v4.s[1] |
|
fmla v28.4s,v0.4s,v4.s[2] |
|
fmla v30.4s,v0.4s,v4.s[3] |
|
fmla v25.4s,v1.4s,v4.s[0] |
|
fmla v27.4s,v1.4s,v4.s[1] |
|
fmla v29.4s,v1.4s,v4.s[2] |
|
fmla v31.4s,v1.4s,v4.s[3] |
|
|
|
cmp x6,0 |
|
bne .SLOOPKLEFT |
|
|
|
.SPOSTACCUM: |
|
|
|
ld1r {v6.4s},[x7] |
|
ld1r {v7.4s},[x8] |
|
|
|
cmp x13,#1 |
|
bne .SGENSTORED |
|
|
|
.SCOLSTORED: |
|
|
|
dup v0.4s, wzr |
|
dup v1.4s, wzr |
|
dup v2.4s, wzr |
|
dup v3.4s, wzr |
|
dup v4.4s, wzr |
|
dup v5.4s, wzr |
|
|
|
fcmp s7,#0.0 |
|
beq .SBETAZEROCOLSTOREDS1 |
|
|
|
ldr q0, [x2] |
|
ldr q1, [x2, #16] |
|
ldr q2, [x16] |
|
ldr q3, [x16, #16] |
|
ldr q4, [x17] |
|
ldr q5, [x17, #16] |
|
|
|
fmul v0.4s,v0.4s,v7.s[0] |
|
fmul v1.4s,v1.4s,v7.s[0] |
|
fmul v2.4s,v2.4s,v7.s[0] |
|
fmul v3.4s,v3.4s,v7.s[0] |
|
fmul v4.4s,v4.4s,v7.s[0] |
|
fmul v5.4s,v5.4s,v7.s[0] |
|
|
|
.SBETAZEROCOLSTOREDS1: |
|
|
|
fmla v0.4s,v8.4s,v6.s[0] |
|
fmla v1.4s,v9.4s,v6.s[0] |
|
fmla v2.4s,v10.4s,v6.s[0] |
|
fmla v3.4s,v11.4s,v6.s[0] |
|
fmla v4.4s,v12.4s,v6.s[0] |
|
fmla v5.4s,v13.4s,v6.s[0] |
|
|
|
str q0, [x2] |
|
str q1, [x2, #16] |
|
str q2, [x16] |
|
str q3, [x16, #16] |
|
str q4, [x17] |
|
str q5, [x17, #16] |
|
|
|
dup v8.4s, wzr |
|
dup v9.4s, wzr |
|
dup v10.4s, wzr |
|
dup v11.4s, wzr |
|
dup v12.4s, wzr |
|
dup v13.4s, wzr |
|
|
|
fcmp s7,#0.0 |
|
beq .SBETAZEROCOLSTOREDS2 |
|
|
|
ldr q8, [x18] |
|
ldr q9, [x18, #16] |
|
ldr q10, [x19] |
|
ldr q11, [x19, #16] |
|
ldr q12, [x20] |
|
ldr q13, [x20, #16] |
|
|
|
fmul v8.4s, v8.4s, v7.s[0] |
|
fmul v9.4s, v9.4s, v7.s[0] |
|
fmul v10.4s,v10.4s,v7.s[0] |
|
fmul v11.4s,v11.4s,v7.s[0] |
|
fmul v12.4s,v12.4s,v7.s[0] |
|
fmul v13.4s,v13.4s,v7.s[0] |
|
|
|
.SBETAZEROCOLSTOREDS2: |
|
|
|
fmla v8.4s, v14.4s,v6.s[0] |
|
fmla v9.4s, v15.4s,v6.s[0] |
|
fmla v10.4s,v16.4s,v6.s[0] |
|
fmla v11.4s,v17.4s,v6.s[0] |
|
fmla v12.4s,v18.4s,v6.s[0] |
|
fmla v13.4s,v19.4s,v6.s[0] |
|
|
|
str q8, [x18] |
|
str q9, [x18, #16] |
|
str q10, [x19] |
|
str q11, [x19, #16] |
|
str q12, [x20] |
|
str q13, [x20, #16] |
|
|
|
dup v0.4s, wzr |
|
dup v1.4s, wzr |
|
dup v2.4s, wzr |
|
dup v3.4s, wzr |
|
dup v4.4s, wzr |
|
dup v5.4s, wzr |
|
|
|
fcmp s7,#0.0 |
|
beq .SBETAZEROCOLSTOREDS3 |
|
|
|
ldr q0, [x21] |
|
ldr q1, [x21, #16] |
|
ldr q2, [x22] |
|
ldr q3, [x22, #16] |
|
ldr q4, [x23] |
|
ldr q5, [x23, #16] |
|
|
|
fmul v0.4s,v0.4s,v7.s[0] |
|
fmul v1.4s,v1.4s,v7.s[0] |
|
fmul v2.4s,v2.4s,v7.s[0] |
|
fmul v3.4s,v3.4s,v7.s[0] |
|
fmul v4.4s,v4.4s,v7.s[0] |
|
fmul v5.4s,v5.4s,v7.s[0] |
|
|
|
.SBETAZEROCOLSTOREDS3: |
|
|
|
fmla v0.4s,v20.4s,v6.s[0] |
|
fmla v1.4s,v21.4s,v6.s[0] |
|
fmla v2.4s,v22.4s,v6.s[0] |
|
fmla v3.4s,v23.4s,v6.s[0] |
|
fmla v4.4s,v24.4s,v6.s[0] |
|
fmla v5.4s,v25.4s,v6.s[0] |
|
|
|
str q0, [x21] |
|
str q1, [x21, #16] |
|
str q2, [x22] |
|
str q3, [x22, #16] |
|
str q4, [x23] |
|
str q5, [x23, #16] |
|
|
|
dup v8.4s, wzr |
|
dup v9.4s, wzr |
|
dup v10.4s, wzr |
|
dup v11.4s, wzr |
|
dup v12.4s, wzr |
|
dup v13.4s, wzr |
|
|
|
fcmp s7,#0.0 |
|
beq .SBETAZEROCOLSTOREDS4 |
|
|
|
ldr q8, [x24] |
|
ldr q9, [x24, #16] |
|
ldr q10, [x25] |
|
ldr q11, [x25, #16] |
|
ldr q12, [x26] |
|
ldr q13, [x26, #16] |
|
|
|
fmul v8.4s, v8.4s, v7.s[0] |
|
fmul v9.4s, v9.4s, v7.s[0] |
|
fmul v10.4s,v10.4s,v7.s[0] |
|
fmul v11.4s,v11.4s,v7.s[0] |
|
fmul v12.4s,v12.4s,v7.s[0] |
|
fmul v13.4s,v13.4s,v7.s[0] |
|
|
|
.SBETAZEROCOLSTOREDS4: |
|
|
|
prfm pldl2keep,[x3] |
|
prfm pldl2keep,[x4] |
|
|
|
fmla v8.4s, v26.4s,v6.s[0] |
|
fmla v9.4s, v27.4s,v6.s[0] |
|
fmla v10.4s,v28.4s,v6.s[0] |
|
fmla v11.4s,v29.4s,v6.s[0] |
|
fmla v12.4s,v30.4s,v6.s[0] |
|
fmla v13.4s,v31.4s,v6.s[0] |
|
|
|
str q8, [x24] |
|
str q9, [x24, #16] |
|
str q10, [x25] |
|
str q11, [x25, #16] |
|
str q12, [x26] |
|
str q13, [x26, #16] |
|
|
|
|
|
b .SEND |
|
|
|
|
|
.SGENSTORED: |
|
|
|
|
|
dup v0.4s, wzr |
|
dup v1.4s, wzr |
|
dup v2.4s, wzr |
|
dup v3.4s, wzr |
|
dup v4.4s, wzr |
|
dup v5.4s, wzr |
|
|
|
fcmp s7,#0.0 |
|
beq .SBETAZEROGENSTOREDS1 |
|
|
|
mov x27, x2 |
|
|
|
ld1 {v0.s}[0],[x27],x14 |
|
ld1 {v0.s}[1],[x27],x14 |
|
ld1 {v0.s}[2],[x27],x14 |
|
ld1 {v0.s}[3],[x27],x14 |
|
ld1 {v1.s}[0],[x27],x14 |
|
ld1 {v1.s}[1],[x27],x14 |
|
ld1 {v1.s}[2],[x27],x14 |
|
ld1 {v1.s}[3],[x27],x14 |
|
|
|
mov x27, x16 |
|
|
|
ld1 {v2.s}[0],[x27],x14 |
|
ld1 {v2.s}[1],[x27],x14 |
|
ld1 {v2.s}[2],[x27],x14 |
|
ld1 {v2.s}[3],[x27],x14 |
|
ld1 {v3.s}[0],[x27],x14 |
|
ld1 {v3.s}[1],[x27],x14 |
|
ld1 {v3.s}[2],[x27],x14 |
|
ld1 {v3.s}[3],[x27],x14 |
|
|
|
mov x27, x17 |
|
|
|
ld1 {v4.s}[0],[x27],x14 |
|
ld1 {v4.s}[1],[x27],x14 |
|
ld1 {v4.s}[2],[x27],x14 |
|
ld1 {v4.s}[3],[x27],x14 |
|
ld1 {v5.s}[0],[x27],x14 |
|
ld1 {v5.s}[1],[x27],x14 |
|
ld1 {v5.s}[2],[x27],x14 |
|
ld1 {v5.s}[3],[x27],x14 |
|
|
|
fmul v0.4s,v0.4s,v7.s[0] |
|
fmul v1.4s,v1.4s,v7.s[0] |
|
fmul v2.4s,v2.4s,v7.s[0] |
|
fmul v3.4s,v3.4s,v7.s[0] |
|
fmul v4.4s,v4.4s,v7.s[0] |
|
fmul v5.4s,v5.4s,v7.s[0] |
|
|
|
.SBETAZEROGENSTOREDS1: |
|
|
|
fmla v0.4s, v8.4s,v6.s[0] |
|
fmla v1.4s, v9.4s,v6.s[0] |
|
fmla v2.4s,v10.4s,v6.s[0] |
|
fmla v3.4s,v11.4s,v6.s[0] |
|
fmla v4.4s,v12.4s,v6.s[0] |
|
fmla v5.4s,v13.4s,v6.s[0] |
|
|
|
mov x27, x2 |
|
|
|
st1 {v0.s}[0],[x27],x14 |
|
st1 {v0.s}[1],[x27],x14 |
|
st1 {v0.s}[2],[x27],x14 |
|
st1 {v0.s}[3],[x27],x14 |
|
st1 {v1.s}[0],[x27],x14 |
|
st1 {v1.s}[1],[x27],x14 |
|
st1 {v1.s}[2],[x27],x14 |
|
st1 {v1.s}[3],[x27],x14 |
|
|
|
mov x27, x16 |
|
|
|
st1 {v2.s}[0],[x27],x14 |
|
st1 {v2.s}[1],[x27],x14 |
|
st1 {v2.s}[2],[x27],x14 |
|
st1 {v2.s}[3],[x27],x14 |
|
st1 {v3.s}[0],[x27],x14 |
|
st1 {v3.s}[1],[x27],x14 |
|
st1 {v3.s}[2],[x27],x14 |
|
st1 {v3.s}[3],[x27],x14 |
|
|
|
mov x27, x17 |
|
|
|
st1 {v4.s}[0],[x27],x14 |
|
st1 {v4.s}[1],[x27],x14 |
|
st1 {v4.s}[2],[x27],x14 |
|
st1 {v4.s}[3],[x27],x14 |
|
st1 {v5.s}[0],[x27],x14 |
|
st1 {v5.s}[1],[x27],x14 |
|
st1 {v5.s}[2],[x27],x14 |
|
st1 {v5.s}[3],[x27],x14 |
|
|
|
dup v8.4s, wzr |
|
dup v9.4s, wzr |
|
dup v10.4s, wzr |
|
dup v11.4s, wzr |
|
dup v12.4s, wzr |
|
dup v13.4s, wzr |
|
|
|
fcmp s7,#0.0 |
|
beq .SBETAZEROGENSTOREDS2 |
|
|
|
mov x27, x18 |
|
|
|
ld1 {v8.s}[0],[x27],x14 |
|
ld1 {v8.s}[1],[x27],x14 |
|
ld1 {v8.s}[2],[x27],x14 |
|
ld1 {v8.s}[3],[x27],x14 |
|
ld1 {v9.s}[0],[x27],x14 |
|
ld1 {v9.s}[1],[x27],x14 |
|
ld1 {v9.s}[2],[x27],x14 |
|
ld1 {v9.s}[3],[x27],x14 |
|
|
|
mov x27, x19 |
|
|
|
ld1 {v10.s}[0],[x27],x14 |
|
ld1 {v10.s}[1],[x27],x14 |
|
ld1 {v10.s}[2],[x27],x14 |
|
ld1 {v10.s}[3],[x27],x14 |
|
ld1 {v11.s}[0],[x27],x14 |
|
ld1 {v11.s}[1],[x27],x14 |
|
ld1 {v11.s}[2],[x27],x14 |
|
ld1 {v11.s}[3],[x27],x14 |
|
|
|
mov x27, x20 |
|
|
|
ld1 {v12.s}[0],[x27],x14 |
|
ld1 {v12.s}[1],[x27],x14 |
|
ld1 {v12.s}[2],[x27],x14 |
|
ld1 {v12.s}[3],[x27],x14 |
|
ld1 {v13.s}[0],[x27],x14 |
|
ld1 {v13.s}[1],[x27],x14 |
|
ld1 {v13.s}[2],[x27],x14 |
|
ld1 {v13.s}[3],[x27],x14 |
|
|
|
fmul v8.4s, v8.4s, v7.s[0] |
|
fmul v9.4s, v9.4s, v7.s[0] |
|
fmul v10.4s,v10.4s,v7.s[0] |
|
fmul v11.4s,v11.4s,v7.s[0] |
|
fmul v12.4s,v12.4s,v7.s[0] |
|
fmul v13.4s,v13.4s,v7.s[0] |
|
|
|
.SBETAZEROGENSTOREDS2: |
|
|
|
fmla v8.4s, v14.4s,v6.s[0] |
|
fmla v9.4s, v15.4s,v6.s[0] |
|
fmla v10.4s,v16.4s,v6.s[0] |
|
fmla v11.4s,v17.4s,v6.s[0] |
|
fmla v12.4s,v18.4s,v6.s[0] |
|
fmla v13.4s,v19.4s,v6.s[0] |
|
|
|
mov x27, x18 |
|
|
|
st1 {v8.s}[0],[x27],x14 |
|
st1 {v8.s}[1],[x27],x14 |
|
st1 {v8.s}[2],[x27],x14 |
|
st1 {v8.s}[3],[x27],x14 |
|
st1 {v9.s}[0],[x27],x14 |
|
st1 {v9.s}[1],[x27],x14 |
|
st1 {v9.s}[2],[x27],x14 |
|
st1 {v9.s}[3],[x27],x14 |
|
|
|
mov x27, x19 |
|
|
|
st1 {v10.s}[0],[x27],x14 |
|
st1 {v10.s}[1],[x27],x14 |
|
st1 {v10.s}[2],[x27],x14 |
|
st1 {v10.s}[3],[x27],x14 |
|
st1 {v11.s}[0],[x27],x14 |
|
st1 {v11.s}[1],[x27],x14 |
|
st1 {v11.s}[2],[x27],x14 |
|
st1 {v11.s}[3],[x27],x14 |
|
|
|
mov x27, x20 |
|
|
|
st1 {v12.s}[0],[x27],x14 |
|
st1 {v12.s}[1],[x27],x14 |
|
st1 {v12.s}[2],[x27],x14 |
|
st1 {v12.s}[3],[x27],x14 |
|
st1 {v13.s}[0],[x27],x14 |
|
st1 {v13.s}[1],[x27],x14 |
|
st1 {v13.s}[2],[x27],x14 |
|
st1 {v13.s}[3],[x27],x14 |
|
|
|
dup v0.4s, wzr |
|
dup v1.4s, wzr |
|
dup v2.4s, wzr |
|
dup v3.4s, wzr |
|
dup v4.4s, wzr |
|
dup v5.4s, wzr |
|
|
|
fcmp s7,#0.0 |
|
beq .SBETAZEROGENSTOREDS3 |
|
|
|
mov x27, x21 |
|
|
|
ld1 {v0.s}[0],[x27],x14 |
|
ld1 {v0.s}[1],[x27],x14 |
|
ld1 {v0.s}[2],[x27],x14 |
|
ld1 {v0.s}[3],[x27],x14 |
|
ld1 {v1.s}[0],[x27],x14 |
|
ld1 {v1.s}[1],[x27],x14 |
|
ld1 {v1.s}[2],[x27],x14 |
|
ld1 {v1.s}[3],[x27],x14 |
|
|
|
mov x27, x22 |
|
|
|
ld1 {v2.s}[0],[x27],x14 |
|
ld1 {v2.s}[1],[x27],x14 |
|
ld1 {v2.s}[2],[x27],x14 |
|
ld1 {v2.s}[3],[x27],x14 |
|
ld1 {v3.s}[0],[x27],x14 |
|
ld1 {v3.s}[1],[x27],x14 |
|
ld1 {v3.s}[2],[x27],x14 |
|
ld1 {v3.s}[3],[x27],x14 |
|
|
|
mov x27, x23 |
|
|
|
ld1 {v4.s}[0],[x27],x14 |
|
ld1 {v4.s}[1],[x27],x14 |
|
ld1 {v4.s}[2],[x27],x14 |
|
ld1 {v4.s}[3],[x27],x14 |
|
ld1 {v5.s}[0],[x27],x14 |
|
ld1 {v5.s}[1],[x27],x14 |
|
ld1 {v5.s}[2],[x27],x14 |
|
ld1 {v5.s}[3],[x27],x14 |
|
|
|
fmul v0.4s,v0.4s,v7.s[0] |
|
fmul v1.4s,v1.4s,v7.s[0] |
|
fmul v2.4s,v2.4s,v7.s[0] |
|
fmul v3.4s,v3.4s,v7.s[0] |
|
fmul v4.4s,v4.4s,v7.s[0] |
|
fmul v5.4s,v5.4s,v7.s[0] |
|
|
|
.SBETAZEROGENSTOREDS3: |
|
|
|
fmla v0.4s,v20.4s,v6.s[0] |
|
fmla v1.4s,v21.4s,v6.s[0] |
|
fmla v2.4s,v22.4s,v6.s[0] |
|
fmla v3.4s,v23.4s,v6.s[0] |
|
fmla v4.4s,v24.4s,v6.s[0] |
|
fmla v5.4s,v25.4s,v6.s[0] |
|
|
|
mov x27, x21 |
|
|
|
st1 {v0.s}[0],[x27],x14 |
|
st1 {v0.s}[1],[x27],x14 |
|
st1 {v0.s}[2],[x27],x14 |
|
st1 {v0.s}[3],[x27],x14 |
|
st1 {v1.s}[0],[x27],x14 |
|
st1 {v1.s}[1],[x27],x14 |
|
st1 {v1.s}[2],[x27],x14 |
|
st1 {v1.s}[3],[x27],x14 |
|
|
|
mov x27, x22 |
|
|
|
st1 {v2.s}[0],[x27],x14 |
|
st1 {v2.s}[1],[x27],x14 |
|
st1 {v2.s}[2],[x27],x14 |
|
st1 {v2.s}[3],[x27],x14 |
|
st1 {v3.s}[0],[x27],x14 |
|
st1 {v3.s}[1],[x27],x14 |
|
st1 {v3.s}[2],[x27],x14 |
|
st1 {v3.s}[3],[x27],x14 |
|
|
|
mov x27, x23 |
|
|
|
st1 {v4.s}[0],[x27],x14 |
|
st1 {v4.s}[1],[x27],x14 |
|
st1 {v4.s}[2],[x27],x14 |
|
st1 {v4.s}[3],[x27],x14 |
|
st1 {v5.s}[0],[x27],x14 |
|
st1 {v5.s}[1],[x27],x14 |
|
st1 {v5.s}[2],[x27],x14 |
|
st1 {v5.s}[3],[x27],x14 |
|
|
|
dup v8.4s, wzr |
|
dup v9.4s, wzr |
|
dup v10.4s, wzr |
|
dup v11.4s, wzr |
|
dup v12.4s, wzr |
|
dup v13.4s, wzr |
|
|
|
fcmp s7,#0.0 |
|
beq .SBETAZEROGENSTOREDS4 |
|
|
|
mov x27, x24 |
|
|
|
ld1 {v8.s}[0],[x27],x14 |
|
ld1 {v8.s}[1],[x27],x14 |
|
ld1 {v8.s}[2],[x27],x14 |
|
ld1 {v8.s}[3],[x27],x14 |
|
ld1 {v9.s}[0],[x27],x14 |
|
ld1 {v9.s}[1],[x27],x14 |
|
ld1 {v9.s}[2],[x27],x14 |
|
ld1 {v9.s}[3],[x27],x14 |
|
|
|
mov x27, x25 |
|
|
|
ld1 {v10.s}[0],[x27],x14 |
|
ld1 {v10.s}[1],[x27],x14 |
|
ld1 {v10.s}[2],[x27],x14 |
|
ld1 {v10.s}[3],[x27],x14 |
|
ld1 {v11.s}[0],[x27],x14 |
|
ld1 {v11.s}[1],[x27],x14 |
|
ld1 {v11.s}[2],[x27],x14 |
|
ld1 {v11.s}[3],[x27],x14 |
|
|
|
mov x27, x26 |
|
|
|
ld1 {v12.s}[0],[x27],x14 |
|
ld1 {v12.s}[1],[x27],x14 |
|
ld1 {v12.s}[2],[x27],x14 |
|
ld1 {v12.s}[3],[x27],x14 |
|
ld1 {v13.s}[0],[x27],x14 |
|
ld1 {v13.s}[1],[x27],x14 |
|
ld1 {v13.s}[2],[x27],x14 |
|
ld1 {v13.s}[3],[x27],x14 |
|
|
|
fmul v8.4s, v8.4s, v7.s[0] |
|
fmul v9.4s, v9.4s, v7.s[0] |
|
fmul v10.4s,v10.4s,v7.s[0] |
|
fmul v11.4s,v11.4s,v7.s[0] |
|
fmul v12.4s,v12.4s,v7.s[0] |
|
fmul v13.4s,v13.4s,v7.s[0] |
|
|
|
.SBETAZEROGENSTOREDS4: |
|
|
|
prfm pldl2keep,[x3] |
|
prfm pldl2keep,[x4] |
|
|
|
fmla v8.4s, v26.4s,v6.s[0] |
|
fmla v9.4s, v27.4s,v6.s[0] |
|
fmla v10.4s,v28.4s,v6.s[0] |
|
fmla v11.4s,v29.4s,v6.s[0] |
|
fmla v12.4s,v30.4s,v6.s[0] |
|
fmla v13.4s,v31.4s,v6.s[0] |
|
|
|
mov x27, x24 |
|
|
|
st1 {v8.s}[0],[x27],x14 |
|
st1 {v8.s}[1],[x27],x14 |
|
st1 {v8.s}[2],[x27],x14 |
|
st1 {v8.s}[3],[x27],x14 |
|
st1 {v9.s}[0],[x27],x14 |
|
st1 {v9.s}[1],[x27],x14 |
|
st1 {v9.s}[2],[x27],x14 |
|
st1 {v9.s}[3],[x27],x14 |
|
|
|
mov x27, x25 |
|
|
|
st1 {v10.s}[0],[x27],x14 |
|
st1 {v10.s}[1],[x27],x14 |
|
st1 {v10.s}[2],[x27],x14 |
|
st1 {v10.s}[3],[x27],x14 |
|
st1 {v11.s}[0],[x27],x14 |
|
st1 {v11.s}[1],[x27],x14 |
|
st1 {v11.s}[2],[x27],x14 |
|
st1 {v11.s}[3],[x27],x14 |
|
|
|
mov x27, x26 |
|
|
|
st1 {v12.s}[0],[x27],x14 |
|
st1 {v12.s}[1],[x27],x14 |
|
st1 {v12.s}[2],[x27],x14 |
|
st1 {v12.s}[3],[x27],x14 |
|
st1 {v13.s}[0],[x27],x14 |
|
st1 {v13.s}[1],[x27],x14 |
|
st1 {v13.s}[2],[x27],x14 |
|
st1 {v13.s}[3],[x27],x14 |
|
|
|
.SEND: |
|
|
|
|
|
// 0 "" 2 |
|
#NO_APP |
|
ldp x21, x22, [sp, 16] |
|
ldp x23, x24, [sp, 32] |
|
ldp x25, x26, [sp, 48] |
|
ldr x27, [sp, 64] |
|
ldp d8, d9, [sp, 80] |
|
ldp d10, d11, [sp, 96] |
|
ldp d12, d13, [sp, 112] |
|
ldp d14, d15, [sp, 128] |
|
ldp x19, x20, [sp], 240 |
|
.cfi_restore 20 |
|
.cfi_restore 19 |
|
.cfi_restore 78 |
|
.cfi_restore 79 |
|
.cfi_restore 76 |
|
.cfi_restore 77 |
|
.cfi_restore 74 |
|
.cfi_restore 75 |
|
.cfi_restore 72 |
|
.cfi_restore 73 |
|
.cfi_restore 27 |
|
.cfi_restore 25 |
|
.cfi_restore 26 |
|
.cfi_restore 23 |
|
.cfi_restore 24 |
|
.cfi_restore 21 |
|
.cfi_restore 22 |
|
.cfi_def_cfa_offset 0 |
|
ret |
|
.cfi_endproc |
|
.LFE738: |
|
// .size bli_sgemm_armv8a_asm_8x12, .-bli_sgemm_armv8a_asm_8x12 |
|
.align 2 |
|
.p2align 3,,7 |
|
.global _bli_dgemm_armv8a_asm_6x8 |
|
// .type bli_dgemm_armv8a_asm_6x8, %function |
|
_bli_dgemm_armv8a_asm_6x8: |
|
.LFB739: |
|
.cfi_startproc |
|
stp x20, x21, [sp, -224]! |
|
.cfi_def_cfa_offset 224 |
|
.cfi_offset 20, -224 |
|
.cfi_offset 21, -216 |
|
cmp x0, 0 |
|
add x9, x0, 3 |
|
ldr x10, [sp, 224] |
|
csel x9, x9, x0, lt |
|
negs x8, x0 |
|
and x0, x0, 3 |
|
and x8, x8, 3 |
|
asr x9, x9, 2 |
|
ldp x11, x10, [x10, 8] |
|
csneg x0, x0, x8, mi |
|
stp x22, x23, [sp, 16] |
|
stp x24, x25, [sp, 32] |
|
stp x26, x27, [sp, 48] |
|
stp d8, d9, [sp, 64] |
|
stp d10, d11, [sp, 80] |
|
stp d12, d13, [sp, 96] |
|
stp d14, d15, [sp, 112] |
|
.cfi_offset 22, -208 |
|
.cfi_offset 23, -200 |
|
.cfi_offset 24, -192 |
|
.cfi_offset 25, -184 |
|
.cfi_offset 26, -176 |
|
.cfi_offset 27, -168 |
|
.cfi_offset 72, -160 |
|
.cfi_offset 73, -152 |
|
.cfi_offset 74, -144 |
|
.cfi_offset 75, -136 |
|
.cfi_offset 76, -128 |
|
.cfi_offset 77, -120 |
|
.cfi_offset 78, -112 |
|
.cfi_offset 79, -104 |
|
stp x5, x4, [sp, 136] |
|
stp x3, x2, [sp, 152] |
|
stp x1, x11, [sp, 168] |
|
stp x10, x9, [sp, 184] |
|
stp x0, x6, [sp, 200] |
|
str x7, [sp, 216] |
|
#APP |
|
// 1130 "bli_gemm_armv8a_asm_d6x8.c" 1 |
|
|
|
ldr x0,[sp, 160] |
|
ldr x1,[sp, 152] |
|
ldr x2,[sp, 136] |
|
|
|
ldr x3,[sp, 176] |
|
ldr x4,[sp, 184] |
|
|
|
ldr x5,[sp, 192] |
|
ldr x6,[sp, 200] |
|
|
|
ldr x7,[sp, 168] |
|
ldr x8,[sp, 144] |
|
|
|
ldr x9,[sp, 216] |
|
lsl x10,x9,#3 |
|
|
|
ldr x13,[sp, 208] |
|
lsl x14,x13,#3 |
|
|
|
add x20,x2,x10 |
|
add x21,x20,x10 |
|
add x22,x21,x10 |
|
add x23,x22,x10 |
|
add x24,x23,x10 |
|
add x25,x24,x10 |
|
add x26,x25,x10 |
|
|
|
prfm pldl1keep,[x2] |
|
prfm pldl1keep,[x20] |
|
prfm pldl1keep,[x21] |
|
prfm pldl1keep,[x22] |
|
prfm pldl1keep,[x23] |
|
prfm pldl1keep,[x24] |
|
prfm pldl1keep,[x25] |
|
prfm pldl1keep,[x26] |
|
|
|
dup v8.2d, xzr |
|
prfm PLDL1KEEP, [x1, #256] |
|
dup v9.2d, xzr |
|
prfm PLDL1KEEP, [x1, #320] |
|
dup v10.2d, xzr |
|
prfm PLDL1KEEP, [x1, #384] |
|
dup v11.2d, xzr |
|
prfm PLDL1KEEP, [x1, #448] |
|
dup v12.2d, xzr |
|
dup v13.2d, xzr |
|
|
|
dup v14.2d, xzr |
|
prfm PLDL1KEEP, [x0, #192] |
|
dup v15.2d, xzr |
|
prfm PLDL1KEEP, [x0, #256] |
|
dup v16.2d, xzr |
|
prfm PLDL1KEEP, [x0, #320] |
|
dup v17.2d, xzr |
|
dup v18.2d, xzr |
|
dup v19.2d, xzr |
|
|
|
dup v20.2d, xzr |
|
dup v21.2d, xzr |
|
dup v22.2d, xzr |
|
dup v23.2d, xzr |
|
dup v24.2d, xzr |
|
dup v25.2d, xzr |
|
|
|
dup v26.2d, xzr |
|
dup v27.2d, xzr |
|
dup v28.2d, xzr |
|
dup v29.2d, xzr |
|
dup v30.2d, xzr |
|
dup v31.2d, xzr |
|
|
|
|
|
cmp x5,#0 |
|
beq .DCONSIDERKLEFT |
|
|
|
ldr q0, [x0] |
|
ldr q1, [x0, #16] |
|
ldr q2, [x0, #32] |
|
|
|
ldr q3, [x1] |
|
ldr q4, [x1, #16] |
|
ldr q5, [x1, #32] |
|
ldr q6, [x1, #48] |
|
|
|
add x0, x0, #48 |
|
add x1, x1, #64 |
|
|
|
cmp x5,1 |
|
beq .DLASTITER |
|
|
|
DLOOP: |
|
|
|
fmla v8.2d ,v0.2d,v3.d[0] |
|
prfm PLDL1KEEP, [x1, #448] |
|
fmla v9.2d ,v1.2d,v3.d[0] |
|
prfm PLDL1KEEP, [x1, #512] |
|
fmla v10.2d,v2.2d,v3.d[0] |
|
prfm PLDL1KEEP, [x1, #576] |
|
|
|
fmla v11.2d,v0.2d,v3.d[1] |
|
fmla v12.2d,v1.2d,v3.d[1] |
|
fmla v13.2d,v2.2d,v3.d[1] |
|
|
|
fmla v14.2d,v0.2d,v4.d[0] |
|
fmla v15.2d,v1.2d,v4.d[0] |
|
fmla v16.2d,v2.2d,v4.d[0] |
|
ldr q3, [x1] |
|
|
|
fmla v17.2d,v0.2d,v4.d[1] |
|
fmla v18.2d,v1.2d,v4.d[1] |
|
fmla v19.2d,v2.2d,v4.d[1] |
|
ldr q7, [x0, #32] |
|
|
|
fmla v20.2d,v0.2d,v5.d[0] |
|
fmla v21.2d,v1.2d,v5.d[0] |
|
fmla v22.2d,v2.2d,v5.d[0] |
|
ldr q4, [x1, #16] |
|
|
|
fmla v23.2d,v0.2d,v5.d[1] |
|
fmla v24.2d,v1.2d,v5.d[1] |
|
fmla v25.2d,v2.2d,v5.d[1] |
|
ldr q5, [x1, #32] |
|
|
|
fmla v26.2d,v0.2d,v6.d[0] |
|
fmla v29.2d,v0.2d,v6.d[1] |
|
ldr q0, [x0] |
|
|
|
fmla v27.2d,v1.2d,v6.d[0] |
|
fmla v30.2d,v1.2d,v6.d[1] |
|
ldr q1, [x0, #16] |
|
|
|
fmla v28.2d,v2.2d,v6.d[0] |
|
fmla v31.2d,v2.2d,v6.d[1] |
|
ldr q6, [x1, #48] |
|
|
|
fmla v8.2d ,v0.2d,v3.d[0] |
|
prfm PLDL1KEEP, [x1, #640] |
|
fmla v9.2d ,v1.2d,v3.d[0] |
|
prfm PLDL1KEEP, [x0, #336] |
|
fmla v10.2d,v7.2d,v3.d[0] |
|
prfm PLDL1KEEP, [x0, #400] |
|
|
|
fmla v11.2d,v0.2d,v3.d[1] |
|
fmla v12.2d,v1.2d,v3.d[1] |
|
fmla v13.2d,v7.2d,v3.d[1] |
|
|
|
fmla v14.2d,v0.2d,v4.d[0] |
|
fmla v15.2d,v1.2d,v4.d[0] |
|
fmla v16.2d,v7.2d,v4.d[0] |
|
ldr q3, [x1, #64] |
|
|
|
fmla v17.2d,v0.2d,v4.d[1] |
|
fmla v18.2d,v1.2d,v4.d[1] |
|
fmla v19.2d,v7.2d,v4.d[1] |
|
ldr q2, [x0, #80] |
|
|
|
fmla v20.2d,v0.2d,v5.d[0] |
|
fmla v21.2d,v1.2d,v5.d[0] |
|
fmla v22.2d,v7.2d,v5.d[0] |
|
ldr q4, [x1, #80] |
|
|
|
fmla v23.2d,v0.2d,v5.d[1] |
|
fmla v24.2d,v1.2d,v5.d[1] |
|
fmla v25.2d,v7.2d,v5.d[1] |
|
ldr q5, [x1, #96] |
|
|
|
fmla v26.2d,v0.2d,v6.d[0] |
|
fmla v29.2d,v0.2d,v6.d[1] |
|
ldr q0, [x0, #48] |
|
|
|
fmla v27.2d,v1.2d,v6.d[0] |
|
fmla v30.2d,v1.2d,v6.d[1] |
|
ldr q1, [x0, #64] |
|
|
|
fmla v28.2d,v7.2d,v6.d[0] |
|
fmla v31.2d,v7.2d,v6.d[1] |
|
ldr q6, [x1, #112] |
|
|
|
fmla v8.2d ,v0.2d,v3.d[0] |
|
prfm PLDL1KEEP, [x0, #464] |
|
fmla v9.2d ,v1.2d,v3.d[0] |
|
fmla v10.2d,v2.2d,v3.d[0] |
|
|
|
fmla v11.2d,v0.2d,v3.d[1] |
|
fmla v12.2d,v1.2d,v3.d[1] |
|
fmla v13.2d,v2.2d,v3.d[1] |
|
|
|
fmla v14.2d,v0.2d,v4.d[0] |
|
fmla v15.2d,v1.2d,v4.d[0] |
|
fmla v16.2d,v2.2d,v4.d[0] |
|
ldr q3, [x1, #128] |
|
|
|
fmla v17.2d,v0.2d,v4.d[1] |
|
fmla v18.2d,v1.2d,v4.d[1] |
|
fmla v19.2d,v2.2d,v4.d[1] |
|
ldr q7, [x0, #128] |
|
|
|
fmla v20.2d,v0.2d,v5.d[0] |
|
fmla v21.2d,v1.2d,v5.d[0] |
|
fmla v22.2d,v2.2d,v5.d[0] |
|
ldr q4, [x1, #144] |
|
|
|
fmla v23.2d,v0.2d,v5.d[1] |
|
fmla v24.2d,v1.2d,v5.d[1] |
|
fmla v25.2d,v2.2d,v5.d[1] |
|
ldr q5, [x1, #160] |
|
|
|
fmla v26.2d,v0.2d,v6.d[0] |
|
fmla v29.2d,v0.2d,v6.d[1] |
|
ldr q0, [x0, #96] |
|
|
|
fmla v27.2d,v1.2d,v6.d[0] |
|
fmla v30.2d,v1.2d,v6.d[1] |
|
ldr q1, [x0, #112] |
|
|
|
fmla v28.2d,v2.2d,v6.d[0] |
|
fmla v31.2d,v2.2d,v6.d[1] |
|
ldr q6, [x1, #176] |
|
|
|
fmla v8.2d ,v0.2d,v3.d[0] |
|
fmla v9.2d ,v1.2d,v3.d[0] |
|
fmla v10.2d,v7.2d,v3.d[0] |
|
|
|
fmla v11.2d,v0.2d,v3.d[1] |
|
fmla v12.2d,v1.2d,v3.d[1] |
|
fmla v13.2d,v7.2d,v3.d[1] |
|
ldr q3, [x1, #192] |
|
|
|
fmla v14.2d,v0.2d,v4.d[0] |
|
fmla v15.2d,v1.2d,v4.d[0] |
|
fmla v16.2d,v7.2d,v4.d[0] |
|
ldr q2, [x0, #176] |
|
|
|
fmla v17.2d,v0.2d,v4.d[1] |
|
fmla v18.2d,v1.2d,v4.d[1] |
|
fmla v19.2d,v7.2d,v4.d[1] |
|
ldr q4, [x1, #208] |
|
|
|
fmla v20.2d,v0.2d,v5.d[0] |
|
fmla v21.2d,v1.2d,v5.d[0] |
|
fmla v22.2d,v7.2d,v5.d[0] |
|
|
|
fmla v23.2d,v0.2d,v5.d[1] |
|
fmla v24.2d,v1.2d,v5.d[1] |
|
fmla v25.2d,v7.2d,v5.d[1] |
|
ldr q5, [x1, #224] |
|
|
|
fmla v26.2d,v0.2d,v6.d[0] |
|
fmla v29.2d,v0.2d,v6.d[1] |
|
ldr q0, [x0, #144] |
|
|
|
fmla v27.2d,v1.2d,v6.d[0] |
|
fmla v30.2d,v1.2d,v6.d[1] |
|
ldr q1, [x0, #160] |
|
|
|
fmla v28.2d,v7.2d,v6.d[0] |
|
fmla v31.2d,v7.2d,v6.d[1] |
|
ldr q6, [x1, #240] |
|
|
|
add x0, x0, #192 |
|
add x1, x1, #256 |
|
|
|
sub x5,x5,1 |
|
cmp x5,1 |
|
bne DLOOP |
|
|
|
.DLASTITER: |
|
|
|
fmla v8.2d ,v0.2d,v3.d[0] |
|
fmla v9.2d ,v1.2d,v3.d[0] |
|
fmla v10.2d,v2.2d,v3.d[0] |
|
|
|
fmla v11.2d,v0.2d,v3.d[1] |
|
fmla v12.2d,v1.2d,v3.d[1] |
|
fmla v13.2d,v2.2d,v3.d[1] |
|
ldr q3, [x1] |
|
|
|
fmla v14.2d,v0.2d,v4.d[0] |
|
fmla v15.2d,v1.2d,v4.d[0] |
|
fmla v16.2d,v2.2d,v4.d[0] |
|
ldr q7, [x0, #32] |
|
|
|
fmla v17.2d,v0.2d,v4.d[1] |
|
fmla v18.2d,v1.2d,v4.d[1] |
|
fmla v19.2d,v2.2d,v4.d[1] |
|
ldr q4, [x1, #16] |
|
|
|
fmla v20.2d,v0.2d,v5.d[0] |
|
fmla v21.2d,v1.2d,v5.d[0] |
|
fmla v22.2d,v2.2d,v5.d[0] |
|
|
|
fmla v23.2d,v0.2d,v5.d[1] |
|
fmla v24.2d,v1.2d,v5.d[1] |
|
fmla v25.2d,v2.2d,v5.d[1] |
|
ldr q5, [x1, #32] |
|
|
|
fmla v26.2d,v0.2d,v6.d[0] |
|
fmla v29.2d,v0.2d,v6.d[1] |
|
ldr q0, [x0] |
|
|
|
fmla v27.2d,v1.2d,v6.d[0] |
|
fmla v30.2d,v1.2d,v6.d[1] |
|
ldr q1, [x0, #16] |
|
|
|
fmla v28.2d,v2.2d,v6.d[0] |
|
fmla v31.2d,v2.2d,v6.d[1] |
|
ldr q6, [x1, #48] |
|
|
|
fmla v8.2d ,v0.2d,v3.d[0] |
|
fmla v9.2d ,v1.2d,v3.d[0] |
|
fmla v10.2d,v7.2d,v3.d[0] |
|
|
|
fmla v11.2d,v0.2d,v3.d[1] |
|
fmla v12.2d,v1.2d,v3.d[1] |
|
fmla v13.2d,v7.2d,v3.d[1] |
|
ldr q3, [x1, #64] |
|
|
|
fmla v14.2d,v0.2d,v4.d[0] |
|
fmla v15.2d,v1.2d,v4.d[0] |
|
fmla v16.2d,v7.2d,v4.d[0] |
|
ldr q2, [x0, #80] |
|
|
|
fmla v17.2d,v0.2d,v4.d[1] |
|
fmla v18.2d,v1.2d,v4.d[1] |
|
fmla v19.2d,v7.2d,v4.d[1] |
|
ldr q4, [x1, #80] |
|
|
|
fmla v20.2d,v0.2d,v5.d[0] |
|
fmla v21.2d,v1.2d,v5.d[0] |
|
fmla v22.2d,v7.2d,v5.d[0] |
|
|
|
fmla v23.2d,v0.2d,v5.d[1] |
|
fmla v24.2d,v1.2d,v5.d[1] |
|
fmla v25.2d,v7.2d,v5.d[1] |
|
ldr q5, [x1, #96] |
|
|
|
fmla v26.2d,v0.2d,v6.d[0] |
|
fmla v29.2d,v0.2d,v6.d[1] |
|
ldr q0, [x0, #48] |
|
|
|
fmla v27.2d,v1.2d,v6.d[0] |
|
fmla v30.2d,v1.2d,v6.d[1] |
|
ldr q1, [x0, #64] |
|
|
|
fmla v28.2d,v7.2d,v6.d[0] |
|
fmla v31.2d,v7.2d,v6.d[1] |
|
ldr q6, [x1, #112] |
|
|
|
fmla v8.2d ,v0.2d,v3.d[0] |
|
fmla v9.2d ,v1.2d,v3.d[0] |
|
fmla v10.2d,v2.2d,v3.d[0] |
|
|
|
fmla v11.2d,v0.2d,v3.d[1] |
|
fmla v12.2d,v1.2d,v3.d[1] |
|
fmla v13.2d,v2.2d,v3.d[1] |
|
ldr q3, [x1, #128] |
|
|
|
fmla v14.2d,v0.2d,v4.d[0] |
|
fmla v15.2d,v1.2d,v4.d[0] |
|
fmla v16.2d,v2.2d,v4.d[0] |
|
ldr q7, [x0, #128] |
|
|
|
fmla v17.2d,v0.2d,v4.d[1] |
|
fmla v18.2d,v1.2d,v4.d[1] |
|
fmla v19.2d,v2.2d,v4.d[1] |
|
ldr q4, [x1, #144] |
|
|
|
fmla v20.2d,v0.2d,v5.d[0] |
|
fmla v21.2d,v1.2d,v5.d[0] |
|
fmla v22.2d,v2.2d,v5.d[0] |
|
|
|
fmla v23.2d,v0.2d,v5.d[1] |
|
fmla v24.2d,v1.2d,v5.d[1] |
|
fmla v25.2d,v2.2d,v5.d[1] |
|
ldr q5, [x1, #160] |
|
|
|
fmla v26.2d,v0.2d,v6.d[0] |
|
fmla v29.2d,v0.2d,v6.d[1] |
|
ldr q0, [x0, #96] |
|
|
|
fmla v27.2d,v1.2d,v6.d[0] |
|
fmla v30.2d,v1.2d,v6.d[1] |
|
ldr q1, [x0, #112] |
|
|
|
fmla v28.2d,v2.2d,v6.d[0] |
|
fmla v31.2d,v2.2d,v6.d[1] |
|
ldr q6, [x1, #176] |
|
|
|
fmla v8.2d ,v0.2d,v3.d[0] |
|
fmla v9.2d ,v1.2d,v3.d[0] |
|
fmla v10.2d,v7.2d,v3.d[0] |
|
|
|
fmla v11.2d,v0.2d,v3.d[1] |
|
fmla v12.2d,v1.2d,v3.d[1] |
|
fmla v13.2d,v7.2d,v3.d[1] |
|
|
|
fmla v14.2d,v0.2d,v4.d[0] |
|
fmla v15.2d,v1.2d,v4.d[0] |
|
fmla v16.2d,v7.2d,v4.d[0] |
|
|
|
fmla v17.2d,v0.2d,v4.d[1] |
|
fmla v18.2d,v1.2d,v4.d[1] |
|
fmla v19.2d,v7.2d,v4.d[1] |
|
|
|
fmla v20.2d,v0.2d,v5.d[0] |
|
fmla v21.2d,v1.2d,v5.d[0] |
|
fmla v22.2d,v7.2d,v5.d[0] |
|
|
|
fmla v23.2d,v0.2d,v5.d[1] |
|
fmla v24.2d,v1.2d,v5.d[1] |
|
fmla v25.2d,v7.2d,v5.d[1] |
|
|
|
fmla v26.2d,v0.2d,v6.d[0] |
|
add x1, x1, #192 |
|
fmla v29.2d,v0.2d,v6.d[1] |
|
|
|
fmla v27.2d,v1.2d,v6.d[0] |
|
fmla v30.2d,v1.2d,v6.d[1] |
|
|
|
fmla v28.2d,v7.2d,v6.d[0] |
|
fmla v31.2d,v7.2d,v6.d[1] |
|
|
|
add x0, x0, #144 |
|
|
|
.DCONSIDERKLEFT: |
|
cmp x6,0 |
|
beq .DPOSTACCUM |
|
|
|
.DLOOPKLEFT: |
|
|
|
ldr q0, [x0],#16 |
|
ldr q1, [x0],#16 |
|
ldr q2, [x0],#16 |
|
|
|
ldr q3, [x1],#16 |
|
ldr q4, [x1],#16 |
|
ldr q5, [x1],#16 |
|
ldr q6, [x1],#16 |
|
|
|
sub x6,x6,1 |
|
|
|
fmla v8.2d ,v0.2d,v3.d[0] |
|
fmla v9.2d ,v1.2d,v3.d[0] |
|
fmla v10.2d,v2.2d,v3.d[0] |
|
|
|
fmla v11.2d,v0.2d,v3.d[1] |
|
fmla v12.2d,v1.2d,v3.d[1] |
|
fmla v13.2d,v2.2d,v3.d[1] |
|
|
|
fmla v14.2d,v0.2d,v4.d[0] |
|
fmla v15.2d,v1.2d,v4.d[0] |
|
fmla v16.2d,v2.2d,v4.d[0] |
|
|
|
fmla v17.2d,v0.2d,v4.d[1] |
|
fmla v18.2d,v1.2d,v4.d[1] |
|
fmla v19.2d,v2.2d,v4.d[1] |
|
|
|
fmla v20.2d,v0.2d,v5.d[0] |
|
fmla v21.2d,v1.2d,v5.d[0] |
|
fmla v22.2d,v2.2d,v5.d[0] |
|
|
|
fmla v23.2d,v0.2d,v5.d[1] |
|
fmla v24.2d,v1.2d,v5.d[1] |
|
fmla v25.2d,v2.2d,v5.d[1] |
|
|
|
fmla v26.2d,v0.2d,v6.d[0] |
|
fmla v29.2d,v0.2d,v6.d[1] |
|
|
|
fmla v27.2d,v1.2d,v6.d[0] |
|
fmla v30.2d,v1.2d,v6.d[1] |
|
|
|
fmla v28.2d,v2.2d,v6.d[0] |
|
fmla v31.2d,v2.2d,v6.d[1] |
|
|
|
cmp x6,0 |
|
bne .DLOOPKLEFT |
|
|
|
.DPOSTACCUM: |
|
|
|
ld1r {v6.2d},[x7] |
|
ld1r {v7.2d},[x8] |
|
|
|
cmp x13,#1 |
|
bne .DGENSTORED |
|
|
|
.DCOLSTORED: |
|
|
|
dup v0.2d, xzr |
|
dup v1.2d, xzr |
|
dup v2.2d, xzr |
|
dup v3.2d, xzr |
|
dup v4.2d, xzr |
|
dup v5.2d, xzr |
|
|
|
fcmp d7,#0.0 |
|
beq .DBETAZEROCOLSTOREDS1 |
|
|
|
ldr q0, [x2] |
|
ldr q1, [x2, #16] |
|
ldr q2, [x2, #32] |
|
|
|
ldr q3, [x20] |
|
ldr q4, [x20, #16] |
|
ldr q5, [x20, #32] |
|
|
|
fmul v0.2d,v0.2d,v7.d[0] |
|
fmul v1.2d,v1.2d,v7.d[0] |
|
fmul v2.2d,v2.2d,v7.d[0] |
|
fmul v3.2d,v3.2d,v7.d[0] |
|
fmul v4.2d,v4.2d,v7.d[0] |
|
fmul v5.2d,v5.2d,v7.d[0] |
|
|
|
.DBETAZEROCOLSTOREDS1: |
|
|
|
fmla v0.2d,v8.2d,v6.d[0] |
|
fmla v1.2d,v9.2d,v6.d[0] |
|
fmla v2.2d,v10.2d,v6.d[0] |
|
fmla v3.2d,v11.2d,v6.d[0] |
|
fmla v4.2d,v12.2d,v6.d[0] |
|
fmla v5.2d,v13.2d,v6.d[0] |
|
|
|
str q0, [x2] |
|
str q1, [x2, #16] |
|
str q2, [x2, #32] |
|
|
|
str q3, [x20] |
|
str q4, [x20, #16] |
|
str q5, [x20, #32] |
|
|
|
dup v8.2d, xzr |
|
dup v9.2d, xzr |
|
dup v10.2d, xzr |
|
dup v11.2d, xzr |
|
dup v12.2d, xzr |
|
dup v13.2d, xzr |
|
|
|
fcmp d7,#0.0 |
|
beq .DBETAZEROCOLSTOREDS2 |
|
|
|
ldr q8, [x21] |
|
ldr q9, [x21, #16] |
|
ldr q10, [x21, #32] |
|
|
|
ldr q11, [x22] |
|
ldr q12, [x22, #16] |
|
ldr q13, [x22, #32] |
|
|
|
fmul v8.2d, v8.2d, v7.d[0] |
|
fmul v9.2d, v9.2d, v7.d[0] |
|
fmul v10.2d,v10.2d,v7.d[0] |
|
fmul v11.2d,v11.2d,v7.d[0] |
|
fmul v12.2d,v12.2d,v7.d[0] |
|
fmul v13.2d,v13.2d,v7.d[0] |
|
|
|
.DBETAZEROCOLSTOREDS2: |
|
|
|
fmla v8.2d, v14.2d,v6.d[0] |
|
fmla v9.2d, v15.2d,v6.d[0] |
|
fmla v10.2d,v16.2d,v6.d[0] |
|
fmla v11.2d,v17.2d,v6.d[0] |
|
fmla v12.2d,v18.2d,v6.d[0] |
|
fmla v13.2d,v19.2d,v6.d[0] |
|
|
|
str q8, [x21] |
|
str q9, [x21, #16] |
|
str q10, [x21, #32] |
|
|
|
str q11, [x22] |
|
str q12, [x22, #16] |
|
str q13, [x22, #32] |
|
|
|
dup v0.2d, xzr |
|
dup v1.2d, xzr |
|
dup v2.2d, xzr |
|
dup v3.2d, xzr |
|
dup v4.2d, xzr |
|
dup v5.2d, xzr |
|
|
|
fcmp d7,#0.0 |
|
beq .DBETAZEROCOLSTOREDS3 |
|
|
|
ldr q0, [x23] |
|
ldr q1, [x23, #16] |
|
ldr q2, [x23, #32] |
|
|
|
ldr q3, [x24] |
|
ldr q4, [x24, #16] |
|
ldr q5, [x24, #32] |
|
|
|
fmul v0.2d,v0.2d,v7.d[0] |
|
fmul v1.2d,v1.2d,v7.d[0] |
|
fmul v2.2d,v2.2d,v7.d[0] |
|
fmul v3.2d,v3.2d,v7.d[0] |
|
fmul v4.2d,v4.2d,v7.d[0] |
|
fmul v5.2d,v5.2d,v7.d[0] |
|
|
|
.DBETAZEROCOLSTOREDS3: |
|
|
|
fmla v0.2d,v20.2d,v6.d[0] |
|
fmla v1.2d,v21.2d,v6.d[0] |
|
fmla v2.2d,v22.2d,v6.d[0] |
|
fmla v3.2d,v23.2d,v6.d[0] |
|
fmla v4.2d,v24.2d,v6.d[0] |
|
fmla v5.2d,v25.2d,v6.d[0] |
|
|
|
str q0, [x23] |
|
str q1, [x23, #16] |
|
str q2, [x23, #32] |
|
|
|
str q3, [x24] |
|
str q4, [x24, #16] |
|
str q5, [x24, #32] |
|
|
|
dup v8.2d, xzr |
|
dup v9.2d, xzr |
|
dup v10.2d, xzr |
|
dup v11.2d, xzr |
|
dup v12.2d, xzr |
|
dup v13.2d, xzr |
|
|
|
fcmp d7,#0.0 |
|
beq .DBETAZEROCOLSTOREDS4 |
|
|
|
ldr q8, [x25] |
|
ldr q9, [x25, #16] |
|
ldr q10, [x25, #32] |
|
|
|
ldr q11, [x26] |
|
ldr q12, [x26, #16] |
|
ldr q13, [x26, #32] |
|
|
|
fmul v8.2d, v8.2d, v7.d[0] |
|
fmul v9.2d, v9.2d, v7.d[0] |
|
fmul v10.2d,v10.2d,v7.d[0] |
|
fmul v11.2d,v11.2d,v7.d[0] |
|
fmul v12.2d,v12.2d,v7.d[0] |
|
fmul v13.2d,v13.2d,v7.d[0] |
|
|
|
.DBETAZEROCOLSTOREDS4: |
|
|
|
prfm pldl2keep,[x3] |
|
prfm pldl2keep,[x4] |
|
|
|
fmla v8.2d, v26.2d,v6.d[0] |
|
fmla v9.2d, v27.2d,v6.d[0] |
|
fmla v10.2d,v28.2d,v6.d[0] |
|
fmla v11.2d,v29.2d,v6.d[0] |
|
fmla v12.2d,v30.2d,v6.d[0] |
|
fmla v13.2d,v31.2d,v6.d[0] |
|
|
|
str q8, [x25] |
|
str q9, [x25, #16] |
|
str q10, [x25, #32] |
|
|
|
str q11, [x26] |
|
str q12, [x26, #16] |
|
str q13, [x26, #32] |
|
|
|
b .DEND |
|
|
|
.DGENSTORED: |
|
|
|
dup v0.2d, xzr |
|
dup v1.2d, xzr |
|
dup v2.2d, xzr |
|
dup v3.2d, xzr |
|
dup v4.2d, xzr |
|
dup v5.2d, xzr |
|
|
|
fcmp d7,#0.0 |
|
beq .DBETAZEROGENSTOREDS1 |
|
|
|
mov x27, x2 |
|
|
|
ld1 {v0.d}[0],[x27],x14 |
|
ld1 {v0.d}[1],[x27],x14 |
|
ld1 {v1.d}[0],[x27],x14 |
|
ld1 {v1.d}[1],[x27],x14 |
|
ld1 {v2.d}[0],[x27],x14 |
|
ld1 {v2.d}[1],[x27],x14 |
|
|
|
mov x27, x20 |
|
|
|
ld1 {v3.d}[0],[x27],x14 |
|
ld1 {v3.d}[1],[x27],x14 |
|
ld1 {v4.d}[0],[x27],x14 |
|
ld1 {v4.d}[1],[x27],x14 |
|
ld1 {v5.d}[0],[x27],x14 |
|
ld1 {v5.d}[1],[x27],x14 |
|
|
|
fmul v0.2d,v0.2d,v7.d[0] |
|
fmul v1.2d,v1.2d,v7.d[0] |
|
fmul v2.2d,v2.2d,v7.d[0] |
|
fmul v3.2d,v3.2d,v7.d[0] |
|
fmul v4.2d,v4.2d,v7.d[0] |
|
fmul v5.2d,v5.2d,v7.d[0] |
|
|
|
.DBETAZEROGENSTOREDS1: |
|
|
|
fmla v0.2d,v8.2d,v6.d[0] |
|
fmla v1.2d,v9.2d,v6.d[0] |
|
fmla v2.2d,v10.2d,v6.d[0] |
|
fmla v3.2d,v11.2d,v6.d[0] |
|
fmla v4.2d,v12.2d,v6.d[0] |
|
fmla v5.2d,v13.2d,v6.d[0] |
|
|
|
mov x27, x2 |
|
|
|
st1 {v0.d}[0],[x27],x14 |
|
st1 {v0.d}[1],[x27],x14 |
|
st1 {v1.d}[0],[x27],x14 |
|
st1 {v1.d}[1],[x27],x14 |
|
st1 {v2.d}[0],[x27],x14 |
|
st1 {v2.d}[1],[x27],x14 |
|
|
|
mov x27, x20 |
|
|
|
st1 {v3.d}[0],[x27],x14 |
|
st1 {v3.d}[1],[x27],x14 |
|
st1 {v4.d}[0],[x27],x14 |
|
st1 {v4.d}[1],[x27],x14 |
|
st1 {v5.d}[0],[x27],x14 |
|
st1 {v5.d}[1],[x27],x14 |
|
|
|
dup v8.2d, xzr |
|
dup v9.2d, xzr |
|
dup v10.2d, xzr |
|
dup v11.2d, xzr |
|
dup v12.2d, xzr |
|
dup v13.2d, xzr |
|
|
|
fcmp d7,#0.0 |
|
beq .DBETAZEROGENSTOREDS2 |
|
|
|
mov x27, x21 |
|
|
|
ld1 {v8.d}[0], [x27],x14 |
|
ld1 {v8.d}[1], [x27],x14 |
|
ld1 {v9.d}[0], [x27],x14 |
|
ld1 {v9.d}[1], [x27],x14 |
|
ld1 {v10.d}[0],[x27],x14 |
|
ld1 {v10.d}[1],[x27],x14 |
|
|
|
mov x27, x22 |
|
|
|
ld1 {v11.d}[0],[x27],x14 |
|
ld1 {v11.d}[1],[x27],x14 |
|
ld1 {v12.d}[0],[x27],x14 |
|
ld1 {v12.d}[1],[x27],x14 |
|
ld1 {v13.d}[0],[x27],x14 |
|
ld1 {v13.d}[1],[x27],x14 |
|
|
|
fmul v8.2d, v8.2d, v7.d[0] |
|
fmul v9.2d, v9.2d, v7.d[0] |
|
fmul v10.2d,v10.2d,v7.d[0] |
|
fmul v11.2d,v11.2d,v7.d[0] |
|
fmul v12.2d,v12.2d,v7.d[0] |
|
fmul v13.2d,v13.2d,v7.d[0] |
|
|
|
.DBETAZEROGENSTOREDS2: |
|
|
|
fmla v8.2d, v14.2d,v6.d[0] |
|
fmla v9.2d, v15.2d,v6.d[0] |
|
fmla v10.2d,v16.2d,v6.d[0] |
|
fmla v11.2d,v17.2d,v6.d[0] |
|
fmla v12.2d,v18.2d,v6.d[0] |
|
fmla v13.2d,v19.2d,v6.d[0] |
|
|
|
mov x27, x21 |
|
|
|
st1 {v8.d}[0], [x27],x14 |
|
st1 {v8.d}[1], [x27],x14 |
|
st1 {v9.d}[0], [x27],x14 |
|
st1 {v9.d}[1], [x27],x14 |
|
st1 {v10.d}[0],[x27],x14 |
|
st1 {v10.d}[1],[x27],x14 |
|
|
|
mov x27, x22 |
|
|
|
st1 {v11.d}[0],[x27],x14 |
|
st1 {v11.d}[1],[x27],x14 |
|
st1 {v12.d}[0],[x27],x14 |
|
st1 {v12.d}[1],[x27],x14 |
|
st1 {v13.d}[0],[x27],x14 |
|
st1 {v13.d}[1],[x27],x14 |
|
|
|
dup v0.2d, xzr |
|
dup v1.2d, xzr |
|
dup v2.2d, xzr |
|
dup v3.2d, xzr |
|
dup v4.2d, xzr |
|
dup v5.2d, xzr |
|
|
|
fcmp d7,#0.0 |
|
beq .DBETAZEROGENSTOREDS3 |
|
|
|
mov x27, x23 |
|
|
|
ld1 {v0.d}[0],[x27],x14 |
|
ld1 {v0.d}[1],[x27],x14 |
|
ld1 {v1.d}[0],[x27],x14 |
|
ld1 {v1.d}[1],[x27],x14 |
|
ld1 {v2.d}[0],[x27],x14 |
|
ld1 {v2.d}[1],[x27],x14 |
|
|
|
mov x27, x24 |
|
|
|
ld1 {v3.d}[0],[x27],x14 |
|
ld1 {v3.d}[1],[x27],x14 |
|
ld1 {v4.d}[0],[x27],x14 |
|
ld1 {v4.d}[1],[x27],x14 |
|
ld1 {v5.d}[0],[x27],x14 |
|
ld1 {v5.d}[1],[x27],x14 |
|
|
|
fmul v0.2d,v0.2d,v7.d[0] |
|
fmul v1.2d,v1.2d,v7.d[0] |
|
fmul v2.2d,v2.2d,v7.d[0] |
|
fmul v3.2d,v3.2d,v7.d[0] |
|
fmul v4.2d,v4.2d,v7.d[0] |
|
fmul v5.2d,v5.2d,v7.d[0] |
|
|
|
.DBETAZEROGENSTOREDS3: |
|
|
|
fmla v0.2d,v20.2d,v6.d[0] |
|
fmla v1.2d,v21.2d,v6.d[0] |
|
fmla v2.2d,v22.2d,v6.d[0] |
|
fmla v3.2d,v23.2d,v6.d[0] |
|
fmla v4.2d,v24.2d,v6.d[0] |
|
fmla v5.2d,v25.2d,v6.d[0] |
|
|
|
mov x27, x23 |
|
|
|
st1 {v0.d}[0],[x27],x14 |
|
st1 {v0.d}[1],[x27],x14 |
|
st1 {v1.d}[0],[x27],x14 |
|
st1 {v1.d}[1],[x27],x14 |
|
st1 {v2.d}[0],[x27],x14 |
|
st1 {v2.d}[1],[x27],x14 |
|
|
|
mov x27, x24 |
|
|
|
st1 {v3.d}[0],[x27],x14 |
|
st1 {v3.d}[1],[x27],x14 |
|
st1 {v4.d}[0],[x27],x14 |
|
st1 {v4.d}[1],[x27],x14 |
|
st1 {v5.d}[0],[x27],x14 |
|
st1 {v5.d}[1],[x27],x14 |
|
|
|
dup v8.2d, xzr |
|
dup v9.2d, xzr |
|
dup v10.2d, xzr |
|
dup v11.2d, xzr |
|
dup v12.2d, xzr |
|
dup v13.2d, xzr |
|
|
|
fcmp d7,#0.0 |
|
beq .DBETAZEROGENSTOREDS4 |
|
|
|
mov x27, x25 |
|
|
|
ld1 {v8.d}[0], [x27],x14 |
|
ld1 {v8.d}[1], [x27],x14 |
|
ld1 {v9.d}[0], [x27],x14 |
|
ld1 {v9.d}[1], [x27],x14 |
|
ld1 {v10.d}[0],[x27],x14 |
|
ld1 {v10.d}[1],[x27],x14 |
|
|
|
mov x27, x26 |
|
|
|
ld1 {v11.d}[0],[x27],x14 |
|
ld1 {v11.d}[1],[x27],x14 |
|
ld1 {v12.d}[0],[x27],x14 |
|
ld1 {v12.d}[1],[x27],x14 |
|
ld1 {v13.d}[0],[x27],x14 |
|
ld1 {v13.d}[1],[x27],x14 |
|
|
|
fmul v8.2d, v8.2d, v7.d[0] |
|
fmul v9.2d, v9.2d, v7.d[0] |
|
fmul v10.2d,v10.2d,v7.d[0] |
|
fmul v11.2d,v11.2d,v7.d[0] |
|
fmul v12.2d,v12.2d,v7.d[0] |
|
fmul v13.2d,v13.2d,v7.d[0] |
|
|
|
.DBETAZEROGENSTOREDS4: |
|
|
|
prfm pldl2keep,[x3] |
|
prfm pldl2keep,[x4] |
|
|
|
fmla v8.2d, v26.2d,v6.d[0] |
|
fmla v9.2d, v27.2d,v6.d[0] |
|
fmla v10.2d,v28.2d,v6.d[0] |
|
fmla v11.2d,v29.2d,v6.d[0] |
|
fmla v12.2d,v30.2d,v6.d[0] |
|
fmla v13.2d,v31.2d,v6.d[0] |
|
|
|
mov x27, x25 |
|
|
|
st1 {v8.d}[0], [x27],x14 |
|
st1 {v8.d}[1], [x27],x14 |
|
st1 {v9.d}[0], [x27],x14 |
|
st1 {v9.d}[1], [x27],x14 |
|
st1 {v10.d}[0],[x27],x14 |
|
st1 {v10.d}[1],[x27],x14 |
|
|
|
mov x27, x26 |
|
|
|
st1 {v11.d}[0],[x27],x14 |
|
st1 {v11.d}[1],[x27],x14 |
|
st1 {v12.d}[0],[x27],x14 |
|
st1 {v12.d}[1],[x27],x14 |
|
st1 {v13.d}[0],[x27],x14 |
|
st1 {v13.d}[1],[x27],x14 |
|
|
|
.DEND: |
|
|
|
|
|
// 0 "" 2 |
|
#NO_APP |
|
ldp x22, x23, [sp, 16] |
|
ldp x24, x25, [sp, 32] |
|
ldp x26, x27, [sp, 48] |
|
ldp d8, d9, [sp, 64] |
|
ldp d10, d11, [sp, 80] |
|
ldp d12, d13, [sp, 96] |
|
ldp d14, d15, [sp, 112] |
|
ldp x20, x21, [sp], 224 |
|
.cfi_restore 21 |
|
.cfi_restore 20 |
|
.cfi_restore 78 |
|
.cfi_restore 79 |
|
.cfi_restore 76 |
|
.cfi_restore 77 |
|
.cfi_restore 74 |
|
.cfi_restore 75 |
|
.cfi_restore 72 |
|
.cfi_restore 73 |
|
.cfi_restore 26 |
|
.cfi_restore 27 |
|
.cfi_restore 24 |
|
.cfi_restore 25 |
|
.cfi_restore 22 |
|
.cfi_restore 23 |
|
.cfi_def_cfa_offset 0 |
|
ret |
|
.cfi_endproc |
|
.LFE739: |
|
// .size bli_dgemm_armv8a_asm_6x8, .-bli_dgemm_armv8a_asm_6x8 |
|
.ident "GCC: (ARM-build-5) 9.2.0" |
|
// .section .note.GNU-stack,"",@progbits |