Skip to content

Instantly share code, notes, and snippets.

@cyanreg
Last active April 2, 2021 14:47
Show Gist options
  • Save cyanreg/12b83d1ade1238b6e1af9901ad70852e to your computer and use it in GitHub Desktop.
Save cyanreg/12b83d1ade1238b6e1af9901ad70852e to your computer and use it in GitHub Desktop.
Split-Radix recombine loop
#define BF(x, y, a, b) \
do { \
x = (a) - (b); \
y = (a) + (b); \
} while (0)
#define BUTTERFLIES_MIX(a0,a1,a2,a3, P1, P2, P5, P6) \
do { \
r0=a0.re; \
i0=a0.im; \
r1=a1.re; \
i1=a1.im; \
BF(t3, P5, P5, P1); \
BF(a2.re, a0.re, r0, P5); \
BF(a3.im, a1.im, i1, t3); \
BF(t4, P6, P2, P6); \
BF(a3.re, a1.re, r1, t4); \
BF(a2.im, a0.im, i0, P6); \
} while (0)
#undef BUTTERFLIES
#define BUTTERFLIES(a0,a1,a2,a3) \
do { \
r0=a0.re; \
i0=a0.im; \
r1=a1.re; \
i1=a1.im; \
BF(q3, q5, q5, q1); \
BF(a2.re, a0.re, r0, q5); \
BF(a3.im, a1.im, i1, q3); \
BF(q4, q6, q2, q6); \
BF(a3.re, a1.re, r1, q4); \
BF(a2.im, a0.im, i0, q6); \
} while (0)
#undef TRANSFORM
#define TRANSFORM(a0,a1,a2,a3,wre,wim) \
do { \
CMUL(q1, q2, a2.re, a2.im, wre, -wim); \
CMUL(q5, q6, a3.re, a3.im, wre, wim); \
BUTTERFLIES(a0, a1, a2, a3); \
} while (0)
#define CMUL(dre, dim, are, aim, bre, bim) \
do { \
(dre) = (are) * (bre) - (aim) * (bim); \
(dim) = (are) * (bim) + (aim) * (bre); \
} while (0)
static void recombine_new(AVTXContext *s, FFTComplex *z, const FFTSample *cos,
unsigned int n, FFTComplex *temp_buf)
{
const int o1 = 2*n;
const int o2 = 4*n;
const int o3 = 6*n;
const FFTSample *wim = cos + o1 - 7;
FFTSample q1, q2, q3, q4, q5, q6, r0, i0, r1, i1;
// LOAD AND ROTATE VIA VPERM2F128! 1 cycle less!
for (int i = 0; i < n; i += 4) {
FFTSample h[8], j[8], r[8], w[8];
FFTSample t[8];
FFTComplex *m0 = &z[0];
FFTComplex *m1 = &z[4];
FFTComplex *m2 = &z[o2 + 0];
FFTComplex *m3 = &z[o2 + 4];
const FFTSample *t1 = &cos[0];
const FFTSample *t2 = &wim[0];
/* 2 loads (tabs) */
/* 2 vperm2fs, 2 shufs (im), 2 shufs (tabs) */
/* 1 xor, 1 add, 1 sub, 4 mults OR 2 mults, 2 fmas */
/* 13 OR 10ish (-2 each for second passovers!) */
w[0] = m2[0].im*t1[0] - m2[0].re*t2[7];
w[1] = m2[0].re*t1[0] + m2[0].im*t2[7];
w[2] = m2[1].im*t1[2] - m2[1].re*t2[5];
w[3] = m2[1].re*t1[2] + m2[1].im*t2[5];
w[4] = m3[0].im*t1[4] - m3[0].re*t2[3];
w[5] = m3[0].re*t1[4] + m3[0].im*t2[3];
w[6] = m3[1].im*t1[6] - m3[1].re*t2[1];
w[7] = m3[1].re*t1[6] + m3[1].im*t2[1];
j[0] = m2[2].im*t1[0] + m2[2].re*t2[7];
j[1] = m2[2].re*t1[0] - m2[2].im*t2[7];
j[2] = m2[3].im*t1[2] + m2[3].re*t2[5];
j[3] = m2[3].re*t1[2] - m2[3].im*t2[5];
j[4] = m3[2].im*t1[4] + m3[2].re*t2[3];
j[5] = m3[2].re*t1[4] - m3[2].im*t2[3];
j[6] = m3[3].im*t1[6] + m3[3].re*t2[1];
j[7] = m3[3].re*t1[6] - m3[3].im*t2[1];
/* 1 add + 1 shuf */
t[1] = j[0] + w[0];
t[0] = j[1] + w[1];
t[3] = j[2] + w[2];
t[2] = j[3] + w[3];
t[5] = j[4] + w[4];
t[4] = j[5] + w[5];
t[7] = j[6] + w[6];
t[6] = j[7] + w[7];
/* 1 sub + 1 xor */
r[0] = (w[0] - j[0]);
r[1] = -(w[1] - j[1]);
r[2] = (w[2] - j[2]);
r[3] = -(w[3] - j[3]);
r[4] = (w[4] - j[4]);
r[5] = -(w[5] - j[5]);
r[6] = (w[6] - j[6]);
r[7] = -(w[7] - j[7]);
/* Min: 2 subs, 2 adds, 2 vperm2fs (OPTIONAL) */
m2[0].re = m0[0].re - t[0];
m2[0].im = m0[0].im - t[1];
m2[1].re = m0[1].re - t[2];
m2[1].im = m0[1].im - t[3];
m3[0].re = m0[2].re - t[4];
m3[0].im = m0[2].im - t[5];
m3[1].re = m0[3].re - t[6];
m3[1].im = m0[3].im - t[7];
m2[2].re = m1[0].re - r[0];
m2[2].im = m1[0].im - r[1];
m2[3].re = m1[1].re - r[2];
m2[3].im = m1[1].im - r[3];
m3[2].re = m1[2].re - r[4];
m3[2].im = m1[2].im - r[5];
m3[3].re = m1[3].re - r[6];
m3[3].im = m1[3].im - r[7];
m0[0].re = m0[0].re + t[0];
m0[0].im = m0[0].im + t[1];
m0[1].re = m0[1].re + t[2];
m0[1].im = m0[1].im + t[3];
m0[2].re = m0[2].re + t[4];
m0[2].im = m0[2].im + t[5];
m0[3].re = m0[3].re + t[6];
m0[3].im = m0[3].im + t[7];
m1[0].re = m1[0].re + r[0];
m1[0].im = m1[0].im + r[1];
m1[1].re = m1[1].re + r[2];
m1[1].im = m1[1].im + r[3];
m1[2].re = m1[2].re + r[4];
m1[2].im = m1[2].im + r[5];
m1[3].re = m1[3].re + r[6];
m1[3].im = m1[3].im + r[7];
#if 1
/* Identical for below, but with the following parameters */
m0 = &z[o1];
m1 = &z[o1 + 4];
m2 = &z[o3 + 0];
m3 = &z[o3 + 4];
t1 = &cos[1];
t2 = &wim[-1];
#endif
w[0] = m2[0].im*t1[0] - m2[0].re*t2[7];
w[1] = m2[0].re*t1[0] + m2[0].im*t2[7];
w[2] = m2[1].im*t1[2] - m2[1].re*t2[5];
w[3] = m2[1].re*t1[2] + m2[1].im*t2[5];
w[4] = m3[0].im*t1[4] - m3[0].re*t2[3];
w[5] = m3[0].re*t1[4] + m3[0].im*t2[3];
w[6] = m3[1].im*t1[6] - m3[1].re*t2[1];
w[7] = m3[1].re*t1[6] + m3[1].im*t2[1];
j[0] = m2[2].im*t1[0] + m2[2].re*t2[7];
j[1] = m2[2].re*t1[0] - m2[2].im*t2[7];
j[2] = m2[3].im*t1[2] + m2[3].re*t2[5];
j[3] = m2[3].re*t1[2] - m2[3].im*t2[5];
j[4] = m3[2].im*t1[4] + m3[2].re*t2[3];
j[5] = m3[2].re*t1[4] - m3[2].im*t2[3];
j[6] = m3[3].im*t1[6] + m3[3].re*t2[1];
j[7] = m3[3].re*t1[6] - m3[3].im*t2[1];
/* 1 add + 1 shuf */
t[1] = j[0] + w[0];
t[0] = j[1] + w[1];
t[3] = j[2] + w[2];
t[2] = j[3] + w[3];
t[5] = j[4] + w[4];
t[4] = j[5] + w[5];
t[7] = j[6] + w[6];
t[6] = j[7] + w[7];
/* 1 sub + 1 xor */
r[0] = (w[0] - j[0]);
r[1] = -(w[1] - j[1]);
r[2] = (w[2] - j[2]);
r[3] = -(w[3] - j[3]);
r[4] = (w[4] - j[4]);
r[5] = -(w[5] - j[5]);
r[6] = (w[6] - j[6]);
r[7] = -(w[7] - j[7]);
/* Min: 2 subs, 2 adds, 2 vperm2fs (OPTIONAL) */
m2[0].re = m0[0].re - t[0];
m2[0].im = m0[0].im - t[1];
m2[1].re = m0[1].re - t[2];
m2[1].im = m0[1].im - t[3];
m3[0].re = m0[2].re - t[4];
m3[0].im = m0[2].im - t[5];
m3[1].re = m0[3].re - t[6];
m3[1].im = m0[3].im - t[7];
m2[2].re = m1[0].re - r[0];
m2[2].im = m1[0].im - r[1];
m2[3].re = m1[1].re - r[2];
m2[3].im = m1[1].im - r[3];
m3[2].re = m1[2].re - r[4];
m3[2].im = m1[2].im - r[5];
m3[3].re = m1[3].re - r[6];
m3[3].im = m1[3].im - r[7];
m0[0].re = m0[0].re + t[0];
m0[0].im = m0[0].im + t[1];
m0[1].re = m0[1].re + t[2];
m0[1].im = m0[1].im + t[3];
m0[2].re = m0[2].re + t[4];
m0[2].im = m0[2].im + t[5];
m0[3].re = m0[3].re + t[6];
m0[3].im = m0[3].im + t[7];
m1[0].re = m1[0].re + r[0];
m1[0].im = m1[0].im + r[1];
m1[1].re = m1[1].re + r[2];
m1[1].im = m1[1].im + r[3];
m1[2].re = m1[2].re + r[4];
m1[2].im = m1[2].im + r[5];
m1[3].re = m1[3].re + r[6];
m1[3].im = m1[3].im + r[7];
#if 0
TRANSFORM(z[ 0 + 0], z[ 0 + 4], z[o2 + 0], z[o2 + 2], cos[0], wim[7]);
TRANSFORM(z[ 0 + 1], z[ 0 + 5], z[o2 + 1], z[o2 + 3], cos[2], wim[5]);
TRANSFORM(z[ 0 + 2], z[ 0 + 6], z[o2 + 4], z[o2 + 6], cos[4], wim[3]);
TRANSFORM(z[ 0 + 3], z[ 0 + 7], z[o2 + 5], z[o2 + 7], cos[6], wim[1]);
TRANSFORM(z[o1 + 0], z[o1 + 4], z[o3 + 0], z[o3 + 2], cos[1], wim[6]);
TRANSFORM(z[o1 + 1], z[o1 + 5], z[o3 + 1], z[o3 + 3], cos[3], wim[4]);
TRANSFORM(z[o1 + 2], z[o1 + 6], z[o3 + 4], z[o3 + 6], cos[5], wim[2]);
TRANSFORM(z[o1 + 3], z[o1 + 7], z[o3 + 5], z[o3 + 7], cos[7], wim[0]);
#endif
#if 0
ff_fft32_float_fma3(s, temp_buf, temp_buf, 0);
FFTComplex zztop[] = {
m2[2], m2[3], m3[2], m3[3],
};
FFTComplex *out = (FFTComplex *)zztop;
FFTComplex *temp = temp_buf;
temp += 0; out += temp - temp_buf;
FFTComplex *ptr = (void *)out;
printf("\nIDX = 0re 0im 1re 1im 2re 2im 3re 3im\n"
"REF = %f %f %f %f %f %f %f %f\n"
"OUT = %f %f %f %f %f %f %f %f\n"
"TST = %i %i %i %i %i %i %i %i\n\n",
ptr[0].re, ptr[0].im, ptr[1].re, ptr[1].im,
ptr[2].re, ptr[2].im, ptr[3].re, ptr[3].im,
temp[0].re, temp[0].im, temp[1].re, temp[1].im,
temp[2].re, temp[2].im, temp[3].re, temp[3].im,
fabsf(ptr[0].re - temp[0].re) <= 3*FLT_EPSILON, fabsf(ptr[0].im - temp[0].im) <= 3*FLT_EPSILON,
fabsf(ptr[1].re - temp[1].re) <= 3*FLT_EPSILON, fabsf(ptr[1].im - temp[1].im) <= 3*FLT_EPSILON,
fabsf(ptr[2].re - temp[2].re) <= 3*FLT_EPSILON, fabsf(ptr[2].im - temp[2].im) <= 3*FLT_EPSILON,
fabsf(ptr[3].re - temp[3].re) <= 3*FLT_EPSILON, fabsf(ptr[3].im - temp[3].im) <= 3*FLT_EPSILON);
#endif
z += 2*4;
cos += 2*4;
wim -= 2*4;
}
}
#if 0
; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs
; Uses all 16 of registers.
; Output is slightly permuted such that tx2,3's coefficients are interleaved
; on a 2-point basis (look at `doc/transforms.md`)
%macro SPLIT_RADIX_COMBINE 11
%if %11 && mmsize == 32
vperm2f128 m12, %5, %6, 0x20 ; m2[0], m2[1], m3[0], m3[1] even
vperm2f128 m14, %8, %7, 0x20 ; m2[0], m2[1], m3[0], m3[1] odd
vperm2f128 m13, %5, %6, 0x31 ; m2[2], m2[3], m3[2], m3[3] even
vperm2f128 m15, %8, %7, 0x31 ; m2[2], m2[3], m3[2], m3[3] odd
%endif
shufps m10, %9, %9, q2200 ; cos00224466
shufps m11, %10, %10, q1133 ; wim77553311
movshdup %9, %9 ; cos11335577
shufps %10, %10, q0022 ; wim66442200
%if %11 && mmsize == 32
shufps %5, m12, m12, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even
shufps %7, m14, m14, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd
shufps %6, m13, m13, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even
shufps %8, m15, m15, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd
; reorder the multiplies to save movs reg, reg in the %if above
mulps m12, m11 ; m2[0123]reim * wim7531 even
mulps m14, %10 ; m2[0123]reim * wim7531 odd
mulps m13, m11 ; m3[0123]reim * wim7531 even
mulps m15, %10 ; m3[0123]reim * wim7531 odd
%else
mulps m12, %5, m11 ; m2,3[01]reim * wim7531 even
mulps m14, %7, %10 ; m2,3[01]reim * wim7531 odd
mulps m13, %6, m11 ; m2,3[23]reim * wim7531 even
mulps m15, %8, %10 ; m2,3[23]reim * wim7531 odd
shufps %5, %5, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
shufps %7, %7, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd
shufps %6, %6, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
shufps %8, %8, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd
%endif
%if cpuflag(fma3) ; 6 instructions saved through FMA!
fmaddsubps %5, %5, m10, m12 ; w[0..8] even
fmaddsubps %7, %7, %9, m14 ; w[0..8] odd
fmsubaddps %6, %6, m10, m13 ; j[0..8] even
fmsubaddps %8, %8, %9, m15 ; j[0..8] odd
movaps m11, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!"
%else
mulps %5, m10 ; m2,3[01]imre * cos0246
mulps %7, m8 ; m2,3[01]imre * cos0246
movaps m11, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!"
mulps %8, m8 ; m2,3[23]reim * cos0246
mulps %6, m10 ; m2,3[23]reim * cos0246
addsubps %5, m12 ; w[0..8]
addsubps %7, m14 ; w[0..8]
xorps m13, m11 ; +-m2,3[23]imre * wim7531
xorps m15, m11 ; +-m2,3[23]imre * wim7531
addps %6, m13 ; j[0..8]
addps %8, m15 ; j[0..8]
%endif
addps m12, %5, m5 ; t10235476 even
addps m14, %7, m7 ; t10235476 odd
subps m13, %5, m5 ; +-r[0..7] even
subps m15, %7, m7 ; +-r[0..7] odd
shufps m12, m12, q2301 ; t[0..7] even
shufps m14, m14, q2301 ; t[0..7] odd
xorps m13, m11 ; r[0..7] even
xorps m15, m11 ; r[0..7] odd
subps %5, %1, m12 ; %3,3[01] even
subps %7, %3, m14 ; %3,3[01] odd
subps %6, %2, m13 ; %3,3[23] even
subps %8, %4, m15 ; %3,3[23] odd
addps %1, m12 ; m0 even
addps %3, m14 ; m0 odd
addps %2, m13 ; m1 even
addps %4, m15 ; m1 odd
%endmacro
; Same as above, only does a single parity at a time, dependent on whether
; the first argument is 0 or 1. Uses 3 temporary registers instead of 6.
%macro SPLIT_RADIX_COMBINE_HALF 10
%if %1
shufps %8, %6, %6, q2200 ; cos00224466
shufps %9, %7, %7, q1133 ; wim77553311
%else
shufps %8, %6, %6, q3311 ; cos11335577
shufps %9, %7, %7, q0022 ; wim66442200
%endif
mulps %10, %4, %9 ; m2,3[01]reim * wim7531 even
mulps %9, %5 ; m2,3[23]reim * wim7531 even
shufps %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
shufps %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
%if cpuflag(fma3)
fmaddsubps %4, %4, %8, %10 ; w[0..8] even
fmsubaddps %5, %5, %8, %9 ; j[0..8] even
movaps %8, [mask_pmpmpmpm]
%else
mulps %4, %8 ; m2,3[01]imre * cos0246
mulps %5, %8 ; m2,3[23]reim * cos0246
movaps %8, [mask_pmpmpmpm]
addsubps %4, %10 ; w[0..8]
xorps %9, %8 ; +-m2,3[23]imre * wim7531
addps %5, %9 ; j[0..8]
%endif
addps %10, %4, %5 ; t10235476
subps %9, %4, %5 ; +-r[0..7]
shufps %10, %10, q2301 ; t[0..7]
xorps %9, %8 ; r[0..7]
subps %4, %2, %10 ; %3,3[01]
subps %5, %3, %9 ; %3,3[23]
addps %2, %10 ; m0
addps %3, %9 ; m1
%endmacro
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment