cyanreg · April 2, 2021 14:47
diff --git a/fft_split_radix_recombine.c b/fft_split_radix_recombine.c
 #define BF(x, y, a, b)                           \
    do {                                         \
        x = (a) - (b);                           \
        y = (a) + (b);                           \
    } while (0)

 #define BUTTERFLIES_MIX(a0,a1,a2,a3, P1, P2, P5, P6)      \
    do {                                       \
        r0=a0.re;                              \
        i0=a0.im;                              \
        r1=a1.re;                              \
        i1=a1.im;                              \
        BF(t3, P5, P5, P1);                    \
        BF(a2.re, a0.re, r0, P5);              \
        BF(a3.im, a1.im, i1, t3);              \
        BF(t4, P6, P2, P6);                    \
        BF(a3.re, a1.re, r1, t4);              \
        BF(a2.im, a0.im, i0, P6);              \
    } while (0)

 #undef BUTTERFLIES
 #define BUTTERFLIES(a0,a1,a2,a3)               \
    do {                                       \
        r0=a0.re;                              \
        i0=a0.im;                              \
        r1=a1.re;                              \
        i1=a1.im;                              \
        BF(q3, q5, q5, q1);                    \
        BF(a2.re, a0.re, r0, q5);              \
        BF(a3.im, a1.im, i1, q3);              \
        BF(q4, q6, q2, q6);                    \
        BF(a3.re, a1.re, r1, q4);              \
        BF(a2.im, a0.im, i0, q6);              \
    } while (0)

 #undef TRANSFORM
 #define TRANSFORM(a0,a1,a2,a3,wre,wim)         \
    do {                                       \
        CMUL(q1, q2, a2.re, a2.im, wre, -wim); \
        CMUL(q5, q6, a3.re, a3.im, wre,  wim); \
        BUTTERFLIES(a0, a1, a2, a3);           \
    } while (0)

 #define CMUL(dre, dim, are, aim, bre, bim)       \
    do {                                         \
        (dre) = (are) * (bre) - (aim) * (bim);   \
        (dim) = (are) * (bim) + (aim) * (bre);   \
    } while (0)

 static void recombine_new(AVTXContext *s, FFTComplex *z, const FFTSample *cos,
                          unsigned int n, FFTComplex *temp_buf)
 {
    const int o1 = 2*n;
    const int o2 = 4*n;
    const int o3 = 6*n;
    const FFTSample *wim = cos + o1 - 7;
    FFTSample q1, q2, q3, q4, q5, q6, r0, i0, r1, i1;

    // LOAD AND ROTATE VIA VPERM2F128! 1 cycle less!

    for (int i = 0; i < n; i += 4) {
        FFTSample h[8], j[8], r[8], w[8];
        FFTSample t[8];
        FFTComplex *m0 = &z[0];
        FFTComplex *m1 = &z[4];
        FFTComplex *m2 = &z[o2 + 0];
        FFTComplex *m3 = &z[o2 + 4];

        const FFTSample *t1  = &cos[0];
        const FFTSample *t2  = &wim[0];

        /* 2 loads (tabs) */

        /* 2 vperm2fs, 2 shufs (im), 2 shufs (tabs) */
        /* 1 xor, 1 add, 1 sub, 4 mults OR 2 mults, 2 fmas */
        /* 13 OR 10ish (-2 each for second passovers!) */

        w[0] = m2[0].im*t1[0] - m2[0].re*t2[7];
        w[1] = m2[0].re*t1[0] + m2[0].im*t2[7];
        w[2] = m2[1].im*t1[2] - m2[1].re*t2[5];
        w[3] = m2[1].re*t1[2] + m2[1].im*t2[5];
        w[4] = m3[0].im*t1[4] - m3[0].re*t2[3];
        w[5] = m3[0].re*t1[4] + m3[0].im*t2[3];
        w[6] = m3[1].im*t1[6] - m3[1].re*t2[1];
        w[7] = m3[1].re*t1[6] + m3[1].im*t2[1];

        j[0] = m2[2].im*t1[0] + m2[2].re*t2[7];
        j[1] = m2[2].re*t1[0] - m2[2].im*t2[7];
        j[2] = m2[3].im*t1[2] + m2[3].re*t2[5];
        j[3] = m2[3].re*t1[2] - m2[3].im*t2[5];
        j[4] = m3[2].im*t1[4] + m3[2].re*t2[3];
        j[5] = m3[2].re*t1[4] - m3[2].im*t2[3];
        j[6] = m3[3].im*t1[6] + m3[3].re*t2[1];
        j[7] = m3[3].re*t1[6] - m3[3].im*t2[1];

        /* 1 add + 1 shuf */
        t[1] = j[0] + w[0];
        t[0] = j[1] + w[1];
        t[3] = j[2] + w[2];
        t[2] = j[3] + w[3];
        t[5] = j[4] + w[4];
        t[4] = j[5] + w[5];
        t[7] = j[6] + w[6];
        t[6] = j[7] + w[7];

        /* 1 sub + 1 xor */
        r[0] =  (w[0] - j[0]);
        r[1] = -(w[1] - j[1]);
        r[2] =  (w[2] - j[2]);
        r[3] = -(w[3] - j[3]);
        r[4] =  (w[4] - j[4]);
        r[5] = -(w[5] - j[5]);
        r[6] =  (w[6] - j[6]);
        r[7] = -(w[7] - j[7]);

        /* Min: 2 subs, 2 adds, 2 vperm2fs (OPTIONAL) */
        m2[0].re = m0[0].re - t[0];
        m2[0].im = m0[0].im - t[1];
        m2[1].re = m0[1].re - t[2];
        m2[1].im = m0[1].im - t[3];
        m3[0].re = m0[2].re - t[4];
        m3[0].im = m0[2].im - t[5];
        m3[1].re = m0[3].re - t[6];
        m3[1].im = m0[3].im - t[7];

        m2[2].re = m1[0].re - r[0];
        m2[2].im = m1[0].im - r[1];
        m2[3].re = m1[1].re - r[2];
        m2[3].im = m1[1].im - r[3];
        m3[2].re = m1[2].re - r[4];
        m3[2].im = m1[2].im - r[5];
        m3[3].re = m1[3].re - r[6];
        m3[3].im = m1[3].im - r[7];

        m0[0].re = m0[0].re + t[0];
        m0[0].im = m0[0].im + t[1];
        m0[1].re = m0[1].re + t[2];
        m0[1].im = m0[1].im + t[3];
        m0[2].re = m0[2].re + t[4];
        m0[2].im = m0[2].im + t[5];
        m0[3].re = m0[3].re + t[6];
        m0[3].im = m0[3].im + t[7];

        m1[0].re = m1[0].re + r[0];
        m1[0].im = m1[0].im + r[1];
        m1[1].re = m1[1].re + r[2];
        m1[1].im = m1[1].im + r[3];
        m1[2].re = m1[2].re + r[4];
        m1[2].im = m1[2].im + r[5];
        m1[3].re = m1[3].re + r[6];
        m1[3].im = m1[3].im + r[7];

    #if 1
        /* Identical for below, but with the following parameters */
        m0 = &z[o1];
        m1 = &z[o1 + 4];
        m2 = &z[o3 + 0];
        m3 = &z[o3 + 4];
        t1  = &cos[1];
        t2  = &wim[-1];
    #endif

        w[0] = m2[0].im*t1[0] - m2[0].re*t2[7];
        w[1] = m2[0].re*t1[0] + m2[0].im*t2[7];
        w[2] = m2[1].im*t1[2] - m2[1].re*t2[5];
        w[3] = m2[1].re*t1[2] + m2[1].im*t2[5];
        w[4] = m3[0].im*t1[4] - m3[0].re*t2[3];
        w[5] = m3[0].re*t1[4] + m3[0].im*t2[3];
        w[6] = m3[1].im*t1[6] - m3[1].re*t2[1];
        w[7] = m3[1].re*t1[6] + m3[1].im*t2[1];

        j[0] = m2[2].im*t1[0] + m2[2].re*t2[7];
        j[1] = m2[2].re*t1[0] - m2[2].im*t2[7];
        j[2] = m2[3].im*t1[2] + m2[3].re*t2[5];
        j[3] = m2[3].re*t1[2] - m2[3].im*t2[5];
        j[4] = m3[2].im*t1[4] + m3[2].re*t2[3];
        j[5] = m3[2].re*t1[4] - m3[2].im*t2[3];
        j[6] = m3[3].im*t1[6] + m3[3].re*t2[1];
        j[7] = m3[3].re*t1[6] - m3[3].im*t2[1];

        /* 1 add + 1 shuf */
        t[1] = j[0] + w[0];
        t[0] = j[1] + w[1];
        t[3] = j[2] + w[2];
        t[2] = j[3] + w[3];
        t[5] = j[4] + w[4];
        t[4] = j[5] + w[5];
        t[7] = j[6] + w[6];
        t[6] = j[7] + w[7];

        /* 1 sub + 1 xor */
        r[0] =  (w[0] - j[0]);
        r[1] = -(w[1] - j[1]);
        r[2] =  (w[2] - j[2]);
        r[3] = -(w[3] - j[3]);
        r[4] =  (w[4] - j[4]);
        r[5] = -(w[5] - j[5]);
        r[6] =  (w[6] - j[6]);
        r[7] = -(w[7] - j[7]);

        /* Min: 2 subs, 2 adds, 2 vperm2fs (OPTIONAL) */
        m2[0].re = m0[0].re - t[0];
        m2[0].im = m0[0].im - t[1];
        m2[1].re = m0[1].re - t[2];
        m2[1].im = m0[1].im - t[3];
        m3[0].re = m0[2].re - t[4];
        m3[0].im = m0[2].im - t[5];
        m3[1].re = m0[3].re - t[6];
        m3[1].im = m0[3].im - t[7];

        m2[2].re = m1[0].re - r[0];
        m2[2].im = m1[0].im - r[1];
        m2[3].re = m1[1].re - r[2];
        m2[3].im = m1[1].im - r[3];
        m3[2].re = m1[2].re - r[4];
        m3[2].im = m1[2].im - r[5];
        m3[3].re = m1[3].re - r[6];
        m3[3].im = m1[3].im - r[7];

        m0[0].re = m0[0].re + t[0];
        m0[0].im = m0[0].im + t[1];
        m0[1].re = m0[1].re + t[2];
        m0[1].im = m0[1].im + t[3];
        m0[2].re = m0[2].re + t[4];
        m0[2].im = m0[2].im + t[5];
        m0[3].re = m0[3].re + t[6];
        m0[3].im = m0[3].im + t[7];

        m1[0].re = m1[0].re + r[0];
        m1[0].im = m1[0].im + r[1];
        m1[1].re = m1[1].re + r[2];
        m1[1].im = m1[1].im + r[3];
        m1[2].re = m1[2].re + r[4];
        m1[2].im = m1[2].im + r[5];
        m1[3].re = m1[3].re + r[6];
        m1[3].im = m1[3].im + r[7];

    #if 0
        TRANSFORM(z[ 0 + 0], z[ 0 + 4], z[o2 + 0], z[o2 + 2], cos[0], wim[7]);
        TRANSFORM(z[ 0 + 1], z[ 0 + 5], z[o2 + 1], z[o2 + 3], cos[2], wim[5]);
        TRANSFORM(z[ 0 + 2], z[ 0 + 6], z[o2 + 4], z[o2 + 6], cos[4], wim[3]);
        TRANSFORM(z[ 0 + 3], z[ 0 + 7], z[o2 + 5], z[o2 + 7], cos[6], wim[1]);

        TRANSFORM(z[o1 + 0], z[o1 + 4], z[o3 + 0], z[o3 + 2], cos[1], wim[6]);
        TRANSFORM(z[o1 + 1], z[o1 + 5], z[o3 + 1], z[o3 + 3], cos[3], wim[4]);
        TRANSFORM(z[o1 + 2], z[o1 + 6], z[o3 + 4], z[o3 + 6], cos[5], wim[2]);
        TRANSFORM(z[o1 + 3], z[o1 + 7], z[o3 + 5], z[o3 + 7], cos[7], wim[0]);
    #endif

    #if 0
        ff_fft32_float_fma3(s, temp_buf, temp_buf, 0);

        FFTComplex zztop[] = {
            m2[2], m2[3], m3[2], m3[3],
        };

        FFTComplex *out = (FFTComplex *)zztop;
        FFTComplex *temp = temp_buf;
        temp += 0; out += temp - temp_buf;

        FFTComplex *ptr = (void *)out;

        printf("\nIDX =      0re      0im      1re      1im      2re      2im      3re      3im\n"
                 "REF = %f %f %f %f %f %f %f %f\n"
                 "OUT = %f %f %f %f %f %f %f %f\n"
                 "TST =        %i        %i        %i        %i        %i        %i        %i        %i\n\n",
               ptr[0].re, ptr[0].im, ptr[1].re, ptr[1].im,
               ptr[2].re, ptr[2].im, ptr[3].re, ptr[3].im,
               temp[0].re, temp[0].im, temp[1].re, temp[1].im,
               temp[2].re, temp[2].im, temp[3].re, temp[3].im,
               fabsf(ptr[0].re - temp[0].re) <= 3*FLT_EPSILON, fabsf(ptr[0].im - temp[0].im) <= 3*FLT_EPSILON,
               fabsf(ptr[1].re - temp[1].re) <= 3*FLT_EPSILON, fabsf(ptr[1].im - temp[1].im) <= 3*FLT_EPSILON,
               fabsf(ptr[2].re - temp[2].re) <= 3*FLT_EPSILON, fabsf(ptr[2].im - temp[2].im) <= 3*FLT_EPSILON,
               fabsf(ptr[3].re - temp[3].re) <= 3*FLT_EPSILON, fabsf(ptr[3].im - temp[3].im) <= 3*FLT_EPSILON);
    #endif

        z   += 2*4;
        cos += 2*4;
        wim -= 2*4;
    }
 }

 #if 0
 ; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs
 ; Uses all 16 of registers.
 ; Output is slightly permuted such that tx2,3's coefficients are interleaved
 ; on a 2-point basis (look at `doc/transforms.md`)
 %macro SPLIT_RADIX_COMBINE 11
 %if %11 && mmsize == 32
    vperm2f128 m12, %5, %6, 0x20     ; m2[0], m2[1], m3[0], m3[1] even
    vperm2f128 m14, %8, %7, 0x20     ; m2[0], m2[1], m3[0], m3[1] odd
    vperm2f128 m13, %5, %6, 0x31     ; m2[2], m2[3], m3[2], m3[3] even
    vperm2f128 m15, %8, %7, 0x31     ; m2[2], m2[3], m3[2], m3[3] odd
 %endif

    shufps     m10, %9, %9, q2200    ; cos00224466
    shufps     m11, %10, %10, q1133  ; wim77553311
    movshdup   %9, %9                ; cos11335577
    shufps     %10, %10, q0022       ; wim66442200

 %if %11 && mmsize == 32
    shufps     %5, m12, m12, q2301   ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even
    shufps     %7, m14, m14, q2301   ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd
    shufps     %6, m13, m13, q2301   ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even
    shufps     %8, m15, m15, q2301   ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd
    ; reorder the multiplies to save movs reg, reg in the %if above
    mulps      m12, m11              ; m2[0123]reim * wim7531 even
    mulps      m14, %10              ; m2[0123]reim * wim7531 odd
    mulps      m13, m11              ; m3[0123]reim * wim7531 even
    mulps      m15, %10              ; m3[0123]reim * wim7531 odd
 %else
    mulps      m12, %5, m11          ; m2,3[01]reim * wim7531 even
    mulps      m14, %7, %10          ; m2,3[01]reim * wim7531 odd
    mulps      m13, %6, m11          ; m2,3[23]reim * wim7531 even
    mulps      m15, %8, %10          ; m2,3[23]reim * wim7531 odd

    shufps     %5, %5, q2301         ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
    shufps     %7, %7, q2301         ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd
    shufps     %6, %6, q2301         ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
    shufps     %8, %8, q2301         ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd
 %endif

 %if cpuflag(fma3) ; 6 instructions saved through FMA!
    fmaddsubps %5, %5, m10, m12      ; w[0..8] even
    fmaddsubps %7, %7, %9, m14       ; w[0..8] odd
    fmsubaddps %6, %6, m10, m13      ; j[0..8] even
    fmsubaddps %8, %8, %9, m15       ; j[0..8] odd
    movaps     m11, [mask_pmpmpmpm]  ; "subaddps? pfft, who needs that!"
 %else
    mulps      %5, m10               ; m2,3[01]imre * cos0246
    mulps      %7, m8                ; m2,3[01]imre * cos0246
    movaps     m11, [mask_pmpmpmpm]  ; "subaddps? pfft, who needs that!"
    mulps      %8, m8                ; m2,3[23]reim * cos0246
    mulps      %6, m10               ; m2,3[23]reim * cos0246
    addsubps   %5, m12               ; w[0..8]
    addsubps   %7, m14               ; w[0..8]
    xorps      m13, m11              ; +-m2,3[23]imre * wim7531
    xorps      m15, m11              ; +-m2,3[23]imre * wim7531
    addps      %6, m13               ; j[0..8]
    addps      %8, m15               ; j[0..8]
 %endif

    addps      m12, %5, m5           ; t10235476 even
    addps      m14, %7, m7           ; t10235476 odd
    subps      m13, %5, m5           ; +-r[0..7] even
    subps      m15, %7, m7           ; +-r[0..7] odd

    shufps     m12, m12, q2301       ; t[0..7] even
    shufps     m14, m14, q2301       ; t[0..7] odd
    xorps      m13, m11              ; r[0..7] even
    xorps      m15, m11              ; r[0..7] odd

    subps      %5, %1, m12           ; %3,3[01] even
    subps      %7, %3, m14           ; %3,3[01] odd
    subps      %6, %2, m13           ; %3,3[23] even
    subps      %8, %4, m15           ; %3,3[23] odd

    addps      %1, m12               ; m0 even
    addps      %3, m14               ; m0 odd
    addps      %2, m13               ; m1 even
    addps      %4, m15               ; m1 odd
 %endmacro

 ; Same as above, only does a single parity at a time, dependent on whether
 ; the first argument is 0 or 1. Uses 3 temporary registers instead of 6.
 %macro SPLIT_RADIX_COMBINE_HALF 10
 %if %1
    shufps     %8, %6, %6, q2200     ; cos00224466
    shufps     %9, %7, %7, q1133     ; wim77553311
 %else
    shufps     %8, %6, %6, q3311     ; cos11335577
    shufps     %9, %7, %7, q0022     ; wim66442200
 %endif

    mulps      %10, %4, %9           ; m2,3[01]reim * wim7531 even
    mulps      %9, %5                ; m2,3[23]reim * wim7531 even

    shufps     %4, %4, q2301         ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
    shufps     %5, %5, q2301         ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even

 %if cpuflag(fma3)
    fmaddsubps %4, %4, %8, %10       ; w[0..8] even
    fmsubaddps %5, %5, %8, %9        ; j[0..8] even
    movaps     %8, [mask_pmpmpmpm]
 %else
    mulps      %4, %8                ; m2,3[01]imre * cos0246
    mulps      %5, %8                ; m2,3[23]reim * cos0246
    movaps     %8, [mask_pmpmpmpm]
    addsubps   %4, %10               ; w[0..8]
    xorps      %9, %8                ; +-m2,3[23]imre * wim7531
    addps      %5, %9                ; j[0..8]
 %endif

    addps      %10, %4, %5           ; t10235476
    subps      %9, %4, %5            ; +-r[0..7]

    shufps     %10, %10, q2301       ; t[0..7]
    xorps      %9, %8                ; r[0..7]

    subps      %4, %2, %10           ; %3,3[01]
    subps      %5, %3, %9            ; %3,3[23]

    addps      %2, %10               ; m0
    addps      %3, %9                ; m1
 %endmacro
 #endif
	#define BF(x, y, a, b) \
	do { \
	x = (a) - (b); \
	y = (a) + (b); \
	} while (0)

	#define BUTTERFLIES_MIX(a0,a1,a2,a3, P1, P2, P5, P6) \
	do { \
	r0=a0.re; \
	i0=a0.im; \
	r1=a1.re; \
	i1=a1.im; \
	BF(t3, P5, P5, P1); \
	BF(a2.re, a0.re, r0, P5); \
	BF(a3.im, a1.im, i1, t3); \
	BF(t4, P6, P2, P6); \
	BF(a3.re, a1.re, r1, t4); \
	BF(a2.im, a0.im, i0, P6); \
	} while (0)

	#undef BUTTERFLIES
	#define BUTTERFLIES(a0,a1,a2,a3) \
	do { \
	r0=a0.re; \
	i0=a0.im; \
	r1=a1.re; \
	i1=a1.im; \
	BF(q3, q5, q5, q1); \
	BF(a2.re, a0.re, r0, q5); \
	BF(a3.im, a1.im, i1, q3); \
	BF(q4, q6, q2, q6); \
	BF(a3.re, a1.re, r1, q4); \
	BF(a2.im, a0.im, i0, q6); \
	} while (0)

	#undef TRANSFORM
	#define TRANSFORM(a0,a1,a2,a3,wre,wim) \
	do { \
	CMUL(q1, q2, a2.re, a2.im, wre, -wim); \
	CMUL(q5, q6, a3.re, a3.im, wre, wim); \
	BUTTERFLIES(a0, a1, a2, a3); \
	} while (0)

	#define CMUL(dre, dim, are, aim, bre, bim) \
	do { \
	(dre) = (are) * (bre) - (aim) * (bim); \
	(dim) = (are) * (bim) + (aim) * (bre); \
	} while (0)

	static void recombine_new(AVTXContext s, FFTComplex z, const FFTSample *cos,
	unsigned int n, FFTComplex *temp_buf)
	{
	const int o1 = 2*n;
	const int o2 = 4*n;
	const int o3 = 6*n;
	const FFTSample *wim = cos + o1 - 7;
	FFTSample q1, q2, q3, q4, q5, q6, r0, i0, r1, i1;

	// LOAD AND ROTATE VIA VPERM2F128! 1 cycle less!

	for (int i = 0; i < n; i += 4) {
	FFTSample h[8], j[8], r[8], w[8];
	FFTSample t[8];
	FFTComplex *m0 = &z[0];
	FFTComplex *m1 = &z[4];
	FFTComplex *m2 = &z[o2 + 0];
	FFTComplex *m3 = &z[o2 + 4];

	const FFTSample *t1 = &cos[0];
	const FFTSample *t2 = &wim[0];

	/* 2 loads (tabs) */

	/* 2 vperm2fs, 2 shufs (im), 2 shufs (tabs) */
	/* 1 xor, 1 add, 1 sub, 4 mults OR 2 mults, 2 fmas */
	/* 13 OR 10ish (-2 each for second passovers!) */

	w[0] = m2[0].imt1[0] - m2[0].ret2[7];
	w[1] = m2[0].ret1[0] + m2[0].imt2[7];
	w[2] = m2[1].imt1[2] - m2[1].ret2[5];
	w[3] = m2[1].ret1[2] + m2[1].imt2[5];
	w[4] = m3[0].imt1[4] - m3[0].ret2[3];
	w[5] = m3[0].ret1[4] + m3[0].imt2[3];
	w[6] = m3[1].imt1[6] - m3[1].ret2[1];
	w[7] = m3[1].ret1[6] + m3[1].imt2[1];

	j[0] = m2[2].imt1[0] + m2[2].ret2[7];
	j[1] = m2[2].ret1[0] - m2[2].imt2[7];
	j[2] = m2[3].imt1[2] + m2[3].ret2[5];
	j[3] = m2[3].ret1[2] - m2[3].imt2[5];
	j[4] = m3[2].imt1[4] + m3[2].ret2[3];
	j[5] = m3[2].ret1[4] - m3[2].imt2[3];
	j[6] = m3[3].imt1[6] + m3[3].ret2[1];
	j[7] = m3[3].ret1[6] - m3[3].imt2[1];

	/* 1 add + 1 shuf */
	t[1] = j[0] + w[0];
	t[0] = j[1] + w[1];
	t[3] = j[2] + w[2];
	t[2] = j[3] + w[3];
	t[5] = j[4] + w[4];
	t[4] = j[5] + w[5];
	t[7] = j[6] + w[6];
	t[6] = j[7] + w[7];

	/* 1 sub + 1 xor */
	r[0] = (w[0] - j[0]);
	r[1] = -(w[1] - j[1]);
	r[2] = (w[2] - j[2]);
	r[3] = -(w[3] - j[3]);
	r[4] = (w[4] - j[4]);
	r[5] = -(w[5] - j[5]);
	r[6] = (w[6] - j[6]);
	r[7] = -(w[7] - j[7]);

	/* Min: 2 subs, 2 adds, 2 vperm2fs (OPTIONAL) */
	m2[0].re = m0[0].re - t[0];
	m2[0].im = m0[0].im - t[1];
	m2[1].re = m0[1].re - t[2];
	m2[1].im = m0[1].im - t[3];
	m3[0].re = m0[2].re - t[4];
	m3[0].im = m0[2].im - t[5];
	m3[1].re = m0[3].re - t[6];
	m3[1].im = m0[3].im - t[7];

	m2[2].re = m1[0].re - r[0];
	m2[2].im = m1[0].im - r[1];
	m2[3].re = m1[1].re - r[2];
	m2[3].im = m1[1].im - r[3];
	m3[2].re = m1[2].re - r[4];
	m3[2].im = m1[2].im - r[5];
	m3[3].re = m1[3].re - r[6];
	m3[3].im = m1[3].im - r[7];

	m0[0].re = m0[0].re + t[0];
	m0[0].im = m0[0].im + t[1];
	m0[1].re = m0[1].re + t[2];
	m0[1].im = m0[1].im + t[3];
	m0[2].re = m0[2].re + t[4];
	m0[2].im = m0[2].im + t[5];
	m0[3].re = m0[3].re + t[6];
	m0[3].im = m0[3].im + t[7];

	m1[0].re = m1[0].re + r[0];
	m1[0].im = m1[0].im + r[1];
	m1[1].re = m1[1].re + r[2];
	m1[1].im = m1[1].im + r[3];
	m1[2].re = m1[2].re + r[4];
	m1[2].im = m1[2].im + r[5];
	m1[3].re = m1[3].re + r[6];
	m1[3].im = m1[3].im + r[7];

	#if 1
	/* Identical for below, but with the following parameters */
	m0 = &z[o1];
	m1 = &z[o1 + 4];
	m2 = &z[o3 + 0];
	m3 = &z[o3 + 4];
	t1 = &cos[1];
	t2 = &wim[-1];
	#endif

	w[0] = m2[0].imt1[0] - m2[0].ret2[7];
	w[1] = m2[0].ret1[0] + m2[0].imt2[7];
	w[2] = m2[1].imt1[2] - m2[1].ret2[5];
	w[3] = m2[1].ret1[2] + m2[1].imt2[5];
	w[4] = m3[0].imt1[4] - m3[0].ret2[3];
	w[5] = m3[0].ret1[4] + m3[0].imt2[3];
	w[6] = m3[1].imt1[6] - m3[1].ret2[1];
	w[7] = m3[1].ret1[6] + m3[1].imt2[1];

	j[0] = m2[2].imt1[0] + m2[2].ret2[7];
	j[1] = m2[2].ret1[0] - m2[2].imt2[7];
	j[2] = m2[3].imt1[2] + m2[3].ret2[5];
	j[3] = m2[3].ret1[2] - m2[3].imt2[5];
	j[4] = m3[2].imt1[4] + m3[2].ret2[3];
	j[5] = m3[2].ret1[4] - m3[2].imt2[3];
	j[6] = m3[3].imt1[6] + m3[3].ret2[1];
	j[7] = m3[3].ret1[6] - m3[3].imt2[1];

	/* 1 add + 1 shuf */
	t[1] = j[0] + w[0];
	t[0] = j[1] + w[1];
	t[3] = j[2] + w[2];
	t[2] = j[3] + w[3];
	t[5] = j[4] + w[4];
	t[4] = j[5] + w[5];
	t[7] = j[6] + w[6];
	t[6] = j[7] + w[7];

	/* 1 sub + 1 xor */
	r[0] = (w[0] - j[0]);
	r[1] = -(w[1] - j[1]);
	r[2] = (w[2] - j[2]);
	r[3] = -(w[3] - j[3]);
	r[4] = (w[4] - j[4]);
	r[5] = -(w[5] - j[5]);
	r[6] = (w[6] - j[6]);
	r[7] = -(w[7] - j[7]);

	/* Min: 2 subs, 2 adds, 2 vperm2fs (OPTIONAL) */
	m2[0].re = m0[0].re - t[0];
	m2[0].im = m0[0].im - t[1];
	m2[1].re = m0[1].re - t[2];
	m2[1].im = m0[1].im - t[3];
	m3[0].re = m0[2].re - t[4];
	m3[0].im = m0[2].im - t[5];
	m3[1].re = m0[3].re - t[6];
	m3[1].im = m0[3].im - t[7];

	m2[2].re = m1[0].re - r[0];
	m2[2].im = m1[0].im - r[1];
	m2[3].re = m1[1].re - r[2];
	m2[3].im = m1[1].im - r[3];
	m3[2].re = m1[2].re - r[4];
	m3[2].im = m1[2].im - r[5];
	m3[3].re = m1[3].re - r[6];
	m3[3].im = m1[3].im - r[7];

	m0[0].re = m0[0].re + t[0];
	m0[0].im = m0[0].im + t[1];
	m0[1].re = m0[1].re + t[2];
	m0[1].im = m0[1].im + t[3];
	m0[2].re = m0[2].re + t[4];
	m0[2].im = m0[2].im + t[5];
	m0[3].re = m0[3].re + t[6];
	m0[3].im = m0[3].im + t[7];

	m1[0].re = m1[0].re + r[0];
	m1[0].im = m1[0].im + r[1];
	m1[1].re = m1[1].re + r[2];
	m1[1].im = m1[1].im + r[3];
	m1[2].re = m1[2].re + r[4];
	m1[2].im = m1[2].im + r[5];
	m1[3].re = m1[3].re + r[6];
	m1[3].im = m1[3].im + r[7];

	#if 0
	TRANSFORM(z[ 0 + 0], z[ 0 + 4], z[o2 + 0], z[o2 + 2], cos[0], wim[7]);
	TRANSFORM(z[ 0 + 1], z[ 0 + 5], z[o2 + 1], z[o2 + 3], cos[2], wim[5]);
	TRANSFORM(z[ 0 + 2], z[ 0 + 6], z[o2 + 4], z[o2 + 6], cos[4], wim[3]);
	TRANSFORM(z[ 0 + 3], z[ 0 + 7], z[o2 + 5], z[o2 + 7], cos[6], wim[1]);

	TRANSFORM(z[o1 + 0], z[o1 + 4], z[o3 + 0], z[o3 + 2], cos[1], wim[6]);
	TRANSFORM(z[o1 + 1], z[o1 + 5], z[o3 + 1], z[o3 + 3], cos[3], wim[4]);
	TRANSFORM(z[o1 + 2], z[o1 + 6], z[o3 + 4], z[o3 + 6], cos[5], wim[2]);
	TRANSFORM(z[o1 + 3], z[o1 + 7], z[o3 + 5], z[o3 + 7], cos[7], wim[0]);
	#endif

	#if 0
	ff_fft32_float_fma3(s, temp_buf, temp_buf, 0);

	FFTComplex zztop[] = {
	m2[2], m2[3], m3[2], m3[3],
	};

	FFTComplex out = (FFTComplex )zztop;
	FFTComplex *temp = temp_buf;
	temp += 0; out += temp - temp_buf;

	FFTComplex ptr = (void )out;

	printf("\nIDX = 0re 0im 1re 1im 2re 2im 3re 3im\n"
	"REF = %f %f %f %f %f %f %f %f\n"
	"OUT = %f %f %f %f %f %f %f %f\n"
	"TST = %i %i %i %i %i %i %i %i\n\n",
	ptr[0].re, ptr[0].im, ptr[1].re, ptr[1].im,
	ptr[2].re, ptr[2].im, ptr[3].re, ptr[3].im,
	temp[0].re, temp[0].im, temp[1].re, temp[1].im,
	temp[2].re, temp[2].im, temp[3].re, temp[3].im,
	fabsf(ptr[0].re - temp[0].re) <= 3FLT_EPSILON, fabsf(ptr[0].im - temp[0].im) <= 3FLT_EPSILON,
	fabsf(ptr[1].re - temp[1].re) <= 3FLT_EPSILON, fabsf(ptr[1].im - temp[1].im) <= 3FLT_EPSILON,
	fabsf(ptr[2].re - temp[2].re) <= 3FLT_EPSILON, fabsf(ptr[2].im - temp[2].im) <= 3FLT_EPSILON,
	fabsf(ptr[3].re - temp[3].re) <= 3FLT_EPSILON, fabsf(ptr[3].im - temp[3].im) <= 3FLT_EPSILON);
	#endif

	z += 2*4;
	cos += 2*4;
	wim -= 2*4;
	}
	}

	#if 0
	; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs
	; Uses all 16 of registers.
	; Output is slightly permuted such that tx2,3's coefficients are interleaved
	; on a 2-point basis (look at `doc/transforms.md`)
	%macro SPLIT_RADIX_COMBINE 11
	%if %11 && mmsize == 32
	vperm2f128 m12, %5, %6, 0x20 ; m2[0], m2[1], m3[0], m3[1] even
	vperm2f128 m14, %8, %7, 0x20 ; m2[0], m2[1], m3[0], m3[1] odd
	vperm2f128 m13, %5, %6, 0x31 ; m2[2], m2[3], m3[2], m3[3] even
	vperm2f128 m15, %8, %7, 0x31 ; m2[2], m2[3], m3[2], m3[3] odd
	%endif

	shufps m10, %9, %9, q2200 ; cos00224466
	shufps m11, %10, %10, q1133 ; wim77553311
	movshdup %9, %9 ; cos11335577
	shufps %10, %10, q0022 ; wim66442200

	%if %11 && mmsize == 32
	shufps %5, m12, m12, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even
	shufps %7, m14, m14, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd
	shufps %6, m13, m13, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even
	shufps %8, m15, m15, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd
	; reorder the multiplies to save movs reg, reg in the %if above
	mulps m12, m11 ; m2[0123]reim * wim7531 even
	mulps m14, %10 ; m2[0123]reim * wim7531 odd
	mulps m13, m11 ; m3[0123]reim * wim7531 even
	mulps m15, %10 ; m3[0123]reim * wim7531 odd
	%else
	mulps m12, %5, m11 ; m2,3[01]reim * wim7531 even
	mulps m14, %7, %10 ; m2,3[01]reim * wim7531 odd
	mulps m13, %6, m11 ; m2,3[23]reim * wim7531 even
	mulps m15, %8, %10 ; m2,3[23]reim * wim7531 odd

	shufps %5, %5, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
	shufps %7, %7, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd
	shufps %6, %6, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
	shufps %8, %8, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd
	%endif

	%if cpuflag(fma3) ; 6 instructions saved through FMA!
	fmaddsubps %5, %5, m10, m12 ; w[0..8] even
	fmaddsubps %7, %7, %9, m14 ; w[0..8] odd
	fmsubaddps %6, %6, m10, m13 ; j[0..8] even
	fmsubaddps %8, %8, %9, m15 ; j[0..8] odd
	movaps m11, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!"
	%else
	mulps %5, m10 ; m2,3[01]imre * cos0246
	mulps %7, m8 ; m2,3[01]imre * cos0246
	movaps m11, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!"
	mulps %8, m8 ; m2,3[23]reim * cos0246
	mulps %6, m10 ; m2,3[23]reim * cos0246
	addsubps %5, m12 ; w[0..8]
	addsubps %7, m14 ; w[0..8]
	xorps m13, m11 ; +-m2,3[23]imre * wim7531
	xorps m15, m11 ; +-m2,3[23]imre * wim7531
	addps %6, m13 ; j[0..8]
	addps %8, m15 ; j[0..8]
	%endif

	addps m12, %5, m5 ; t10235476 even
	addps m14, %7, m7 ; t10235476 odd
	subps m13, %5, m5 ; +-r[0..7] even
	subps m15, %7, m7 ; +-r[0..7] odd

	shufps m12, m12, q2301 ; t[0..7] even
	shufps m14, m14, q2301 ; t[0..7] odd
	xorps m13, m11 ; r[0..7] even
	xorps m15, m11 ; r[0..7] odd

	subps %5, %1, m12 ; %3,3[01] even
	subps %7, %3, m14 ; %3,3[01] odd
	subps %6, %2, m13 ; %3,3[23] even
	subps %8, %4, m15 ; %3,3[23] odd

	addps %1, m12 ; m0 even
	addps %3, m14 ; m0 odd
	addps %2, m13 ; m1 even
	addps %4, m15 ; m1 odd
	%endmacro

	; Same as above, only does a single parity at a time, dependent on whether
	; the first argument is 0 or 1. Uses 3 temporary registers instead of 6.
	%macro SPLIT_RADIX_COMBINE_HALF 10
	%if %1
	shufps %8, %6, %6, q2200 ; cos00224466
	shufps %9, %7, %7, q1133 ; wim77553311
	%else
	shufps %8, %6, %6, q3311 ; cos11335577
	shufps %9, %7, %7, q0022 ; wim66442200
	%endif

	mulps %10, %4, %9 ; m2,3[01]reim * wim7531 even
	mulps %9, %5 ; m2,3[23]reim * wim7531 even

	shufps %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
	shufps %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even

	%if cpuflag(fma3)
	fmaddsubps %4, %4, %8, %10 ; w[0..8] even
	fmsubaddps %5, %5, %8, %9 ; j[0..8] even
	movaps %8, [mask_pmpmpmpm]
	%else
	mulps %4, %8 ; m2,3[01]imre * cos0246
	mulps %5, %8 ; m2,3[23]reim * cos0246
	movaps %8, [mask_pmpmpmpm]
	addsubps %4, %10 ; w[0..8]
	xorps %9, %8 ; +-m2,3[23]imre * wim7531
	addps %5, %9 ; j[0..8]
	%endif

	addps %10, %4, %5 ; t10235476
	subps %9, %4, %5 ; +-r[0..7]

	shufps %10, %10, q2301 ; t[0..7]
	xorps %9, %8 ; r[0..7]

	subps %4, %2, %10 ; %3,3[01]
	subps %5, %3, %9 ; %3,3[23]

	addps %2, %10 ; m0
	addps %3, %9 ; m1
	%endmacro
	#endif