Skip to content

Instantly share code, notes, and snippets.

@cyanreg
Created April 19, 2021 13:10
Show Gist options
  • Save cyanreg/50fe5ecf6bc5935490aeb189e1bb5d45 to your computer and use it in GitHub Desktop.
Save cyanreg/50fe5ecf6bc5935490aeb189e1bb5d45 to your computer and use it in GitHub Desktop.
FFmpeg transform debugging code
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define TX_FLOAT
#include "libavutil/tx_priv.h"
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
void ff_fft2_float_sse3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
void ff_fft4_inv_float_sse2 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
void ff_fft4_fwd_float_sse2 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
void ff_fft8_float_sse3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
void ff_fft8_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
void ff_fft16_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
void ff_fft16_float_fma3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
void ff_fft32_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
void ff_fft32_float_fma3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
void ff_split_radix_fft_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
void ff_split_radix_fft_float_fma3(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
#include <float.h>
#define FFTSample av_unused FFTSample
extern const FFTSample ff_cos_16_float[];
extern const FFTSample ff_cos_32_float[];
extern const FFTSample ff_cos_64_float[];
extern const FFTSample ff_cos_128_float[];
extern const FFTSample ff_cos_256_float[];
extern const FFTSample ff_cos_512_float[];
#define BUTTERFLIES(a0,a1,a2,a3) \
do { \
r0=a0.re; \
i0=a0.im; \
r1=a1.re; \
i1=a1.im; \
BF(t3, t5, t5, t1); \
BF(a2.re, a0.re, r0, t5); \
BF(a3.im, a1.im, i1, t3); \
BF(t4, t6, t2, t6); \
BF(a3.re, a1.re, r1, t4); \
BF(a2.im, a0.im, i0, t6); \
} while (0)
#define TRANSFORM(a0,a1,a2,a3,wre,wim) \
do { \
CMUL(t1, t2, a2.re, a2.im, wre, -wim); \
CMUL(t5, t6, a3.re, a3.im, wre, wim); \
BUTTERFLIES(a0, a1, a2, a3); \
} while (0)
static void fft4(FFTComplex *z)
{
FFTSample t1, t2, t3, t4, t5, t6, t7, t8;
BF(t3, t1, z[0].re, z[1].re);
BF(t8, t6, z[3].re, z[2].re);
BF(z[2].re, z[0].re, t1, t6);
BF(t4, t2, z[0].im, z[1].im);
BF(t7, t5, z[2].im, z[3].im);
BF(z[3].im, z[1].im, t4, t8);
BF(z[3].re, z[1].re, t3, t7);
BF(z[2].im, z[0].im, t2, t5);
}
static void fft8(FFTComplex *z)
{
FFTSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
fft4(z);
BF(t1, z[5].re, z[4].re, -z[5].re);
BF(t2, z[5].im, z[4].im, -z[5].im);
BF(t5, z[7].re, z[6].re, -z[7].re);
BF(t6, z[7].im, z[6].im, -z[7].im);
BUTTERFLIES(z[0], z[2], z[4], z[6]);
TRANSFORM(z[1], z[3], z[5], z[7], RESCALE(M_SQRT1_2), RESCALE(M_SQRT1_2));
}
static void fft16(FFTComplex *z)
{
FFTSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
FFTSample cos_16_1 = TX_NAME(ff_cos_16)[1];
FFTSample cos_16_2 = TX_NAME(ff_cos_16)[2];
FFTSample cos_16_3 = TX_NAME(ff_cos_16)[3];
fft8(z + 0);
fft4(z + 8);
fft4(z + 12);
t1 = z[ 8].re;
t2 = z[ 8].im;
t5 = z[12].re;
t6 = z[12].im;
BUTTERFLIES(z[0], z[4], z[8], z[12]);
TRANSFORM(z[ 2], z[ 6], z[10], z[14], cos_16_2, cos_16_2);
TRANSFORM(z[ 1], z[ 5], z[ 9], z[13], cos_16_1, cos_16_3);
TRANSFORM(z[ 3], z[ 7], z[11], z[15], cos_16_3, cos_16_1);
}
static void split_radix_combine(FFTComplex *z, const FFTSample *cos, int n, int loops)
{
int o1 = 2*n;
int o2 = 4*n;
int o3 = 6*n;
const FFTSample *wim = cos + o1 - 7;
FFTSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
FFTComplex *oz = z + o3;
for (int i = 0; i < 4*loops; i += 4) {
TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
skip:
z += 2*4;
cos += 2*4;
wim -= 2*4;
}
}
int old_revtab[4096];
static void fft32(FFTComplex *z)
{
int n4 = 8;
fft16(z);
fft8(z+n4*2);
fft8(z+n4*3);
split_radix_combine(z, ff_cos_32_float, n4/2, 1);
}
static void fft64(FFTComplex *z, int combine)
{
int n4 = 16;
fft32(z);
fft16(z+n4*2);
fft16(z+n4*3);
split_radix_combine(z, ff_cos_64_float, n4/2, combine);
}
static void fft128(FFTComplex *z, int combine)
{
int n4 = 32;
fft64(z, 2);
fft32(z+n4*2);
fft32(z+n4*3);
split_radix_combine(z, ff_cos_128_float, n4/2, combine);
}
static void fft256(FFTComplex *z, int combine)
{
int n4 = 64;
fft128(z, 4);
fft64(z+n4*2, 2);
fft64(z+n4*3, 2);
split_radix_combine(z, ff_cos_256_float, n4/2, combine);
}
static void fft512(FFTComplex *z, int combine)
{
int n4 = 128;
fft256(z, 8);
fft128(z+n4*2, 4);
fft128(z+n4*3, 4);
split_radix_combine(z, ff_cos_512_float, n4/2, combine);
}
#define BF(x, y, a, b) \
do { \
x = (a) - (b); \
y = (a) + (b); \
} while (0)
#define BUTTERFLIES_MIX(a0,a1,a2,a3, P1, P2, P5, P6) \
do { \
r0=a0.re; \
i0=a0.im; \
r1=a1.re; \
i1=a1.im; \
BF(t3, P5, P5, P1); \
BF(a2.re, a0.re, r0, P5); \
BF(a3.im, a1.im, i1, t3); \
BF(t4, P6, P2, P6); \
BF(a3.re, a1.re, r1, t4); \
BF(a2.im, a0.im, i0, P6); \
} while (0)
#undef BUTTERFLIES
#define BUTTERFLIES(a0,a1,a2,a3) \
do { \
r0=a0.re; \
i0=a0.im; \
r1=a1.re; \
i1=a1.im; \
BF(q3, q5, q5, q1); \
BF(a2.re, a0.re, r0, q5); \
BF(a3.im, a1.im, i1, q3); \
BF(q4, q6, q2, q6); \
BF(a3.re, a1.re, r1, q4); \
BF(a2.im, a0.im, i0, q6); \
} while (0)
#undef TRANSFORM
#define TRANSFORM(a0,a1,a2,a3,wre,wim) \
do { \
CMUL(q1, q2, a2.re, a2.im, wre, -wim); \
CMUL(q5, q6, a3.re, a3.im, wre, wim); \
BUTTERFLIES(a0, a1, a2, a3); \
} while (0)
#define CMUL(dre, dim, are, aim, bre, bim) \
do { \
(dre) = (are) * (bre) - (aim) * (bim); \
(dim) = (are) * (bim) + (aim) * (bre); \
} while (0)
int64_t ff_aliens = 0;
static av_unused void recombine_new(AVTXContext *s, FFTComplex *z, const FFTSample *cos,
unsigned int n, FFTComplex *test, FFTComplex *ref)
{
const int o1 = 2*n;
const int o2 = 4*n;
const int o3 = 6*n;
const FFTSample *wim = cos + o1 - 7;
FFTSample q1, q2, q3, q4, q5, q6, r0, i0, r1, i1;
for (int i = 0; i < 8; i += 4) {
TRANSFORM(z[0 + 0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
TRANSFORM(z[0 + 1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[2], wim[5]);
TRANSFORM(z[0 + 2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[4], wim[3]);
TRANSFORM(z[0 + 3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[6], wim[1]);
TRANSFORM(z[0 + 8], z[o1 + 8], z[o2 + 8], z[o3 + 8], cos[1], wim[6]);
TRANSFORM(z[0 + 9], z[o1 + 9], z[o2 + 9], z[o3 + 9], cos[3], wim[4]);
TRANSFORM(z[0 + 10], z[o1 + 10], z[o2 + 10], z[o3 + 10], cos[5], wim[2]);
TRANSFORM(z[0 + 11], z[o1 + 11], z[o2 + 11], z[o3 + 11], cos[7], wim[0]);
z += 4;
cos += 2*4;
wim -= 2*4;
}
}
static av_unused void test_fft(AVTXContext *s, void *_out, void *_in,
ptrdiff_t stride)
{
FFTComplex *in = _in;
FFTComplex *out = _out;
FFTComplex *out_inter = av_mallocz(2 * s->m * sizeof(FFTComplex)), *out_inter_orig = out_inter;
FFTComplex *out_test = av_mallocz(2 * s->m * sizeof(FFTComplex)), *out_test_orig = out_test;
FFTComplex *out_main = av_mallocz(2 * s->m * sizeof(FFTComplex)), *out_main_orig = out_main;
FFTComplex *in_inter = av_mallocz(2 * s->m * sizeof(FFTComplex)), *in_inter_orig = in_inter;
FFTComplex *in_test = av_mallocz(2 * s->m * sizeof(FFTComplex)), *in_test_orig = in_test;
FFTComplex *in_main = av_mallocz(2 * s->m * sizeof(FFTComplex)), *in_main_orig = in_main;
for (int i = 0; i < s->m; i++) {
in_main[i] = in[i];
in_test[i] = in[i];
in_inter[i] = in[i];
out[i] = in[old_revtab[i]];
}
printf("\n");
if (s->m == 64)
fft64(out, 2);
else if (s->m == 32)
fft32(out);
else if (s->m == 128)
fft128(out, 4);
else if (s->m == 256)
fft256(out, 8);
else if (s->m == 512)
fft512(out, 16);
else {
}
ff_split_radix_fft_float_fma3(s, out_main, in_main, stride);
// recombine_new(s, out_main, ff_cos_128_float, 16, in_inter, out);
#if 1
FFTComplex *temp = out_main;
FFTComplex *ref = out + (temp - out_main);
int match;
#define MONTE 0
#if 0
int hay = -1;
int idx = 112;
float needle = ref[idx].re;
for (int i = 0; i < s->m; i++) {
int fr = fabsf(needle - temp[i].re) <= 32*FLT_EPSILON;
int fi = fabsf(needle - temp[i].im) <= 32*FLT_EPSILON;
if (fr || fi) {
hay = i;
break;
}
}
printf("z[%i] = %f %i\n", idx, needle, hay);
#endif
#if MONTE
int start = 96;
do {
FFTSample q1, q2, q3, q4, q5, q6, r0, i0, r1, i1;
int n = 16;
const int o1 = 2*n;
const int o2 = 4*n;
const int o3 = 6*n;
const FFTSample *cos = ff_cos_128_float;
const FFTSample *wim = cos + o1 - 7;
int r[6];
for (int i = 0; i < 6; i++) {
float val = (double)rand() / (double)RAND_MAX;
float val2 = val * s->m;
int lolval = lrintf(val2);
r[i] = lolval;
}
FFTComplex *z = temp + 0;
cos += 2*4*0;
wim -= 2*4*0;
FFTComplex backup[4] = {
z[0 + r[0]], z[0 + r[1]], z[0 + r[2]], z[0 + r[3]],
};
TRANSFORM(z[0 + r[0]], z[0 + r[1]], z[0 + r[2]], z[0 + r[3]], cos[r[4]], cos[r[5]]);
#endif
int found = 0;
int i = 0;
match = 0;
for (i = 0; i < s->m; i++)
{
int correct = 0;
int j = 0;
for (j = 0; j < s->m; j++)
{
int fr = fabsf(ref[i].re - temp[j].re) <= 32*FLT_EPSILON;
int fi = fabsf(ref[i].im - temp[j].im) <= 32*FLT_EPSILON;
correct += fr && fi;
found += correct;
if (fr && fi) {
match++;
if (correct > 0)
printf("%i %i\n", i, j);
break;
}
}
if (!correct) {
printf("%i nope\n", i);
}
}
#if MONTE
if (match > start) {
printf("Matches = { %i %i %i %i %i %i } %i\n", r[0], r[1], r[2], r[3], r[4], r[5], match);
start++;
} else {
z[0 + r[0]] = backup[0];
z[0 + r[1]] = backup[1];
z[0 + r[2]] = backup[2];
z[0 + r[3]] = backup[3];
}
} while (1);
#endif
for (int i = 0; i < s->m; i++) {
int fr = fabsf(0 - temp[i].re) <= 32*FLT_EPSILON;
int fi = fabsf(0 - temp[i].im) <= 32*FLT_EPSILON;
// if (fr && fi)
// printf("Quack at %i\n", i);
}
temp += 0;
ref = out + (temp - out_main);
printf("\nIDX = 0re 0im 1re 1im 2re 2im 3re 3im\n"
"REF = %f %f %f %f %f %f %f %f\n"
"OUT = %f %f %f %f %f %f %f %f\n"
"TST = %i %i %i %i %i %i %i %i\n"
"MCH = %i/%i, %li\n\n",
ref[0].re, ref[0].im, ref[1].re, ref[1].im,
ref[2].re, ref[2].im, ref[3].re, ref[3].im,
temp[0].re, temp[0].im, temp[1].re, temp[1].im,
temp[2].re, temp[2].im, temp[3].re, temp[3].im,
fabsf(ref[0].re - temp[0].re) <= 8*FLT_EPSILON, fabsf(ref[0].im - temp[0].im) <= 8*FLT_EPSILON,
fabsf(ref[1].re - temp[1].re) <= 8*FLT_EPSILON, fabsf(ref[1].im - temp[1].im) <= 8*FLT_EPSILON,
fabsf(ref[2].re - temp[2].re) <= 8*FLT_EPSILON, fabsf(ref[2].im - temp[2].im) <= 8*FLT_EPSILON,
fabsf(ref[3].re - temp[3].re) <= 8*FLT_EPSILON, fabsf(ref[3].im - temp[3].im) <= 8*FLT_EPSILON,
match, s->m, ff_aliens);
#endif
// memcpy(out, orig, s->m*sizeof(*test));
av_free(in_inter_orig);
av_free(in_test_orig);
av_free(in_main_orig);
av_free(out_inter_orig);
av_free(out_test_orig);
av_free(out_main_orig);
}
av_cold void ff_tx_init_float_x86(AVTXContext *s, av_tx_fn *tx)
{
int cpu_flags = av_get_cpu_flags();
int gen_revtab = 0, basis, revtab_interleave;
if (s->flags & AV_TX_UNALIGNED)
return;
if (ff_tx_type_is_mdct(s->type))
return;
#define TXFN(fn, gentab, sr_basis, interleave) \
do { \
*tx = fn; \
gen_revtab = gentab; \
basis = sr_basis; \
revtab_interleave = interleave; \
} while (0)
if (s->n == 1) {
if (EXTERNAL_SSE2(cpu_flags)) {
if (s->m == 4 && s->inv)
TXFN(ff_fft4_inv_float_sse2, 0, 0, 0);
else if (s->m == 4)
TXFN(ff_fft4_fwd_float_sse2, 0, 0, 0);
}
if (EXTERNAL_SSE3(cpu_flags)) {
if (s->m == 2)
TXFN(ff_fft2_float_sse3, 0, 0, 0);
else if (s->m == 8)
TXFN(ff_fft8_float_sse3, 1, 8, 0);
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
if (s->m == 8)
TXFN(ff_fft8_float_avx, 1, 8, 0);
else if (s->m == 16)
TXFN(ff_fft16_float_avx, 1, 8, 2);
#if ARCH_X86_64
else if (s->m == 32)
TXFN(ff_fft32_float_avx, 1, 8, 2);
// else if (s->m >= 64 && s->m <= 131072 && !(s->flags & AV_TX_INPLACE))
// TXFN(ff_split_radix_fft_float_avx, 1, 8, 2);
#endif
}
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
if (s->m == 16)
TXFN(ff_fft16_float_fma3, 1, 8, 2);
#if ARCH_X86_64
else if (s->m == 32)
TXFN(ff_fft32_float_fma3, 1, 8, 2);
else if (s->m >= 64 && s->m <= 131072 && !(s->flags & AV_TX_INPLACE))
TXFN(ff_split_radix_fft_float_fma3, 1, 8, 2);
#endif
}
}
// if (s->n == 1)
// TXFN(test_fft, 1, 8, 2);
// if (s->revtab)
// memcpy(old_revtab, s->revtab, s->m*sizeof(*s->revtab));
if (gen_revtab)
ff_tx_gen_split_radix_parity_revtab(s->revtab, s->m, s->inv, basis,
revtab_interleave);
#undef TXFN
#if 0
int *revtab = av_malloc(20*s->m*sizeof(*revtab));
memcpy(revtab, s->revtab, s->m*sizeof(*revtab));
memset(s->revtab, 0, s->m * sizeof(*s->revtab));
revtab_avx(s->revtab, s->m, s->inv, 0, s->m);
for (int i = 0; i < s->m; i++) {
printf("Remap entry [%i]: %i -> %i\n", i, revtab[i], s->revtab[i]);
}
av_free(revtab);
#endif
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment