Skip to content

Instantly share code, notes, and snippets.

@nihui
Last active May 2, 2024 07:24
Show Gist options
  • Save nihui/37d98b705a6a28911d77c502282b4748 to your computer and use it in GitHub Desktop.
Save nihui/37d98b705a6a28911d77c502282b4748 to your computer and use it in GitHub Desktop.
avx512 16x24 16x16 16x12 16x8 16x4 16x2 8x24 8x16 8x12 8x8 8x4 8x2 matrix transpose
// g++ -mfma -mf16c -mavx512f -mavx512vnni -mavx512vl
#include <immintrin.h>
#include <stdio.h>
static void print(const __m512& _x)
{
__attribute__((aligned(64)))
float a[16];
_mm512_store_ps(a, _x);
for (int i = 0; i < 16; i++)
{
fprintf(stderr, "%4.0f ", a[i]);
}
fprintf(stderr, "\n");
}
static void print(const __m256& _x)
{
__attribute__((aligned(32)))
float a[8];
_mm256_store_ps(a, _x);
for (int i = 0; i < 8; i++)
{
fprintf(stderr, "%4.0f ", a[i]);
}
fprintf(stderr, "\n");
}
static void print_epi16(const __m256i& _x)
{
__attribute__((aligned(32)))
short a[16];
_mm256_store_si256((__m256i*)a, _x);
for (int i = 0; i < 16; i++)
{
fprintf(stderr, "%4d ", a[i]);
}
fprintf(stderr, "\n");
}
static void print_epi16(const __m128i& _x)
{
__attribute__((aligned(16)))
short a[8];
_mm_store_si128((__m128i*)a, _x);
for (int i = 0; i < 8; i++)
{
fprintf(stderr, "%4d ", a[i]);
}
fprintf(stderr, "\n");
}
void transpose_16x24_ps()
{
__attribute__((aligned(64)))
float a[24][16];
for (int i = 0; i < 24; i++)
{
for (int j = 0; j < 16; j++)
{
a[i][j] = i * 100 + j;
}
}
__m512 _r0 = _mm512_load_ps(a[0]);
__m512 _r1 = _mm512_load_ps(a[1]);
__m512 _r2 = _mm512_load_ps(a[2]);
__m512 _r3 = _mm512_load_ps(a[3]);
__m512 _r4 = _mm512_load_ps(a[4]);
__m512 _r5 = _mm512_load_ps(a[5]);
__m512 _r6 = _mm512_load_ps(a[6]);
__m512 _r7 = _mm512_load_ps(a[7]);
__m512 _r8 = _mm512_load_ps(a[8]);
__m512 _r9 = _mm512_load_ps(a[9]);
__m512 _ra = _mm512_load_ps(a[10]);
__m512 _rb = _mm512_load_ps(a[11]);
__m512 _rc = _mm512_load_ps(a[12]);
__m512 _rd = _mm512_load_ps(a[13]);
__m512 _re = _mm512_load_ps(a[14]);
__m512 _rf = _mm512_load_ps(a[15]);
__m512 _rg = _mm512_load_ps(a[16]);
__m512 _rh = _mm512_load_ps(a[17]);
__m512 _ri = _mm512_load_ps(a[18]);
__m512 _rj = _mm512_load_ps(a[19]);
__m512 _rk = _mm512_load_ps(a[20]);
__m512 _rl = _mm512_load_ps(a[21]);
__m512 _rm = _mm512_load_ps(a[22]);
__m512 _rn = _mm512_load_ps(a[23]);
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
print(_r8);
print(_r9);
print(_ra);
print(_rb);
print(_rc);
print(_rd);
print(_re);
print(_rf);
print(_rg);
print(_rh);
print(_ri);
print(_rj);
print(_rk);
print(_rl);
print(_rm);
print(_rn);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
__m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
__m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
__m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
__m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
__m512 _tmp8 = _mm512_unpacklo_ps(_r8, _r9);
__m512 _tmp9 = _mm512_unpackhi_ps(_r8, _r9);
__m512 _tmpa = _mm512_unpacklo_ps(_ra, _rb);
__m512 _tmpb = _mm512_unpackhi_ps(_ra, _rb);
__m512 _tmpc = _mm512_unpacklo_ps(_rc, _rd);
__m512 _tmpd = _mm512_unpackhi_ps(_rc, _rd);
__m512 _tmpe = _mm512_unpacklo_ps(_re, _rf);
__m512 _tmpf = _mm512_unpackhi_ps(_re, _rf);
__m512 _tmpg = _mm512_unpacklo_ps(_rg, _rh);
__m512 _tmph = _mm512_unpackhi_ps(_rg, _rh);
__m512 _tmpi = _mm512_unpacklo_ps(_ri, _rj);
__m512 _tmpj = _mm512_unpackhi_ps(_ri, _rj);
__m512 _tmpk = _mm512_unpacklo_ps(_rk, _rl);
__m512 _tmpl = _mm512_unpackhi_ps(_rk, _rl);
__m512 _tmpm = _mm512_unpacklo_ps(_rm, _rn);
__m512 _tmpn = _mm512_unpackhi_ps(_rm, _rn);
__m512 _tmpo = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpp = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpq = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpr = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmps = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpt = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpu = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpv = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpw = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpx = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpy = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpz = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpA = _mm512_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpB = _mm512_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpC = _mm512_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpD = _mm512_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpE = _mm512_shuffle_ps(_tmpg, _tmpi, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpF = _mm512_shuffle_ps(_tmpg, _tmpi, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpG = _mm512_shuffle_ps(_tmph, _tmpj, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpH = _mm512_shuffle_ps(_tmph, _tmpj, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpI = _mm512_shuffle_ps(_tmpk, _tmpm, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpJ = _mm512_shuffle_ps(_tmpk, _tmpm, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpK = _mm512_shuffle_ps(_tmpl, _tmpn, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpL = _mm512_shuffle_ps(_tmpl, _tmpn, _MM_SHUFFLE(3, 2, 3, 2));
_tmp0 = _mm512_shuffle_f32x4(_tmpo, _tmps, _MM_SHUFFLE(2, 0, 2, 0));
_tmp1 = _mm512_shuffle_f32x4(_tmpw, _tmpA, _MM_SHUFFLE(2, 0, 2, 0));
_tmp2 = _mm512_shuffle_f32x4(_tmpE, _tmpI, _MM_SHUFFLE(2, 0, 2, 0));
_tmp3 = _mm512_shuffle_f32x4(_tmpp, _tmpt, _MM_SHUFFLE(2, 0, 2, 0));
_tmp4 = _mm512_shuffle_f32x4(_tmpx, _tmpB, _MM_SHUFFLE(2, 0, 2, 0));
_tmp5 = _mm512_shuffle_f32x4(_tmpF, _tmpJ, _MM_SHUFFLE(2, 0, 2, 0));
_tmp6 = _mm512_shuffle_f32x4(_tmpq, _tmpu, _MM_SHUFFLE(2, 0, 2, 0));
_tmp7 = _mm512_shuffle_f32x4(_tmpy, _tmpC, _MM_SHUFFLE(2, 0, 2, 0));
_tmp8 = _mm512_shuffle_f32x4(_tmpG, _tmpK, _MM_SHUFFLE(2, 0, 2, 0));
_tmp9 = _mm512_shuffle_f32x4(_tmpr, _tmpv, _MM_SHUFFLE(2, 0, 2, 0));
_tmpa = _mm512_shuffle_f32x4(_tmpz, _tmpD, _MM_SHUFFLE(2, 0, 2, 0));
_tmpb = _mm512_shuffle_f32x4(_tmpH, _tmpL, _MM_SHUFFLE(2, 0, 2, 0));
_tmpc = _mm512_shuffle_f32x4(_tmpo, _tmps, _MM_SHUFFLE(3, 1, 3, 1));
_tmpd = _mm512_shuffle_f32x4(_tmpw, _tmpA, _MM_SHUFFLE(3, 1, 3, 1));
_tmpe = _mm512_shuffle_f32x4(_tmpE, _tmpI, _MM_SHUFFLE(3, 1, 3, 1));
_tmpf = _mm512_shuffle_f32x4(_tmpp, _tmpt, _MM_SHUFFLE(3, 1, 3, 1));
_tmpg = _mm512_shuffle_f32x4(_tmpx, _tmpB, _MM_SHUFFLE(3, 1, 3, 1));
_tmph = _mm512_shuffle_f32x4(_tmpF, _tmpJ, _MM_SHUFFLE(3, 1, 3, 1));
_tmpi = _mm512_shuffle_f32x4(_tmpq, _tmpu, _MM_SHUFFLE(3, 1, 3, 1));
_tmpj = _mm512_shuffle_f32x4(_tmpy, _tmpC, _MM_SHUFFLE(3, 1, 3, 1));
_tmpk = _mm512_shuffle_f32x4(_tmpG, _tmpK, _MM_SHUFFLE(3, 1, 3, 1));
_tmpl = _mm512_shuffle_f32x4(_tmpr, _tmpv, _MM_SHUFFLE(3, 1, 3, 1));
_tmpm = _mm512_shuffle_f32x4(_tmpz, _tmpD, _MM_SHUFFLE(3, 1, 3, 1));
_tmpn = _mm512_shuffle_f32x4(_tmpH, _tmpL, _MM_SHUFFLE(3, 1, 3, 1));
_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
_r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
_r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
_r4 = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(2, 0, 2, 0));
_r5 = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(2, 0, 2, 0));
_r6 = _mm512_shuffle_f32x4(_tmpc, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
_r7 = _mm512_shuffle_f32x4(_tmpe, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
_r8 = _mm512_shuffle_f32x4(_tmpg, _tmph, _MM_SHUFFLE(2, 0, 2, 0));
_r9 = _mm512_shuffle_f32x4(_tmpi, _tmpj, _MM_SHUFFLE(2, 0, 2, 0));
_ra = _mm512_shuffle_f32x4(_tmpk, _tmpl, _MM_SHUFFLE(2, 0, 2, 0));
_rb = _mm512_shuffle_f32x4(_tmpm, _tmpn, _MM_SHUFFLE(2, 0, 2, 0));
_rc = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
_rd = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
_re = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
_rf = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
_rg = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(3, 1, 3, 1));
_rh = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(3, 1, 3, 1));
_ri = _mm512_shuffle_f32x4(_tmpc, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
_rj = _mm512_shuffle_f32x4(_tmpe, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
_rk = _mm512_shuffle_f32x4(_tmpg, _tmph, _MM_SHUFFLE(3, 1, 3, 1));
_rl = _mm512_shuffle_f32x4(_tmpi, _tmpj, _MM_SHUFFLE(3, 1, 3, 1));
_rm = _mm512_shuffle_f32x4(_tmpk, _tmpl, _MM_SHUFFLE(3, 1, 3, 1));
_rn = _mm512_shuffle_f32x4(_tmpm, _tmpn, _MM_SHUFFLE(3, 1, 3, 1));
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
print(_r8);
print(_r9);
print(_ra);
print(_rb);
print(_rc);
print(_rd);
print(_re);
print(_rf);
print(_rg);
print(_rh);
print(_ri);
print(_rj);
print(_rk);
print(_rl);
print(_rm);
print(_rn);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
void transpose_16x16_ps()
{
__attribute__((aligned(64)))
float a[16][16];
for (int i = 0; i < 16; i++)
{
for (int j = 0; j < 16; j++)
{
a[i][j] = i * 100 + j;
}
}
__m512 _r0 = _mm512_load_ps(a[0]);
__m512 _r1 = _mm512_load_ps(a[1]);
__m512 _r2 = _mm512_load_ps(a[2]);
__m512 _r3 = _mm512_load_ps(a[3]);
__m512 _r4 = _mm512_load_ps(a[4]);
__m512 _r5 = _mm512_load_ps(a[5]);
__m512 _r6 = _mm512_load_ps(a[6]);
__m512 _r7 = _mm512_load_ps(a[7]);
__m512 _r8 = _mm512_load_ps(a[8]);
__m512 _r9 = _mm512_load_ps(a[9]);
__m512 _ra = _mm512_load_ps(a[10]);
__m512 _rb = _mm512_load_ps(a[11]);
__m512 _rc = _mm512_load_ps(a[12]);
__m512 _rd = _mm512_load_ps(a[13]);
__m512 _re = _mm512_load_ps(a[14]);
__m512 _rf = _mm512_load_ps(a[15]);
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
print(_r8);
print(_r9);
print(_ra);
print(_rb);
print(_rc);
print(_rd);
print(_re);
print(_rf);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
__m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
__m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
__m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
__m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
__m512 _tmp8 = _mm512_unpacklo_ps(_r8, _r9);
__m512 _tmp9 = _mm512_unpackhi_ps(_r8, _r9);
__m512 _tmpa = _mm512_unpacklo_ps(_ra, _rb);
__m512 _tmpb = _mm512_unpackhi_ps(_ra, _rb);
__m512 _tmpc = _mm512_unpacklo_ps(_rc, _rd);
__m512 _tmpd = _mm512_unpackhi_ps(_rc, _rd);
__m512 _tmpe = _mm512_unpacklo_ps(_re, _rf);
__m512 _tmpf = _mm512_unpackhi_ps(_re, _rf);
__m512 _tmpg = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmph = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpi = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpj = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpk = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpl = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpm = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpn = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpo = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpp = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpq = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpr = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmps = _mm512_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpt = _mm512_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpu = _mm512_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpv = _mm512_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(3, 2, 3, 2));
_tmp0 = _mm512_shuffle_f32x4(_tmpg, _tmpk, _MM_SHUFFLE(2, 0, 2, 0));
_tmp1 = _mm512_shuffle_f32x4(_tmpo, _tmps, _MM_SHUFFLE(2, 0, 2, 0));
_tmp2 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(2, 0, 2, 0));
_tmp3 = _mm512_shuffle_f32x4(_tmpp, _tmpt, _MM_SHUFFLE(2, 0, 2, 0));
_tmp4 = _mm512_shuffle_f32x4(_tmpi, _tmpm, _MM_SHUFFLE(2, 0, 2, 0));
_tmp5 = _mm512_shuffle_f32x4(_tmpq, _tmpu, _MM_SHUFFLE(2, 0, 2, 0));
_tmp6 = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(2, 0, 2, 0));
_tmp7 = _mm512_shuffle_f32x4(_tmpr, _tmpv, _MM_SHUFFLE(2, 0, 2, 0));
_tmp8 = _mm512_shuffle_f32x4(_tmpg, _tmpk, _MM_SHUFFLE(3, 1, 3, 1));
_tmp9 = _mm512_shuffle_f32x4(_tmpo, _tmps, _MM_SHUFFLE(3, 1, 3, 1));
_tmpa = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(3, 1, 3, 1));
_tmpb = _mm512_shuffle_f32x4(_tmpp, _tmpt, _MM_SHUFFLE(3, 1, 3, 1));
_tmpc = _mm512_shuffle_f32x4(_tmpi, _tmpm, _MM_SHUFFLE(3, 1, 3, 1));
_tmpd = _mm512_shuffle_f32x4(_tmpq, _tmpu, _MM_SHUFFLE(3, 1, 3, 1));
_tmpe = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(3, 1, 3, 1));
_tmpf = _mm512_shuffle_f32x4(_tmpr, _tmpv, _MM_SHUFFLE(3, 1, 3, 1));
_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
_r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
_r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
_r4 = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(2, 0, 2, 0));
_r5 = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(2, 0, 2, 0));
_r6 = _mm512_shuffle_f32x4(_tmpc, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
_r7 = _mm512_shuffle_f32x4(_tmpe, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
_r8 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
_r9 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
_ra = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
_rb = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
_rc = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(3, 1, 3, 1));
_rd = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(3, 1, 3, 1));
_re = _mm512_shuffle_f32x4(_tmpc, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
_rf = _mm512_shuffle_f32x4(_tmpe, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
print(_r8);
print(_r9);
print(_ra);
print(_rb);
print(_rc);
print(_rd);
print(_re);
print(_rf);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
void transpose_16x12_ps()
{
__attribute__((aligned(64)))
float a[12][16];
for (int i = 0; i < 12; i++)
{
for (int j = 0; j < 16; j++)
{
a[i][j] = i * 100 + j;
}
}
__m512 _r0 = _mm512_load_ps(a[0]);
__m512 _r1 = _mm512_load_ps(a[1]);
__m512 _r2 = _mm512_load_ps(a[2]);
__m512 _r3 = _mm512_load_ps(a[3]);
__m512 _r4 = _mm512_load_ps(a[4]);
__m512 _r5 = _mm512_load_ps(a[5]);
__m512 _r6 = _mm512_load_ps(a[6]);
__m512 _r7 = _mm512_load_ps(a[7]);
__m512 _r8 = _mm512_load_ps(a[8]);
__m512 _r9 = _mm512_load_ps(a[9]);
__m512 _ra = _mm512_load_ps(a[10]);
__m512 _rb = _mm512_load_ps(a[11]);
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
print(_r8);
print(_r9);
print(_ra);
print(_rb);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
__m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
__m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
__m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
__m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
__m512 _tmp8 = _mm512_unpacklo_ps(_r8, _r9);
__m512 _tmp9 = _mm512_unpackhi_ps(_r8, _r9);
__m512 _tmpa = _mm512_unpacklo_ps(_ra, _rb);
__m512 _tmpb = _mm512_unpackhi_ps(_ra, _rb);
__m512 _tmpc = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpd = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpe = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpf = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpg = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmph = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpi = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpj = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpk = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpl = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpm = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpn = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));
_tmp0 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(2, 0, 2, 0));
_tmp1 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
_tmp2 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(2, 0, 2, 0));
_tmp3 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(2, 0, 2, 0));
_tmp4 = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
_tmp5 = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(2, 0, 2, 0));
_tmp6 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(3, 1, 3, 1));
_tmp7 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
_tmp8 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(3, 1, 3, 1));
_tmp9 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(3, 1, 3, 1));
_tmpa = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
_tmpb = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(3, 1, 3, 1));
_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
_r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
_r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
_r4 = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(2, 0, 2, 0));
_r5 = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(2, 0, 2, 0));
_r6 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
_r7 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
_r8 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
_r9 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
_ra = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(3, 1, 3, 1));
_rb = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(3, 1, 3, 1));
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
print(_r8);
print(_r9);
print(_ra);
print(_rb);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
void transpose_16x8_ps()
{
__attribute__((aligned(64)))
float a[8][16];
for (int i = 0; i < 8; i++)
{
for (int j = 0; j < 16; j++)
{
a[i][j] = i * 100 + j;
}
}
__m512 _r0 = _mm512_load_ps(a[0]);
__m512 _r1 = _mm512_load_ps(a[1]);
__m512 _r2 = _mm512_load_ps(a[2]);
__m512 _r3 = _mm512_load_ps(a[3]);
__m512 _r4 = _mm512_load_ps(a[4]);
__m512 _r5 = _mm512_load_ps(a[5]);
__m512 _r6 = _mm512_load_ps(a[6]);
__m512 _r7 = _mm512_load_ps(a[7]);
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
__m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
__m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
__m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
__m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
__m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
_tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0));
_tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
_tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0));
_tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
_tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1));
_tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
_tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1));
_tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
_r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
_r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
_r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
_r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
_r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
_r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
void transpose_16x4_ps()
{
__attribute__((aligned(64)))
float a[4][16];
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 16; j++)
{
a[i][j] = i * 100 + j;
}
}
__m512 _r0 = _mm512_load_ps(a[0]);
__m512 _r1 = _mm512_load_ps(a[1]);
__m512 _r2 = _mm512_load_ps(a[2]);
__m512 _r3 = _mm512_load_ps(a[3]);
print(_r0);
print(_r1);
print(_r2);
print(_r3);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
__m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
_tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
_tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
_tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
_tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
_r2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
_r3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
print(_r0);
print(_r1);
print(_r2);
print(_r3);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
void transpose_16x2_ps()
{
__attribute__((aligned(64)))
float a[2][16];
for (int i = 0; i < 2; i++)
{
for (int j = 0; j < 16; j++)
{
a[i][j] = i * 100 + j;
}
}
__m512 _r0 = _mm512_load_ps(a[0]);
__m512 _r1 = _mm512_load_ps(a[1]);
print(_r0);
print(_r1);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
__m512 _tmp2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
__m512 _tmp3 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
_r0 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
print(_r0);
print(_r1);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
void transpose_8x24_ps()
{
__attribute__((aligned(32)))
float a[24][8];
for (int i = 0; i < 24; i++)
{
for (int j = 0; j < 8; j++)
{
a[i][j] = i * 100 + j;
}
}
__m256 _r0 = _mm256_load_ps(a[0]);
__m256 _r1 = _mm256_load_ps(a[1]);
__m256 _r2 = _mm256_load_ps(a[2]);
__m256 _r3 = _mm256_load_ps(a[3]);
__m256 _r4 = _mm256_load_ps(a[4]);
__m256 _r5 = _mm256_load_ps(a[5]);
__m256 _r6 = _mm256_load_ps(a[6]);
__m256 _r7 = _mm256_load_ps(a[7]);
__m256 _r8 = _mm256_load_ps(a[8]);
__m256 _r9 = _mm256_load_ps(a[9]);
__m256 _ra = _mm256_load_ps(a[10]);
__m256 _rb = _mm256_load_ps(a[11]);
__m256 _rc = _mm256_load_ps(a[12]);
__m256 _rd = _mm256_load_ps(a[13]);
__m256 _re = _mm256_load_ps(a[14]);
__m256 _rf = _mm256_load_ps(a[15]);
__m256 _rg = _mm256_load_ps(a[16]);
__m256 _rh = _mm256_load_ps(a[17]);
__m256 _ri = _mm256_load_ps(a[18]);
__m256 _rj = _mm256_load_ps(a[19]);
__m256 _rk = _mm256_load_ps(a[20]);
__m256 _rl = _mm256_load_ps(a[21]);
__m256 _rm = _mm256_load_ps(a[22]);
__m256 _rn = _mm256_load_ps(a[23]);
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
print(_r8);
print(_r9);
print(_ra);
print(_rb);
print(_rc);
print(_rd);
print(_re);
print(_rf);
print(_rg);
print(_rh);
print(_ri);
print(_rj);
print(_rk);
print(_rl);
print(_rm);
print(_rn);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
__m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
__m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
__m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
__m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
__m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
__m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
__m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
__m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9);
__m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9);
__m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb);
__m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb);
__m256 _tmpc = _mm256_unpacklo_ps(_rc, _rd);
__m256 _tmpd = _mm256_unpackhi_ps(_rc, _rd);
__m256 _tmpe = _mm256_unpacklo_ps(_re, _rf);
__m256 _tmpf = _mm256_unpackhi_ps(_re, _rf);
__m256 _tmpg = _mm256_unpacklo_ps(_rg, _rh);
__m256 _tmph = _mm256_unpackhi_ps(_rg, _rh);
__m256 _tmpi = _mm256_unpacklo_ps(_ri, _rj);
__m256 _tmpj = _mm256_unpackhi_ps(_ri, _rj);
__m256 _tmpk = _mm256_unpacklo_ps(_rk, _rl);
__m256 _tmpl = _mm256_unpackhi_ps(_rk, _rl);
__m256 _tmpm = _mm256_unpacklo_ps(_rm, _rn);
__m256 _tmpn = _mm256_unpackhi_ps(_rm, _rn);
__m256 _tmpo = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpp = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpq = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpr = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmps = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpt = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpu = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpv = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpw = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpx = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpy = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpz = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpA = _mm256_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpB = _mm256_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpC = _mm256_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpD = _mm256_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpE = _mm256_shuffle_ps(_tmpg, _tmpi, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpF = _mm256_shuffle_ps(_tmpg, _tmpi, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpG = _mm256_shuffle_ps(_tmph, _tmpj, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpH = _mm256_shuffle_ps(_tmph, _tmpj, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpI = _mm256_shuffle_ps(_tmpk, _tmpm, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpJ = _mm256_shuffle_ps(_tmpk, _tmpm, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpK = _mm256_shuffle_ps(_tmpl, _tmpn, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpL = _mm256_shuffle_ps(_tmpl, _tmpn, _MM_SHUFFLE(3, 2, 3, 2));
_r0 = _mm256_permute2f128_ps(_tmpo, _tmps, _MM_SHUFFLE(0, 2, 0, 0));
_r1 = _mm256_permute2f128_ps(_tmpw, _tmpA, _MM_SHUFFLE(0, 2, 0, 0));
_r2 = _mm256_permute2f128_ps(_tmpE, _tmpI, _MM_SHUFFLE(0, 2, 0, 0));
_r3 = _mm256_permute2f128_ps(_tmpp, _tmpt, _MM_SHUFFLE(0, 2, 0, 0));
_r4 = _mm256_permute2f128_ps(_tmpx, _tmpB, _MM_SHUFFLE(0, 2, 0, 0));
_r5 = _mm256_permute2f128_ps(_tmpF, _tmpJ, _MM_SHUFFLE(0, 2, 0, 0));
_r6 = _mm256_permute2f128_ps(_tmpq, _tmpu, _MM_SHUFFLE(0, 2, 0, 0));
_r7 = _mm256_permute2f128_ps(_tmpy, _tmpC, _MM_SHUFFLE(0, 2, 0, 0));
_r8 = _mm256_permute2f128_ps(_tmpG, _tmpK, _MM_SHUFFLE(0, 2, 0, 0));
_r9 = _mm256_permute2f128_ps(_tmpr, _tmpv, _MM_SHUFFLE(0, 2, 0, 0));
_ra = _mm256_permute2f128_ps(_tmpz, _tmpD, _MM_SHUFFLE(0, 2, 0, 0));
_rb = _mm256_permute2f128_ps(_tmpH, _tmpL, _MM_SHUFFLE(0, 2, 0, 0));
_rc = _mm256_permute2f128_ps(_tmpo, _tmps, _MM_SHUFFLE(0, 3, 0, 1));
_rd = _mm256_permute2f128_ps(_tmpw, _tmpA, _MM_SHUFFLE(0, 3, 0, 1));
_re = _mm256_permute2f128_ps(_tmpE, _tmpI, _MM_SHUFFLE(0, 3, 0, 1));
_rf = _mm256_permute2f128_ps(_tmpp, _tmpt, _MM_SHUFFLE(0, 3, 0, 1));
_rg = _mm256_permute2f128_ps(_tmpx, _tmpB, _MM_SHUFFLE(0, 3, 0, 1));
_rh = _mm256_permute2f128_ps(_tmpF, _tmpJ, _MM_SHUFFLE(0, 3, 0, 1));
_ri = _mm256_permute2f128_ps(_tmpq, _tmpu, _MM_SHUFFLE(0, 3, 0, 1));
_rj = _mm256_permute2f128_ps(_tmpy, _tmpC, _MM_SHUFFLE(0, 3, 0, 1));
_rk = _mm256_permute2f128_ps(_tmpG, _tmpK, _MM_SHUFFLE(0, 3, 0, 1));
_rl = _mm256_permute2f128_ps(_tmpr, _tmpv, _MM_SHUFFLE(0, 3, 0, 1));
_rm = _mm256_permute2f128_ps(_tmpz, _tmpD, _MM_SHUFFLE(0, 3, 0, 1));
_rn = _mm256_permute2f128_ps(_tmpH, _tmpL, _MM_SHUFFLE(0, 3, 0, 1));
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
print(_r8);
print(_r9);
print(_ra);
print(_rb);
print(_rc);
print(_rd);
print(_re);
print(_rf);
print(_rg);
print(_rh);
print(_ri);
print(_rj);
print(_rk);
print(_rl);
print(_rm);
print(_rn);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
void transpose_8x16_ps()
{
__attribute__((aligned(32)))
float a[16][8];
for (int i = 0; i < 16; i++)
{
for (int j = 0; j < 8; j++)
{
a[i][j] = i * 100 + j;
}
}
__m256 _r0 = _mm256_load_ps(a[0]);
__m256 _r1 = _mm256_load_ps(a[1]);
__m256 _r2 = _mm256_load_ps(a[2]);
__m256 _r3 = _mm256_load_ps(a[3]);
__m256 _r4 = _mm256_load_ps(a[4]);
__m256 _r5 = _mm256_load_ps(a[5]);
__m256 _r6 = _mm256_load_ps(a[6]);
__m256 _r7 = _mm256_load_ps(a[7]);
__m256 _r8 = _mm256_load_ps(a[8]);
__m256 _r9 = _mm256_load_ps(a[9]);
__m256 _ra = _mm256_load_ps(a[10]);
__m256 _rb = _mm256_load_ps(a[11]);
__m256 _rc = _mm256_load_ps(a[12]);
__m256 _rd = _mm256_load_ps(a[13]);
__m256 _re = _mm256_load_ps(a[14]);
__m256 _rf = _mm256_load_ps(a[15]);
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
print(_r8);
print(_r9);
print(_ra);
print(_rb);
print(_rc);
print(_rd);
print(_re);
print(_rf);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
__m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
__m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
__m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
__m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
__m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
__m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
__m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
__m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9);
__m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9);
__m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb);
__m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb);
__m256 _tmpc = _mm256_unpacklo_ps(_rc, _rd);
__m256 _tmpd = _mm256_unpackhi_ps(_rc, _rd);
__m256 _tmpe = _mm256_unpacklo_ps(_re, _rf);
__m256 _tmpf = _mm256_unpackhi_ps(_re, _rf);
__m256 _tmpg = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmph = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpi = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpj = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpk = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpl = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpm = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpn = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpo = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpp = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpq = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpr = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmps = _mm256_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpt = _mm256_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpu = _mm256_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpv = _mm256_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(3, 2, 3, 2));
_r0 = _mm256_permute2f128_ps(_tmpg, _tmpk, _MM_SHUFFLE(0, 2, 0, 0));
_r1 = _mm256_permute2f128_ps(_tmpo, _tmps, _MM_SHUFFLE(0, 2, 0, 0));
_r2 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 2, 0, 0));
_r3 = _mm256_permute2f128_ps(_tmpp, _tmpt, _MM_SHUFFLE(0, 2, 0, 0));
_r4 = _mm256_permute2f128_ps(_tmpi, _tmpm, _MM_SHUFFLE(0, 2, 0, 0));
_r5 = _mm256_permute2f128_ps(_tmpq, _tmpu, _MM_SHUFFLE(0, 2, 0, 0));
_r6 = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 2, 0, 0));
_r7 = _mm256_permute2f128_ps(_tmpr, _tmpv, _MM_SHUFFLE(0, 2, 0, 0));
_r8 = _mm256_permute2f128_ps(_tmpg, _tmpk, _MM_SHUFFLE(0, 3, 0, 1));
_r9 = _mm256_permute2f128_ps(_tmpo, _tmps, _MM_SHUFFLE(0, 3, 0, 1));
_ra = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 3, 0, 1));
_rb = _mm256_permute2f128_ps(_tmpp, _tmpt, _MM_SHUFFLE(0, 3, 0, 1));
_rc = _mm256_permute2f128_ps(_tmpi, _tmpm, _MM_SHUFFLE(0, 3, 0, 1));
_rd = _mm256_permute2f128_ps(_tmpq, _tmpu, _MM_SHUFFLE(0, 3, 0, 1));
_re = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 3, 0, 1));
_rf = _mm256_permute2f128_ps(_tmpr, _tmpv, _MM_SHUFFLE(0, 3, 0, 1));
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
print(_r8);
print(_r9);
print(_ra);
print(_rb);
print(_rc);
print(_rd);
print(_re);
print(_rf);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
void transpose_8x12_ps()
{
__attribute__((aligned(32)))
float a[12][8];
for (int i = 0; i < 12; i++)
{
for (int j = 0; j < 8; j++)
{
a[i][j] = i * 100 + j;
}
}
__m256 _r0 = _mm256_load_ps(a[0]);
__m256 _r1 = _mm256_load_ps(a[1]);
__m256 _r2 = _mm256_load_ps(a[2]);
__m256 _r3 = _mm256_load_ps(a[3]);
__m256 _r4 = _mm256_load_ps(a[4]);
__m256 _r5 = _mm256_load_ps(a[5]);
__m256 _r6 = _mm256_load_ps(a[6]);
__m256 _r7 = _mm256_load_ps(a[7]);
__m256 _r8 = _mm256_load_ps(a[8]);
__m256 _r9 = _mm256_load_ps(a[9]);
__m256 _ra = _mm256_load_ps(a[10]);
__m256 _rb = _mm256_load_ps(a[11]);
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
print(_r8);
print(_r9);
print(_ra);
print(_rb);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
__m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
__m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
__m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
__m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
__m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
__m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
__m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
__m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9);
__m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9);
__m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb);
__m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb);
__m256 _tmpc = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpd = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpe = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpf = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpg = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmph = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpi = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpj = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpk = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpl = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpm = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpn = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));
_r0 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 2, 0, 0));
_r1 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 2, 0, 0));
_r2 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 2, 0, 0));
_r3 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 2, 0, 0));
_r4 = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 2, 0, 0));
_r5 = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 2, 0, 0));
_r6 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 3, 0, 1));
_r7 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 3, 0, 1));
_r8 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 3, 0, 1));
_r9 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 3, 0, 1));
_ra = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 3, 0, 1));
_rb = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 3, 0, 1));
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
print(_r8);
print(_r9);
print(_ra);
print(_rb);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
void transpose_8x8_ps()
{
__attribute__((aligned(32)))
float a[8][8];
for (int i = 0; i < 8; i++)
{
for (int j = 0; j < 8; j++)
{
a[i][j] = i * 100 + j;
}
}
__m256 _r0 = _mm256_load_ps(a[0]);
__m256 _r1 = _mm256_load_ps(a[1]);
__m256 _r2 = _mm256_load_ps(a[2]);
__m256 _r3 = _mm256_load_ps(a[3]);
__m256 _r4 = _mm256_load_ps(a[4]);
__m256 _r5 = _mm256_load_ps(a[5]);
__m256 _r6 = _mm256_load_ps(a[6]);
__m256 _r7 = _mm256_load_ps(a[7]);
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
__m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
__m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
__m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
__m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
__m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
__m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
__m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
__m256 _tmp8 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmp9 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpa = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpb = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpc = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpd = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmpe = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmpf = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
_r0 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 2, 0, 0));
_r1 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 2, 0, 0));
_r2 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 2, 0, 0));
_r3 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 2, 0, 0));
_r4 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 3, 0, 1));
_r5 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 3, 0, 1));
_r6 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 3, 0, 1));
_r7 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 3, 0, 1));
print(_r0);
print(_r1);
print(_r2);
print(_r3);
print(_r4);
print(_r5);
print(_r6);
print(_r7);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
void transpose_8x4_ps()
{
__attribute__((aligned(32)))
float a[4][8];
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 8; j++)
{
a[i][j] = i * 100 + j;
}
}
__m256 _r0 = _mm256_load_ps(a[0]);
__m256 _r1 = _mm256_load_ps(a[1]);
__m256 _r2 = _mm256_load_ps(a[2]);
__m256 _r3 = _mm256_load_ps(a[3]);
print(_r0);
print(_r1);
print(_r2);
print(_r3);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
__m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
__m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
__m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
__m256 _tmp4 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmp5 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m256 _tmp6 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m256 _tmp7 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
_r0 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0));
_r1 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0));
_r2 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1));
_r3 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
print(_r0);
print(_r1);
print(_r2);
print(_r3);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
void transpose_8x2_ps()
{
__attribute__((aligned(32)))
float a[2][8];
for (int i = 0; i < 2; i++)
{
for (int j = 0; j < 8; j++)
{
a[i][j] = i * 100 + j;
}
}
__m256 _r0 = _mm256_load_ps(a[0]);
__m256 _r1 = _mm256_load_ps(a[1]);
print(_r0);
print(_r1);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
__m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
_r0 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0));
_r1 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1));
print(_r0);
print(_r1);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
void transpose_16x16_epi16()
{
__attribute__((aligned(32)))
short a[16][16];
for (int i = 0; i < 16; i++)
{
for (int j = 0; j < 16; j++)
{
a[i][j] = i * 100 + j;
}
}
__m256i _r0 = _mm256_load_si256((const __m256i*)a[0]);
__m256i _r1 = _mm256_load_si256((const __m256i*)a[1]);
__m256i _r2 = _mm256_load_si256((const __m256i*)a[2]);
__m256i _r3 = _mm256_load_si256((const __m256i*)a[3]);
__m256i _r4 = _mm256_load_si256((const __m256i*)a[4]);
__m256i _r5 = _mm256_load_si256((const __m256i*)a[5]);
__m256i _r6 = _mm256_load_si256((const __m256i*)a[6]);
__m256i _r7 = _mm256_load_si256((const __m256i*)a[7]);
__m256i _r8 = _mm256_load_si256((const __m256i*)a[8]);
__m256i _r9 = _mm256_load_si256((const __m256i*)a[9]);
__m256i _ra = _mm256_load_si256((const __m256i*)a[10]);
__m256i _rb = _mm256_load_si256((const __m256i*)a[11]);
__m256i _rc = _mm256_load_si256((const __m256i*)a[12]);
__m256i _rd = _mm256_load_si256((const __m256i*)a[13]);
__m256i _re = _mm256_load_si256((const __m256i*)a[14]);
__m256i _rf = _mm256_load_si256((const __m256i*)a[15]);
print_epi16(_r0);
print_epi16(_r1);
print_epi16(_r2);
print_epi16(_r3);
print_epi16(_r4);
print_epi16(_r5);
print_epi16(_r6);
print_epi16(_r7);
print_epi16(_r8);
print_epi16(_r9);
print_epi16(_ra);
print_epi16(_rb);
print_epi16(_rc);
print_epi16(_rd);
print_epi16(_re);
print_epi16(_rf);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m256i _tmp0 = _mm256_unpacklo_epi16(_r0, _r1);
__m256i _tmp1 = _mm256_unpackhi_epi16(_r0, _r1);
__m256i _tmp2 = _mm256_unpacklo_epi16(_r2, _r3);
__m256i _tmp3 = _mm256_unpackhi_epi16(_r2, _r3);
__m256i _tmp4 = _mm256_unpacklo_epi16(_r4, _r5);
__m256i _tmp5 = _mm256_unpackhi_epi16(_r4, _r5);
__m256i _tmp6 = _mm256_unpacklo_epi16(_r6, _r7);
__m256i _tmp7 = _mm256_unpackhi_epi16(_r6, _r7);
__m256i _tmp8 = _mm256_unpacklo_epi16(_r8, _r9);
__m256i _tmp9 = _mm256_unpackhi_epi16(_r8, _r9);
__m256i _tmpa = _mm256_unpacklo_epi16(_ra, _rb);
__m256i _tmpb = _mm256_unpackhi_epi16(_ra, _rb);
__m256i _tmpc = _mm256_unpacklo_epi16(_rc, _rd);
__m256i _tmpd = _mm256_unpackhi_epi16(_rc, _rd);
__m256i _tmpe = _mm256_unpacklo_epi16(_re, _rf);
__m256i _tmpf = _mm256_unpackhi_epi16(_re, _rf);
__m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2);
__m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2);
__m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3);
__m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3);
__m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6);
__m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6);
__m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7);
__m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7);
__m256i _tmpo = _mm256_unpacklo_epi32(_tmp8, _tmpa);
__m256i _tmpp = _mm256_unpackhi_epi32(_tmp8, _tmpa);
__m256i _tmpq = _mm256_unpacklo_epi32(_tmp9, _tmpb);
__m256i _tmpr = _mm256_unpackhi_epi32(_tmp9, _tmpb);
__m256i _tmps = _mm256_unpacklo_epi32(_tmpc, _tmpe);
__m256i _tmpt = _mm256_unpackhi_epi32(_tmpc, _tmpe);
__m256i _tmpu = _mm256_unpacklo_epi32(_tmpd, _tmpf);
__m256i _tmpv = _mm256_unpackhi_epi32(_tmpd, _tmpf);
_tmp0 = _mm256_unpacklo_epi64(_tmpg, _tmpk);
_tmp1 = _mm256_unpackhi_epi64(_tmpg, _tmpk);
_tmp2 = _mm256_unpacklo_epi64(_tmph, _tmpl);
_tmp3 = _mm256_unpackhi_epi64(_tmph, _tmpl);
_tmp4 = _mm256_unpacklo_epi64(_tmpi, _tmpm);
_tmp5 = _mm256_unpackhi_epi64(_tmpi, _tmpm);
_tmp6 = _mm256_unpacklo_epi64(_tmpj, _tmpn);
_tmp7 = _mm256_unpackhi_epi64(_tmpj, _tmpn);
_tmp8 = _mm256_unpacklo_epi64(_tmpo, _tmps);
_tmp9 = _mm256_unpackhi_epi64(_tmpo, _tmps);
_tmpa = _mm256_unpacklo_epi64(_tmpp, _tmpt);
_tmpb = _mm256_unpackhi_epi64(_tmpp, _tmpt);
_tmpc = _mm256_unpacklo_epi64(_tmpq, _tmpu);
_tmpd = _mm256_unpackhi_epi64(_tmpq, _tmpu);
_tmpe = _mm256_unpacklo_epi64(_tmpr, _tmpv);
_tmpf = _mm256_unpackhi_epi64(_tmpr, _tmpv);
_r0 = _mm256_permute2x128_si256(_tmp0, _tmp8, _MM_SHUFFLE(0, 2, 0, 0));
_r1 = _mm256_permute2x128_si256(_tmp1, _tmp9, _MM_SHUFFLE(0, 2, 0, 0));
_r2 = _mm256_permute2x128_si256(_tmp2, _tmpa, _MM_SHUFFLE(0, 2, 0, 0));
_r3 = _mm256_permute2x128_si256(_tmp3, _tmpb, _MM_SHUFFLE(0, 2, 0, 0));
_r4 = _mm256_permute2x128_si256(_tmp4, _tmpc, _MM_SHUFFLE(0, 2, 0, 0));
_r5 = _mm256_permute2x128_si256(_tmp5, _tmpd, _MM_SHUFFLE(0, 2, 0, 0));
_r6 = _mm256_permute2x128_si256(_tmp6, _tmpe, _MM_SHUFFLE(0, 2, 0, 0));
_r7 = _mm256_permute2x128_si256(_tmp7, _tmpf, _MM_SHUFFLE(0, 2, 0, 0));
_r8 = _mm256_permute2x128_si256(_tmp0, _tmp8, _MM_SHUFFLE(0, 3, 0, 1));
_r9 = _mm256_permute2x128_si256(_tmp1, _tmp9, _MM_SHUFFLE(0, 3, 0, 1));
_ra = _mm256_permute2x128_si256(_tmp2, _tmpa, _MM_SHUFFLE(0, 3, 0, 1));
_rb = _mm256_permute2x128_si256(_tmp3, _tmpb, _MM_SHUFFLE(0, 3, 0, 1));
_rc = _mm256_permute2x128_si256(_tmp4, _tmpc, _MM_SHUFFLE(0, 3, 0, 1));
_rd = _mm256_permute2x128_si256(_tmp5, _tmpd, _MM_SHUFFLE(0, 3, 0, 1));
_re = _mm256_permute2x128_si256(_tmp6, _tmpe, _MM_SHUFFLE(0, 3, 0, 1));
_rf = _mm256_permute2x128_si256(_tmp7, _tmpf, _MM_SHUFFLE(0, 3, 0, 1));
print_epi16(_r0);
print_epi16(_r1);
print_epi16(_r2);
print_epi16(_r3);
print_epi16(_r4);
print_epi16(_r5);
print_epi16(_r6);
print_epi16(_r7);
print_epi16(_r8);
print_epi16(_r9);
print_epi16(_ra);
print_epi16(_rb);
print_epi16(_rc);
print_epi16(_rd);
print_epi16(_re);
print_epi16(_rf);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
void transpose_16x8_epi16()
{
__attribute__((aligned(32)))
short a[16][8];
for (int i = 0; i < 16; i++)
{
for (int j = 0; j < 8; j++)
{
a[i][j] = i * 100 + j;
}
}
__m128i _r0 = _mm_load_si128((const __m128i*)a[0]);
__m128i _r1 = _mm_load_si128((const __m128i*)a[1]);
__m128i _r2 = _mm_load_si128((const __m128i*)a[2]);
__m128i _r3 = _mm_load_si128((const __m128i*)a[3]);
__m128i _r4 = _mm_load_si128((const __m128i*)a[4]);
__m128i _r5 = _mm_load_si128((const __m128i*)a[5]);
__m128i _r6 = _mm_load_si128((const __m128i*)a[6]);
__m128i _r7 = _mm_load_si128((const __m128i*)a[7]);
__m128i _r8 = _mm_load_si128((const __m128i*)a[8]);
__m128i _r9 = _mm_load_si128((const __m128i*)a[9]);
__m128i _ra = _mm_load_si128((const __m128i*)a[10]);
__m128i _rb = _mm_load_si128((const __m128i*)a[11]);
__m128i _rc = _mm_load_si128((const __m128i*)a[12]);
__m128i _rd = _mm_load_si128((const __m128i*)a[13]);
__m128i _re = _mm_load_si128((const __m128i*)a[14]);
__m128i _rf = _mm_load_si128((const __m128i*)a[15]);
print_epi16(_r0);
print_epi16(_r1);
print_epi16(_r2);
print_epi16(_r3);
print_epi16(_r4);
print_epi16(_r5);
print_epi16(_r6);
print_epi16(_r7);
print_epi16(_r8);
print_epi16(_r9);
print_epi16(_ra);
print_epi16(_rb);
print_epi16(_rc);
print_epi16(_rd);
print_epi16(_re);
print_epi16(_rf);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
#if __AVX2__
__m256i _r08 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r0), _r8, 1);
__m256i _r19 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r1), _r9, 1);
__m256i _r2a = _mm256_inserti128_si256(_mm256_castsi128_si256(_r2), _ra, 1);
__m256i _r3b = _mm256_inserti128_si256(_mm256_castsi128_si256(_r3), _rb, 1);
__m256i _r4c = _mm256_inserti128_si256(_mm256_castsi128_si256(_r4), _rc, 1);
__m256i _r5d = _mm256_inserti128_si256(_mm256_castsi128_si256(_r5), _rd, 1);
__m256i _r6e = _mm256_inserti128_si256(_mm256_castsi128_si256(_r6), _re, 1);
__m256i _r7f = _mm256_inserti128_si256(_mm256_castsi128_si256(_r7), _rf, 1);
__m256i _tmp0 = _mm256_unpacklo_epi16(_r08, _r19);
__m256i _tmp1 = _mm256_unpackhi_epi16(_r08, _r19);
__m256i _tmp2 = _mm256_unpacklo_epi16(_r2a, _r3b);
__m256i _tmp3 = _mm256_unpackhi_epi16(_r2a, _r3b);
__m256i _tmp4 = _mm256_unpacklo_epi16(_r4c, _r5d);
__m256i _tmp5 = _mm256_unpackhi_epi16(_r4c, _r5d);
__m256i _tmp6 = _mm256_unpacklo_epi16(_r6e, _r7f);
__m256i _tmp7 = _mm256_unpackhi_epi16(_r6e, _r7f);
__m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2);
__m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2);
__m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3);
__m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3);
__m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6);
__m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6);
__m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7);
__m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7);
_r08 = _mm256_unpacklo_epi64(_tmpg, _tmpk);
_r19 = _mm256_unpackhi_epi64(_tmpg, _tmpk);
_r2a = _mm256_unpacklo_epi64(_tmph, _tmpl);
_r3b = _mm256_unpackhi_epi64(_tmph, _tmpl);
_r4c = _mm256_unpacklo_epi64(_tmpi, _tmpm);
_r5d = _mm256_unpackhi_epi64(_tmpi, _tmpm);
_r6e = _mm256_unpacklo_epi64(_tmpj, _tmpn);
_r7f = _mm256_unpackhi_epi64(_tmpj, _tmpn);
print_epi16(_r08);
print_epi16(_r19);
print_epi16(_r2a);
print_epi16(_r3b);
print_epi16(_r4c);
print_epi16(_r5d);
print_epi16(_r6e);
print_epi16(_r7f);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
#else // __AVX2__
__m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1);
__m128i _tmp1 = _mm_unpackhi_epi16(_r0, _r1);
__m128i _tmp2 = _mm_unpacklo_epi16(_r2, _r3);
__m128i _tmp3 = _mm_unpackhi_epi16(_r2, _r3);
__m128i _tmp4 = _mm_unpacklo_epi16(_r4, _r5);
__m128i _tmp5 = _mm_unpackhi_epi16(_r4, _r5);
__m128i _tmp6 = _mm_unpacklo_epi16(_r6, _r7);
__m128i _tmp7 = _mm_unpackhi_epi16(_r6, _r7);
__m128i _tmp8 = _mm_unpacklo_epi16(_r8, _r9);
__m128i _tmp9 = _mm_unpackhi_epi16(_r8, _r9);
__m128i _tmpa = _mm_unpacklo_epi16(_ra, _rb);
__m128i _tmpb = _mm_unpackhi_epi16(_ra, _rb);
__m128i _tmpc = _mm_unpacklo_epi16(_rc, _rd);
__m128i _tmpd = _mm_unpackhi_epi16(_rc, _rd);
__m128i _tmpe = _mm_unpacklo_epi16(_re, _rf);
__m128i _tmpf = _mm_unpackhi_epi16(_re, _rf);
__m128i _tmpg = _mm_unpacklo_epi32(_tmp0, _tmp2);
__m128i _tmph = _mm_unpackhi_epi32(_tmp0, _tmp2);
__m128i _tmpi = _mm_unpacklo_epi32(_tmp1, _tmp3);
__m128i _tmpj = _mm_unpackhi_epi32(_tmp1, _tmp3);
__m128i _tmpk = _mm_unpacklo_epi32(_tmp4, _tmp6);
__m128i _tmpl = _mm_unpackhi_epi32(_tmp4, _tmp6);
__m128i _tmpm = _mm_unpacklo_epi32(_tmp5, _tmp7);
__m128i _tmpn = _mm_unpackhi_epi32(_tmp5, _tmp7);
__m128i _tmpo = _mm_unpacklo_epi32(_tmp8, _tmpa);
__m128i _tmpp = _mm_unpackhi_epi32(_tmp8, _tmpa);
__m128i _tmpq = _mm_unpacklo_epi32(_tmp9, _tmpb);
__m128i _tmpr = _mm_unpackhi_epi32(_tmp9, _tmpb);
__m128i _tmps = _mm_unpacklo_epi32(_tmpc, _tmpe);
__m128i _tmpt = _mm_unpackhi_epi32(_tmpc, _tmpe);
__m128i _tmpu = _mm_unpacklo_epi32(_tmpd, _tmpf);
__m128i _tmpv = _mm_unpackhi_epi32(_tmpd, _tmpf);
_r0 = _mm_unpacklo_epi64(_tmpg, _tmpk);
_r1 = _mm_unpacklo_epi64(_tmpo, _tmps);
_r2 = _mm_unpackhi_epi64(_tmpg, _tmpk);
_r3 = _mm_unpackhi_epi64(_tmpo, _tmps);
_r4 = _mm_unpacklo_epi64(_tmph, _tmpl);
_r5 = _mm_unpacklo_epi64(_tmpp, _tmpt);
_r6 = _mm_unpackhi_epi64(_tmph, _tmpl);
_r7 = _mm_unpackhi_epi64(_tmpp, _tmpt);
_r8 = _mm_unpacklo_epi64(_tmpi, _tmpm);
_r9 = _mm_unpacklo_epi64(_tmpq, _tmpu);
_ra = _mm_unpackhi_epi64(_tmpi, _tmpm);
_rb = _mm_unpackhi_epi64(_tmpq, _tmpu);
_rc = _mm_unpacklo_epi64(_tmpj, _tmpn);
_rd = _mm_unpacklo_epi64(_tmpr, _tmpv);
_re = _mm_unpackhi_epi64(_tmpj, _tmpn);
_rf = _mm_unpackhi_epi64(_tmpr, _tmpv);
print_epi16(_r0);
print_epi16(_r1);
print_epi16(_r2);
print_epi16(_r3);
print_epi16(_r4);
print_epi16(_r5);
print_epi16(_r6);
print_epi16(_r7);
print_epi16(_r8);
print_epi16(_r9);
print_epi16(_ra);
print_epi16(_rb);
print_epi16(_rc);
print_epi16(_rd);
print_epi16(_re);
print_epi16(_rf);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
#endif // __AVX2__
}
void transpose_8x16_epi16()
{
__attribute__((aligned(32)))
short a[8][16];
for (int i = 0; i < 8; i++)
{
for (int j = 0; j < 16; j++)
{
a[i][j] = i * 100 + j;
}
}
__m256i _r0 = _mm256_load_si256((const __m256i*)a[0]);
__m256i _r1 = _mm256_load_si256((const __m256i*)a[1]);
__m256i _r2 = _mm256_load_si256((const __m256i*)a[2]);
__m256i _r3 = _mm256_load_si256((const __m256i*)a[3]);
__m256i _r4 = _mm256_load_si256((const __m256i*)a[4]);
__m256i _r5 = _mm256_load_si256((const __m256i*)a[5]);
__m256i _r6 = _mm256_load_si256((const __m256i*)a[6]);
__m256i _r7 = _mm256_load_si256((const __m256i*)a[7]);
print_epi16(_r0);
print_epi16(_r1);
print_epi16(_r2);
print_epi16(_r3);
print_epi16(_r4);
print_epi16(_r5);
print_epi16(_r6);
print_epi16(_r7);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m256i _tmp0 = _mm256_unpacklo_epi16(_r0, _r1);
__m256i _tmp1 = _mm256_unpackhi_epi16(_r0, _r1);
__m256i _tmp2 = _mm256_unpacklo_epi16(_r2, _r3);
__m256i _tmp3 = _mm256_unpackhi_epi16(_r2, _r3);
__m256i _tmp4 = _mm256_unpacklo_epi16(_r4, _r5);
__m256i _tmp5 = _mm256_unpackhi_epi16(_r4, _r5);
__m256i _tmp6 = _mm256_unpacklo_epi16(_r6, _r7);
__m256i _tmp7 = _mm256_unpackhi_epi16(_r6, _r7);
__m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2);
__m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2);
__m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3);
__m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3);
__m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6);
__m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6);
__m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7);
__m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7);
_tmp0 = _mm256_unpacklo_epi64(_tmpg, _tmpk);
_tmp1 = _mm256_unpackhi_epi64(_tmpg, _tmpk);
_tmp2 = _mm256_unpacklo_epi64(_tmph, _tmpl);
_tmp3 = _mm256_unpackhi_epi64(_tmph, _tmpl);
_tmp4 = _mm256_unpacklo_epi64(_tmpi, _tmpm);
_tmp5 = _mm256_unpackhi_epi64(_tmpi, _tmpm);
_tmp6 = _mm256_unpacklo_epi64(_tmpj, _tmpn);
_tmp7 = _mm256_unpackhi_epi64(_tmpj, _tmpn);
_r0 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0));
_r1 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 2, 0, 0));
_r2 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0));
_r3 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0));
_r4 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1));
_r5 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 3, 0, 1));
_r6 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1));
_r7 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
print_epi16(_r0);
print_epi16(_r1);
print_epi16(_r2);
print_epi16(_r3);
print_epi16(_r4);
print_epi16(_r5);
print_epi16(_r6);
print_epi16(_r7);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
void transpose_8x8_epi16()
{
__attribute__((aligned(16)))
short a[8][8];
for (int i = 0; i < 8; i++)
{
for (int j = 0; j < 8; j++)
{
a[i][j] = i * 100 + j;
}
}
__m128i _r0 = _mm_load_si128((const __m128i*)a[0]);
__m128i _r1 = _mm_load_si128((const __m128i*)a[1]);
__m128i _r2 = _mm_load_si128((const __m128i*)a[2]);
__m128i _r3 = _mm_load_si128((const __m128i*)a[3]);
__m128i _r4 = _mm_load_si128((const __m128i*)a[4]);
__m128i _r5 = _mm_load_si128((const __m128i*)a[5]);
__m128i _r6 = _mm_load_si128((const __m128i*)a[6]);
__m128i _r7 = _mm_load_si128((const __m128i*)a[7]);
print_epi16(_r0);
print_epi16(_r1);
print_epi16(_r2);
print_epi16(_r3);
print_epi16(_r4);
print_epi16(_r5);
print_epi16(_r6);
print_epi16(_r7);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
__m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1);
__m128i _tmp1 = _mm_unpackhi_epi16(_r0, _r1);
__m128i _tmp2 = _mm_unpacklo_epi16(_r2, _r3);
__m128i _tmp3 = _mm_unpackhi_epi16(_r2, _r3);
__m128i _tmp4 = _mm_unpacklo_epi16(_r4, _r5);
__m128i _tmp5 = _mm_unpackhi_epi16(_r4, _r5);
__m128i _tmp6 = _mm_unpacklo_epi16(_r6, _r7);
__m128i _tmp7 = _mm_unpackhi_epi16(_r6, _r7);
__m128i _tmp8 = _mm_unpacklo_epi32(_tmp0, _tmp2);
__m128i _tmp9 = _mm_unpackhi_epi32(_tmp0, _tmp2);
__m128i _tmpa = _mm_unpacklo_epi32(_tmp1, _tmp3);
__m128i _tmpb = _mm_unpackhi_epi32(_tmp1, _tmp3);
__m128i _tmpc = _mm_unpacklo_epi32(_tmp4, _tmp6);
__m128i _tmpd = _mm_unpackhi_epi32(_tmp4, _tmp6);
__m128i _tmpe = _mm_unpacklo_epi32(_tmp5, _tmp7);
__m128i _tmpf = _mm_unpackhi_epi32(_tmp5, _tmp7);
_r0 = _mm_unpacklo_epi64(_tmp8, _tmpc);
_r1 = _mm_unpackhi_epi64(_tmp8, _tmpc);
_r2 = _mm_unpacklo_epi64(_tmp9, _tmpd);
_r3 = _mm_unpackhi_epi64(_tmp9, _tmpd);
_r4 = _mm_unpacklo_epi64(_tmpa, _tmpe);
_r5 = _mm_unpackhi_epi64(_tmpa, _tmpe);
_r6 = _mm_unpacklo_epi64(_tmpb, _tmpf);
_r7 = _mm_unpackhi_epi64(_tmpb, _tmpf);
print_epi16(_r0);
print_epi16(_r1);
print_epi16(_r2);
print_epi16(_r3);
print_epi16(_r4);
print_epi16(_r5);
print_epi16(_r6);
print_epi16(_r7);
fprintf(stderr, "-----------------------------------------------------------------------------------\n");
}
int main()
{
fprintf(stderr, "hello!\n");
// transpose_16x24_ps();
// transpose_16x16_ps();
// transpose_16x12_ps();
// transpose_16x8_ps();
// transpose_16x4_ps();
// transpose_16x2_ps();
// transpose_8x24_ps();
// transpose_8x16_ps();
// transpose_8x12_ps();
// transpose_8x8_ps();
// transpose_8x4_ps();
// transpose_8x2_ps();
// transpose_16x16_epi16();
transpose_16x8_epi16();
// transpose_8x16_epi16();
// transpose_8x8_epi16();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment