Last active
May 2, 2024 07:24
-
-
Save nihui/37d98b705a6a28911d77c502282b4748 to your computer and use it in GitHub Desktop.
avx512 16x24 16x16 16x12 16x8 16x4 16x2 8x24 8x16 8x12 8x8 8x4 8x2 matrix transpose
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// g++ -mfma -mf16c -mavx512f -mavx512vnni -mavx512vl | |
#include <immintrin.h> | |
#include <stdio.h> | |
static void print(const __m512& _x) | |
{ | |
__attribute__((aligned(64))) | |
float a[16]; | |
_mm512_store_ps(a, _x); | |
for (int i = 0; i < 16; i++) | |
{ | |
fprintf(stderr, "%4.0f ", a[i]); | |
} | |
fprintf(stderr, "\n"); | |
} | |
static void print(const __m256& _x) | |
{ | |
__attribute__((aligned(32))) | |
float a[8]; | |
_mm256_store_ps(a, _x); | |
for (int i = 0; i < 8; i++) | |
{ | |
fprintf(stderr, "%4.0f ", a[i]); | |
} | |
fprintf(stderr, "\n"); | |
} | |
static void print_epi16(const __m256i& _x) | |
{ | |
__attribute__((aligned(32))) | |
short a[16]; | |
_mm256_store_si256((__m256i*)a, _x); | |
for (int i = 0; i < 16; i++) | |
{ | |
fprintf(stderr, "%4d ", a[i]); | |
} | |
fprintf(stderr, "\n"); | |
} | |
static void print_epi16(const __m128i& _x) | |
{ | |
__attribute__((aligned(16))) | |
short a[8]; | |
_mm_store_si128((__m128i*)a, _x); | |
for (int i = 0; i < 8; i++) | |
{ | |
fprintf(stderr, "%4d ", a[i]); | |
} | |
fprintf(stderr, "\n"); | |
} | |
void transpose_16x24_ps() | |
{ | |
__attribute__((aligned(64))) | |
float a[24][16]; | |
for (int i = 0; i < 24; i++) | |
{ | |
for (int j = 0; j < 16; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m512 _r0 = _mm512_load_ps(a[0]); | |
__m512 _r1 = _mm512_load_ps(a[1]); | |
__m512 _r2 = _mm512_load_ps(a[2]); | |
__m512 _r3 = _mm512_load_ps(a[3]); | |
__m512 _r4 = _mm512_load_ps(a[4]); | |
__m512 _r5 = _mm512_load_ps(a[5]); | |
__m512 _r6 = _mm512_load_ps(a[6]); | |
__m512 _r7 = _mm512_load_ps(a[7]); | |
__m512 _r8 = _mm512_load_ps(a[8]); | |
__m512 _r9 = _mm512_load_ps(a[9]); | |
__m512 _ra = _mm512_load_ps(a[10]); | |
__m512 _rb = _mm512_load_ps(a[11]); | |
__m512 _rc = _mm512_load_ps(a[12]); | |
__m512 _rd = _mm512_load_ps(a[13]); | |
__m512 _re = _mm512_load_ps(a[14]); | |
__m512 _rf = _mm512_load_ps(a[15]); | |
__m512 _rg = _mm512_load_ps(a[16]); | |
__m512 _rh = _mm512_load_ps(a[17]); | |
__m512 _ri = _mm512_load_ps(a[18]); | |
__m512 _rj = _mm512_load_ps(a[19]); | |
__m512 _rk = _mm512_load_ps(a[20]); | |
__m512 _rl = _mm512_load_ps(a[21]); | |
__m512 _rm = _mm512_load_ps(a[22]); | |
__m512 _rn = _mm512_load_ps(a[23]); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
print(_r8); | |
print(_r9); | |
print(_ra); | |
print(_rb); | |
print(_rc); | |
print(_rd); | |
print(_re); | |
print(_rf); | |
print(_rg); | |
print(_rh); | |
print(_ri); | |
print(_rj); | |
print(_rk); | |
print(_rl); | |
print(_rm); | |
print(_rn); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); | |
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); | |
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); | |
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); | |
__m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5); | |
__m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5); | |
__m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7); | |
__m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7); | |
__m512 _tmp8 = _mm512_unpacklo_ps(_r8, _r9); | |
__m512 _tmp9 = _mm512_unpackhi_ps(_r8, _r9); | |
__m512 _tmpa = _mm512_unpacklo_ps(_ra, _rb); | |
__m512 _tmpb = _mm512_unpackhi_ps(_ra, _rb); | |
__m512 _tmpc = _mm512_unpacklo_ps(_rc, _rd); | |
__m512 _tmpd = _mm512_unpackhi_ps(_rc, _rd); | |
__m512 _tmpe = _mm512_unpacklo_ps(_re, _rf); | |
__m512 _tmpf = _mm512_unpackhi_ps(_re, _rf); | |
__m512 _tmpg = _mm512_unpacklo_ps(_rg, _rh); | |
__m512 _tmph = _mm512_unpackhi_ps(_rg, _rh); | |
__m512 _tmpi = _mm512_unpacklo_ps(_ri, _rj); | |
__m512 _tmpj = _mm512_unpackhi_ps(_ri, _rj); | |
__m512 _tmpk = _mm512_unpacklo_ps(_rk, _rl); | |
__m512 _tmpl = _mm512_unpackhi_ps(_rk, _rl); | |
__m512 _tmpm = _mm512_unpacklo_ps(_rm, _rn); | |
__m512 _tmpn = _mm512_unpackhi_ps(_rm, _rn); | |
__m512 _tmpo = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpp = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpq = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpr = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmps = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpt = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpu = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpv = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpw = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpx = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpy = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpz = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpA = _mm512_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpB = _mm512_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpC = _mm512_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpD = _mm512_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpE = _mm512_shuffle_ps(_tmpg, _tmpi, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpF = _mm512_shuffle_ps(_tmpg, _tmpi, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpG = _mm512_shuffle_ps(_tmph, _tmpj, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpH = _mm512_shuffle_ps(_tmph, _tmpj, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpI = _mm512_shuffle_ps(_tmpk, _tmpm, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpJ = _mm512_shuffle_ps(_tmpk, _tmpm, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpK = _mm512_shuffle_ps(_tmpl, _tmpn, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpL = _mm512_shuffle_ps(_tmpl, _tmpn, _MM_SHUFFLE(3, 2, 3, 2)); | |
_tmp0 = _mm512_shuffle_f32x4(_tmpo, _tmps, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp1 = _mm512_shuffle_f32x4(_tmpw, _tmpA, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp2 = _mm512_shuffle_f32x4(_tmpE, _tmpI, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp3 = _mm512_shuffle_f32x4(_tmpp, _tmpt, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp4 = _mm512_shuffle_f32x4(_tmpx, _tmpB, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp5 = _mm512_shuffle_f32x4(_tmpF, _tmpJ, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp6 = _mm512_shuffle_f32x4(_tmpq, _tmpu, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp7 = _mm512_shuffle_f32x4(_tmpy, _tmpC, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp8 = _mm512_shuffle_f32x4(_tmpG, _tmpK, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp9 = _mm512_shuffle_f32x4(_tmpr, _tmpv, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmpa = _mm512_shuffle_f32x4(_tmpz, _tmpD, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmpb = _mm512_shuffle_f32x4(_tmpH, _tmpL, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmpc = _mm512_shuffle_f32x4(_tmpo, _tmps, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpd = _mm512_shuffle_f32x4(_tmpw, _tmpA, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpe = _mm512_shuffle_f32x4(_tmpE, _tmpI, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpf = _mm512_shuffle_f32x4(_tmpp, _tmpt, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpg = _mm512_shuffle_f32x4(_tmpx, _tmpB, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmph = _mm512_shuffle_f32x4(_tmpF, _tmpJ, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpi = _mm512_shuffle_f32x4(_tmpq, _tmpu, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpj = _mm512_shuffle_f32x4(_tmpy, _tmpC, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpk = _mm512_shuffle_f32x4(_tmpG, _tmpK, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpl = _mm512_shuffle_f32x4(_tmpr, _tmpv, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpm = _mm512_shuffle_f32x4(_tmpz, _tmpD, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpn = _mm512_shuffle_f32x4(_tmpH, _tmpL, _MM_SHUFFLE(3, 1, 3, 1)); | |
_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r4 = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r5 = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r6 = _mm512_shuffle_f32x4(_tmpc, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r7 = _mm512_shuffle_f32x4(_tmpe, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r8 = _mm512_shuffle_f32x4(_tmpg, _tmph, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r9 = _mm512_shuffle_f32x4(_tmpi, _tmpj, _MM_SHUFFLE(2, 0, 2, 0)); | |
_ra = _mm512_shuffle_f32x4(_tmpk, _tmpl, _MM_SHUFFLE(2, 0, 2, 0)); | |
_rb = _mm512_shuffle_f32x4(_tmpm, _tmpn, _MM_SHUFFLE(2, 0, 2, 0)); | |
_rc = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); | |
_rd = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); | |
_re = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); | |
_rf = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); | |
_rg = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(3, 1, 3, 1)); | |
_rh = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(3, 1, 3, 1)); | |
_ri = _mm512_shuffle_f32x4(_tmpc, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); | |
_rj = _mm512_shuffle_f32x4(_tmpe, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); | |
_rk = _mm512_shuffle_f32x4(_tmpg, _tmph, _MM_SHUFFLE(3, 1, 3, 1)); | |
_rl = _mm512_shuffle_f32x4(_tmpi, _tmpj, _MM_SHUFFLE(3, 1, 3, 1)); | |
_rm = _mm512_shuffle_f32x4(_tmpk, _tmpl, _MM_SHUFFLE(3, 1, 3, 1)); | |
_rn = _mm512_shuffle_f32x4(_tmpm, _tmpn, _MM_SHUFFLE(3, 1, 3, 1)); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
print(_r8); | |
print(_r9); | |
print(_ra); | |
print(_rb); | |
print(_rc); | |
print(_rd); | |
print(_re); | |
print(_rf); | |
print(_rg); | |
print(_rh); | |
print(_ri); | |
print(_rj); | |
print(_rk); | |
print(_rl); | |
print(_rm); | |
print(_rn); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
void transpose_16x16_ps() | |
{ | |
__attribute__((aligned(64))) | |
float a[16][16]; | |
for (int i = 0; i < 16; i++) | |
{ | |
for (int j = 0; j < 16; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m512 _r0 = _mm512_load_ps(a[0]); | |
__m512 _r1 = _mm512_load_ps(a[1]); | |
__m512 _r2 = _mm512_load_ps(a[2]); | |
__m512 _r3 = _mm512_load_ps(a[3]); | |
__m512 _r4 = _mm512_load_ps(a[4]); | |
__m512 _r5 = _mm512_load_ps(a[5]); | |
__m512 _r6 = _mm512_load_ps(a[6]); | |
__m512 _r7 = _mm512_load_ps(a[7]); | |
__m512 _r8 = _mm512_load_ps(a[8]); | |
__m512 _r9 = _mm512_load_ps(a[9]); | |
__m512 _ra = _mm512_load_ps(a[10]); | |
__m512 _rb = _mm512_load_ps(a[11]); | |
__m512 _rc = _mm512_load_ps(a[12]); | |
__m512 _rd = _mm512_load_ps(a[13]); | |
__m512 _re = _mm512_load_ps(a[14]); | |
__m512 _rf = _mm512_load_ps(a[15]); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
print(_r8); | |
print(_r9); | |
print(_ra); | |
print(_rb); | |
print(_rc); | |
print(_rd); | |
print(_re); | |
print(_rf); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); | |
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); | |
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); | |
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); | |
__m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5); | |
__m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5); | |
__m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7); | |
__m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7); | |
__m512 _tmp8 = _mm512_unpacklo_ps(_r8, _r9); | |
__m512 _tmp9 = _mm512_unpackhi_ps(_r8, _r9); | |
__m512 _tmpa = _mm512_unpacklo_ps(_ra, _rb); | |
__m512 _tmpb = _mm512_unpackhi_ps(_ra, _rb); | |
__m512 _tmpc = _mm512_unpacklo_ps(_rc, _rd); | |
__m512 _tmpd = _mm512_unpackhi_ps(_rc, _rd); | |
__m512 _tmpe = _mm512_unpacklo_ps(_re, _rf); | |
__m512 _tmpf = _mm512_unpackhi_ps(_re, _rf); | |
__m512 _tmpg = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmph = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpi = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpj = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpk = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpl = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpm = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpn = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpo = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpp = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpq = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpr = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmps = _mm512_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpt = _mm512_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpu = _mm512_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpv = _mm512_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(3, 2, 3, 2)); | |
_tmp0 = _mm512_shuffle_f32x4(_tmpg, _tmpk, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp1 = _mm512_shuffle_f32x4(_tmpo, _tmps, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp2 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp3 = _mm512_shuffle_f32x4(_tmpp, _tmpt, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp4 = _mm512_shuffle_f32x4(_tmpi, _tmpm, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp5 = _mm512_shuffle_f32x4(_tmpq, _tmpu, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp6 = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp7 = _mm512_shuffle_f32x4(_tmpr, _tmpv, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp8 = _mm512_shuffle_f32x4(_tmpg, _tmpk, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmp9 = _mm512_shuffle_f32x4(_tmpo, _tmps, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpa = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpb = _mm512_shuffle_f32x4(_tmpp, _tmpt, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpc = _mm512_shuffle_f32x4(_tmpi, _tmpm, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpd = _mm512_shuffle_f32x4(_tmpq, _tmpu, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpe = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpf = _mm512_shuffle_f32x4(_tmpr, _tmpv, _MM_SHUFFLE(3, 1, 3, 1)); | |
_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r4 = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r5 = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r6 = _mm512_shuffle_f32x4(_tmpc, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r7 = _mm512_shuffle_f32x4(_tmpe, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r8 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); | |
_r9 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); | |
_ra = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); | |
_rb = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); | |
_rc = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(3, 1, 3, 1)); | |
_rd = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(3, 1, 3, 1)); | |
_re = _mm512_shuffle_f32x4(_tmpc, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); | |
_rf = _mm512_shuffle_f32x4(_tmpe, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
print(_r8); | |
print(_r9); | |
print(_ra); | |
print(_rb); | |
print(_rc); | |
print(_rd); | |
print(_re); | |
print(_rf); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
void transpose_16x12_ps() | |
{ | |
__attribute__((aligned(64))) | |
float a[12][16]; | |
for (int i = 0; i < 12; i++) | |
{ | |
for (int j = 0; j < 16; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m512 _r0 = _mm512_load_ps(a[0]); | |
__m512 _r1 = _mm512_load_ps(a[1]); | |
__m512 _r2 = _mm512_load_ps(a[2]); | |
__m512 _r3 = _mm512_load_ps(a[3]); | |
__m512 _r4 = _mm512_load_ps(a[4]); | |
__m512 _r5 = _mm512_load_ps(a[5]); | |
__m512 _r6 = _mm512_load_ps(a[6]); | |
__m512 _r7 = _mm512_load_ps(a[7]); | |
__m512 _r8 = _mm512_load_ps(a[8]); | |
__m512 _r9 = _mm512_load_ps(a[9]); | |
__m512 _ra = _mm512_load_ps(a[10]); | |
__m512 _rb = _mm512_load_ps(a[11]); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
print(_r8); | |
print(_r9); | |
print(_ra); | |
print(_rb); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); | |
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); | |
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); | |
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); | |
__m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5); | |
__m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5); | |
__m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7); | |
__m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7); | |
__m512 _tmp8 = _mm512_unpacklo_ps(_r8, _r9); | |
__m512 _tmp9 = _mm512_unpackhi_ps(_r8, _r9); | |
__m512 _tmpa = _mm512_unpacklo_ps(_ra, _rb); | |
__m512 _tmpb = _mm512_unpackhi_ps(_ra, _rb); | |
__m512 _tmpc = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpd = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpe = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpf = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpg = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmph = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpi = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpj = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpk = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpl = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpm = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpn = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2)); | |
_tmp0 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp1 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp2 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp3 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp4 = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp5 = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp6 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmp7 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmp8 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmp9 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpa = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmpb = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(3, 1, 3, 1)); | |
_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r4 = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r5 = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r6 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); | |
_r7 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); | |
_r8 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); | |
_r9 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); | |
_ra = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(3, 1, 3, 1)); | |
_rb = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(3, 1, 3, 1)); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
print(_r8); | |
print(_r9); | |
print(_ra); | |
print(_rb); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
void transpose_16x8_ps() | |
{ | |
__attribute__((aligned(64))) | |
float a[8][16]; | |
for (int i = 0; i < 8; i++) | |
{ | |
for (int j = 0; j < 16; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m512 _r0 = _mm512_load_ps(a[0]); | |
__m512 _r1 = _mm512_load_ps(a[1]); | |
__m512 _r2 = _mm512_load_ps(a[2]); | |
__m512 _r3 = _mm512_load_ps(a[3]); | |
__m512 _r4 = _mm512_load_ps(a[4]); | |
__m512 _r5 = _mm512_load_ps(a[5]); | |
__m512 _r6 = _mm512_load_ps(a[6]); | |
__m512 _r7 = _mm512_load_ps(a[7]); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); | |
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); | |
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); | |
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); | |
__m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5); | |
__m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5); | |
__m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7); | |
__m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7); | |
__m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); | |
_tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); | |
_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); | |
_r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); | |
_r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); | |
_r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
void transpose_16x4_ps() | |
{ | |
__attribute__((aligned(64))) | |
float a[4][16]; | |
for (int i = 0; i < 4; i++) | |
{ | |
for (int j = 0; j < 16; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m512 _r0 = _mm512_load_ps(a[0]); | |
__m512 _r1 = _mm512_load_ps(a[1]); | |
__m512 _r2 = _mm512_load_ps(a[2]); | |
__m512 _r3 = _mm512_load_ps(a[3]); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); | |
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); | |
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); | |
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); | |
__m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); | |
_tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); | |
_tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); | |
_tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); | |
_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); | |
_r3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
void transpose_16x2_ps() | |
{ | |
__attribute__((aligned(64))) | |
float a[2][16]; | |
for (int i = 0; i < 2; i++) | |
{ | |
for (int j = 0; j < 16; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m512 _r0 = _mm512_load_ps(a[0]); | |
__m512 _r1 = _mm512_load_ps(a[1]); | |
print(_r0); | |
print(_r1); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); | |
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); | |
__m512 _tmp2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); | |
__m512 _tmp3 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); | |
_r0 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); | |
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); | |
print(_r0); | |
print(_r1); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
void transpose_8x24_ps() | |
{ | |
__attribute__((aligned(32))) | |
float a[24][8]; | |
for (int i = 0; i < 24; i++) | |
{ | |
for (int j = 0; j < 8; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m256 _r0 = _mm256_load_ps(a[0]); | |
__m256 _r1 = _mm256_load_ps(a[1]); | |
__m256 _r2 = _mm256_load_ps(a[2]); | |
__m256 _r3 = _mm256_load_ps(a[3]); | |
__m256 _r4 = _mm256_load_ps(a[4]); | |
__m256 _r5 = _mm256_load_ps(a[5]); | |
__m256 _r6 = _mm256_load_ps(a[6]); | |
__m256 _r7 = _mm256_load_ps(a[7]); | |
__m256 _r8 = _mm256_load_ps(a[8]); | |
__m256 _r9 = _mm256_load_ps(a[9]); | |
__m256 _ra = _mm256_load_ps(a[10]); | |
__m256 _rb = _mm256_load_ps(a[11]); | |
__m256 _rc = _mm256_load_ps(a[12]); | |
__m256 _rd = _mm256_load_ps(a[13]); | |
__m256 _re = _mm256_load_ps(a[14]); | |
__m256 _rf = _mm256_load_ps(a[15]); | |
__m256 _rg = _mm256_load_ps(a[16]); | |
__m256 _rh = _mm256_load_ps(a[17]); | |
__m256 _ri = _mm256_load_ps(a[18]); | |
__m256 _rj = _mm256_load_ps(a[19]); | |
__m256 _rk = _mm256_load_ps(a[20]); | |
__m256 _rl = _mm256_load_ps(a[21]); | |
__m256 _rm = _mm256_load_ps(a[22]); | |
__m256 _rn = _mm256_load_ps(a[23]); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
print(_r8); | |
print(_r9); | |
print(_ra); | |
print(_rb); | |
print(_rc); | |
print(_rd); | |
print(_re); | |
print(_rf); | |
print(_rg); | |
print(_rh); | |
print(_ri); | |
print(_rj); | |
print(_rk); | |
print(_rl); | |
print(_rm); | |
print(_rn); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); | |
__m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); | |
__m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); | |
__m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); | |
__m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5); | |
__m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5); | |
__m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7); | |
__m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7); | |
__m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9); | |
__m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9); | |
__m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb); | |
__m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb); | |
__m256 _tmpc = _mm256_unpacklo_ps(_rc, _rd); | |
__m256 _tmpd = _mm256_unpackhi_ps(_rc, _rd); | |
__m256 _tmpe = _mm256_unpacklo_ps(_re, _rf); | |
__m256 _tmpf = _mm256_unpackhi_ps(_re, _rf); | |
__m256 _tmpg = _mm256_unpacklo_ps(_rg, _rh); | |
__m256 _tmph = _mm256_unpackhi_ps(_rg, _rh); | |
__m256 _tmpi = _mm256_unpacklo_ps(_ri, _rj); | |
__m256 _tmpj = _mm256_unpackhi_ps(_ri, _rj); | |
__m256 _tmpk = _mm256_unpacklo_ps(_rk, _rl); | |
__m256 _tmpl = _mm256_unpackhi_ps(_rk, _rl); | |
__m256 _tmpm = _mm256_unpacklo_ps(_rm, _rn); | |
__m256 _tmpn = _mm256_unpackhi_ps(_rm, _rn); | |
__m256 _tmpo = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpp = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpq = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpr = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmps = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpt = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpu = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpv = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpw = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpx = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpy = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpz = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpA = _mm256_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpB = _mm256_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpC = _mm256_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpD = _mm256_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpE = _mm256_shuffle_ps(_tmpg, _tmpi, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpF = _mm256_shuffle_ps(_tmpg, _tmpi, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpG = _mm256_shuffle_ps(_tmph, _tmpj, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpH = _mm256_shuffle_ps(_tmph, _tmpj, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpI = _mm256_shuffle_ps(_tmpk, _tmpm, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpJ = _mm256_shuffle_ps(_tmpk, _tmpm, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpK = _mm256_shuffle_ps(_tmpl, _tmpn, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpL = _mm256_shuffle_ps(_tmpl, _tmpn, _MM_SHUFFLE(3, 2, 3, 2)); | |
_r0 = _mm256_permute2f128_ps(_tmpo, _tmps, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r1 = _mm256_permute2f128_ps(_tmpw, _tmpA, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r2 = _mm256_permute2f128_ps(_tmpE, _tmpI, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r3 = _mm256_permute2f128_ps(_tmpp, _tmpt, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r4 = _mm256_permute2f128_ps(_tmpx, _tmpB, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r5 = _mm256_permute2f128_ps(_tmpF, _tmpJ, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r6 = _mm256_permute2f128_ps(_tmpq, _tmpu, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r7 = _mm256_permute2f128_ps(_tmpy, _tmpC, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r8 = _mm256_permute2f128_ps(_tmpG, _tmpK, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r9 = _mm256_permute2f128_ps(_tmpr, _tmpv, _MM_SHUFFLE(0, 2, 0, 0)); | |
_ra = _mm256_permute2f128_ps(_tmpz, _tmpD, _MM_SHUFFLE(0, 2, 0, 0)); | |
_rb = _mm256_permute2f128_ps(_tmpH, _tmpL, _MM_SHUFFLE(0, 2, 0, 0)); | |
_rc = _mm256_permute2f128_ps(_tmpo, _tmps, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rd = _mm256_permute2f128_ps(_tmpw, _tmpA, _MM_SHUFFLE(0, 3, 0, 1)); | |
_re = _mm256_permute2f128_ps(_tmpE, _tmpI, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rf = _mm256_permute2f128_ps(_tmpp, _tmpt, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rg = _mm256_permute2f128_ps(_tmpx, _tmpB, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rh = _mm256_permute2f128_ps(_tmpF, _tmpJ, _MM_SHUFFLE(0, 3, 0, 1)); | |
_ri = _mm256_permute2f128_ps(_tmpq, _tmpu, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rj = _mm256_permute2f128_ps(_tmpy, _tmpC, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rk = _mm256_permute2f128_ps(_tmpG, _tmpK, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rl = _mm256_permute2f128_ps(_tmpr, _tmpv, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rm = _mm256_permute2f128_ps(_tmpz, _tmpD, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rn = _mm256_permute2f128_ps(_tmpH, _tmpL, _MM_SHUFFLE(0, 3, 0, 1)); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
print(_r8); | |
print(_r9); | |
print(_ra); | |
print(_rb); | |
print(_rc); | |
print(_rd); | |
print(_re); | |
print(_rf); | |
print(_rg); | |
print(_rh); | |
print(_ri); | |
print(_rj); | |
print(_rk); | |
print(_rl); | |
print(_rm); | |
print(_rn); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
void transpose_8x16_ps() | |
{ | |
__attribute__((aligned(32))) | |
float a[16][8]; | |
for (int i = 0; i < 16; i++) | |
{ | |
for (int j = 0; j < 8; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m256 _r0 = _mm256_load_ps(a[0]); | |
__m256 _r1 = _mm256_load_ps(a[1]); | |
__m256 _r2 = _mm256_load_ps(a[2]); | |
__m256 _r3 = _mm256_load_ps(a[3]); | |
__m256 _r4 = _mm256_load_ps(a[4]); | |
__m256 _r5 = _mm256_load_ps(a[5]); | |
__m256 _r6 = _mm256_load_ps(a[6]); | |
__m256 _r7 = _mm256_load_ps(a[7]); | |
__m256 _r8 = _mm256_load_ps(a[8]); | |
__m256 _r9 = _mm256_load_ps(a[9]); | |
__m256 _ra = _mm256_load_ps(a[10]); | |
__m256 _rb = _mm256_load_ps(a[11]); | |
__m256 _rc = _mm256_load_ps(a[12]); | |
__m256 _rd = _mm256_load_ps(a[13]); | |
__m256 _re = _mm256_load_ps(a[14]); | |
__m256 _rf = _mm256_load_ps(a[15]); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
print(_r8); | |
print(_r9); | |
print(_ra); | |
print(_rb); | |
print(_rc); | |
print(_rd); | |
print(_re); | |
print(_rf); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); | |
__m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); | |
__m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); | |
__m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); | |
__m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5); | |
__m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5); | |
__m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7); | |
__m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7); | |
__m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9); | |
__m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9); | |
__m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb); | |
__m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb); | |
__m256 _tmpc = _mm256_unpacklo_ps(_rc, _rd); | |
__m256 _tmpd = _mm256_unpackhi_ps(_rc, _rd); | |
__m256 _tmpe = _mm256_unpacklo_ps(_re, _rf); | |
__m256 _tmpf = _mm256_unpackhi_ps(_re, _rf); | |
__m256 _tmpg = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmph = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpi = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpj = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpk = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpl = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpm = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpn = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpo = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpp = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpq = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpr = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmps = _mm256_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpt = _mm256_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpu = _mm256_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpv = _mm256_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(3, 2, 3, 2)); | |
_r0 = _mm256_permute2f128_ps(_tmpg, _tmpk, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r1 = _mm256_permute2f128_ps(_tmpo, _tmps, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r2 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r3 = _mm256_permute2f128_ps(_tmpp, _tmpt, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r4 = _mm256_permute2f128_ps(_tmpi, _tmpm, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r5 = _mm256_permute2f128_ps(_tmpq, _tmpu, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r6 = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r7 = _mm256_permute2f128_ps(_tmpr, _tmpv, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r8 = _mm256_permute2f128_ps(_tmpg, _tmpk, _MM_SHUFFLE(0, 3, 0, 1)); | |
_r9 = _mm256_permute2f128_ps(_tmpo, _tmps, _MM_SHUFFLE(0, 3, 0, 1)); | |
_ra = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rb = _mm256_permute2f128_ps(_tmpp, _tmpt, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rc = _mm256_permute2f128_ps(_tmpi, _tmpm, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rd = _mm256_permute2f128_ps(_tmpq, _tmpu, _MM_SHUFFLE(0, 3, 0, 1)); | |
_re = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rf = _mm256_permute2f128_ps(_tmpr, _tmpv, _MM_SHUFFLE(0, 3, 0, 1)); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
print(_r8); | |
print(_r9); | |
print(_ra); | |
print(_rb); | |
print(_rc); | |
print(_rd); | |
print(_re); | |
print(_rf); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
void transpose_8x12_ps() | |
{ | |
__attribute__((aligned(32))) | |
float a[12][8]; | |
for (int i = 0; i < 12; i++) | |
{ | |
for (int j = 0; j < 8; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m256 _r0 = _mm256_load_ps(a[0]); | |
__m256 _r1 = _mm256_load_ps(a[1]); | |
__m256 _r2 = _mm256_load_ps(a[2]); | |
__m256 _r3 = _mm256_load_ps(a[3]); | |
__m256 _r4 = _mm256_load_ps(a[4]); | |
__m256 _r5 = _mm256_load_ps(a[5]); | |
__m256 _r6 = _mm256_load_ps(a[6]); | |
__m256 _r7 = _mm256_load_ps(a[7]); | |
__m256 _r8 = _mm256_load_ps(a[8]); | |
__m256 _r9 = _mm256_load_ps(a[9]); | |
__m256 _ra = _mm256_load_ps(a[10]); | |
__m256 _rb = _mm256_load_ps(a[11]); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
print(_r8); | |
print(_r9); | |
print(_ra); | |
print(_rb); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); | |
__m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); | |
__m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); | |
__m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); | |
__m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5); | |
__m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5); | |
__m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7); | |
__m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7); | |
__m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9); | |
__m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9); | |
__m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb); | |
__m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb); | |
__m256 _tmpc = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpd = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpe = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpf = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpg = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmph = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpi = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpj = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpk = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpl = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpm = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpn = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2)); | |
_r0 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r1 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r2 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r3 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r4 = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r5 = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r6 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 3, 0, 1)); | |
_r7 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 3, 0, 1)); | |
_r8 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 3, 0, 1)); | |
_r9 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 3, 0, 1)); | |
_ra = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rb = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 3, 0, 1)); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
print(_r8); | |
print(_r9); | |
print(_ra); | |
print(_rb); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
void transpose_8x8_ps() | |
{ | |
__attribute__((aligned(32))) | |
float a[8][8]; | |
for (int i = 0; i < 8; i++) | |
{ | |
for (int j = 0; j < 8; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m256 _r0 = _mm256_load_ps(a[0]); | |
__m256 _r1 = _mm256_load_ps(a[1]); | |
__m256 _r2 = _mm256_load_ps(a[2]); | |
__m256 _r3 = _mm256_load_ps(a[3]); | |
__m256 _r4 = _mm256_load_ps(a[4]); | |
__m256 _r5 = _mm256_load_ps(a[5]); | |
__m256 _r6 = _mm256_load_ps(a[6]); | |
__m256 _r7 = _mm256_load_ps(a[7]); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); | |
__m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); | |
__m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); | |
__m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); | |
__m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5); | |
__m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5); | |
__m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7); | |
__m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7); | |
__m256 _tmp8 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmp9 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpa = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpb = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpc = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpd = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmpe = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmpf = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); | |
_r0 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r1 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r2 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r3 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r4 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 3, 0, 1)); | |
_r5 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 3, 0, 1)); | |
_r6 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 3, 0, 1)); | |
_r7 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 3, 0, 1)); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
print(_r4); | |
print(_r5); | |
print(_r6); | |
print(_r7); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
void transpose_8x4_ps() | |
{ | |
__attribute__((aligned(32))) | |
float a[4][8]; | |
for (int i = 0; i < 4; i++) | |
{ | |
for (int j = 0; j < 8; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m256 _r0 = _mm256_load_ps(a[0]); | |
__m256 _r1 = _mm256_load_ps(a[1]); | |
__m256 _r2 = _mm256_load_ps(a[2]); | |
__m256 _r3 = _mm256_load_ps(a[3]); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); | |
__m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); | |
__m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); | |
__m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); | |
__m256 _tmp4 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmp5 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); | |
__m256 _tmp6 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); | |
__m256 _tmp7 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); | |
_r0 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r1 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r2 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1)); | |
_r3 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1)); | |
print(_r0); | |
print(_r1); | |
print(_r2); | |
print(_r3); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
void transpose_8x2_ps() | |
{ | |
__attribute__((aligned(32))) | |
float a[2][8]; | |
for (int i = 0; i < 2; i++) | |
{ | |
for (int j = 0; j < 8; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m256 _r0 = _mm256_load_ps(a[0]); | |
__m256 _r1 = _mm256_load_ps(a[1]); | |
print(_r0); | |
print(_r1); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); | |
__m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); | |
_r0 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r1 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1)); | |
print(_r0); | |
print(_r1); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
void transpose_16x16_epi16() | |
{ | |
__attribute__((aligned(32))) | |
short a[16][16]; | |
for (int i = 0; i < 16; i++) | |
{ | |
for (int j = 0; j < 16; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m256i _r0 = _mm256_load_si256((const __m256i*)a[0]); | |
__m256i _r1 = _mm256_load_si256((const __m256i*)a[1]); | |
__m256i _r2 = _mm256_load_si256((const __m256i*)a[2]); | |
__m256i _r3 = _mm256_load_si256((const __m256i*)a[3]); | |
__m256i _r4 = _mm256_load_si256((const __m256i*)a[4]); | |
__m256i _r5 = _mm256_load_si256((const __m256i*)a[5]); | |
__m256i _r6 = _mm256_load_si256((const __m256i*)a[6]); | |
__m256i _r7 = _mm256_load_si256((const __m256i*)a[7]); | |
__m256i _r8 = _mm256_load_si256((const __m256i*)a[8]); | |
__m256i _r9 = _mm256_load_si256((const __m256i*)a[9]); | |
__m256i _ra = _mm256_load_si256((const __m256i*)a[10]); | |
__m256i _rb = _mm256_load_si256((const __m256i*)a[11]); | |
__m256i _rc = _mm256_load_si256((const __m256i*)a[12]); | |
__m256i _rd = _mm256_load_si256((const __m256i*)a[13]); | |
__m256i _re = _mm256_load_si256((const __m256i*)a[14]); | |
__m256i _rf = _mm256_load_si256((const __m256i*)a[15]); | |
print_epi16(_r0); | |
print_epi16(_r1); | |
print_epi16(_r2); | |
print_epi16(_r3); | |
print_epi16(_r4); | |
print_epi16(_r5); | |
print_epi16(_r6); | |
print_epi16(_r7); | |
print_epi16(_r8); | |
print_epi16(_r9); | |
print_epi16(_ra); | |
print_epi16(_rb); | |
print_epi16(_rc); | |
print_epi16(_rd); | |
print_epi16(_re); | |
print_epi16(_rf); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m256i _tmp0 = _mm256_unpacklo_epi16(_r0, _r1); | |
__m256i _tmp1 = _mm256_unpackhi_epi16(_r0, _r1); | |
__m256i _tmp2 = _mm256_unpacklo_epi16(_r2, _r3); | |
__m256i _tmp3 = _mm256_unpackhi_epi16(_r2, _r3); | |
__m256i _tmp4 = _mm256_unpacklo_epi16(_r4, _r5); | |
__m256i _tmp5 = _mm256_unpackhi_epi16(_r4, _r5); | |
__m256i _tmp6 = _mm256_unpacklo_epi16(_r6, _r7); | |
__m256i _tmp7 = _mm256_unpackhi_epi16(_r6, _r7); | |
__m256i _tmp8 = _mm256_unpacklo_epi16(_r8, _r9); | |
__m256i _tmp9 = _mm256_unpackhi_epi16(_r8, _r9); | |
__m256i _tmpa = _mm256_unpacklo_epi16(_ra, _rb); | |
__m256i _tmpb = _mm256_unpackhi_epi16(_ra, _rb); | |
__m256i _tmpc = _mm256_unpacklo_epi16(_rc, _rd); | |
__m256i _tmpd = _mm256_unpackhi_epi16(_rc, _rd); | |
__m256i _tmpe = _mm256_unpacklo_epi16(_re, _rf); | |
__m256i _tmpf = _mm256_unpackhi_epi16(_re, _rf); | |
__m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2); | |
__m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2); | |
__m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3); | |
__m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3); | |
__m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6); | |
__m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6); | |
__m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7); | |
__m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7); | |
__m256i _tmpo = _mm256_unpacklo_epi32(_tmp8, _tmpa); | |
__m256i _tmpp = _mm256_unpackhi_epi32(_tmp8, _tmpa); | |
__m256i _tmpq = _mm256_unpacklo_epi32(_tmp9, _tmpb); | |
__m256i _tmpr = _mm256_unpackhi_epi32(_tmp9, _tmpb); | |
__m256i _tmps = _mm256_unpacklo_epi32(_tmpc, _tmpe); | |
__m256i _tmpt = _mm256_unpackhi_epi32(_tmpc, _tmpe); | |
__m256i _tmpu = _mm256_unpacklo_epi32(_tmpd, _tmpf); | |
__m256i _tmpv = _mm256_unpackhi_epi32(_tmpd, _tmpf); | |
_tmp0 = _mm256_unpacklo_epi64(_tmpg, _tmpk); | |
_tmp1 = _mm256_unpackhi_epi64(_tmpg, _tmpk); | |
_tmp2 = _mm256_unpacklo_epi64(_tmph, _tmpl); | |
_tmp3 = _mm256_unpackhi_epi64(_tmph, _tmpl); | |
_tmp4 = _mm256_unpacklo_epi64(_tmpi, _tmpm); | |
_tmp5 = _mm256_unpackhi_epi64(_tmpi, _tmpm); | |
_tmp6 = _mm256_unpacklo_epi64(_tmpj, _tmpn); | |
_tmp7 = _mm256_unpackhi_epi64(_tmpj, _tmpn); | |
_tmp8 = _mm256_unpacklo_epi64(_tmpo, _tmps); | |
_tmp9 = _mm256_unpackhi_epi64(_tmpo, _tmps); | |
_tmpa = _mm256_unpacklo_epi64(_tmpp, _tmpt); | |
_tmpb = _mm256_unpackhi_epi64(_tmpp, _tmpt); | |
_tmpc = _mm256_unpacklo_epi64(_tmpq, _tmpu); | |
_tmpd = _mm256_unpackhi_epi64(_tmpq, _tmpu); | |
_tmpe = _mm256_unpacklo_epi64(_tmpr, _tmpv); | |
_tmpf = _mm256_unpackhi_epi64(_tmpr, _tmpv); | |
_r0 = _mm256_permute2x128_si256(_tmp0, _tmp8, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r1 = _mm256_permute2x128_si256(_tmp1, _tmp9, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r2 = _mm256_permute2x128_si256(_tmp2, _tmpa, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r3 = _mm256_permute2x128_si256(_tmp3, _tmpb, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r4 = _mm256_permute2x128_si256(_tmp4, _tmpc, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r5 = _mm256_permute2x128_si256(_tmp5, _tmpd, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r6 = _mm256_permute2x128_si256(_tmp6, _tmpe, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r7 = _mm256_permute2x128_si256(_tmp7, _tmpf, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r8 = _mm256_permute2x128_si256(_tmp0, _tmp8, _MM_SHUFFLE(0, 3, 0, 1)); | |
_r9 = _mm256_permute2x128_si256(_tmp1, _tmp9, _MM_SHUFFLE(0, 3, 0, 1)); | |
_ra = _mm256_permute2x128_si256(_tmp2, _tmpa, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rb = _mm256_permute2x128_si256(_tmp3, _tmpb, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rc = _mm256_permute2x128_si256(_tmp4, _tmpc, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rd = _mm256_permute2x128_si256(_tmp5, _tmpd, _MM_SHUFFLE(0, 3, 0, 1)); | |
_re = _mm256_permute2x128_si256(_tmp6, _tmpe, _MM_SHUFFLE(0, 3, 0, 1)); | |
_rf = _mm256_permute2x128_si256(_tmp7, _tmpf, _MM_SHUFFLE(0, 3, 0, 1)); | |
print_epi16(_r0); | |
print_epi16(_r1); | |
print_epi16(_r2); | |
print_epi16(_r3); | |
print_epi16(_r4); | |
print_epi16(_r5); | |
print_epi16(_r6); | |
print_epi16(_r7); | |
print_epi16(_r8); | |
print_epi16(_r9); | |
print_epi16(_ra); | |
print_epi16(_rb); | |
print_epi16(_rc); | |
print_epi16(_rd); | |
print_epi16(_re); | |
print_epi16(_rf); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
void transpose_16x8_epi16() | |
{ | |
__attribute__((aligned(32))) | |
short a[16][8]; | |
for (int i = 0; i < 16; i++) | |
{ | |
for (int j = 0; j < 8; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m128i _r0 = _mm_load_si128((const __m128i*)a[0]); | |
__m128i _r1 = _mm_load_si128((const __m128i*)a[1]); | |
__m128i _r2 = _mm_load_si128((const __m128i*)a[2]); | |
__m128i _r3 = _mm_load_si128((const __m128i*)a[3]); | |
__m128i _r4 = _mm_load_si128((const __m128i*)a[4]); | |
__m128i _r5 = _mm_load_si128((const __m128i*)a[5]); | |
__m128i _r6 = _mm_load_si128((const __m128i*)a[6]); | |
__m128i _r7 = _mm_load_si128((const __m128i*)a[7]); | |
__m128i _r8 = _mm_load_si128((const __m128i*)a[8]); | |
__m128i _r9 = _mm_load_si128((const __m128i*)a[9]); | |
__m128i _ra = _mm_load_si128((const __m128i*)a[10]); | |
__m128i _rb = _mm_load_si128((const __m128i*)a[11]); | |
__m128i _rc = _mm_load_si128((const __m128i*)a[12]); | |
__m128i _rd = _mm_load_si128((const __m128i*)a[13]); | |
__m128i _re = _mm_load_si128((const __m128i*)a[14]); | |
__m128i _rf = _mm_load_si128((const __m128i*)a[15]); | |
print_epi16(_r0); | |
print_epi16(_r1); | |
print_epi16(_r2); | |
print_epi16(_r3); | |
print_epi16(_r4); | |
print_epi16(_r5); | |
print_epi16(_r6); | |
print_epi16(_r7); | |
print_epi16(_r8); | |
print_epi16(_r9); | |
print_epi16(_ra); | |
print_epi16(_rb); | |
print_epi16(_rc); | |
print_epi16(_rd); | |
print_epi16(_re); | |
print_epi16(_rf); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
#if __AVX2__ | |
__m256i _r08 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r0), _r8, 1); | |
__m256i _r19 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r1), _r9, 1); | |
__m256i _r2a = _mm256_inserti128_si256(_mm256_castsi128_si256(_r2), _ra, 1); | |
__m256i _r3b = _mm256_inserti128_si256(_mm256_castsi128_si256(_r3), _rb, 1); | |
__m256i _r4c = _mm256_inserti128_si256(_mm256_castsi128_si256(_r4), _rc, 1); | |
__m256i _r5d = _mm256_inserti128_si256(_mm256_castsi128_si256(_r5), _rd, 1); | |
__m256i _r6e = _mm256_inserti128_si256(_mm256_castsi128_si256(_r6), _re, 1); | |
__m256i _r7f = _mm256_inserti128_si256(_mm256_castsi128_si256(_r7), _rf, 1); | |
__m256i _tmp0 = _mm256_unpacklo_epi16(_r08, _r19); | |
__m256i _tmp1 = _mm256_unpackhi_epi16(_r08, _r19); | |
__m256i _tmp2 = _mm256_unpacklo_epi16(_r2a, _r3b); | |
__m256i _tmp3 = _mm256_unpackhi_epi16(_r2a, _r3b); | |
__m256i _tmp4 = _mm256_unpacklo_epi16(_r4c, _r5d); | |
__m256i _tmp5 = _mm256_unpackhi_epi16(_r4c, _r5d); | |
__m256i _tmp6 = _mm256_unpacklo_epi16(_r6e, _r7f); | |
__m256i _tmp7 = _mm256_unpackhi_epi16(_r6e, _r7f); | |
__m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2); | |
__m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2); | |
__m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3); | |
__m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3); | |
__m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6); | |
__m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6); | |
__m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7); | |
__m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7); | |
_r08 = _mm256_unpacklo_epi64(_tmpg, _tmpk); | |
_r19 = _mm256_unpackhi_epi64(_tmpg, _tmpk); | |
_r2a = _mm256_unpacklo_epi64(_tmph, _tmpl); | |
_r3b = _mm256_unpackhi_epi64(_tmph, _tmpl); | |
_r4c = _mm256_unpacklo_epi64(_tmpi, _tmpm); | |
_r5d = _mm256_unpackhi_epi64(_tmpi, _tmpm); | |
_r6e = _mm256_unpacklo_epi64(_tmpj, _tmpn); | |
_r7f = _mm256_unpackhi_epi64(_tmpj, _tmpn); | |
print_epi16(_r08); | |
print_epi16(_r19); | |
print_epi16(_r2a); | |
print_epi16(_r3b); | |
print_epi16(_r4c); | |
print_epi16(_r5d); | |
print_epi16(_r6e); | |
print_epi16(_r7f); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
#else // __AVX2__ | |
__m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1); | |
__m128i _tmp1 = _mm_unpackhi_epi16(_r0, _r1); | |
__m128i _tmp2 = _mm_unpacklo_epi16(_r2, _r3); | |
__m128i _tmp3 = _mm_unpackhi_epi16(_r2, _r3); | |
__m128i _tmp4 = _mm_unpacklo_epi16(_r4, _r5); | |
__m128i _tmp5 = _mm_unpackhi_epi16(_r4, _r5); | |
__m128i _tmp6 = _mm_unpacklo_epi16(_r6, _r7); | |
__m128i _tmp7 = _mm_unpackhi_epi16(_r6, _r7); | |
__m128i _tmp8 = _mm_unpacklo_epi16(_r8, _r9); | |
__m128i _tmp9 = _mm_unpackhi_epi16(_r8, _r9); | |
__m128i _tmpa = _mm_unpacklo_epi16(_ra, _rb); | |
__m128i _tmpb = _mm_unpackhi_epi16(_ra, _rb); | |
__m128i _tmpc = _mm_unpacklo_epi16(_rc, _rd); | |
__m128i _tmpd = _mm_unpackhi_epi16(_rc, _rd); | |
__m128i _tmpe = _mm_unpacklo_epi16(_re, _rf); | |
__m128i _tmpf = _mm_unpackhi_epi16(_re, _rf); | |
__m128i _tmpg = _mm_unpacklo_epi32(_tmp0, _tmp2); | |
__m128i _tmph = _mm_unpackhi_epi32(_tmp0, _tmp2); | |
__m128i _tmpi = _mm_unpacklo_epi32(_tmp1, _tmp3); | |
__m128i _tmpj = _mm_unpackhi_epi32(_tmp1, _tmp3); | |
__m128i _tmpk = _mm_unpacklo_epi32(_tmp4, _tmp6); | |
__m128i _tmpl = _mm_unpackhi_epi32(_tmp4, _tmp6); | |
__m128i _tmpm = _mm_unpacklo_epi32(_tmp5, _tmp7); | |
__m128i _tmpn = _mm_unpackhi_epi32(_tmp5, _tmp7); | |
__m128i _tmpo = _mm_unpacklo_epi32(_tmp8, _tmpa); | |
__m128i _tmpp = _mm_unpackhi_epi32(_tmp8, _tmpa); | |
__m128i _tmpq = _mm_unpacklo_epi32(_tmp9, _tmpb); | |
__m128i _tmpr = _mm_unpackhi_epi32(_tmp9, _tmpb); | |
__m128i _tmps = _mm_unpacklo_epi32(_tmpc, _tmpe); | |
__m128i _tmpt = _mm_unpackhi_epi32(_tmpc, _tmpe); | |
__m128i _tmpu = _mm_unpacklo_epi32(_tmpd, _tmpf); | |
__m128i _tmpv = _mm_unpackhi_epi32(_tmpd, _tmpf); | |
_r0 = _mm_unpacklo_epi64(_tmpg, _tmpk); | |
_r1 = _mm_unpacklo_epi64(_tmpo, _tmps); | |
_r2 = _mm_unpackhi_epi64(_tmpg, _tmpk); | |
_r3 = _mm_unpackhi_epi64(_tmpo, _tmps); | |
_r4 = _mm_unpacklo_epi64(_tmph, _tmpl); | |
_r5 = _mm_unpacklo_epi64(_tmpp, _tmpt); | |
_r6 = _mm_unpackhi_epi64(_tmph, _tmpl); | |
_r7 = _mm_unpackhi_epi64(_tmpp, _tmpt); | |
_r8 = _mm_unpacklo_epi64(_tmpi, _tmpm); | |
_r9 = _mm_unpacklo_epi64(_tmpq, _tmpu); | |
_ra = _mm_unpackhi_epi64(_tmpi, _tmpm); | |
_rb = _mm_unpackhi_epi64(_tmpq, _tmpu); | |
_rc = _mm_unpacklo_epi64(_tmpj, _tmpn); | |
_rd = _mm_unpacklo_epi64(_tmpr, _tmpv); | |
_re = _mm_unpackhi_epi64(_tmpj, _tmpn); | |
_rf = _mm_unpackhi_epi64(_tmpr, _tmpv); | |
print_epi16(_r0); | |
print_epi16(_r1); | |
print_epi16(_r2); | |
print_epi16(_r3); | |
print_epi16(_r4); | |
print_epi16(_r5); | |
print_epi16(_r6); | |
print_epi16(_r7); | |
print_epi16(_r8); | |
print_epi16(_r9); | |
print_epi16(_ra); | |
print_epi16(_rb); | |
print_epi16(_rc); | |
print_epi16(_rd); | |
print_epi16(_re); | |
print_epi16(_rf); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
#endif // __AVX2__ | |
} | |
void transpose_8x16_epi16() | |
{ | |
__attribute__((aligned(32))) | |
short a[8][16]; | |
for (int i = 0; i < 8; i++) | |
{ | |
for (int j = 0; j < 16; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m256i _r0 = _mm256_load_si256((const __m256i*)a[0]); | |
__m256i _r1 = _mm256_load_si256((const __m256i*)a[1]); | |
__m256i _r2 = _mm256_load_si256((const __m256i*)a[2]); | |
__m256i _r3 = _mm256_load_si256((const __m256i*)a[3]); | |
__m256i _r4 = _mm256_load_si256((const __m256i*)a[4]); | |
__m256i _r5 = _mm256_load_si256((const __m256i*)a[5]); | |
__m256i _r6 = _mm256_load_si256((const __m256i*)a[6]); | |
__m256i _r7 = _mm256_load_si256((const __m256i*)a[7]); | |
print_epi16(_r0); | |
print_epi16(_r1); | |
print_epi16(_r2); | |
print_epi16(_r3); | |
print_epi16(_r4); | |
print_epi16(_r5); | |
print_epi16(_r6); | |
print_epi16(_r7); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m256i _tmp0 = _mm256_unpacklo_epi16(_r0, _r1); | |
__m256i _tmp1 = _mm256_unpackhi_epi16(_r0, _r1); | |
__m256i _tmp2 = _mm256_unpacklo_epi16(_r2, _r3); | |
__m256i _tmp3 = _mm256_unpackhi_epi16(_r2, _r3); | |
__m256i _tmp4 = _mm256_unpacklo_epi16(_r4, _r5); | |
__m256i _tmp5 = _mm256_unpackhi_epi16(_r4, _r5); | |
__m256i _tmp6 = _mm256_unpacklo_epi16(_r6, _r7); | |
__m256i _tmp7 = _mm256_unpackhi_epi16(_r6, _r7); | |
__m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2); | |
__m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2); | |
__m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3); | |
__m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3); | |
__m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6); | |
__m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6); | |
__m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7); | |
__m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7); | |
_tmp0 = _mm256_unpacklo_epi64(_tmpg, _tmpk); | |
_tmp1 = _mm256_unpackhi_epi64(_tmpg, _tmpk); | |
_tmp2 = _mm256_unpacklo_epi64(_tmph, _tmpl); | |
_tmp3 = _mm256_unpackhi_epi64(_tmph, _tmpl); | |
_tmp4 = _mm256_unpacklo_epi64(_tmpi, _tmpm); | |
_tmp5 = _mm256_unpackhi_epi64(_tmpi, _tmpm); | |
_tmp6 = _mm256_unpacklo_epi64(_tmpj, _tmpn); | |
_tmp7 = _mm256_unpackhi_epi64(_tmpj, _tmpn); | |
_r0 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r1 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r2 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r3 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0)); | |
_r4 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1)); | |
_r5 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 3, 0, 1)); | |
_r6 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1)); | |
_r7 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1)); | |
print_epi16(_r0); | |
print_epi16(_r1); | |
print_epi16(_r2); | |
print_epi16(_r3); | |
print_epi16(_r4); | |
print_epi16(_r5); | |
print_epi16(_r6); | |
print_epi16(_r7); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
void transpose_8x8_epi16() | |
{ | |
__attribute__((aligned(16))) | |
short a[8][8]; | |
for (int i = 0; i < 8; i++) | |
{ | |
for (int j = 0; j < 8; j++) | |
{ | |
a[i][j] = i * 100 + j; | |
} | |
} | |
__m128i _r0 = _mm_load_si128((const __m128i*)a[0]); | |
__m128i _r1 = _mm_load_si128((const __m128i*)a[1]); | |
__m128i _r2 = _mm_load_si128((const __m128i*)a[2]); | |
__m128i _r3 = _mm_load_si128((const __m128i*)a[3]); | |
__m128i _r4 = _mm_load_si128((const __m128i*)a[4]); | |
__m128i _r5 = _mm_load_si128((const __m128i*)a[5]); | |
__m128i _r6 = _mm_load_si128((const __m128i*)a[6]); | |
__m128i _r7 = _mm_load_si128((const __m128i*)a[7]); | |
print_epi16(_r0); | |
print_epi16(_r1); | |
print_epi16(_r2); | |
print_epi16(_r3); | |
print_epi16(_r4); | |
print_epi16(_r5); | |
print_epi16(_r6); | |
print_epi16(_r7); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
__m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1); | |
__m128i _tmp1 = _mm_unpackhi_epi16(_r0, _r1); | |
__m128i _tmp2 = _mm_unpacklo_epi16(_r2, _r3); | |
__m128i _tmp3 = _mm_unpackhi_epi16(_r2, _r3); | |
__m128i _tmp4 = _mm_unpacklo_epi16(_r4, _r5); | |
__m128i _tmp5 = _mm_unpackhi_epi16(_r4, _r5); | |
__m128i _tmp6 = _mm_unpacklo_epi16(_r6, _r7); | |
__m128i _tmp7 = _mm_unpackhi_epi16(_r6, _r7); | |
__m128i _tmp8 = _mm_unpacklo_epi32(_tmp0, _tmp2); | |
__m128i _tmp9 = _mm_unpackhi_epi32(_tmp0, _tmp2); | |
__m128i _tmpa = _mm_unpacklo_epi32(_tmp1, _tmp3); | |
__m128i _tmpb = _mm_unpackhi_epi32(_tmp1, _tmp3); | |
__m128i _tmpc = _mm_unpacklo_epi32(_tmp4, _tmp6); | |
__m128i _tmpd = _mm_unpackhi_epi32(_tmp4, _tmp6); | |
__m128i _tmpe = _mm_unpacklo_epi32(_tmp5, _tmp7); | |
__m128i _tmpf = _mm_unpackhi_epi32(_tmp5, _tmp7); | |
_r0 = _mm_unpacklo_epi64(_tmp8, _tmpc); | |
_r1 = _mm_unpackhi_epi64(_tmp8, _tmpc); | |
_r2 = _mm_unpacklo_epi64(_tmp9, _tmpd); | |
_r3 = _mm_unpackhi_epi64(_tmp9, _tmpd); | |
_r4 = _mm_unpacklo_epi64(_tmpa, _tmpe); | |
_r5 = _mm_unpackhi_epi64(_tmpa, _tmpe); | |
_r6 = _mm_unpacklo_epi64(_tmpb, _tmpf); | |
_r7 = _mm_unpackhi_epi64(_tmpb, _tmpf); | |
print_epi16(_r0); | |
print_epi16(_r1); | |
print_epi16(_r2); | |
print_epi16(_r3); | |
print_epi16(_r4); | |
print_epi16(_r5); | |
print_epi16(_r6); | |
print_epi16(_r7); | |
fprintf(stderr, "-----------------------------------------------------------------------------------\n"); | |
} | |
int main() | |
{ | |
fprintf(stderr, "hello!\n"); | |
// transpose_16x24_ps(); | |
// transpose_16x16_ps(); | |
// transpose_16x12_ps(); | |
// transpose_16x8_ps(); | |
// transpose_16x4_ps(); | |
// transpose_16x2_ps(); | |
// transpose_8x24_ps(); | |
// transpose_8x16_ps(); | |
// transpose_8x12_ps(); | |
// transpose_8x8_ps(); | |
// transpose_8x4_ps(); | |
// transpose_8x2_ps(); | |
// transpose_16x16_epi16(); | |
transpose_16x8_epi16(); | |
// transpose_8x16_epi16(); | |
// transpose_8x8_epi16(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment