Skip to content

Instantly share code, notes, and snippets.

@LYP951018
Created September 27, 2019 12:21
Show Gist options
  • Save LYP951018/01b359395240a1d4c3a0da1dac97f881 to your computer and use it in GitHub Desktop.
Save LYP951018/01b359395240a1d4c3a0da1dac97f881 to your computer and use it in GitHub Desktop.
ARM transpose blocking
void Rotate2701(const unsigned char* __restrict__ input, int width, int height,
unsigned char* __restrict__ output)
{
const std::uint32_t* inputPixels =
reinterpret_cast<const std::uint32_t*>(input);
std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output);
const std::uint32_t kBlockPixelSize = 8;
const std::uint32_t rowBlockCount = width / kBlockPixelSize;
const std::uint32_t colBlockCount = height / kBlockPixelSize;
const auto transpose = [&](int x, int y) {
outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x];
};
// transpose blocks first
for (int i = 0; i < height - kBlockPixelSize; i += kBlockPixelSize)
{
for (int j = 0; j < width - kBlockPixelSize; j += kBlockPixelSize)
{
for (int k = 0; k < kBlockPixelSize; ++k)
{
int y = i + k;
for (int m = 0; m < kBlockPixelSize; ++m)
{
int x = j + m;
// transpose(x, y);
outputPixels[y + (width - x - 1) * height] =
inputPixels[y * width + x];
}
}
}
}
// pixels left
int leftXStart = static_cast<int>(rowBlockCount * kBlockPixelSize);
int leftYStart = static_cast<int>(colBlockCount * kBlockPixelSize);
for (int i = leftYStart; i < height; ++i)
{
for (int j = leftXStart; j < width; ++j)
{
transpose(j, i);
}
}
}
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
{ \
float32x4_t tmp0 = \
vcombine_f32(vget_low_f32(row0), vget_low_f32(row1)); \
float32x4_t tmp1 = \
vcombine_f32(vget_high_f32(row0), vget_high_f32(row1)); \
float32x4_t tmp2 = \
vcombine_f32(vget_low_f32(row2), vget_low_f32(row3)); \
float32x4_t tmp3 = \
vcombine_f32(vget_high_f32(row2), vget_high_f32(row3)); \
auto tmp4 = vuzpq_f32(tmp0, tmp1); \
row0 = tmp4.val[0]; \
row1 = tmp4.val[1]; \
auto tmp5 = vuzpq_f32(tmp2, tmp3); \
row2 = tmp5.val[0]; \
row3 = tmp5.val[1]; \
}
void Rotate2703(const unsigned char* __restrict__ input, int width, int height,
unsigned char* __restrict__ output)
{
const std::uint32_t* inputPixels =
reinterpret_cast<const std::uint32_t*>(input);
std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output);
const std::uint32_t kBlockPixelSize = 8;
const std::uint32_t rowBlockCount = width / kBlockPixelSize;
const std::uint32_t colBlockCount = height / kBlockPixelSize;
const auto transpose = [&](int x, int y) {
outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x];
};
// transpose blocks first
for (int i = 0; i < height - kBlockPixelSize; i += kBlockPixelSize)
{
for (int j = 0; j < width - kBlockPixelSize; j += kBlockPixelSize)
{
const float* inputPixelsFp =
reinterpret_cast<const float*>(inputPixels);
float* ouputPixelsFp = reinterpret_cast<float*>(outputPixels);
const float* start = inputPixelsFp + i * width + j;
float* dest = ouputPixelsFp + i + (width - j - 1) * height;
float32x4x2_t row0 = vld2q_f32(start);
float32x4x2_t row1 = vld2q_f32(start + width);
float32x4x2_t row2 = vld2q_f32(start + width * 2);
float32x4x2_t row3 = vld2q_f32(start + width * 3);
float32x4x2_t row4 = vld2q_f32(start + width * 4);
float32x4x2_t row5 = vld2q_f32(start + width * 5);
float32x4x2_t row6 = vld2q_f32(start + width * 6);
float32x4x2_t row7 = vld2q_f32(start + width * 7);
_MM_TRANSPOSE4_PS(row0.val[0], row1.val[0], row2.val[0],
row3.val[0]);
_MM_TRANSPOSE4_PS(row4.val[0], row5.val[0], row6.val[0],
row7.val[0]);
_MM_TRANSPOSE4_PS(row0.val[1], row1.val[1], row2.val[1],
row3.val[1]);
_MM_TRANSPOSE4_PS(row4.val[1], row5.val[1], row6.val[1],
row7.val[1]);
vst2q_f32(dest, (float32x4x2_t{row0.val[0], row4.val[0]}));
vst2q_f32(dest - height, (float32x4x2_t{row1.val[0], row5.val[0]}));
vst2q_f32(dest - height * 2,
(float32x4x2_t{row2.val[0], row6.val[0]}));
vst2q_f32(dest - height * 3,
(float32x4x2_t{row3.val[0], row7.val[0]}));
vst2q_f32(dest - height * 4,
(float32x4x2_t{row0.val[1], row4.val[1]}));
vst2q_f32(dest - height * 5,
(float32x4x2_t{row1.val[1], row5.val[1]}));
vst2q_f32(dest - height * 6,
(float32x4x2_t{row2.val[1], row6.val[1]}));
vst2q_f32(dest - height * 7,
(float32x4x2_t{row3.val[1], row7.val[1]}));
}
}
// pixels left
int leftXStart = static_cast<int>(rowBlockCount * kBlockPixelSize);
int leftYStart = static_cast<int>(colBlockCount * kBlockPixelSize);
for (int i = leftYStart; i < height; ++i)
{
for (int j = leftXStart; j < width; ++j)
{
transpose(j, i);
}
}
}
void Rotate2704(const unsigned char* __restrict__ input, int width, int height,
unsigned char* __restrict__ output)
{
const std::uint32_t* inputPixels =
reinterpret_cast<const std::uint32_t*>(input);
std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output);
const std::uint32_t kBlockPixelSize = 4;
const std::uint32_t rowBlockCount = width / kBlockPixelSize;
const std::uint32_t colBlockCount = height / kBlockPixelSize;
const auto transpose = [&](int x, int y) {
outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x];
};
// transpose blocks first
for (int i = 0; i < height - kBlockPixelSize; i += kBlockPixelSize)
{
for (int j = 0; j < width - kBlockPixelSize; j += kBlockPixelSize)
{
const float* inputPixelsFp =
reinterpret_cast<const float*>(inputPixels);
float* ouputPixelsFp = reinterpret_cast<float*>(outputPixels);
const float* start = inputPixelsFp + i * width + j;
float* dest = ouputPixelsFp + i + (width - j - 1) * height;
float32x4_t row0 = vld1q_f32(start);
float32x4_t row1 = vld1q_f32(start + width);
float32x4_t row2 = vld1q_f32(start + width * 2);
float32x4_t row3 = vld1q_f32(start + width * 3);
_MM_TRANSPOSE4_PS(row0, row1, row2, row3);
vst1q_f32(dest, row0);
vst1q_f32(dest - height, row1);
vst1q_f32(dest - height * 2, row2);
vst1q_f32(dest - height * 3, row3);
}
}
// pixels left
int leftXStart = static_cast<int>(rowBlockCount * kBlockPixelSize);
int leftYStart = static_cast<int>(colBlockCount * kBlockPixelSize);
for (int i = leftYStart; i < height; ++i)
{
for (int j = leftXStart; j < width; ++j)
{
transpose(j, i);
}
}
}
void Rotate2702(const unsigned char* input, int width, int height,
unsigned char* output)
{
const std::uint32_t* inputPixels =
reinterpret_cast<const std::uint32_t*>(input);
std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output);
const auto transpose = [&](int x, int y) {
outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x];
};
for (int i = 0; i < height; ++i)
{
for (int j = 0; j < width; ++j)
{
transpose(j, i);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment