Created
September 27, 2019 12:21
-
-
Save LYP951018/01b359395240a1d4c3a0da1dac97f881 to your computer and use it in GitHub Desktop.
ARM transpose blocking
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void Rotate2701(const unsigned char* __restrict__ input, int width, int height, | |
unsigned char* __restrict__ output) | |
{ | |
const std::uint32_t* inputPixels = | |
reinterpret_cast<const std::uint32_t*>(input); | |
std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output); | |
const std::uint32_t kBlockPixelSize = 8; | |
const std::uint32_t rowBlockCount = width / kBlockPixelSize; | |
const std::uint32_t colBlockCount = height / kBlockPixelSize; | |
const auto transpose = [&](int x, int y) { | |
outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x]; | |
}; | |
// transpose blocks first | |
for (int i = 0; i < height - kBlockPixelSize; i += kBlockPixelSize) | |
{ | |
for (int j = 0; j < width - kBlockPixelSize; j += kBlockPixelSize) | |
{ | |
for (int k = 0; k < kBlockPixelSize; ++k) | |
{ | |
int y = i + k; | |
for (int m = 0; m < kBlockPixelSize; ++m) | |
{ | |
int x = j + m; | |
// transpose(x, y); | |
outputPixels[y + (width - x - 1) * height] = | |
inputPixels[y * width + x]; | |
} | |
} | |
} | |
} | |
// pixels left | |
int leftXStart = static_cast<int>(rowBlockCount * kBlockPixelSize); | |
int leftYStart = static_cast<int>(colBlockCount * kBlockPixelSize); | |
for (int i = leftYStart; i < height; ++i) | |
{ | |
for (int j = leftXStart; j < width; ++j) | |
{ | |
transpose(j, i); | |
} | |
} | |
} | |
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ | |
{ \ | |
float32x4_t tmp0 = \ | |
vcombine_f32(vget_low_f32(row0), vget_low_f32(row1)); \ | |
float32x4_t tmp1 = \ | |
vcombine_f32(vget_high_f32(row0), vget_high_f32(row1)); \ | |
float32x4_t tmp2 = \ | |
vcombine_f32(vget_low_f32(row2), vget_low_f32(row3)); \ | |
float32x4_t tmp3 = \ | |
vcombine_f32(vget_high_f32(row2), vget_high_f32(row3)); \ | |
auto tmp4 = vuzpq_f32(tmp0, tmp1); \ | |
row0 = tmp4.val[0]; \ | |
row1 = tmp4.val[1]; \ | |
auto tmp5 = vuzpq_f32(tmp2, tmp3); \ | |
row2 = tmp5.val[0]; \ | |
row3 = tmp5.val[1]; \ | |
} | |
void Rotate2703(const unsigned char* __restrict__ input, int width, int height, | |
unsigned char* __restrict__ output) | |
{ | |
const std::uint32_t* inputPixels = | |
reinterpret_cast<const std::uint32_t*>(input); | |
std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output); | |
const std::uint32_t kBlockPixelSize = 8; | |
const std::uint32_t rowBlockCount = width / kBlockPixelSize; | |
const std::uint32_t colBlockCount = height / kBlockPixelSize; | |
const auto transpose = [&](int x, int y) { | |
outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x]; | |
}; | |
// transpose blocks first | |
for (int i = 0; i < height - kBlockPixelSize; i += kBlockPixelSize) | |
{ | |
for (int j = 0; j < width - kBlockPixelSize; j += kBlockPixelSize) | |
{ | |
const float* inputPixelsFp = | |
reinterpret_cast<const float*>(inputPixels); | |
float* ouputPixelsFp = reinterpret_cast<float*>(outputPixels); | |
const float* start = inputPixelsFp + i * width + j; | |
float* dest = ouputPixelsFp + i + (width - j - 1) * height; | |
float32x4x2_t row0 = vld2q_f32(start); | |
float32x4x2_t row1 = vld2q_f32(start + width); | |
float32x4x2_t row2 = vld2q_f32(start + width * 2); | |
float32x4x2_t row3 = vld2q_f32(start + width * 3); | |
float32x4x2_t row4 = vld2q_f32(start + width * 4); | |
float32x4x2_t row5 = vld2q_f32(start + width * 5); | |
float32x4x2_t row6 = vld2q_f32(start + width * 6); | |
float32x4x2_t row7 = vld2q_f32(start + width * 7); | |
_MM_TRANSPOSE4_PS(row0.val[0], row1.val[0], row2.val[0], | |
row3.val[0]); | |
_MM_TRANSPOSE4_PS(row4.val[0], row5.val[0], row6.val[0], | |
row7.val[0]); | |
_MM_TRANSPOSE4_PS(row0.val[1], row1.val[1], row2.val[1], | |
row3.val[1]); | |
_MM_TRANSPOSE4_PS(row4.val[1], row5.val[1], row6.val[1], | |
row7.val[1]); | |
vst2q_f32(dest, (float32x4x2_t{row0.val[0], row4.val[0]})); | |
vst2q_f32(dest - height, (float32x4x2_t{row1.val[0], row5.val[0]})); | |
vst2q_f32(dest - height * 2, | |
(float32x4x2_t{row2.val[0], row6.val[0]})); | |
vst2q_f32(dest - height * 3, | |
(float32x4x2_t{row3.val[0], row7.val[0]})); | |
vst2q_f32(dest - height * 4, | |
(float32x4x2_t{row0.val[1], row4.val[1]})); | |
vst2q_f32(dest - height * 5, | |
(float32x4x2_t{row1.val[1], row5.val[1]})); | |
vst2q_f32(dest - height * 6, | |
(float32x4x2_t{row2.val[1], row6.val[1]})); | |
vst2q_f32(dest - height * 7, | |
(float32x4x2_t{row3.val[1], row7.val[1]})); | |
} | |
} | |
// pixels left | |
int leftXStart = static_cast<int>(rowBlockCount * kBlockPixelSize); | |
int leftYStart = static_cast<int>(colBlockCount * kBlockPixelSize); | |
for (int i = leftYStart; i < height; ++i) | |
{ | |
for (int j = leftXStart; j < width; ++j) | |
{ | |
transpose(j, i); | |
} | |
} | |
} | |
void Rotate2704(const unsigned char* __restrict__ input, int width, int height, | |
unsigned char* __restrict__ output) | |
{ | |
const std::uint32_t* inputPixels = | |
reinterpret_cast<const std::uint32_t*>(input); | |
std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output); | |
const std::uint32_t kBlockPixelSize = 4; | |
const std::uint32_t rowBlockCount = width / kBlockPixelSize; | |
const std::uint32_t colBlockCount = height / kBlockPixelSize; | |
const auto transpose = [&](int x, int y) { | |
outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x]; | |
}; | |
// transpose blocks first | |
for (int i = 0; i < height - kBlockPixelSize; i += kBlockPixelSize) | |
{ | |
for (int j = 0; j < width - kBlockPixelSize; j += kBlockPixelSize) | |
{ | |
const float* inputPixelsFp = | |
reinterpret_cast<const float*>(inputPixels); | |
float* ouputPixelsFp = reinterpret_cast<float*>(outputPixels); | |
const float* start = inputPixelsFp + i * width + j; | |
float* dest = ouputPixelsFp + i + (width - j - 1) * height; | |
float32x4_t row0 = vld1q_f32(start); | |
float32x4_t row1 = vld1q_f32(start + width); | |
float32x4_t row2 = vld1q_f32(start + width * 2); | |
float32x4_t row3 = vld1q_f32(start + width * 3); | |
_MM_TRANSPOSE4_PS(row0, row1, row2, row3); | |
vst1q_f32(dest, row0); | |
vst1q_f32(dest - height, row1); | |
vst1q_f32(dest - height * 2, row2); | |
vst1q_f32(dest - height * 3, row3); | |
} | |
} | |
// pixels left | |
int leftXStart = static_cast<int>(rowBlockCount * kBlockPixelSize); | |
int leftYStart = static_cast<int>(colBlockCount * kBlockPixelSize); | |
for (int i = leftYStart; i < height; ++i) | |
{ | |
for (int j = leftXStart; j < width; ++j) | |
{ | |
transpose(j, i); | |
} | |
} | |
} | |
void Rotate2702(const unsigned char* input, int width, int height, | |
unsigned char* output) | |
{ | |
const std::uint32_t* inputPixels = | |
reinterpret_cast<const std::uint32_t*>(input); | |
std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output); | |
const auto transpose = [&](int x, int y) { | |
outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x]; | |
}; | |
for (int i = 0; i < height; ++i) | |
{ | |
for (int j = 0; j < width; ++j) | |
{ | |
transpose(j, i); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment