LYP951018 · September 27, 2019 12:21
diff --git a/Main.cpp b/Main.cpp
 void Rotate2701(const unsigned char* __restrict__ input, int width, int height,
                unsigned char* __restrict__ output)
 {
    const std::uint32_t* inputPixels =
        reinterpret_cast<const std::uint32_t*>(input);
    std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output);

    const std::uint32_t kBlockPixelSize = 8;

    const std::uint32_t rowBlockCount = width / kBlockPixelSize;
    const std::uint32_t colBlockCount = height / kBlockPixelSize;

    const auto transpose = [&](int x, int y) {
        outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x];
    };

    // transpose blocks first
    for (int i = 0; i < height - kBlockPixelSize; i += kBlockPixelSize)
    {
        for (int j = 0; j < width - kBlockPixelSize; j += kBlockPixelSize)
        {
            for (int k = 0; k < kBlockPixelSize; ++k)
            {
                int y = i + k;
                for (int m = 0; m < kBlockPixelSize; ++m)
                {
                    int x = j + m;
                    // transpose(x, y);
                    outputPixels[y + (width - x - 1) * height] =
                        inputPixels[y * width + x];
                }
            }
        }
    }

    // pixels left
    int leftXStart = static_cast<int>(rowBlockCount * kBlockPixelSize);
    int leftYStart = static_cast<int>(colBlockCount * kBlockPixelSize);

    for (int i = leftYStart; i < height; ++i)
    {
        for (int j = leftXStart; j < width; ++j)
        {
            transpose(j, i);
        }
    }
 }

 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                              \
    {                                                                          \
        float32x4_t tmp0 =                                                     \
            vcombine_f32(vget_low_f32(row0), vget_low_f32(row1));              \
        float32x4_t tmp1 =                                                     \
            vcombine_f32(vget_high_f32(row0), vget_high_f32(row1));            \
        float32x4_t tmp2 =                                                     \
            vcombine_f32(vget_low_f32(row2), vget_low_f32(row3));              \
        float32x4_t tmp3 =                                                     \
            vcombine_f32(vget_high_f32(row2), vget_high_f32(row3));            \
        auto tmp4 = vuzpq_f32(tmp0, tmp1);                                     \
        row0 = tmp4.val[0];                                                    \
        row1 = tmp4.val[1];                                                    \
        auto tmp5 = vuzpq_f32(tmp2, tmp3);                                     \
        row2 = tmp5.val[0];                                                    \
        row3 = tmp5.val[1];                                                    \
    }

 void Rotate2703(const unsigned char* __restrict__ input, int width, int height,
                unsigned char* __restrict__ output)
 {
    const std::uint32_t* inputPixels =
        reinterpret_cast<const std::uint32_t*>(input);
    std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output);

    const std::uint32_t kBlockPixelSize = 8;

    const std::uint32_t rowBlockCount = width / kBlockPixelSize;
    const std::uint32_t colBlockCount = height / kBlockPixelSize;

    const auto transpose = [&](int x, int y) {
        outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x];
    };

    // transpose blocks first
    for (int i = 0; i < height - kBlockPixelSize; i += kBlockPixelSize)
    {
        for (int j = 0; j < width - kBlockPixelSize; j += kBlockPixelSize)
        {
            const float* inputPixelsFp =
                reinterpret_cast<const float*>(inputPixels);
            float* ouputPixelsFp = reinterpret_cast<float*>(outputPixels);
            const float* start = inputPixelsFp + i * width + j;
            float* dest = ouputPixelsFp + i + (width - j - 1) * height;
            float32x4x2_t row0 = vld2q_f32(start);
            float32x4x2_t row1 = vld2q_f32(start + width);
            float32x4x2_t row2 = vld2q_f32(start + width * 2);
            float32x4x2_t row3 = vld2q_f32(start + width * 3);
            float32x4x2_t row4 = vld2q_f32(start + width * 4);
            float32x4x2_t row5 = vld2q_f32(start + width * 5);
            float32x4x2_t row6 = vld2q_f32(start + width * 6);
            float32x4x2_t row7 = vld2q_f32(start + width * 7);
            _MM_TRANSPOSE4_PS(row0.val[0], row1.val[0], row2.val[0],
                              row3.val[0]);
            _MM_TRANSPOSE4_PS(row4.val[0], row5.val[0], row6.val[0],
                              row7.val[0]);
            _MM_TRANSPOSE4_PS(row0.val[1], row1.val[1], row2.val[1],
                              row3.val[1]);
            _MM_TRANSPOSE4_PS(row4.val[1], row5.val[1], row6.val[1],
                              row7.val[1]);
            vst2q_f32(dest, (float32x4x2_t{row0.val[0], row4.val[0]}));
            vst2q_f32(dest - height, (float32x4x2_t{row1.val[0], row5.val[0]}));
            vst2q_f32(dest - height * 2,
                      (float32x4x2_t{row2.val[0], row6.val[0]}));
            vst2q_f32(dest - height * 3,
                      (float32x4x2_t{row3.val[0], row7.val[0]}));
            vst2q_f32(dest - height * 4,
                      (float32x4x2_t{row0.val[1], row4.val[1]}));
            vst2q_f32(dest - height * 5,
                      (float32x4x2_t{row1.val[1], row5.val[1]}));
            vst2q_f32(dest - height * 6,
                      (float32x4x2_t{row2.val[1], row6.val[1]}));
            vst2q_f32(dest - height * 7,
                      (float32x4x2_t{row3.val[1], row7.val[1]}));
        }
    }

    // pixels left
    int leftXStart = static_cast<int>(rowBlockCount * kBlockPixelSize);
    int leftYStart = static_cast<int>(colBlockCount * kBlockPixelSize);

    for (int i = leftYStart; i < height; ++i)
    {
        for (int j = leftXStart; j < width; ++j)
        {
            transpose(j, i);
        }
    }
 }

 void Rotate2704(const unsigned char* __restrict__ input, int width, int height,
                unsigned char* __restrict__ output)
 {
    const std::uint32_t* inputPixels =
        reinterpret_cast<const std::uint32_t*>(input);
    std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output);

    const std::uint32_t kBlockPixelSize = 4;

    const std::uint32_t rowBlockCount = width / kBlockPixelSize;
    const std::uint32_t colBlockCount = height / kBlockPixelSize;

    const auto transpose = [&](int x, int y) {
        outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x];
    };

    // transpose blocks first
    for (int i = 0; i < height - kBlockPixelSize; i += kBlockPixelSize)
    {
        for (int j = 0; j < width - kBlockPixelSize; j += kBlockPixelSize)
        {
            const float* inputPixelsFp =
                reinterpret_cast<const float*>(inputPixels);
            float* ouputPixelsFp = reinterpret_cast<float*>(outputPixels);
            const float* start = inputPixelsFp + i * width + j;
            float* dest = ouputPixelsFp + i + (width - j - 1) * height;
            float32x4_t row0 = vld1q_f32(start);
            float32x4_t row1 = vld1q_f32(start + width);
            float32x4_t row2 = vld1q_f32(start + width * 2);
            float32x4_t row3 = vld1q_f32(start + width * 3);
            _MM_TRANSPOSE4_PS(row0, row1, row2, row3);
            vst1q_f32(dest, row0);
            vst1q_f32(dest - height, row1);
            vst1q_f32(dest - height * 2, row2);
            vst1q_f32(dest - height * 3, row3);
        }
    }

    // pixels left
    int leftXStart = static_cast<int>(rowBlockCount * kBlockPixelSize);
    int leftYStart = static_cast<int>(colBlockCount * kBlockPixelSize);

    for (int i = leftYStart; i < height; ++i)
    {
        for (int j = leftXStart; j < width; ++j)
        {
            transpose(j, i);
        }
    }
 }

 void Rotate2702(const unsigned char* input, int width, int height,
                unsigned char* output)
 {
    const std::uint32_t* inputPixels =
        reinterpret_cast<const std::uint32_t*>(input);
    std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output);

    const auto transpose = [&](int x, int y) {
        outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x];
    };

    for (int i = 0; i < height; ++i)
    {
        for (int j = 0; j < width; ++j)
        {
            transpose(j, i);
        }
    }
 }
	void Rotate2701(const unsigned char* __restrict__ input, int width, int height,
	unsigned char* __restrict__ output)
	{
	const std::uint32_t* inputPixels =
	reinterpret_cast<const std::uint32_t*>(input);
	std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output);

	const std::uint32_t kBlockPixelSize = 8;

	const std::uint32_t rowBlockCount = width / kBlockPixelSize;
	const std::uint32_t colBlockCount = height / kBlockPixelSize;

	const auto transpose = [&](int x, int y) {
	outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x];
	};

	// transpose blocks first
	for (int i = 0; i < height - kBlockPixelSize; i += kBlockPixelSize)
	{
	for (int j = 0; j < width - kBlockPixelSize; j += kBlockPixelSize)
	{
	for (int k = 0; k < kBlockPixelSize; ++k)
	{
	int y = i + k;
	for (int m = 0; m < kBlockPixelSize; ++m)
	{
	int x = j + m;
	// transpose(x, y);
	outputPixels[y + (width - x - 1) * height] =
	inputPixels[y * width + x];
	}
	}
	}
	}

	// pixels left
	int leftXStart = static_cast<int>(rowBlockCount * kBlockPixelSize);
	int leftYStart = static_cast<int>(colBlockCount * kBlockPixelSize);

	for (int i = leftYStart; i < height; ++i)
	{
	for (int j = leftXStart; j < width; ++j)
	{
	transpose(j, i);
	}
	}
	}

	#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
	{ \
	float32x4_t tmp0 = \
	vcombine_f32(vget_low_f32(row0), vget_low_f32(row1)); \
	float32x4_t tmp1 = \
	vcombine_f32(vget_high_f32(row0), vget_high_f32(row1)); \
	float32x4_t tmp2 = \
	vcombine_f32(vget_low_f32(row2), vget_low_f32(row3)); \
	float32x4_t tmp3 = \
	vcombine_f32(vget_high_f32(row2), vget_high_f32(row3)); \
	auto tmp4 = vuzpq_f32(tmp0, tmp1); \
	row0 = tmp4.val[0]; \
	row1 = tmp4.val[1]; \
	auto tmp5 = vuzpq_f32(tmp2, tmp3); \
	row2 = tmp5.val[0]; \
	row3 = tmp5.val[1]; \
	}

	void Rotate2703(const unsigned char* __restrict__ input, int width, int height,
	unsigned char* __restrict__ output)
	{
	const std::uint32_t* inputPixels =
	reinterpret_cast<const std::uint32_t*>(input);
	std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output);

	const std::uint32_t kBlockPixelSize = 8;

	const std::uint32_t rowBlockCount = width / kBlockPixelSize;
	const std::uint32_t colBlockCount = height / kBlockPixelSize;

	const auto transpose = [&](int x, int y) {
	outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x];
	};

	// transpose blocks first
	for (int i = 0; i < height - kBlockPixelSize; i += kBlockPixelSize)
	{
	for (int j = 0; j < width - kBlockPixelSize; j += kBlockPixelSize)
	{
	const float* inputPixelsFp =
	reinterpret_cast<const float*>(inputPixels);
	float* ouputPixelsFp = reinterpret_cast<float*>(outputPixels);
	const float* start = inputPixelsFp + i * width + j;
	float* dest = ouputPixelsFp + i + (width - j - 1) * height;
	float32x4x2_t row0 = vld2q_f32(start);
	float32x4x2_t row1 = vld2q_f32(start + width);
	float32x4x2_t row2 = vld2q_f32(start + width * 2);
	float32x4x2_t row3 = vld2q_f32(start + width * 3);
	float32x4x2_t row4 = vld2q_f32(start + width * 4);
	float32x4x2_t row5 = vld2q_f32(start + width * 5);
	float32x4x2_t row6 = vld2q_f32(start + width * 6);
	float32x4x2_t row7 = vld2q_f32(start + width * 7);
	_MM_TRANSPOSE4_PS(row0.val[0], row1.val[0], row2.val[0],
	row3.val[0]);
	_MM_TRANSPOSE4_PS(row4.val[0], row5.val[0], row6.val[0],
	row7.val[0]);
	_MM_TRANSPOSE4_PS(row0.val[1], row1.val[1], row2.val[1],
	row3.val[1]);
	_MM_TRANSPOSE4_PS(row4.val[1], row5.val[1], row6.val[1],
	row7.val[1]);
	vst2q_f32(dest, (float32x4x2_t{row0.val[0], row4.val[0]}));
	vst2q_f32(dest - height, (float32x4x2_t{row1.val[0], row5.val[0]}));
	vst2q_f32(dest - height * 2,
	(float32x4x2_t{row2.val[0], row6.val[0]}));
	vst2q_f32(dest - height * 3,
	(float32x4x2_t{row3.val[0], row7.val[0]}));
	vst2q_f32(dest - height * 4,
	(float32x4x2_t{row0.val[1], row4.val[1]}));
	vst2q_f32(dest - height * 5,
	(float32x4x2_t{row1.val[1], row5.val[1]}));
	vst2q_f32(dest - height * 6,
	(float32x4x2_t{row2.val[1], row6.val[1]}));
	vst2q_f32(dest - height * 7,
	(float32x4x2_t{row3.val[1], row7.val[1]}));
	}
	}

	// pixels left
	int leftXStart = static_cast<int>(rowBlockCount * kBlockPixelSize);
	int leftYStart = static_cast<int>(colBlockCount * kBlockPixelSize);

	for (int i = leftYStart; i < height; ++i)
	{
	for (int j = leftXStart; j < width; ++j)
	{
	transpose(j, i);
	}
	}
	}

	void Rotate2704(const unsigned char* __restrict__ input, int width, int height,
	unsigned char* __restrict__ output)
	{
	const std::uint32_t* inputPixels =
	reinterpret_cast<const std::uint32_t*>(input);
	std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output);

	const std::uint32_t kBlockPixelSize = 4;

	const std::uint32_t rowBlockCount = width / kBlockPixelSize;
	const std::uint32_t colBlockCount = height / kBlockPixelSize;

	const auto transpose = [&](int x, int y) {
	outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x];
	};

	// transpose blocks first
	for (int i = 0; i < height - kBlockPixelSize; i += kBlockPixelSize)
	{
	for (int j = 0; j < width - kBlockPixelSize; j += kBlockPixelSize)
	{
	const float* inputPixelsFp =
	reinterpret_cast<const float*>(inputPixels);
	float* ouputPixelsFp = reinterpret_cast<float*>(outputPixels);
	const float* start = inputPixelsFp + i * width + j;
	float* dest = ouputPixelsFp + i + (width - j - 1) * height;
	float32x4_t row0 = vld1q_f32(start);
	float32x4_t row1 = vld1q_f32(start + width);
	float32x4_t row2 = vld1q_f32(start + width * 2);
	float32x4_t row3 = vld1q_f32(start + width * 3);
	_MM_TRANSPOSE4_PS(row0, row1, row2, row3);
	vst1q_f32(dest, row0);
	vst1q_f32(dest - height, row1);
	vst1q_f32(dest - height * 2, row2);
	vst1q_f32(dest - height * 3, row3);
	}
	}

	// pixels left
	int leftXStart = static_cast<int>(rowBlockCount * kBlockPixelSize);
	int leftYStart = static_cast<int>(colBlockCount * kBlockPixelSize);

	for (int i = leftYStart; i < height; ++i)
	{
	for (int j = leftXStart; j < width; ++j)
	{
	transpose(j, i);
	}
	}
	}

	void Rotate2702(const unsigned char* input, int width, int height,
	unsigned char* output)
	{
	const std::uint32_t* inputPixels =
	reinterpret_cast<const std::uint32_t*>(input);
	std::uint32_t* outputPixels = reinterpret_cast<std::uint32_t*>(output);

	const auto transpose = [&](int x, int y) {
	outputPixels[y + (width - x - 1) * height] = inputPixels[y * width + x];
	};

	for (int i = 0; i < height; ++i)
	{
	for (int j = 0; j < width; ++j)
	{
	transpose(j, i);
	}
	}
	}