This gist will be to place steps and numbers for optimization of msd radix sort.
Benchmakring for Travis' radix_sort7
LSD radix sort implementation:
CPU Caches:
L1 Data 32K (x28)
L1 Instruction 32K (x28)
L2 Unified 1024K (x28)
L3 Unified 19712K (x2)
Load Average: 0.47, 0.44, 0.63
--------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------------------------------------------
SortingBmk_random_wholeRange/StdStableSort/10000 537 us 535 us 5230
SortingBmk_random_wholeRange/StdStableSort/60000 3921 us 3912 us 716
SortingBmk_random_wholeRange/StdStableSort/110000 7686 us 7668 us 365
SortingBmk_random_wholeRange/StdStableSort/160000 11455 us 11428 us 245
SortingBmk_random_wholeRange/StdStableSort/210000 15475 us 15438 us 181
SortingBmk_random_wholeRange/StdStableSort/260000 19690 us 19643 us 143
SortingBmk_random_wholeRange/StdStableSort/310000 23834 us 23778 us 118
SortingBmk_random_wholeRange/StdStableSort/360000 28007 us 27941 us 100
SortingBmk_random_wholeRange/StdStableSort/410000 32287 us 32211 us 87
SortingBmk_random_wholeRange/StdStableSort/460000 36361 us 36273 us 77
SortingBmk_random_wholeRange/StdStableSort/510000 40226 us 40130 us 70
SortingBmk_random_wholeRange/StdStableSort/560000 44344 us 44239 us 63
CPU Caches:
L1 Data 32K (x28)
L1 Instruction 32K (x28)
L2 Unified 1024K (x28)
L3 Unified 19712K (x2)
Benchmark Time CPU Iterations
-------------------------------------------------------------------------------------------
SortingBmk_random_wholeRange/MSDRadixSort/10000 195 us 195 us 14394
SortingBmk_random_wholeRange/MSDRadixSort/60000 1212 us 1209 us 2316
SortingBmk_random_wholeRange/MSDRadixSort/110000 2439 us 2433 us 1155
SortingBmk_random_wholeRange/MSDRadixSort/160000 3531 us 3522 us 795
SortingBmk_random_wholeRange/MSDRadixSort/210000 4624 us 4613 us 608
SortingBmk_random_wholeRange/MSDRadixSort/260000 5732 us 5718 us 491
SortingBmk_random_wholeRange/MSDRadixSort/310000 6789 us 6773 us 413
SortingBmk_random_wholeRange/MSDRadixSort/360000 7929 us 7910 us 353
SortingBmk_random_wholeRange/MSDRadixSort/410000 9034 us 9012 us 311
SortingBmk_random_wholeRange/MSDRadixSort/460000 10086 us 10062 us 278
SortingBmk_random_wholeRange/MSDRadixSort/510000 11157 us 11131 us 252
SortingBmk_random_wholeRange/MSDRadixSort/560000 12223 us 12194 us 230
CPU Caches:
L1 Data 32K (x28)
L1 Instruction 32K (x28)
L2 Unified 1024K (x28)
L3 Unified 19712K (x2)
Load Average: 0.99, 0.63, 0.68
-------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------------------------
SortingBmk_random_wholeRange/LSDRadixSort/10000 82.2 us 82.1 us 34223
SortingBmk_random_wholeRange/LSDRadixSort/60000 552 us 550 us 5092
SortingBmk_random_wholeRange/LSDRadixSort/110000 1442 us 1438 us 1941
SortingBmk_random_wholeRange/LSDRadixSort/160000 2103 us 2099 us 1337
SortingBmk_random_wholeRange/LSDRadixSort/210000 2781 us 2775 us 1009
SortingBmk_random_wholeRange/LSDRadixSort/260000 3461 us 3453 us 811
SortingBmk_random_wholeRange/LSDRadixSort/310000 4143 us 4133 us 678
SortingBmk_random_wholeRange/LSDRadixSort/360000 4835 us 4823 us 580
SortingBmk_random_wholeRange/LSDRadixSort/410000 5543 us 5530 us 505
SortingBmk_random_wholeRange/LSDRadixSort/460000 6233 us 6219 us 452
SortingBmk_random_wholeRange/LSDRadixSort/510000 6912 us 6895 us 406
SortingBmk_random_wholeRange/LSDRadixSort/560000 7599 us 7581 us 369
========================================================= The first version of MSD is below.
Code
#include <bits/stdc++.h>
using namespace std;
using T = uint64_t;
const size_t RADIX_BITS = 1;
const size_t RADIX_SIZE = (size_t)1 << RADIX_BITS;
const size_t RADIX_LEVELS = (63 / RADIX_BITS) + 1;
const size_t DO_FINAL_SWAP = RADIX_LEVELS & 1;
const T RADIX_MASK = RADIX_SIZE - 1;
constexpr auto partFunc = [](T v, size_t i) -> int { return (v >> i) & RADIX_MASK; }; // it looks to be inlined
// pass <- [7, 0]
void radix_msd_rec(vector<T>& from, vector<T>& to, size_t lo, size_t hi, size_t pass) {
//cout << "ITER " << lo << " " << hi << ", " << pass << endl;
size_t shift = pass * RADIX_BITS;
size_t freq[RADIX_SIZE] = {};
for (size_t i = lo; i < hi; ++i) {
T value = from[i];
++freq[partFunc(value, shift)];
}
T* queue_ptrs[RADIX_SIZE];
queue_ptrs[0] = &to[lo];
for (size_t i = 1; i < RADIX_SIZE; ++i)
queue_ptrs[i] = queue_ptrs[i-1] + freq[i-1];
for (size_t i = lo; i < hi; ++i) {
T value = from[i];
size_t index = partFunc(value, shift);
*queue_ptrs[index] = value;
++queue_ptrs[index];
}
auto newLo = lo;
auto newHi = lo + freq[0];
for (size_t i = 1; i < RADIX_SIZE; ++i) {
if (newHi - newLo > 1 && pass != 0) // at least one element to sort
radix_msd_rec(to, from, newLo, newHi, pass - 1);
newLo = newHi;
newHi += freq[i];
}
if (newHi - newLo > 1 && pass != 0)
radix_msd_rec(to, from, newLo, newHi, pass - 1);
}
void radix_sort_msd(vector<T>& data)
{
vector<T> buf(data.size());
radix_msd_rec(data, buf, 0, data.size(), 2);
if (DO_FINAL_SWAP)
swap(data, buf);
}
int main()
{
vector<T> data =
{0b11,
0b10,
0b01,
0b00};
radix_sort_msd(data);
for (auto& v : data) {
cout << bitset<2>(v) << endl;
}
for (auto& v : data) {
cout << v << " ";
}
cout << endl;
return 0;
}
Next steps:
1. test properly on random numbers2. benchmark3. check out known optimizations:
3.1 Skip trivial3.2 precompute histograms? --> looks impossible
3.4 prefetch?3.5 Avoid recursion?
3.6 check out asm in godbolt quickly?3.7 perf with bmks?3.8 Try MSD until size of the range is not in L1 where we use LSD link