Created
September 4, 2018 20:19
-
-
Save llogiq/5763578ef8f60f7646798ed59b9e7404 to your computer and use it in GitHub Desktop.
single-threaded fannkuch_redux stdsimd version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::arch::x86_64::*; | |
#[derive(Copy, Clone, Debug)] | |
pub struct u8x16(__m128i); | |
impl Default for u8x16 { | |
fn default() -> Self { u8x16(unsafe { _mm_setzero_si128() }) } | |
} | |
impl u8x16 { | |
pub fn from_slice_unaligned(s: &[u8]) -> u8x16 { | |
u8x16(unsafe { _mm_loadu_si128(s.as_ptr() as *const __m128i) }) | |
} | |
pub fn write_to_slice_unaligned(self, s: &mut [u8]) { | |
unsafe { _mm_storeu_si128(s.as_mut_ptr() as *mut __m128i, self.0) } | |
} | |
pub fn extract(self, _idx: usize) -> u8 { | |
unsafe { _mm_extract_epi8(self.0, 0i32) as u8 } | |
} | |
pub fn permute_dyn(self, indices: Self) -> Self { | |
u8x16(unsafe { _mm_shuffle_epi8(self.0, indices.0) }) | |
} | |
} | |
#[derive(Default)] | |
struct State { | |
s: [u8; 16], | |
flip_masks: [u8x16; 16], | |
rotate_masks: [u8x16; 16], | |
maxflips: i32, | |
odd: u16, | |
checksum: i32, | |
} | |
impl State { | |
fn rotate_sisd(&mut self, n: usize) { | |
let c = self.s[0]; | |
for i in 1..(n + 1) { | |
self.s[i - 1] = self.s[i]; | |
} | |
self.s[n] = c; | |
} | |
fn popmasks(&mut self) { | |
let mut mask = [0_u8; 16]; | |
for i in 0..16 { | |
for (j, m) in mask.iter_mut().enumerate() { | |
*m = j as u8; | |
} | |
for x in 0..(i + 1) / 2 { | |
mask.swap(x, i - x); | |
} | |
self.flip_masks[i] = u8x16::from_slice_unaligned(&mask); | |
for (j, s) in self.s.iter_mut().enumerate() { | |
*s = j as u8; | |
} | |
self.rotate_sisd(i); | |
self.rotate_masks[i] = self.load_s(); | |
} | |
} | |
fn rotate(&mut self, n: usize) { | |
self.load_s() | |
.permute_dyn(self.rotate_masks[n]) | |
.write_to_slice_unaligned(&mut self.s) | |
} | |
fn load_s(&self) -> u8x16 { | |
u8x16::from_slice_unaligned(&self.s) | |
} | |
fn tk(&mut self, n: usize) { | |
#[derive(Copy, Clone, Debug, Default)] | |
struct Perm { | |
perm: u8x16, | |
start: u8, | |
odd: u16, | |
} | |
let mut perms = [Perm::default(); 60]; | |
let mut i = 0; | |
let mut c = [0_u8; 16]; | |
let mut perm_max = 0; | |
while i < n { | |
while i < n && perm_max < 60 { | |
self.rotate(i); | |
if c[i] as usize >= i { | |
c[i] = 0; | |
i += 1; | |
continue; | |
} | |
c[i] += 1; | |
i = 1; | |
self.odd = !self.odd; | |
if self.s[0] != 0 { | |
if self.s[self.s[0] as usize] == 0 { | |
if self.maxflips == 0 { | |
self.maxflips = 1 | |
} | |
self.checksum += if self.odd == 0 { 1 } else { -1 }; | |
} else { | |
perms[perm_max].perm = self.load_s(); | |
perms[perm_max].start = self.s[0]; | |
perms[perm_max].odd = self.odd; | |
perm_max += 1; | |
} | |
} | |
} | |
let mut k = 0; | |
while k < std::cmp::max(1, perm_max) - 1 { | |
let pk = &perms[k]; | |
let pk1 = &perms[k + 1]; | |
let mut perm1 = pk.perm; | |
let mut perm2 = pk1.perm; | |
let mut f1 = 0; | |
let mut f2 = 0; | |
let mut toterm1 = pk.start; | |
let mut toterm2 = pk1.start; | |
while toterm1 != 0 && toterm2 != 0 { | |
perm1 = | |
perm1.permute_dyn(self.flip_masks[toterm1 as usize]); | |
perm2 = | |
perm2.permute_dyn(self.flip_masks[toterm2 as usize]); | |
toterm1 = perm1.extract(0); | |
toterm2 = perm2.extract(0); | |
f1 += 1; | |
f2 += 1; | |
} | |
while toterm1 != 0 { | |
perm1 = | |
perm1.permute_dyn(self.flip_masks[toterm1 as usize]); | |
toterm1 = perm1.extract(0); | |
f1 += 1; | |
} | |
while toterm2 != 0 { | |
perm2 = | |
perm2.permute_dyn(self.flip_masks[toterm2 as usize]); | |
toterm2 = perm2.extract(0); | |
f2 += 1; | |
} | |
if f1 > self.maxflips { | |
self.maxflips = f1 | |
} | |
if f2 > self.maxflips { | |
self.maxflips = f2 | |
} | |
self.checksum += if pk.odd == 0 { f1 } else { -f1 }; | |
self.checksum += if pk1.odd == 0 { f2 } else { -f2 }; | |
k += 2; | |
} | |
while k < perm_max { | |
let pk = &perms[k]; | |
let mut perm = pk.perm; | |
let mut f = 0; | |
let mut toterm = pk.start; | |
while toterm != 0 { | |
perm = perm.permute_dyn(self.flip_masks[toterm as usize]); | |
toterm = perm.extract(0); | |
f += 1; | |
} | |
if f > self.maxflips { | |
self.maxflips = f | |
} | |
self.checksum += if pk.odd == 0 { f } else { -f }; | |
k += 1 | |
} | |
perm_max = 0; | |
} | |
} | |
} | |
fn main() { | |
let n: usize = std::env::args().nth(1).and_then(|s| s.parse().ok()) | |
.unwrap_or(7); | |
let mut state = State::default(); | |
state.popmasks(); | |
for i in 0..n { | |
state.s[i] = i as u8 | |
} | |
state.tk(n); | |
let (checksum, maxflips) = (state.checksum, state.maxflips); | |
println!("{}\nPfannkuchen({}) = {}", checksum, n, maxflips); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment