Created
September 27, 2019 20:19
-
-
Save thomasdenney/f6f8d938ec0459162ed3cb7a2a2d1412 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <cstdint> | |
#include <cstdio> | |
#include <xmmintrin.h> | |
typedef int32_t v4 __attribute__ ((vector_size(sizeof(int32_t) * 4))); | |
inline bool v4equal(v4 a, v4 b) | |
{ | |
__m128i v1 = _mm_load_si128((__m128i*)&a); | |
__m128i v2 = _mm_load_si128((__m128i*)&b); | |
__m128i vcmp = _mm_cmpeq_epi32(v1, v2); | |
uint16_t mask = _mm_movemask_epi8(vcmp); | |
return mask == 0xffff; | |
} | |
inline bool v4anyGreater(v4 a, v4 b) | |
{ | |
__m128i v1 = _mm_load_si128((__m128i*)&a); | |
__m128i v2 = _mm_load_si128((__m128i*)&b); | |
__m128i vcmp = _mm_cmpgt_epi32(v1, v2); | |
uint16_t mask = _mm_movemask_epi8(vcmp); | |
return mask != 0; | |
} | |
class Sujiko | |
{ | |
private: | |
uint16_t _usedSet; | |
uint16_t _usedCells; | |
uint8_t _cells[9]; | |
v4 _sums; | |
uint8_t _diag1, _diag2; | |
v4 _clues; | |
inline bool filled(); | |
inline bool used(uint8_t v); | |
inline void unset(size_t cell, uint8_t value); | |
inline bool cantBeValid(); | |
inline bool valid(); | |
public: | |
Sujiko(uint8_t clue1, uint8_t clue2, uint8_t clue3, uint8_t clue4) | |
: _usedSet(0), | |
_usedCells(0), | |
_cells {0}, | |
_sums {0,0,0,0}, | |
_diag1(clue1 + clue3), | |
_diag2(clue2 + clue4), | |
_clues { clue4, clue3, clue2, clue1 } | |
{} | |
inline void set(size_t cell, uint8_t value); | |
bool solve(); | |
void print(); | |
}; | |
static const v4 multipliers[] = { | |
{ 0, 0, 0, 1 }, | |
{ 0, 0, 1, 1 }, | |
{ 0, 0, 1, 0 }, | |
{ 0, 1, 0, 1 }, | |
{ 1, 1, 1, 1 }, | |
{ 1, 0, 1, 0 }, | |
{ 0, 1, 0, 0 }, | |
{ 1, 1, 0, 0 }, | |
{ 1, 0, 0, 0 }, | |
}; | |
void Sujiko::set(size_t cell, uint8_t value) | |
{ | |
_sums += multipliers[cell] * value; | |
_cells[cell] = value; | |
_usedSet |= 1 << value; | |
_usedCells |= 1 << cell; | |
} | |
void Sujiko::unset(size_t cell, uint8_t value) | |
{ | |
_usedSet ^= 1 << value; | |
_usedCells ^= 1 << cell; | |
_cells[cell] = 0; | |
_sums -= multipliers[cell] * value; | |
} | |
bool Sujiko::filled() | |
{ | |
return _usedSet == 0b1111111110; | |
} | |
bool Sujiko::valid() | |
{ | |
return filled() && v4equal(_sums, _clues); | |
} | |
bool Sujiko::cantBeValid() | |
{ | |
return | |
// These identities are from Wikipedia and allow us to bail earlier | |
(_cells[0] + _cells[8]) + _diag2 > 45 + _cells[4] || | |
(_cells[2] + _cells[6]) + _diag1 > 45 + _cells[4] || | |
// General properties | |
v4anyGreater(_sums, _clues); | |
} | |
bool Sujiko::used(uint8_t v) | |
{ | |
return (_usedSet & (1 << v)) != 0; | |
} | |
// Precondition: not filled | |
bool Sujiko::solve() | |
{ | |
int firstUnusedCell = 4; | |
// This loop gets unrolled, experiments with running from a higher known | |
// index didn't make it faster. This static order works from most | |
// constrained to least constrained. | |
static const size_t constrainedOrder[] = { 4, 1, 5, 7, 3, 0, 2, 6, 8 }; | |
for (int i = 0; i < 9; i++) { | |
// The compiler inlines the test values from above | |
if ((_usedCells & (1 << constrainedOrder[i])) == 0) { | |
firstUnusedCell = constrainedOrder[i]; | |
break; | |
} | |
} | |
v4 diff = _clues - _sums; | |
if (firstUnusedCell == 0 || firstUnusedCell == 2 || firstUnusedCell == 6 || firstUnusedCell == 8) { | |
uint16_t toUse = (((1 << diff[0]) | (1 << diff[1])) | ((1 << diff[2]) | (1 << diff[3]))) & 0b1111111110; | |
if ((_usedSet ^ toUse) != 0b1111111110) { | |
// Means we would need to duplicate a value | |
return false; | |
} else { | |
if ((_usedCells & (1 << 0)) == 0) | |
set(0, diff[3]); | |
if ((_usedCells & (1 << 2)) == 0) | |
set(2, diff[2]); | |
if ((_usedCells & (1 << 6)) == 0) | |
set(6, diff[1]); | |
if ((_usedCells & (1 << 8)) == 0) | |
set(8, diff[0]); | |
return true; | |
} | |
} | |
uint8_t highestUnused = 63 - __builtin_clzl(~_usedSet & 0b1111111111); | |
const uint16_t originalSet = _usedSet; | |
for (uint8_t testValue = highestUnused; testValue != 0; --testValue) { | |
// Testing against the original set is a bit faster because there is no | |
// longer a data dependency | |
if ((originalSet & (1 << testValue)) == 0) { | |
set(firstUnusedCell, testValue); | |
if (!cantBeValid()) { | |
// This is the same as not filled | |
if (firstUnusedCell != 8) { | |
if (solve()) { | |
return true; | |
} | |
} | |
if (valid()) { | |
return true; | |
} | |
} | |
unset(firstUnusedCell, testValue); | |
} | |
} | |
return false; | |
} | |
void Sujiko::print() | |
{ | |
printf("%d,%d,%d,%d\n%d%d%d\n%d%d%d\n%d%d%d\n", | |
_clues[3], _clues[2], _clues[1], _clues[0], | |
_cells[0], _cells[1], _cells[2], | |
_cells[3], _cells[4], _cells[5], | |
_cells[6], _cells[7], _cells[8]); | |
} | |
static __inline__ unsigned long long rdtsc(void) | |
{ | |
unsigned hi, lo; | |
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); | |
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); | |
} | |
static const int ATTEMPTS = 300000; | |
int main(int argc, char* argv[]) | |
{ | |
Sujiko board(22, 21, 24, 24); | |
board.set(1, 4); | |
board.set(2, 3); | |
board.solve(); | |
unsigned long long start = rdtsc(); | |
for (int i = 0; i < ATTEMPTS; i++) { | |
Sujiko board(22, 21, 24, 24); | |
board.set(1, 4); | |
board.set(2, 3); | |
board.solve(); | |
} | |
unsigned long long end = rdtsc(); | |
unsigned long long cycles = end - start; | |
board.print(); | |
printf("Average time is %f cycles, %f us\n", (double)cycles / ATTEMPTS, (double)cycles / 2.8e9 * 1e6 / (double)ATTEMPTS); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment