nissy-core

The "engine" of nissy, including the H48 optimal solver.
git clone https://git.tronto.net/nissy-core
Download | Log | Files | Refs | README | LICENSE

commit 38014615238a51f1c0c1f2f96d37cbfbf765520a
parent 1d93860c6f6d3fdbdddabdcd870ef0f8b6e47c81
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Tue, 29 Jul 2025 10:00:56 +0200

Optimized some coordinates

Diffstat:
Msrc/arch/arch.h | 3---
Msrc/arch/avx2.h | 84+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Msrc/arch/common.h | 5+++++
Dsrc/arch/coordinates_unoptimized.h | 53-----------------------------------------------------
Msrc/arch/neon.h | 86++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Msrc/arch/portable.h | 44++++++++++++++++++++++++++++++++++++++++++++
Msrc/utils/math.h | 22++++++++++++----------
7 files changed, 224 insertions(+), 73 deletions(-)

diff --git a/src/arch/arch.h b/src/arch/arch.h @@ -7,7 +7,6 @@ typedef __m256i cube_t; #if !defined(TEST_H) #include "common.h" #include "avx2.h" -#include "coordinates_unoptimized.h" #endif #elif defined(NEON) @@ -22,7 +21,6 @@ typedef struct { #if !defined(TEST_H) #include "common.h" #include "neon.h" -#include "coordinates_unoptimized.h" #endif #else @@ -35,7 +33,6 @@ typedef struct { #if !defined(TEST_H) #include "common.h" #include "portable.h" -#include "coordinates_unoptimized.h" #endif #endif diff --git a/src/arch/avx2.h b/src/arch/avx2.h @@ -12,6 +12,9 @@ #define CARRY_AVX2 _mm256_set_epi64x(INT64_C(0x20202020), \ INT64_C(0x2020202020202020), 0, INT64_C(0x6060606060606060)) +#define SOLVED_L INT64_C(0x0706050403020100) +#define SOLVED_H INT64_C(0x0B0A0908) + #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \ e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \ _mm256_set_epi8(0, 0, 0, 0, e_br, e_bl, e_fl, e_fr, \ @@ -19,8 +22,11 @@ 0, 0, 0, 0, 0, 0, 0, 0, \ c_dbl, c_dfr, c_ubr, c_ufl, c_dbr, c_dfl, c_ubl, c_ufr) #define ZERO_CUBE _mm256_set_epi64x(0, 0, 0, 0) -#define SOLVED_CUBE STATIC_CUBE( \ - 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) +#define SOLVED_CUBE _mm256_set_epi64x(SOLVED_H, SOLVED_L, 0, SOLVED_L) + + +STATIC_INLINE int64_t permtoindex_8x8(int64_t); +STATIC_INLINE int64_t indextoperm_8x8(int64_t); STATIC_INLINE int popcount_u32(uint32_t x) @@ -287,3 +293,77 @@ set_eo(cube_t cube[static 1], int64_t eo) *cube = _mm256_andnot_si256(EO_AVX2, *cube); *cube = _mm256_or_si256(*cube, veo); } + +STATIC_INLINE int64_t +permtoindex_8x8(int64_t a) +{ + int64_t i, c, ret; + __m64 cmp; + + for (i = 0, ret = 0; i < 8; i++) { + cmp = _mm_set1_pi8(a & INT64_C(0xFF)); + a = (a >> INT64_C(8)) | INT64_C(0x0F00000000000000); + cmp = _mm_cmpgt_pi8(cmp, _mm_cvtsi64_m64(a)); + c = _mm_popcnt_u64(_mm_cvtm64_si64(cmp)) >> INT64_C(3); + ret += c * factorial[7-i]; + } + + return ret; +} + +STATIC_INLINE int64_t +indextoperm_8x8(int64_t p) +{ + int used; + int64_t c, k, i, j, ret; + + for (i = 0, ret = 0, used = 0; i < 8; i++) { + k = p / factorial[7-i]; + + /* Find k-th unused number */ + for (j = 0, c = 0; c <= k; j++) + c += 1 - ((used & (1 << j)) >> j); + + ret |= (j-1) << (8*i); + used |= 1 << (j-1); + p %= factorial[7-i]; + } + + return ret; +} + +STATIC_INLINE int64_t +coord_cp(cube_t cube) +{ + cube_t cp; + int64_t aux[4]; + + cp = _mm256_and_si256(cube, CP_AVX2); + _mm256_storeu_si256((__m256i_u *)aux, cp); + + return permtoindex_8x8(aux[0]); +} + +STATIC_INLINE cube_t +invcoord_cp(int64_t i) +{ + return _mm256_set_epi64x(SOLVED_H, SOLVED_L, 0, indextoperm_8x8(i)); +} + +STATIC_INLINE int64_t +coord_epud(cube_t cube) +{ + cube_t ep; + int64_t aux[4]; + + ep = _mm256_and_si256(cube, EP_AVX2); + _mm256_storeu_si256((__m256i_u *)aux, ep); + + return permtoindex_8x8(aux[2]); +} + +STATIC_INLINE cube_t +invcoord_epud(int64_t i) +{ + return _mm256_set_epi64x(SOLVED_H, indextoperm_8x8(i), 0, SOLVED_L); +} diff --git a/src/arch/common.h b/src/arch/common.h @@ -37,6 +37,11 @@ STATIC_INLINE void set_eo(cube_t [static 1], int64_t); STATIC_INLINE void invcoord_esep_array(int64_t, int64_t, uint8_t[static 12]); STATIC_INLINE cube_t invcoord_eoesep(int64_t); +STATIC_INLINE int64_t coord_cp(cube_t); +STATIC_INLINE cube_t invcoord_cp(int64_t); +STATIC_INLINE int64_t coord_epud(cube_t); +STATIC_INLINE cube_t invcoord_epud(int64_t); + STATIC_INLINE void invcoord_esep_array(int64_t set1, int64_t set2, uint8_t mem[static 12]) { diff --git a/src/arch/coordinates_unoptimized.h b/src/arch/coordinates_unoptimized.h @@ -1,53 +0,0 @@ -STATIC_INLINE int64_t coord_cp(cube_t); -STATIC_INLINE cube_t invcoord_cp(int64_t); -STATIC_INLINE int64_t coord_epud(cube_t); -STATIC_INLINE cube_t invcoord_epud(int64_t); - -STATIC_INLINE int64_t -coord_cp(cube_t cube) -{ - int i; - uint8_t c[8], e[12]; - - pieces(&cube, c, e); - for (i = 0; i < 8; i++) - c[i] &= PBITS; - - return permtoindex(8, c); -} - -STATIC_INLINE cube_t -invcoord_cp(int64_t i) -{ - uint8_t c[8]; - - indextoperm(i, 8, c); - - return STATIC_CUBE(c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11); -} - -STATIC_INLINE int64_t -coord_epud(cube_t cube) -{ - int i; - uint8_t c[8], e[12]; - - pieces(&cube, c, e); - - for (i = 0; i < 8; i++) - e[i] &= PBITS; - - return permtoindex(8, e); -} - -STATIC_INLINE cube_t -invcoord_epud(int64_t i) -{ - uint8_t e[8]; - - indextoperm(i, 8, e); - - return STATIC_CUBE(0, 1, 2, 3, 4, 5, 6, 7, - e[0], e[1], e[2], e[3], e[4], e[5], e[6], e[7], 8, 9, 10, 11); -} diff --git a/src/arch/neon.h b/src/arch/neon.h @@ -1,8 +1,6 @@ #define CO2_NEON vdup_n_u8(0x60) #define COCW_NEON vdup_n_u8(0x20) -#define CP_NEON vdup_n_u8(0x07) -#define EP_NEON vcombine_u8(vdupq_n_u8(0x0F), vdupq_n_u8(0x0F)) -#define EO_NEON vcombine_u8(vdupq_n_u8(0x10), vdupq_n_u8(0x10)) +#define PBITS8_NEON vdup_n_u8(0x07) STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t); STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t); @@ -20,16 +18,20 @@ STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t); e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0 \ } \ }) - #define ZERO_CUBE \ ((cube_t){ \ .corner = vdup_n_u8(0), \ .edge = vdupq_n_u8(0) \ }) - #define SOLVED_CUBE STATIC_CUBE( \ 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) +const uint8_t SOLVED_L[8] = {0, 1, 2, 3, 4, 5, 6, 7}; +const uint8_t SOLVED_H[8] = {8, 9, 10, 11, 0, 0, 0}; + +STATIC_INLINE int64_t permtoindex_8x8(uint8x8_t); +STATIC_INLINE uint8x8_t indextoperm_8x8(int64_t); + STATIC_INLINE int popcount_u32(uint32_t x) { @@ -361,3 +363,77 @@ invcoord_esep(int64_t esep) return ret; } + +STATIC_INLINE int64_t +permtoindex_8x8(uint8x8_t a) +{ + int64_t i, c, ret; + uint8x8_t cmp; + uint64x1_t anum; + uint8_t or[8] = {0, 0, 0, 0, 0, 0, 0, 0x0F}; + + for (i = 0, ret = 0; i < 8; i++) { + cmp = vdup_lane_u8(a, 0); + anum = vreinterpret_u64_u8(a); + anum = vshr_n_u64(anum, 8); + a = vreinterpret_u8_u64(anum); + a = vorr_u8(a, vld1_u8(or)); + cmp = vcgt_u8(cmp, a); + c = vaddv_u8(vshr_n_u8(cmp, 7)); + ret += c * factorial[7-i]; + } + + return ret; +} + +STATIC_INLINE uint8x8_t +indextoperm_8x8(int64_t p) +{ + int used; + int64_t c, k, i, j; + uint8_t ret[8]; + + for (i = 0, used = 0; i < 8; i++) { + k = p / factorial[7-i]; + + /* Find k-th unused number */ + for (j = 0, c = 0; c <= k; j++) + c += 1 - ((used & (1 << j)) >> j); + + ret[i] = j-1; + used |= 1 << (j-1); + p %= factorial[7-i]; + } + + return vld1_u8(ret); +} + +STATIC_INLINE int64_t +coord_cp(cube_t cube) +{ + return permtoindex_8x8(vand_u8(cube.corner, PBITS8_NEON)); +} + +STATIC_INLINE cube_t +invcoord_cp(int64_t i) +{ + return (cube_t) { + .corner = indextoperm_8x8(i), + .edge = vcombine_u8(vld1_u8(SOLVED_L), vld1_u8(SOLVED_H)) + }; +} + +STATIC_INLINE int64_t +coord_epud(cube_t cube) +{ + return permtoindex_8x8(vand_u8(vget_low_u8(cube.edge), PBITS8_NEON)); +} + +STATIC_INLINE cube_t +invcoord_epud(int64_t i) +{ + return (cube_t) { + .corner = vld1_u8(SOLVED_L), + .edge = vcombine_u8(indextoperm_8x8(i), vld1_u8(SOLVED_H)) + }; +} diff --git a/src/arch/portable.h b/src/arch/portable.h @@ -290,3 +290,47 @@ set_eo(cube_t cube[static 1], int64_t eo) } cube->edge[0] = (cube->edge[0] & ~EOBIT) | (EOBIT * (sum % 2)); } + +STATIC_INLINE int64_t +coord_cp(cube_t cube) +{ + int i; + + for (i = 0; i < 8; i++) + cube.corner[i] &= PBITS; + + return permtoindex(8, cube.corner); +} + +STATIC_INLINE cube_t +invcoord_cp(int64_t i) +{ + uint8_t c[8]; + + indextoperm(i, 8, c); + + return STATIC_CUBE(c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11); +} + +STATIC_INLINE int64_t +coord_epud(cube_t cube) +{ + int i; + + for (i = 0; i < 8; i++) + cube.edge[i] &= PBITS; + + return permtoindex(8, cube.edge); +} + +STATIC_INLINE cube_t +invcoord_epud(int64_t i) +{ + uint8_t e[8]; + + indextoperm(i, 8, e); + + return STATIC_CUBE(0, 1, 2, 3, 4, 5, 6, 7, + e[0], e[1], e[2], e[3], e[4], e[5], e[6], e[7], 8, 9, 10, 11); +} diff --git a/src/utils/math.h b/src/utils/math.h @@ -49,7 +49,7 @@ permtoindex(size_t n, const uint8_t *a) for (i = 0, ret = 0; i < n; i++) { for (j = i+1, c = 0; j < n; j++) - c += (a[i] > a[j]) ? 1 : 0; + c += a[i] > a[j]; ret += factorial[n-i-1] * c; } @@ -59,21 +59,23 @@ permtoindex(size_t n, const uint8_t *a) STATIC void indextoperm(int64_t p, size_t n, uint8_t *r) { - int64_t c; - size_t i, j; - uint8_t a[FACTORIAL_MAX+1]; + int64_t c, k; + size_t i, j, used; DBG_ASSERT(n <= FACTORIAL_MAX, "Error: cannot compute indextoperm() " "for set of size %zu > %" PRId64 "\n", n, FACTORIAL_MAX); DBG_ASSERT(p >= 0 && p < factorial[n], "Error: invalid permutation " "index %" PRId64 " for set of size %zu\n", p, n); - memset(a, 0, n); - for (i = 0; i < n; i++) { - for (j = 0, c = 0; c <= p / factorial[n-i-1]; j++) - c += a[j] ? 0 : 1; + for (i = 0, used = 0; i < n; i++) { + k = p / factorial[n-i-1]; + + /* Find k-th unused number */ + for (j = 0, c = 0; c <= k; j++) + c += 1 - ((used & (1<<j)) >> j); + r[i] = j-1; - a[j-1] = 1; + used |= 1 << (j-1); p %= factorial[n-i-1]; } @@ -87,7 +89,7 @@ permsign(size_t n, const uint8_t *a) for (i = 0, ret = 0; i < n; i++) for (j = i+1; j < n; j++) - ret += a[i] > a[j] ? 1 : 0; + ret += a[i] > a[j]; return ret % 2; }