commit 38014615238a51f1c0c1f2f96d37cbfbf765520a
parent 1d93860c6f6d3fdbdddabdcd870ef0f8b6e47c81
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date: Tue, 29 Jul 2025 10:00:56 +0200
Optimized some coordinates
Diffstat:
7 files changed, 224 insertions(+), 73 deletions(-)
diff --git a/src/arch/arch.h b/src/arch/arch.h
@@ -7,7 +7,6 @@ typedef __m256i cube_t;
#if !defined(TEST_H)
#include "common.h"
#include "avx2.h"
-#include "coordinates_unoptimized.h"
#endif
#elif defined(NEON)
@@ -22,7 +21,6 @@ typedef struct {
#if !defined(TEST_H)
#include "common.h"
#include "neon.h"
-#include "coordinates_unoptimized.h"
#endif
#else
@@ -35,7 +33,6 @@ typedef struct {
#if !defined(TEST_H)
#include "common.h"
#include "portable.h"
-#include "coordinates_unoptimized.h"
#endif
#endif
diff --git a/src/arch/avx2.h b/src/arch/avx2.h
@@ -12,6 +12,9 @@
#define CARRY_AVX2 _mm256_set_epi64x(INT64_C(0x20202020), \
INT64_C(0x2020202020202020), 0, INT64_C(0x6060606060606060))
+#define SOLVED_L INT64_C(0x0706050403020100)
+#define SOLVED_H INT64_C(0x0B0A0908)
+
#define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
_mm256_set_epi8(0, 0, 0, 0, e_br, e_bl, e_fl, e_fr, \
@@ -19,8 +22,11 @@
0, 0, 0, 0, 0, 0, 0, 0, \
c_dbl, c_dfr, c_ubr, c_ufl, c_dbr, c_dfl, c_ubl, c_ufr)
#define ZERO_CUBE _mm256_set_epi64x(0, 0, 0, 0)
-#define SOLVED_CUBE STATIC_CUBE( \
- 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
+#define SOLVED_CUBE _mm256_set_epi64x(SOLVED_H, SOLVED_L, 0, SOLVED_L)
+
+
+STATIC_INLINE int64_t permtoindex_8x8(int64_t);
+STATIC_INLINE int64_t indextoperm_8x8(int64_t);
STATIC_INLINE int
popcount_u32(uint32_t x)
@@ -287,3 +293,77 @@ set_eo(cube_t cube[static 1], int64_t eo)
*cube = _mm256_andnot_si256(EO_AVX2, *cube);
*cube = _mm256_or_si256(*cube, veo);
}
+
+STATIC_INLINE int64_t
+permtoindex_8x8(int64_t a)
+{
+ int64_t i, c, ret;
+ __m64 cmp;
+
+ for (i = 0, ret = 0; i < 8; i++) {
+ cmp = _mm_set1_pi8(a & INT64_C(0xFF));
+ a = (a >> INT64_C(8)) | INT64_C(0x0F00000000000000);
+ cmp = _mm_cmpgt_pi8(cmp, _mm_cvtsi64_m64(a));
+ c = _mm_popcnt_u64(_mm_cvtm64_si64(cmp)) >> INT64_C(3);
+ ret += c * factorial[7-i];
+ }
+
+ return ret;
+}
+
+STATIC_INLINE int64_t
+indextoperm_8x8(int64_t p)
+{
+ int used;
+ int64_t c, k, i, j, ret;
+
+ for (i = 0, ret = 0, used = 0; i < 8; i++) {
+ k = p / factorial[7-i];
+
+ /* Find k-th unused number */
+ for (j = 0, c = 0; c <= k; j++)
+ c += 1 - ((used & (1 << j)) >> j);
+
+ ret |= (j-1) << (8*i);
+ used |= 1 << (j-1);
+ p %= factorial[7-i];
+ }
+
+ return ret;
+}
+
+STATIC_INLINE int64_t
+coord_cp(cube_t cube)
+{
+ cube_t cp;
+ int64_t aux[4];
+
+ cp = _mm256_and_si256(cube, CP_AVX2);
+ _mm256_storeu_si256((__m256i_u *)aux, cp);
+
+ return permtoindex_8x8(aux[0]);
+}
+
+STATIC_INLINE cube_t
+invcoord_cp(int64_t i)
+{
+ return _mm256_set_epi64x(SOLVED_H, SOLVED_L, 0, indextoperm_8x8(i));
+}
+
+STATIC_INLINE int64_t
+coord_epud(cube_t cube)
+{
+ cube_t ep;
+ int64_t aux[4];
+
+ ep = _mm256_and_si256(cube, EP_AVX2);
+ _mm256_storeu_si256((__m256i_u *)aux, ep);
+
+ return permtoindex_8x8(aux[2]);
+}
+
+STATIC_INLINE cube_t
+invcoord_epud(int64_t i)
+{
+ return _mm256_set_epi64x(SOLVED_H, indextoperm_8x8(i), 0, SOLVED_L);
+}
diff --git a/src/arch/common.h b/src/arch/common.h
@@ -37,6 +37,11 @@ STATIC_INLINE void set_eo(cube_t [static 1], int64_t);
STATIC_INLINE void invcoord_esep_array(int64_t, int64_t, uint8_t[static 12]);
STATIC_INLINE cube_t invcoord_eoesep(int64_t);
+STATIC_INLINE int64_t coord_cp(cube_t);
+STATIC_INLINE cube_t invcoord_cp(int64_t);
+STATIC_INLINE int64_t coord_epud(cube_t);
+STATIC_INLINE cube_t invcoord_epud(int64_t);
+
STATIC_INLINE void
invcoord_esep_array(int64_t set1, int64_t set2, uint8_t mem[static 12])
{
diff --git a/src/arch/coordinates_unoptimized.h b/src/arch/coordinates_unoptimized.h
@@ -1,53 +0,0 @@
-STATIC_INLINE int64_t coord_cp(cube_t);
-STATIC_INLINE cube_t invcoord_cp(int64_t);
-STATIC_INLINE int64_t coord_epud(cube_t);
-STATIC_INLINE cube_t invcoord_epud(int64_t);
-
-STATIC_INLINE int64_t
-coord_cp(cube_t cube)
-{
- int i;
- uint8_t c[8], e[12];
-
- pieces(&cube, c, e);
- for (i = 0; i < 8; i++)
- c[i] &= PBITS;
-
- return permtoindex(8, c);
-}
-
-STATIC_INLINE cube_t
-invcoord_cp(int64_t i)
-{
- uint8_t c[8];
-
- indextoperm(i, 8, c);
-
- return STATIC_CUBE(c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7],
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
-}
-
-STATIC_INLINE int64_t
-coord_epud(cube_t cube)
-{
- int i;
- uint8_t c[8], e[12];
-
- pieces(&cube, c, e);
-
- for (i = 0; i < 8; i++)
- e[i] &= PBITS;
-
- return permtoindex(8, e);
-}
-
-STATIC_INLINE cube_t
-invcoord_epud(int64_t i)
-{
- uint8_t e[8];
-
- indextoperm(i, 8, e);
-
- return STATIC_CUBE(0, 1, 2, 3, 4, 5, 6, 7,
- e[0], e[1], e[2], e[3], e[4], e[5], e[6], e[7], 8, 9, 10, 11);
-}
diff --git a/src/arch/neon.h b/src/arch/neon.h
@@ -1,8 +1,6 @@
#define CO2_NEON vdup_n_u8(0x60)
#define COCW_NEON vdup_n_u8(0x20)
-#define CP_NEON vdup_n_u8(0x07)
-#define EP_NEON vcombine_u8(vdupq_n_u8(0x0F), vdupq_n_u8(0x0F))
-#define EO_NEON vcombine_u8(vdupq_n_u8(0x10), vdupq_n_u8(0x10))
+#define PBITS8_NEON vdup_n_u8(0x07)
STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t);
STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t);
@@ -20,16 +18,20 @@ STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t);
e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0 \
} \
})
-
#define ZERO_CUBE \
((cube_t){ \
.corner = vdup_n_u8(0), \
.edge = vdupq_n_u8(0) \
})
-
#define SOLVED_CUBE STATIC_CUBE( \
0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
+const uint8_t SOLVED_L[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+const uint8_t SOLVED_H[8] = {8, 9, 10, 11, 0, 0, 0};
+
+STATIC_INLINE int64_t permtoindex_8x8(uint8x8_t);
+STATIC_INLINE uint8x8_t indextoperm_8x8(int64_t);
+
STATIC_INLINE int
popcount_u32(uint32_t x)
{
@@ -361,3 +363,77 @@ invcoord_esep(int64_t esep)
return ret;
}
+
+STATIC_INLINE int64_t
+permtoindex_8x8(uint8x8_t a)
+{
+ int64_t i, c, ret;
+ uint8x8_t cmp;
+ uint64x1_t anum;
+ uint8_t or[8] = {0, 0, 0, 0, 0, 0, 0, 0x0F};
+
+ for (i = 0, ret = 0; i < 8; i++) {
+ cmp = vdup_lane_u8(a, 0);
+ anum = vreinterpret_u64_u8(a);
+ anum = vshr_n_u64(anum, 8);
+ a = vreinterpret_u8_u64(anum);
+ a = vorr_u8(a, vld1_u8(or));
+ cmp = vcgt_u8(cmp, a);
+ c = vaddv_u8(vshr_n_u8(cmp, 7));
+ ret += c * factorial[7-i];
+ }
+
+ return ret;
+}
+
+STATIC_INLINE uint8x8_t
+indextoperm_8x8(int64_t p)
+{
+ int used;
+ int64_t c, k, i, j;
+ uint8_t ret[8];
+
+ for (i = 0, used = 0; i < 8; i++) {
+ k = p / factorial[7-i];
+
+ /* Find k-th unused number */
+ for (j = 0, c = 0; c <= k; j++)
+ c += 1 - ((used & (1 << j)) >> j);
+
+ ret[i] = j-1;
+ used |= 1 << (j-1);
+ p %= factorial[7-i];
+ }
+
+ return vld1_u8(ret);
+}
+
+STATIC_INLINE int64_t
+coord_cp(cube_t cube)
+{
+ return permtoindex_8x8(vand_u8(cube.corner, PBITS8_NEON));
+}
+
+STATIC_INLINE cube_t
+invcoord_cp(int64_t i)
+{
+ return (cube_t) {
+ .corner = indextoperm_8x8(i),
+ .edge = vcombine_u8(vld1_u8(SOLVED_L), vld1_u8(SOLVED_H))
+ };
+}
+
+STATIC_INLINE int64_t
+coord_epud(cube_t cube)
+{
+ return permtoindex_8x8(vand_u8(vget_low_u8(cube.edge), PBITS8_NEON));
+}
+
+STATIC_INLINE cube_t
+invcoord_epud(int64_t i)
+{
+ return (cube_t) {
+ .corner = vld1_u8(SOLVED_L),
+ .edge = vcombine_u8(indextoperm_8x8(i), vld1_u8(SOLVED_H))
+ };
+}
diff --git a/src/arch/portable.h b/src/arch/portable.h
@@ -290,3 +290,47 @@ set_eo(cube_t cube[static 1], int64_t eo)
}
cube->edge[0] = (cube->edge[0] & ~EOBIT) | (EOBIT * (sum % 2));
}
+
+STATIC_INLINE int64_t
+coord_cp(cube_t cube)
+{
+ int i;
+
+ for (i = 0; i < 8; i++)
+ cube.corner[i] &= PBITS;
+
+ return permtoindex(8, cube.corner);
+}
+
+STATIC_INLINE cube_t
+invcoord_cp(int64_t i)
+{
+ uint8_t c[8];
+
+ indextoperm(i, 8, c);
+
+ return STATIC_CUBE(c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7],
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+}
+
+STATIC_INLINE int64_t
+coord_epud(cube_t cube)
+{
+ int i;
+
+ for (i = 0; i < 8; i++)
+ cube.edge[i] &= PBITS;
+
+ return permtoindex(8, cube.edge);
+}
+
+STATIC_INLINE cube_t
+invcoord_epud(int64_t i)
+{
+ uint8_t e[8];
+
+ indextoperm(i, 8, e);
+
+ return STATIC_CUBE(0, 1, 2, 3, 4, 5, 6, 7,
+ e[0], e[1], e[2], e[3], e[4], e[5], e[6], e[7], 8, 9, 10, 11);
+}
diff --git a/src/utils/math.h b/src/utils/math.h
@@ -49,7 +49,7 @@ permtoindex(size_t n, const uint8_t *a)
for (i = 0, ret = 0; i < n; i++) {
for (j = i+1, c = 0; j < n; j++)
- c += (a[i] > a[j]) ? 1 : 0;
+ c += a[i] > a[j];
ret += factorial[n-i-1] * c;
}
@@ -59,21 +59,23 @@ permtoindex(size_t n, const uint8_t *a)
STATIC void
indextoperm(int64_t p, size_t n, uint8_t *r)
{
- int64_t c;
- size_t i, j;
- uint8_t a[FACTORIAL_MAX+1];
+ int64_t c, k;
+ size_t i, j, used;
DBG_ASSERT(n <= FACTORIAL_MAX, "Error: cannot compute indextoperm() "
"for set of size %zu > %" PRId64 "\n", n, FACTORIAL_MAX);
DBG_ASSERT(p >= 0 && p < factorial[n], "Error: invalid permutation "
"index %" PRId64 " for set of size %zu\n", p, n);
- memset(a, 0, n);
- for (i = 0; i < n; i++) {
- for (j = 0, c = 0; c <= p / factorial[n-i-1]; j++)
- c += a[j] ? 0 : 1;
+ for (i = 0, used = 0; i < n; i++) {
+ k = p / factorial[n-i-1];
+
+ /* Find k-th unused number */
+ for (j = 0, c = 0; c <= k; j++)
+ c += 1 - ((used & (1<<j)) >> j);
+
r[i] = j-1;
- a[j-1] = 1;
+ used |= 1 << (j-1);
p %= factorial[n-i-1];
}
@@ -87,7 +89,7 @@ permsign(size_t n, const uint8_t *a)
for (i = 0, ret = 0; i < n; i++)
for (j = i+1; j < n; j++)
- ret += a[i] > a[j] ? 1 : 0;
+ ret += a[i] > a[j];
return ret % 2;
}