h48

A prototype for an optimal Rubik's cube solver, work in progress.
git clone https://git.tronto.net/h48
Download | Log | Files | Refs | README | LICENSE

commit 73a99f18e7940f424287250f780a18d861aef231
parent b0053277e385bee23336d1fe6b69a12f49f9172f
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Tue,  1 Apr 2025 14:01:49 +0200

Some simplification to AVX2 code

Some simplification after discussion with Arhan Chaudhary (inspired by vcube).
Performance looks unchanged, but at least the code is cleaner.

Diffstat:
Msrc/arch/avx2.h | 88++++++++++++++++++++++++-------------------------------------------------------
Msrc/core/cube.h | 6+++++-
2 files changed, 32 insertions(+), 62 deletions(-)

diff --git a/src/arch/avx2.h b/src/arch/avx2.h @@ -1,10 +1,16 @@ #define CO2_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x6060606060606060)) #define COCW_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x2020202020202020)) #define CP_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x0707070707070707)) -#define EP_AVX2 \ - _mm256_set_epi64x(INT64_C(0x0F0F0F0F), INT64_C(0x0F0F0F0F0F0F0F0F), 0, 0) -#define EO_AVX2 \ - _mm256_set_epi64x(INT64_C(0x10101010), INT64_C(0x1010101010101010), 0, 0) +#define EP_AVX2 _mm256_set_epi64x(\ + INT64_C(0x0F0F0F0F), INT64_C(0x0F0F0F0F0F0F0F0F), 0, 0) +#define EO_AVX2 _mm256_set_epi64x(\ + INT64_C(0x10101010), INT64_C(0x1010101010101010), 0, 0) +#define ORIENT_AVX2 _mm256_set_epi64x(INT64_C(0x10101010), \ + INT64_C(0x1010101010101010), 0, INT64_C(0x6060606060606060)) +#define USED_AVX2 _mm256_set_epi64x(INT64_C(0x00000000FFFFFFFF), \ + INT64_C(0xFFFFFFFFFFFFFFFF), 0, INT64_C(0xFFFFFFFFFFFFFFFF)) +#define CARRY_AVX2 _mm256_set_epi64x(INT64_C(0x20202020), \ + INT64_C(0x2020202020202020), 0, INT64_C(0x6060606060606060)) #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \ e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \ @@ -61,76 +67,36 @@ invertco(cube_t c) } STATIC_INLINE cube_t -compose_epcpeo(cube_t c1, cube_t c2) -{ - cube_t b, s, eo2; - - /* Permute and clean unused bits */ - s = _mm256_shuffle_epi8(c1, c2); - b = _mm256_set_epi8( - ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0 - ); - s = _mm256_andnot_si256(b, s); - - /* Change EO */ - eo2 = _mm256_and_si256(c2, EO_AVX2); - s = _mm256_xor_si256(s, eo2); - - return s; -} - -STATIC_INLINE cube_t compose_edges(cube_t c1, cube_t c2) { - return compose_epcpeo(c1, c2); + return compose(c1, c2); } STATIC_INLINE cube_t compose_corners(cube_t c1, cube_t c2) { - /* - * We do a full compose. Minor optimizations are possible, like - * saving one instruction by not doing EO, but it should not - * be significant. - */ return compose(c1, c2); } STATIC_INLINE cube_t compose(cube_t c1, cube_t c2) { - cube_t s, co1, co2, aux, auy1, auy2, auz1, auz2; - - s = compose_epcpeo(c1, c2); - - /* Change CO */ - co1 = _mm256_and_si256(s, CO2_AVX2); - co2 = _mm256_and_si256(c2, CO2_AVX2); - aux = _mm256_add_epi8(co1, co2); - auy1 = _mm256_add_epi8(aux, COCW_AVX2); - auy2 = _mm256_srli_epi32(auy1, 2); - auz1 = _mm256_add_epi8(aux, auy2); - auz2 = _mm256_and_si256(auz1, CO2_AVX2); - - /* Put together */ - s = _mm256_andnot_si256(CO2_AVX2, s); - s = _mm256_or_si256(s, auz2); - - return s; -} + /* + * Method taken from Andrew Skalski's vcube (thanks to Arhan Chaudhary + * for pointing this out) + */ + cube_t ss, so, su; -STATIC_INLINE cube_t -cleanaftershuffle(cube_t c) -{ - __m256i b; + /* Permute */ + ss = _mm256_shuffle_epi8(c1, c2); - b = _mm256_set_epi8( - ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0 - ); + /* Orient */ + so = _mm256_and_si256(c2, ORIENT_AVX2); + ss = _mm256_add_epi8(ss, so); + su = _mm256_sub_epi8(ss, CARRY_AVX2); + ss = _mm256_min_epu8(ss, su); - return _mm256_andnot_si256(b, c); + return _mm256_and_si256(ss, USED_AVX2); } STATIC_INLINE cube_t @@ -163,11 +129,11 @@ inverse(cube_t c) vi = _mm256_shuffle_epi8(vi, vi); vi = _mm256_shuffle_epi8(vi, c); - vo = _mm256_and_si256(c, _mm256_or_si256(EO_AVX2, CO2_AVX2)); + vo = _mm256_and_si256(c, ORIENT_AVX2); vo = _mm256_shuffle_epi8(vo, vi); - vp = _mm256_andnot_si256(_mm256_or_si256(EO_AVX2, CO2_AVX2), vi); + vp = _mm256_andnot_si256(ORIENT_AVX2, vi); ret = _mm256_or_si256(vp, vo); - ret = cleanaftershuffle(ret); + ret = _mm256_and_si256(ret, USED_AVX2); return invertco(ret); } diff --git a/src/core/cube.h b/src/core/cube.h @@ -109,9 +109,13 @@ issolvable(cube_t cube) return true; issolvable_parity: + LOG("There is parity\n"); + return false; issolvable_eo: + LOG("EO is not solvable\n"); + return false; issolvable_co: - /* We used to do more logging here, hence the 3 different labels */ + LOG("CO is not solvable\n"); return false; }