h48

A prototype for an optimal Rubik's cube solver, work in progress.
git clone https://git.tronto.net/h48
Download | Log | Files | Refs | README | LICENSE

commit c5fb4d1c0f0ecc30673a46e517dc82c5af2af5ef
parent 15ceee3a7c72d174305fff9b4fe9886cb3f7204d
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Sun, 19 Nov 2023 10:53:08 +0100

Fixed AVX2 build, cleaned up compose_fast

Diffstat:
Mcube.c | 43+++++++++++++++++++------------------------
1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/cube.c b/cube.c @@ -501,7 +501,7 @@ typedef __m256i cube_fast_t; #define _cocw_avx2 _mm256_set_epi64x(0, 0, 0, 0x2020202020202020) #define _eo_avx2 _mm256_set_epi64x(0x10101010, 0x1010101010101010, 0, 0) -_static inline cube_fast_t fastcube( +_static_inline cube_fast_t fastcube( uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, @@ -512,11 +512,10 @@ _static cube_t fasttocube(cube_fast_t); _static_inline bool equal_fast(cube_fast_t, cube_fast_t); _static_inline bool issolved_fast(cube_fast_t); _static_inline cube_fast_t invertco_fast(cube_fast_t); -_static_inline cube_fast_t cleanaftershuffle(cube_fast_t); _static_inline cube_fast_t compose_fast(cube_fast_t, cube_fast_t); _static_inline int64_t coord_fast_eo(cube_fast_t); -_static inline cube_fast_t +_static_inline cube_fast_t fastcube( uint8_t c_ufr, uint8_t c_ubl, @@ -609,29 +608,23 @@ invertco_fast(cube_fast_t c) } _static_inline cube_fast_t -cleanaftershuffle(cube_fast_t c) +compose_fast(cube_fast_t c1, cube_fast_t c2) { - __m256i b; + cube_fast_t s, b, eo2, co1, co2, aux, auy1, auy2, auz1, auz2; + /* Permute and clean unused bits */ + s = _mm256_shuffle_epi8(c1, c2); b = _mm256_set_epi8( ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0 ); + s = _mm256_andnot_si256(b, s); - return _mm256_andnot_si256(b, c); -} - -_static_inline cube_fast_t -compose_fast(cube_fast_t c1, cube_fast_t c2) -{ - cube_fast_t ret; - - cube_fast_t s, eo2, ed, co1, co2, aux, auy1, auy2, auz1, auz2, coclean; - + /* Change EO */ eo2 = _mm256_and_si256(c2, _eo_avx2); - s = _mm256_shuffle_epi8(c1, c2); - s = cleanaftershuffle(s); - ed = _mm256_xor_si256(s, eo2); + s = _mm256_xor_si256(s, eo2); + + /* Change CO */ co1 = _mm256_and_si256(s, _co2_avx2); co2 = _mm256_and_si256(c2, _co2_avx2); aux = _mm256_add_epi8(co1, co2); @@ -639,10 +632,12 @@ compose_fast(cube_fast_t c1, cube_fast_t c2) auy2 = _mm256_srli_epi32(auy1, 2); auz1 = _mm256_add_epi8(aux, auy2); auz2 = _mm256_and_si256(auz1, _co2_avx2); - coclean = _mm256_andnot_si256(_co2_avx2, ed); - ret = _mm256_or_si256(coclean, auz2); - return ret; + /* Put together */ + s = _mm256_andnot_si256(_co2_avx2, s); + s = _mm256_or_si256(s, auz2); + + return s; } _static_inline int64_t @@ -671,7 +666,7 @@ in the previous section(s) for unsupported architectures. typedef cube_t cube_fast_t; -_static cube_fast_t fastcube( +_static_inline cube_fast_t fastcube( uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, @@ -685,7 +680,7 @@ _static_inline cube_fast_t invertco_fast(cube_fast_t); _static_inline cube_fast_t compose_fast(cube_fast_t, cube_fast_t); _static_inline int64_t coord_fast_eo(cube_fast_t); -_static inline cube_fast_t +_static_inline cube_fast_t fastcube( uint8_t c_ufr, uint8_t c_ubl, @@ -1018,7 +1013,7 @@ inverse(cube_t cube) DBG_ASSERT(isconsistent(cube), zero, "inverse error: inconsistent cube\n"); - ret = zero_fast; + ret = zero; for (i = 0; i < 12; i++) { piece = cube.edge[i];