h48

A prototype for an optimal Rubik's cube solver, work in progress.
git clone https://git.tronto.net/h48
Download | Log | Files | Refs | README | LICENSE

commit 3051b2342d67dcba2fdc0010fb26f5850964c4d9
parent ade2ed050a2d472b4dd01e30666f5e7ec5030724
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Sat,  4 Nov 2023 10:25:13 +0100

avx2 fully works

Diffstat:
MREADME.md | 4----
Msrc/_trans_avx2.c | 13+++++++------
Msrc/cube.c | 21+++++++++++++--------
3 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md @@ -13,10 +13,6 @@ $ make test ## TODO: -### Make AVX2 work - -* fix inverse, flipallcorners - ### Cleanup / refactor * see planner diff --git a/src/_trans_avx2.c b/src/_trans_avx2.c @@ -1,14 +1,15 @@ static inline cube_t flipallcorners(cube_t c) { - cube_t shleft, shright, summed, newco, cleanco, ret; + cube_t co, shleft, shright, summed, newco, cleanco, ret; - shleft = _mm256_slli_si256(c, 1); - shright = _mm256_srli_si256(c, 1); + co = _mm256_and_si256(c, _co2_avx2); + shleft = _mm256_slli_epi32(co, 1); + shright = _mm256_srli_epi32(co, 1); summed = _mm256_or_si256(shleft, shright); - newco = _mm256_and_si256(summed, _co_avx2); - cleanco = _mm256_andnot_si256(c, _co_avx2); - ret = _mm256_and_si256(cleanco, newco); + newco = _mm256_and_si256(summed, _co2_avx2); + cleanco = _mm256_xor_si256(c, co); + ret = _mm256_or_si256(cleanco, newco); return ret; } diff --git a/src/cube.c b/src/cube.c @@ -130,10 +130,14 @@ cube_arr_t zerocube_arr = { .e = {0}, .c = {0} }; #define _co_avx2 _mm256_set_epi8( \ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ 0, 0, 0, 0, 0, 0, 0, 0, \ - 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70) + 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0) +#define _co2_avx2 _mm256_set_epi8( \ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ + 0, 0, 0, 0, 0, 0, 0, 0, \ + 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60) #define _eo_avx2 _mm256_set_epi8( \ - 0, 0, 0, 0, 0x70, 0x70, 0x70, 0x70, \ - 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, \ + 0, 0, 0, 0, 0x10, 0x10, 0x10, 0x10, \ + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, \ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) #define setsolved(cube) cube = _mm256_loadu_si256((__m256i_u *)&solvedcube_arr) #define setzero(cube) cube = _mm256_setzero_si256() @@ -959,6 +963,7 @@ inverse(cube_t c) vi = _mm256_shuffle_epi8(vi, vi); vi = _mm256_shuffle_epi8(vi, vi); vi = _mm256_shuffle_epi8(vi, vi); + vi = _mm256_shuffle_epi8(vi, vi); vi = _mm256_shuffle_epi8(vi, c); vi = _mm256_shuffle_epi8(vi, vi); vi = _mm256_shuffle_epi8(vi, vi); @@ -969,9 +974,9 @@ inverse(cube_t c) vi = _mm256_shuffle_epi8(vi, vi); vi = _mm256_shuffle_epi8(vi, c); - vo = _mm256_and_si256(c, _mm256_or_si256(_eo_avx2, _co_avx2)); + vo = _mm256_and_si256(c, _mm256_or_si256(_eo_avx2, _co2_avx2)); vo = _mm256_shuffle_epi8(vo, vi); - vp = _mm256_andnot_si256(_mm256_or_si256(_eo_avx2, _co_avx2), vi); + vp = _mm256_andnot_si256(_mm256_or_si256(_eo_avx2, _co2_avx2), vi); ret = _mm256_or_si256(vp, vo); return flipallcorners(ret); @@ -1016,8 +1021,8 @@ inline_compose(cube_t c1, cube_t c2) eo2 = _mm256_and_si256(c2, _eo_avx2); s = _mm256_shuffle_epi8(c1, c2); ed = _mm256_xor_si256(s, eo2); - co1 = _mm256_and_si256(s, _co_avx2); - co2 = _mm256_and_si256(c2, _co_avx2); + co1 = _mm256_and_si256(s, _co2_avx2); + co2 = _mm256_and_si256(c2, _co2_avx2); aux = _mm256_add_epi8(co1, co2); cw = _mm256_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -1033,7 +1038,7 @@ inline_compose(cube_t c1, cube_t c2) 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60 ); auz2 = _mm256_and_si256(auz1, cwccw); - coclean = _mm256_andnot_si256(_co_avx2, ed); + coclean = _mm256_andnot_si256(_co2_avx2, ed); ret = _mm256_or_si256(coclean, auz2); #else uint8_t i, piece1, piece2, p, orien, aux, auy;