h48

A prototype for an optimal Rubik's cube solver, work in progress.
git clone https://git.tronto.net/h48
Download | Log | Files | Refs | README | LICENSE

commit a61382e9dec22ea1ff3fea2d59d3c605c0aa2aac
parent a2b8556e3cdc59bbb4a89c2779d2094db975eaa5
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Thu,  2 Nov 2023 20:05:29 +0100

Optimized compose for avx2

Diffstat:
MREADME.md | 6++----
Msrc/cube.c | 26++++++++++++++++++++++----
2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md @@ -17,7 +17,8 @@ $ make test * inline moves for avx2 * fix base get_ and set_ macros (constant arguments?) -* optimize things that use get_ and set_ +* optimize inverse for avx2 +* other things to optimize? ### Documentation and interface @@ -38,12 +39,9 @@ $ make test ### Things I need to learn: -* Use AVX2 instructions, in particular - [_mm256_shuffle_epi8](https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/mm256-shuffle-epi8.html)) * Inspect compiled assembly * Use valgrind tool cachegrind and other profiling tools - ## Internal representation of the cube The plan (TODO) is to have multiple implementations: some that diff --git a/src/cube.c b/src/cube.c @@ -939,19 +939,36 @@ inverse(cube_t c) cube_t compose(cube_t c1, cube_t c2) { - /* TODO: optimize for avx2 */ - uint8_t i, piece1, piece2, p, orien, aux, auy; cube_t ret; - setzero(ret); - #ifdef DEBUG if (!isconsistent(c1) || !isconsistent(c2)) { fprintf(stderr, "compose error, inconsistent cube\n"); + setzero(ret); return ret; } #endif +#ifdef CUBE_AVX2 + cube_t shuf, eo, eodone, co2, aux, auy1, auy2, cw, auz1, auz2, coclean; + + co2 = _mm256_and_si256(c2, _co_avx2); + eo = _mm256_and_si256(c2, _eo_avx2); + shufd = _mm256_shuffle_epi8(c1, c2); + eodone = _mm256_(shufd, eo); + aux = _mm256_add_epi8(c1, co2); + cw = _mm256_set_epi64x(0, 0x2020202020202020, 0, 0); + auy1 = _mm256_add_epi8(aux, cw); + auy2 = _mm256_srli_si256(auy1, 2); + auz1 = _mm256_add_epi8(aux, auy2); + auz2 = _mm256_and_si256(auz1, _co_avx2); + coclean = _mm256_andnot_si256(eodone, _co_avx2); + ret = _mm256_or_si256(coclean, auz2); +#else + uint8_t i, piece1, piece2, p, orien, aux, auy; + + setzero(ret); + for (i = 0; i < 12; i++) { piece2 = get_edge(c2, i); p = piece2 & _pbits; @@ -969,6 +986,7 @@ compose(cube_t c1, cube_t c2) orien = (aux + auy) & _cobits2; set_corner(ret, i, (piece1 & _pbits) | orien); } +#endif return ret; }