h48

A prototype for an optimal Rubik's cube solver, work in progress.
git clone https://git.tronto.net/h48
Download | Log | Files | Refs | README | LICENSE

commit 9b1c9371333e20611ec5422e660ebec7e66c4261
parent a61382e9dec22ea1ff3fea2d59d3c605c0aa2aac
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Thu,  2 Nov 2023 22:17:14 +0100

Added avx2 moves

Diffstat:
MREADME.md | 12++++++------
Msrc/_moves_avx2.c | 199++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Msrc/cube.c | 49++++++++++++++++++++++++++++++++++++++++++++-----
Autils/genmovecode.sh | 19+++++++++++++++++++
Mutils/gentranscode.sh | 1-
5 files changed, 266 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md @@ -15,10 +15,7 @@ $ make test ### Make AVX2 work -* inline moves for avx2 * fix base get_ and set_ macros (constant arguments?) -* optimize inverse for avx2 -* other things to optimize? ### Documentation and interface @@ -37,10 +34,13 @@ $ make test * Takes as parameters the amount of memory to use and a FILE for the tables * Use multi-move (up to 4/5 moves at once) -### Things I need to learn: +### Future optimizations -* Inspect compiled assembly -* Use valgrind tool cachegrind and other profiling tools +* CO is the worst part of moving, transforming and inverting. Try basing + everything on representing the cube without CO and apply it only at the + end to check that it is actually solved. +* see if vcube's method to flip all corners is better +* find a better way for computing the inverse? ## Internal representation of the cube diff --git a/src/_moves_avx2.c b/src/_moves_avx2.c @@ -1,2 +1,197 @@ -/* TODO: return compose(c, (some avx garbage)); -*/ +static inline cube_t +inline_move_U(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 11, 10, 9, 8, 7, 6, 0, 1, 3, 2, 5, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 1, 0, 3, 2, 4, 5 + ); + + return _mm256_shuffle_epi8(c, m); +} + +static inline cube_t +inline_move_U2(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 11, 10, 9, 8, 7, 6, 4, 5, 3, 2, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 4, 5, 3, 2, 0, 1 + ); + + return _mm256_shuffle_epi8(c, m); +} + +static inline cube_t +inline_move_U3(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 11, 10, 9, 8, 7, 6, 1, 0, 3, 2, 4, 5, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 0, 1, 3, 2, 5, 4 + ); + + return _mm256_shuffle_epi8(c, m); +} + +static inline cube_t +inline_move_D(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 11, 10, 9, 8, 3, 2, 5, 4, 6, 7, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 5, 4, 6, 7, 1, 0 + ); + + return _mm256_shuffle_epi8(c, m); +} + +static inline cube_t +inline_move_D2(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 11, 10, 9, 8, 6, 7, 5, 4, 2, 3, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 5, 4, 2, 3, 1, 0 + ); + + return _mm256_shuffle_epi8(c, m); +} + +static inline cube_t +inline_move_D3(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 11, 10, 9, 8, 2, 3, 5, 4, 7, 6, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 5, 4, 7, 6, 1, 0 + ); + + return _mm256_shuffle_epi8(c, m); +} + +static inline cube_t +inline_move_R(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 4, 10, 9, 7, 11, 6, 5, 8, 3, 2, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 35, 32, 4, 69, 2, 1, 70 + ); + + return compose(c, m); +} + +static inline cube_t +inline_move_R2(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 8, 10, 9, 11, 4, 6, 5, 7, 3, 2, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 5, 6, 4, 0, 2, 1, 3 + ); + + return _mm256_shuffle_epi8(c, m); +} + +static inline cube_t +inline_move_R3(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 7, 10, 9, 4, 8, 6, 5, 11, 3, 2, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 32, 35, 4, 70, 2, 1, 69 + ); + + return compose(c, m); +} + +static inline cube_t +inline_move_L(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 11, 6, 5, 8, 7, 9, 10, 4, 3, 2, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 34, 6, 5, 33, 3, 68, 71, 0 + ); + + return compose(c, m); +} + +static inline cube_t +inline_move_L2(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 11, 9, 10, 8, 7, 5, 6, 4, 3, 2, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 4, 6, 5, 7, 3, 1, 2, 0 + ); + + return _mm256_shuffle_epi8(c, m); +} + +static inline cube_t +inline_move_L3(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 11, 5, 6, 8, 7, 10, 9, 4, 3, 2, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 33, 6, 5, 34, 3, 71, 68, 0 + ); + + return compose(c, m); +} + +static inline cube_t +inline_move_F(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 11, 10, 19, 16, 7, 6, 5, 4, 24, 2, 1, 25, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 64, 5, 66, 3, 38, 1, 36 + ); + + return compose(c, m); +} + +static inline cube_t +inline_move_F2(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 11, 10, 8, 9, 7, 6, 5, 4, 0, 2, 1, 3, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 4, 5, 6, 3, 0, 1, 2 + ); + + return _mm256_shuffle_epi8(c, m); +} + +static inline cube_t +inline_move_F3(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 11, 10, 16, 19, 7, 6, 5, 4, 25, 2, 1, 24, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 66, 5, 64, 3, 36, 1, 38 + ); + + return compose(c, m); +} + +static inline cube_t +inline_move_B(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 18, 17, 9, 8, 7, 6, 5, 4, 3, 26, 27, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 65, 6, 67, 4, 39, 2, 37, 0 + ); + + return compose(c, m); +} + +static inline cube_t +inline_move_B2(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 10, 11, 9, 8, 7, 6, 5, 4, 3, 1, 2, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, 4, 1, 2, 3, 0 + ); + + return _mm256_shuffle_epi8(c, m); +} + +static inline cube_t +inline_move_B3(cube_t c) +{ + cube_t m = _mm256_set_epi8( + 0, 0, 0, 0, 17, 18, 9, 8, 7, 6, 5, 4, 3, 27, 26, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 67, 6, 65, 4, 37, 2, 39, 0 + ); + + return compose(c, m); +} diff --git a/src/cube.c b/src/cube.c @@ -269,6 +269,7 @@ static char *transstr[] = { [BLm] = "mirrored BL", }; +static inline cube_t inline_compose(cube_t, cube_t); static bool isconsistent(cube_t); static cube_t flipallcorners(cube_t); static uint8_t readco(char *); @@ -909,18 +910,49 @@ cube_t inverse(cube_t c) { /* TODO: optimize for avx2 */ - uint8_t i, piece, orien; cube_t ret; - setzero(ret); - #ifdef DEBUG if (!isconsistent(c)) { fprintf(stderr, "inverse error, inconsistent cube\n"); + setzero(ret); return ret; } #endif +#ifdef CUBE_AVX2 + /* Method taken from Andrew Skalski's vcube[1]. The addition sequence + * was generated using [2]. + * [1] https://github.com/Voltara/vcube + * [2] http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html + */ + cube_t v3, vi; + + v3 = _mm256_shuffle_epi8(c, c); + v3 = _mm256_shuffle_epi8(v3, c); + vi = _mm256_shuffle_epi8(v3, v3); + vi = _mm256_shuffle_epi8(vi, vi); + vi = _mm256_shuffle_epi8(vi, vi); + vi = _mm256_shuffle_epi8(vi, v3); + vi = _mm256_shuffle_epi8(vi, vi); + vi = _mm256_shuffle_epi8(vi, vi); + vi = _mm256_shuffle_epi8(vi, vi); + vi = _mm256_shuffle_epi8(vi, c); + vi = _mm256_shuffle_epi8(vi, vi); + vi = _mm256_shuffle_epi8(vi, vi); + vi = _mm256_shuffle_epi8(vi, vi); + vi = _mm256_shuffle_epi8(vi, vi); + vi = _mm256_shuffle_epi8(vi, vi); + vi = _mm256_shuffle_epi8(vi, v3); + vi = _mm256_shuffle_epi8(vi, vi); + ret = _mm256_shuffle_epi8(vi, c); + + return flipallcorners(ret); +#else + uint8_t i, piece, orien; + + setzero(ret); + for (i = 0; i < 12; i++) { piece = get_edge(c, i); orien = piece & _eobit; @@ -932,12 +964,13 @@ inverse(cube_t c) orien = ((piece << 1) | (piece >> 1)) & _cobits2; set_corner(ret, piece & _pbits, i | orien); } +#endif return ret; } -cube_t -compose(cube_t c1, cube_t c2) +static inline cube_t +inline_compose(cube_t c1, cube_t c2) { cube_t ret; @@ -992,6 +1025,12 @@ compose(cube_t c1, cube_t c2) } cube_t +compose(cube_t c1, cube_t c2) +{ + return inline_compose(c1, c2); +} + +cube_t transform(cube_t c, trans_t t) { cube_t ret; diff --git a/utils/genmovecode.sh b/utils/genmovecode.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +type="${1:-src}" + +gcc -DDEBUG h48_to_"$type".c ../src/cube.c -o h48_to_"$type" + +genfuncs() { + for f in move_??_*.txt; do + move="$(echo $f | sed 's/.*_// ; s/\.txt//')" + printf 'static inline cube_t\ninline_move_%s' "$move" + printf '(cube_t c)\n{\n' + printf '\tcube_t m = ' + ./h48_to_"$type" <"$f" | sed '2,4s/^/\t/' + printf ';\n\n\treturn compose(c, m);\n}\n\n' + done +} + +genfuncs +rm -f h48_to_"$type" invert diff --git a/utils/gentranscode.sh b/utils/gentranscode.sh @@ -23,7 +23,6 @@ genfuncs() { for f in transform_??_???.txt; do trans="$(echo $f | sed 's/.*_// ; s/\.txt//')" printf 'static inline cube_t\ninline_trans_%s' "$trans" - [ "$1" = "-i" ] && printf '_inverse' printf '(cube_t c)\n{\n' printf '\tcube_t ret;\n\n' printf '\tcube_t tn = '