h48

A prototype for an optimal Rubik's cube solver, work in progress.
git clone https://git.tronto.net/h48
Download | Log | Files | Refs | README | LICENSE

commit ade2ed050a2d472b4dd01e30666f5e7ec5030724
parent 9b1c9371333e20611ec5422e660ebec7e66c4261
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Fri,  3 Nov 2023 23:13:05 +0100

Moves work with avx2, fixed some stuff

Diffstat:
MMakefile | 4++--
MREADME.md | 9++++++++-
Mbenchmark/bench.sh | 7++++++-
Msrc/_trans_avx2.c | 2+-
Msrc/cube.c | 146++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
Msrc/cube.h | 1+
Mtest/00_basic/basic_tests.c | 1+
Mtest/test.sh | 8++++++--
8 files changed, 117 insertions(+), 61 deletions(-)

diff --git a/Makefile b/Makefile @@ -12,9 +12,9 @@ clean: rm -rf *.o test: debugcube.o - ./test/test.sh + CUBETYPE=${CUBETYPE} ./test/test.sh benchmark: cube.o - ./benchmark/bench.sh + CUBETYPE=${CUBETYPE} ./benchmark/bench.sh .PHONY: all clean test benchmark diff --git a/README.md b/README.md @@ -15,7 +15,13 @@ $ make test ### Make AVX2 work -* fix base get_ and set_ macros (constant arguments?) +* fix inverse, flipallcorners + +### Cleanup / refactor + +* see planner +* change all set_epi to setr_epi +* change epi8 to epi64x (shorter!) ### Documentation and interface @@ -41,6 +47,7 @@ $ make test end to check that it is actually solved. * see if vcube's method to flip all corners is better * find a better way for computing the inverse? +* Improve avx2 instructions in general ## Internal representation of the cube diff --git a/benchmark/bench.sh b/benchmark/bench.sh @@ -1,10 +1,15 @@ #!/bin/sh +CC="cc -std=c99 -pthread -O3 -D$CUBETYPE" +if [ "$CUBETYPE" = "CUBE_AVX2" ]; then + CC="$CC -mavx2" +fi + BENCHBIN="benchmark/run" BENCHDIR="benchmark/results" CUBEOBJ="cube.o" -cc -std=c99 -pthread -O3 -o $BENCHBIN benchmark/bench.c $CUBEOBJ || exit 1; +$CC -o $BENCHBIN benchmark/bench.c $CUBEOBJ || exit 1; d="$(date +'%Y-%m-%d-%H-%M-%S')" mkdir -p "$BENCHDIR" diff --git a/src/_trans_avx2.c b/src/_trans_avx2.c @@ -7,7 +7,7 @@ flipallcorners(cube_t c) shright = _mm256_srli_si256(c, 1); summed = _mm256_or_si256(shleft, shright); newco = _mm256_and_si256(summed, _co_avx2); - cleanco = _mm256_andnot_si256_(c, _co_avx2); + cleanco = _mm256_andnot_si256(c, _co_avx2); ret = _mm256_and_si256(cleanco, newco); return ret; diff --git a/src/cube.c b/src/cube.c @@ -6,10 +6,6 @@ #include <stdio.h> #endif -#ifdef AVX2 -#include <immintrin.h> -#endif - #include "cube.h" #define U 0U @@ -118,6 +114,11 @@ #define _eflip 0x10U #define _error 0xFFU +#define get_edge(cube, i) (cube).e[(i)] +#define get_corner(cube, i) (cube).c[(i)] +#define set_edge(cube, i, p) (cube).e[(i)] = (p) +#define set_corner(cube, i, p) (cube).c[(i)] = (p) + cube_arr_t solvedcube_arr = { .c = {0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0}, .e = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0} @@ -126,40 +127,54 @@ cube_arr_t zerocube_arr = { .e = {0}, .c = {0} }; #ifdef CUBE_AVX2 -#define _co_avx2 _mm256_set_epi64x(0, 0xF0F0F0F0F0F0F0F0, 0, 0) -#define _eo_avx2 _mm256_set_epi64x(0, 0, 0xF0F0F0F0, 0xF0F0F0F0F0F0F0F0) -#define _c _mm256_set_epi64x(0, 0, 0, 0xFF) -#define _e _mm256_set_epi64x(0, 0xFF, 0, 0) +#define _co_avx2 _mm256_set_epi8( \ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ + 0, 0, 0, 0, 0, 0, 0, 0, \ + 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70) +#define _eo_avx2 _mm256_set_epi8( \ + 0, 0, 0, 0, 0x70, 0x70, 0x70, 0x70, \ + 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, \ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) #define setsolved(cube) cube = _mm256_loadu_si256((__m256i_u *)&solvedcube_arr) #define setzero(cube) cube = _mm256_setzero_si256() -/* TODO: in the next 4 macros, all 0 should be i, but must be constant! */ -#define get_edge(cube, i) _mm256_extract_epi8(cube, 0+16) -#define get_corner(cube, i) _mm256_extract_epi8(cube, 0) -#define set_edge(cube, i, p) _mm256_or_si256( \ - _mm256_andnot_si256(cube, _mm256_slli_si256(_e, 0)), \ - _mm256_and_si256(_mm256_set1_epi8(p), _mm256_slli_si256(_e, 0))) -#define set_corner(cube, i, p) _mm256_or_si256( \ - _mm256_andnot_si256(cube, _mm256_slli_si256(_c, 0)), \ - _mm256_and_si256(_mm256_set1_epi8(p), _mm256_slli_si256(_c, 0))) #include "_moves_avx2.c" #include "_trans_avx2.c" +static cube_t +arrtocube(cube_arr_t a) +{ + return _mm256_loadu_si256((__m256i_u *)&a); +} + +static void +cubetoarr(cube_t c, cube_arr_t *a) +{ + _mm256_storeu_si256((__m256i_u *)a, c); +} + #else #define setsolved(cube) cube = solvedcube_arr #define setzero(cube) cube = zerocube_arr -#define get_edge(cube, i) (cube).e[(i)] -#define get_corner(cube, i) (cube).c[(i)] -#define set_edge(cube, i, p) (cube).e[(i)] = (p) -#define set_corner(cube, i, p) (cube).c[(i)] = (p) + +static cube_t +arrtocube(cube_arr_t a) +{ + return a; +} + +static void +cubetoarr(cube_t c, cube_arr_t *a) +{ + memcpy(a, &c, sizeof(cube_t)); +} #include "_moves_arr.c" #include "_trans_arr.c" #endif - static char *cornerstr[] = { [_c_ufr] = "UFR", [_c_ubl] = "UBL", @@ -279,10 +294,10 @@ static uint8_t readep(char *); static uint8_t readmove(char); static uint8_t readmodifier(char); static cube_t readcube_H48(char *); -static void writecube_AVX(cube_t, char *); -static void writecube_H48(cube_t, char *); +static void writecube_AVX(cube_arr_t, char *); +static void writecube_H48(cube_arr_t, char *); static int writepiece_SRC(uint8_t, char *); -static void writecube_SRC(cube_t, char *); +static void writecube_SRC(cube_arr_t, char *); static int permsign(uint8_t *, int); cube_t @@ -367,10 +382,10 @@ readcube_H48(char *buf) { int i; uint8_t piece, orient; - cube_t ret, err; + cube_arr_t ret = {0}; + cube_t err; char *b; - setzero(ret); setzero(err); b = buf; @@ -397,7 +412,7 @@ readcube_H48(char *buf) set_corner(ret, i, piece | orient); } - return ret; + return arrtocube(ret); } cube_t @@ -424,7 +439,7 @@ readcube(format_t format, char *buf) } static void -writecube_AVX(cube_t cube, char *buf) +writecube_AVX(cube_arr_t cube, char *buf) { int i, ptr; uint8_t piece; @@ -448,9 +463,8 @@ writecube_AVX(cube_t cube, char *buf) memcpy(buf+ptr-2, "\n)\0", 3); } - static void -writecube_H48(cube_t cube, char *buf) +writecube_H48(cube_arr_t cube, char *buf) { uint8_t piece, perm, orient; int i; @@ -502,7 +516,7 @@ writepiece_SRC(uint8_t piece, char *buf) } static void -writecube_SRC(cube_t cube, char *buf) +writecube_SRC(cube_arr_t cube, char *buf) { int i, ptr; uint8_t piece; @@ -529,6 +543,7 @@ writecube_SRC(cube_t cube, char *buf) void writecube(format_t format, cube_t cube, char *buf) { + cube_arr_t a; char *errormsg; size_t len; @@ -537,15 +552,17 @@ writecube(format_t format, cube_t cube, char *buf) goto writecube_error; } + cubetoarr(cube, &a); + switch (format) { case AVX: - writecube_AVX(cube, buf); + writecube_AVX(a, buf); break; case H48: - writecube_H48(cube, buf); + writecube_H48(a, buf); break; case SRC: - writecube_SRC(cube, buf); + writecube_SRC(a, buf); break; default: errormsg = "ERROR: cannot write cube in the given format"; @@ -687,11 +704,14 @@ permsign(uint8_t *a, int n) } static bool -isconsistent(cube_t c) +isconsistent(cube_t cube) { + cube_arr_t c; uint8_t i, p, e, piece; bool found[12]; + cubetoarr(cube, &c); + for (i = 0; i < 12; i++) found[i] = false; for (i = 0; i < 12; i++) { @@ -751,7 +771,8 @@ inconsistent_co: bool issolvable(cube_t cube) { - uint8_t i, eo, co, piece, e[12], c[8]; + cube_arr_t c; + uint8_t i, eo, co, piece, edges[12], corners[8]; #ifdef DEBUG if (!isconsistent(cube)) { @@ -760,17 +781,19 @@ issolvable(cube_t cube) } #endif + cubetoarr(cube, &c); + for (i = 0; i < 12; i++) - e[i] = get_edge(cube, i) & _pbits; + edges[i] = get_edge(c, i) & _pbits; for (i = 0; i < 8; i++) - c[i] = get_corner(cube, i) & _pbits; + corners[i] = get_corner(c, i) & _pbits; - if (permsign(e, 12) != permsign(c, 8)) + if (permsign(edges, 12) != permsign(corners, 8)) goto issolvable_parity; eo = 0; for (i = 0; i < 12; i++) { - piece = get_edge(cube, i); + piece = get_edge(c, i); eo += (piece & _eobit) >> _eoshift; } if (eo % 2 != 0) @@ -778,7 +801,7 @@ issolvable(cube_t cube) co = 0; for (i = 0; i < 8; i++) { - piece = get_corner(cube, i); + piece = get_corner(c, i); co += (piece & _cobits) >> _coshift; } if (co % 3 != 0) @@ -909,7 +932,6 @@ move_error: cube_t inverse(cube_t c) { - /* TODO: optimize for avx2 */ cube_t ret; #ifdef DEBUG @@ -926,7 +948,7 @@ inverse(cube_t c) * [1] https://github.com/Voltara/vcube * [2] http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html */ - cube_t v3, vi; + cube_t v3, vi, vo, vp; v3 = _mm256_shuffle_epi8(c, c); v3 = _mm256_shuffle_epi8(v3, c); @@ -945,7 +967,12 @@ inverse(cube_t c) vi = _mm256_shuffle_epi8(vi, vi); vi = _mm256_shuffle_epi8(vi, v3); vi = _mm256_shuffle_epi8(vi, vi); - ret = _mm256_shuffle_epi8(vi, c); + vi = _mm256_shuffle_epi8(vi, c); + + vo = _mm256_and_si256(c, _mm256_or_si256(_eo_avx2, _co_avx2)); + vo = _mm256_shuffle_epi8(vo, vi); + vp = _mm256_andnot_si256(_mm256_or_si256(_eo_avx2, _co_avx2), vi); + ret = _mm256_or_si256(vp, vo); return flipallcorners(ret); #else @@ -983,19 +1010,30 @@ inline_compose(cube_t c1, cube_t c2) #endif #ifdef CUBE_AVX2 - cube_t shuf, eo, eodone, co2, aux, auy1, auy2, cw, auz1, auz2, coclean; + cube_t s, eo2, ed, co1, co2, aux, auy1, auy2, cw, cwccw, auz1, auz2, + coclean; + eo2 = _mm256_and_si256(c2, _eo_avx2); + s = _mm256_shuffle_epi8(c1, c2); + ed = _mm256_xor_si256(s, eo2); + co1 = _mm256_and_si256(s, _co_avx2); co2 = _mm256_and_si256(c2, _co_avx2); - eo = _mm256_and_si256(c2, _eo_avx2); - shufd = _mm256_shuffle_epi8(c1, c2); - eodone = _mm256_(shufd, eo); - aux = _mm256_add_epi8(c1, co2); - cw = _mm256_set_epi64x(0, 0x2020202020202020, 0, 0); + aux = _mm256_add_epi8(co1, co2); + cw = _mm256_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20 + ); auy1 = _mm256_add_epi8(aux, cw); - auy2 = _mm256_srli_si256(auy1, 2); + auy2 = _mm256_srli_epi32(auy1, 2); auz1 = _mm256_add_epi8(aux, auy2); - auz2 = _mm256_and_si256(auz1, _co_avx2); - coclean = _mm256_andnot_si256(eodone, _co_avx2); + cwccw = _mm256_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60 + ); + auz2 = _mm256_and_si256(auz1, cwccw); + coclean = _mm256_andnot_si256(_co_avx2, ed); ret = _mm256_or_si256(coclean, auz2); #else uint8_t i, piece1, piece2, p, orien, aux, auy; diff --git a/src/cube.h b/src/cube.h @@ -3,6 +3,7 @@ typedef struct { uint8_t e[16]; } cube_arr_t; #ifdef CUBE_AVX2 +#include <immintrin.h> typedef __m256i cube_t; #else typedef cube_arr_t cube_t; diff --git a/test/00_basic/basic_tests.c b/test/00_basic/basic_tests.c @@ -14,6 +14,7 @@ check(cube_t cube, char *name) void check2(cube_t cube1, char *name1, cube_t cube2, char *name2) { +fprintf(stderr, "check2 %s %s\n", name1, name2); printf("%s and %s are%s equal\n", name1, name2, equal(cube1, cube2) ? "" : " NOT"); } diff --git a/test/test.sh b/test/test.sh @@ -1,10 +1,14 @@ #!/bin/sh CC="cc -DDEBUG -std=c99 -pthread -pedantic -Wall -Wextra \ - -Wno-unused-parameter -Wno-unused-function -g3" -if [ $(uname) != "OpenBSD" ]; then + -Wno-unused-parameter -Wno-unused-function -g3 -D$CUBETYPE" +if [ "$CUBETYPE" = "CUBE_AVX2" ]; then + CC="$CC -mavx2" +fi +if [ "$(uname)" != "OpenBSD" ]; then CC="$CC -fsanitize=address -fsanitize=undefined" fi + TESTBIN="test/run" TESTOUT="test/last.out" TESTERR="test/last.err"