commit ade2ed050a2d472b4dd01e30666f5e7ec5030724
parent 9b1c9371333e20611ec5422e660ebec7e66c4261
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date: Fri, 3 Nov 2023 23:13:05 +0100
Moves work with avx2, fixed some stuff
Diffstat:
8 files changed, 117 insertions(+), 61 deletions(-)
diff --git a/Makefile b/Makefile
@@ -12,9 +12,9 @@ clean:
rm -rf *.o
test: debugcube.o
- ./test/test.sh
+ CUBETYPE=${CUBETYPE} ./test/test.sh
benchmark: cube.o
- ./benchmark/bench.sh
+ CUBETYPE=${CUBETYPE} ./benchmark/bench.sh
.PHONY: all clean test benchmark
diff --git a/README.md b/README.md
@@ -15,7 +15,13 @@ $ make test
### Make AVX2 work
-* fix base get_ and set_ macros (constant arguments?)
+* fix inverse, flipallcorners
+
+### Cleanup / refactor
+
+* see planner
+* change all set_epi to setr_epi
+* change epi8 to epi64x (shorter!)
### Documentation and interface
@@ -41,6 +47,7 @@ $ make test
end to check that it is actually solved.
* see if vcube's method to flip all corners is better
* find a better way for computing the inverse?
+* Improve avx2 instructions in general
## Internal representation of the cube
diff --git a/benchmark/bench.sh b/benchmark/bench.sh
@@ -1,10 +1,15 @@
#!/bin/sh
+CC="cc -std=c99 -pthread -O3 -D$CUBETYPE"
+if [ "$CUBETYPE" = "CUBE_AVX2" ]; then
+ CC="$CC -mavx2"
+fi
+
BENCHBIN="benchmark/run"
BENCHDIR="benchmark/results"
CUBEOBJ="cube.o"
-cc -std=c99 -pthread -O3 -o $BENCHBIN benchmark/bench.c $CUBEOBJ || exit 1;
+$CC -o $BENCHBIN benchmark/bench.c $CUBEOBJ || exit 1;
d="$(date +'%Y-%m-%d-%H-%M-%S')"
mkdir -p "$BENCHDIR"
diff --git a/src/_trans_avx2.c b/src/_trans_avx2.c
@@ -7,7 +7,7 @@ flipallcorners(cube_t c)
shright = _mm256_srli_si256(c, 1);
summed = _mm256_or_si256(shleft, shright);
newco = _mm256_and_si256(summed, _co_avx2);
- cleanco = _mm256_andnot_si256_(c, _co_avx2);
+ cleanco = _mm256_andnot_si256(c, _co_avx2);
ret = _mm256_and_si256(cleanco, newco);
return ret;
diff --git a/src/cube.c b/src/cube.c
@@ -6,10 +6,6 @@
#include <stdio.h>
#endif
-#ifdef AVX2
-#include <immintrin.h>
-#endif
-
#include "cube.h"
#define U 0U
@@ -118,6 +114,11 @@
#define _eflip 0x10U
#define _error 0xFFU
+#define get_edge(cube, i) (cube).e[(i)]
+#define get_corner(cube, i) (cube).c[(i)]
+#define set_edge(cube, i, p) (cube).e[(i)] = (p)
+#define set_corner(cube, i, p) (cube).c[(i)] = (p)
+
cube_arr_t solvedcube_arr = {
.c = {0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0},
.e = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0}
@@ -126,40 +127,54 @@ cube_arr_t zerocube_arr = { .e = {0}, .c = {0} };
#ifdef CUBE_AVX2
-#define _co_avx2 _mm256_set_epi64x(0, 0xF0F0F0F0F0F0F0F0, 0, 0)
-#define _eo_avx2 _mm256_set_epi64x(0, 0, 0xF0F0F0F0, 0xF0F0F0F0F0F0F0F0)
-#define _c _mm256_set_epi64x(0, 0, 0, 0xFF)
-#define _e _mm256_set_epi64x(0, 0xFF, 0, 0)
+#define _co_avx2 _mm256_set_epi8( \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70)
+#define _eo_avx2 _mm256_set_epi8( \
+ 0, 0, 0, 0, 0x70, 0x70, 0x70, 0x70, \
+ 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
#define setsolved(cube) cube = _mm256_loadu_si256((__m256i_u *)&solvedcube_arr)
#define setzero(cube) cube = _mm256_setzero_si256()
-/* TODO: in the next 4 macros, all 0 should be i, but must be constant! */
-#define get_edge(cube, i) _mm256_extract_epi8(cube, 0+16)
-#define get_corner(cube, i) _mm256_extract_epi8(cube, 0)
-#define set_edge(cube, i, p) _mm256_or_si256( \
- _mm256_andnot_si256(cube, _mm256_slli_si256(_e, 0)), \
- _mm256_and_si256(_mm256_set1_epi8(p), _mm256_slli_si256(_e, 0)))
-#define set_corner(cube, i, p) _mm256_or_si256( \
- _mm256_andnot_si256(cube, _mm256_slli_si256(_c, 0)), \
- _mm256_and_si256(_mm256_set1_epi8(p), _mm256_slli_si256(_c, 0)))
#include "_moves_avx2.c"
#include "_trans_avx2.c"
+static cube_t
+arrtocube(cube_arr_t a)
+{
+ return _mm256_loadu_si256((__m256i_u *)&a);
+}
+
+static void
+cubetoarr(cube_t c, cube_arr_t *a)
+{
+ _mm256_storeu_si256((__m256i_u *)a, c);
+}
+
#else
#define setsolved(cube) cube = solvedcube_arr
#define setzero(cube) cube = zerocube_arr
-#define get_edge(cube, i) (cube).e[(i)]
-#define get_corner(cube, i) (cube).c[(i)]
-#define set_edge(cube, i, p) (cube).e[(i)] = (p)
-#define set_corner(cube, i, p) (cube).c[(i)] = (p)
+
+static cube_t
+arrtocube(cube_arr_t a)
+{
+ return a;
+}
+
+static void
+cubetoarr(cube_t c, cube_arr_t *a)
+{
+ memcpy(a, &c, sizeof(cube_t));
+}
#include "_moves_arr.c"
#include "_trans_arr.c"
#endif
-
static char *cornerstr[] = {
[_c_ufr] = "UFR",
[_c_ubl] = "UBL",
@@ -279,10 +294,10 @@ static uint8_t readep(char *);
static uint8_t readmove(char);
static uint8_t readmodifier(char);
static cube_t readcube_H48(char *);
-static void writecube_AVX(cube_t, char *);
-static void writecube_H48(cube_t, char *);
+static void writecube_AVX(cube_arr_t, char *);
+static void writecube_H48(cube_arr_t, char *);
static int writepiece_SRC(uint8_t, char *);
-static void writecube_SRC(cube_t, char *);
+static void writecube_SRC(cube_arr_t, char *);
static int permsign(uint8_t *, int);
cube_t
@@ -367,10 +382,10 @@ readcube_H48(char *buf)
{
int i;
uint8_t piece, orient;
- cube_t ret, err;
+ cube_arr_t ret = {0};
+ cube_t err;
char *b;
- setzero(ret);
setzero(err);
b = buf;
@@ -397,7 +412,7 @@ readcube_H48(char *buf)
set_corner(ret, i, piece | orient);
}
- return ret;
+ return arrtocube(ret);
}
cube_t
@@ -424,7 +439,7 @@ readcube(format_t format, char *buf)
}
static void
-writecube_AVX(cube_t cube, char *buf)
+writecube_AVX(cube_arr_t cube, char *buf)
{
int i, ptr;
uint8_t piece;
@@ -448,9 +463,8 @@ writecube_AVX(cube_t cube, char *buf)
memcpy(buf+ptr-2, "\n)\0", 3);
}
-
static void
-writecube_H48(cube_t cube, char *buf)
+writecube_H48(cube_arr_t cube, char *buf)
{
uint8_t piece, perm, orient;
int i;
@@ -502,7 +516,7 @@ writepiece_SRC(uint8_t piece, char *buf)
}
static void
-writecube_SRC(cube_t cube, char *buf)
+writecube_SRC(cube_arr_t cube, char *buf)
{
int i, ptr;
uint8_t piece;
@@ -529,6 +543,7 @@ writecube_SRC(cube_t cube, char *buf)
void
writecube(format_t format, cube_t cube, char *buf)
{
+ cube_arr_t a;
char *errormsg;
size_t len;
@@ -537,15 +552,17 @@ writecube(format_t format, cube_t cube, char *buf)
goto writecube_error;
}
+ cubetoarr(cube, &a);
+
switch (format) {
case AVX:
- writecube_AVX(cube, buf);
+ writecube_AVX(a, buf);
break;
case H48:
- writecube_H48(cube, buf);
+ writecube_H48(a, buf);
break;
case SRC:
- writecube_SRC(cube, buf);
+ writecube_SRC(a, buf);
break;
default:
errormsg = "ERROR: cannot write cube in the given format";
@@ -687,11 +704,14 @@ permsign(uint8_t *a, int n)
}
static bool
-isconsistent(cube_t c)
+isconsistent(cube_t cube)
{
+ cube_arr_t c;
uint8_t i, p, e, piece;
bool found[12];
+ cubetoarr(cube, &c);
+
for (i = 0; i < 12; i++)
found[i] = false;
for (i = 0; i < 12; i++) {
@@ -751,7 +771,8 @@ inconsistent_co:
bool
issolvable(cube_t cube)
{
- uint8_t i, eo, co, piece, e[12], c[8];
+ cube_arr_t c;
+ uint8_t i, eo, co, piece, edges[12], corners[8];
#ifdef DEBUG
if (!isconsistent(cube)) {
@@ -760,17 +781,19 @@ issolvable(cube_t cube)
}
#endif
+ cubetoarr(cube, &c);
+
for (i = 0; i < 12; i++)
- e[i] = get_edge(cube, i) & _pbits;
+ edges[i] = get_edge(c, i) & _pbits;
for (i = 0; i < 8; i++)
- c[i] = get_corner(cube, i) & _pbits;
+ corners[i] = get_corner(c, i) & _pbits;
- if (permsign(e, 12) != permsign(c, 8))
+ if (permsign(edges, 12) != permsign(corners, 8))
goto issolvable_parity;
eo = 0;
for (i = 0; i < 12; i++) {
- piece = get_edge(cube, i);
+ piece = get_edge(c, i);
eo += (piece & _eobit) >> _eoshift;
}
if (eo % 2 != 0)
@@ -778,7 +801,7 @@ issolvable(cube_t cube)
co = 0;
for (i = 0; i < 8; i++) {
- piece = get_corner(cube, i);
+ piece = get_corner(c, i);
co += (piece & _cobits) >> _coshift;
}
if (co % 3 != 0)
@@ -909,7 +932,6 @@ move_error:
cube_t
inverse(cube_t c)
{
- /* TODO: optimize for avx2 */
cube_t ret;
#ifdef DEBUG
@@ -926,7 +948,7 @@ inverse(cube_t c)
* [1] https://github.com/Voltara/vcube
* [2] http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
*/
- cube_t v3, vi;
+ cube_t v3, vi, vo, vp;
v3 = _mm256_shuffle_epi8(c, c);
v3 = _mm256_shuffle_epi8(v3, c);
@@ -945,7 +967,12 @@ inverse(cube_t c)
vi = _mm256_shuffle_epi8(vi, vi);
vi = _mm256_shuffle_epi8(vi, v3);
vi = _mm256_shuffle_epi8(vi, vi);
- ret = _mm256_shuffle_epi8(vi, c);
+ vi = _mm256_shuffle_epi8(vi, c);
+
+ vo = _mm256_and_si256(c, _mm256_or_si256(_eo_avx2, _co_avx2));
+ vo = _mm256_shuffle_epi8(vo, vi);
+ vp = _mm256_andnot_si256(_mm256_or_si256(_eo_avx2, _co_avx2), vi);
+ ret = _mm256_or_si256(vp, vo);
return flipallcorners(ret);
#else
@@ -983,19 +1010,30 @@ inline_compose(cube_t c1, cube_t c2)
#endif
#ifdef CUBE_AVX2
- cube_t shuf, eo, eodone, co2, aux, auy1, auy2, cw, auz1, auz2, coclean;
+ cube_t s, eo2, ed, co1, co2, aux, auy1, auy2, cw, cwccw, auz1, auz2,
+ coclean;
+ eo2 = _mm256_and_si256(c2, _eo_avx2);
+ s = _mm256_shuffle_epi8(c1, c2);
+ ed = _mm256_xor_si256(s, eo2);
+ co1 = _mm256_and_si256(s, _co_avx2);
co2 = _mm256_and_si256(c2, _co_avx2);
- eo = _mm256_and_si256(c2, _eo_avx2);
- shufd = _mm256_shuffle_epi8(c1, c2);
- eodone = _mm256_(shufd, eo);
- aux = _mm256_add_epi8(c1, co2);
- cw = _mm256_set_epi64x(0, 0x2020202020202020, 0, 0);
+ aux = _mm256_add_epi8(co1, co2);
+ cw = _mm256_set_epi8(
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20
+ );
auy1 = _mm256_add_epi8(aux, cw);
- auy2 = _mm256_srli_si256(auy1, 2);
+ auy2 = _mm256_srli_epi32(auy1, 2);
auz1 = _mm256_add_epi8(aux, auy2);
- auz2 = _mm256_and_si256(auz1, _co_avx2);
- coclean = _mm256_andnot_si256(eodone, _co_avx2);
+ cwccw = _mm256_set_epi8(
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60
+ );
+ auz2 = _mm256_and_si256(auz1, cwccw);
+ coclean = _mm256_andnot_si256(_co_avx2, ed);
ret = _mm256_or_si256(coclean, auz2);
#else
uint8_t i, piece1, piece2, p, orien, aux, auy;
diff --git a/src/cube.h b/src/cube.h
@@ -3,6 +3,7 @@ typedef struct {
uint8_t e[16];
} cube_arr_t;
#ifdef CUBE_AVX2
+#include <immintrin.h>
typedef __m256i cube_t;
#else
typedef cube_arr_t cube_t;
diff --git a/test/00_basic/basic_tests.c b/test/00_basic/basic_tests.c
@@ -14,6 +14,7 @@ check(cube_t cube, char *name)
void
check2(cube_t cube1, char *name1, cube_t cube2, char *name2)
{
+fprintf(stderr, "check2 %s %s\n", name1, name2);
printf("%s and %s are%s equal\n", name1, name2,
equal(cube1, cube2) ? "" : " NOT");
}
diff --git a/test/test.sh b/test/test.sh
@@ -1,10 +1,14 @@
#!/bin/sh
CC="cc -DDEBUG -std=c99 -pthread -pedantic -Wall -Wextra \
- -Wno-unused-parameter -Wno-unused-function -g3"
-if [ $(uname) != "OpenBSD" ]; then
+ -Wno-unused-parameter -Wno-unused-function -g3 -D$CUBETYPE"
+if [ "$CUBETYPE" = "CUBE_AVX2" ]; then
+ CC="$CC -mavx2"
+fi
+if [ "$(uname)" != "OpenBSD" ]; then
CC="$CC -fsanitize=address -fsanitize=undefined"
fi
+
TESTBIN="test/run"
TESTOUT="test/last.out"
TESTERR="test/last.err"