Added avx2 moves - h48 - A prototype for an optimal Rubik's cube solver, work in progress.

commit 9b1c9371333e20611ec5422e660ebec7e66c4261
parent a61382e9dec22ea1ff3fea2d59d3c605c0aa2aac
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Thu,  2 Nov 2023 22:17:14 +0100

Added avx2 moves

Diffstat:
M README.md  | 12 ++++++------
M src/_moves_avx2.c  | 199 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
M src/cube.c  | 49 ++++++++++++++++++++++++++++++++++++++++++++-----
A utils/genmovecode.sh  | 19 +++++++++++++++++++
M utils/gentranscode.sh  | 1 -

5 files changed, 266 insertions(+), 14 deletions(-)
diff --git a/README.md b/README.md
@@ -15,10 +15,7 @@ $ make test
 
 ### Make AVX2 work
 
-* inline moves for avx2
 * fix base get_ and set_ macros (constant arguments?)
-* optimize inverse for avx2
-* other things to optimize?
 
 ### Documentation and interface
 
@@ -37,10 +34,13 @@ $ make test
 * Takes as parameters the amount of memory to use and a FILE for the tables
 * Use multi-move (up to 4/5 moves at once)
 
-### Things I need to learn:
+### Future optimizations
 
-* Inspect compiled assembly
-* Use valgrind tool cachegrind and other profiling tools
+* CO is the worst part of moving, transforming and inverting. Try basing
+  everything on representing the cube without CO and apply it only at the
+  end to check that it is actually solved.
+* see if vcube's method to flip all corners is better
+* find a better way for computing the inverse?
 
 ## Internal representation of the cube
 
diff --git a/src/_moves_avx2.c b/src/_moves_avx2.c
@@ -1,2 +1,197 @@
-/* TODO: return compose(c, (some avx garbage));
-*/
+static inline cube_t
+inline_move_U(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 11, 10, 9, 8, 7, 6, 0, 1, 3, 2, 5, 4,
+		0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 1, 0, 3, 2, 4, 5
+	);
+
+	return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_U2(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 11, 10, 9, 8, 7, 6, 4, 5, 3, 2, 0, 1,
+		0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 4, 5, 3, 2, 0, 1
+	);
+
+	return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_U3(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 11, 10, 9, 8, 7, 6, 1, 0, 3, 2, 4, 5,
+		0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 0, 1, 3, 2, 5, 4
+	);
+
+	return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_D(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 11, 10, 9, 8, 3, 2, 5, 4, 6, 7, 1, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 5, 4, 6, 7, 1, 0
+	);
+
+	return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_D2(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 11, 10, 9, 8, 6, 7, 5, 4, 2, 3, 1, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 5, 4, 2, 3, 1, 0
+	);
+
+	return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_D3(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 11, 10, 9, 8, 2, 3, 5, 4, 7, 6, 1, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 5, 4, 7, 6, 1, 0
+	);
+
+	return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_R(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 4, 10, 9, 7, 11, 6, 5, 8, 3, 2, 1, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 7, 35, 32, 4, 69, 2, 1, 70
+	);
+
+	return compose(c, m);
+}
+
+static inline cube_t
+inline_move_R2(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 8, 10, 9, 11, 4, 6, 5, 7, 3, 2, 1, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 7, 5, 6, 4, 0, 2, 1, 3
+	);
+
+	return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_R3(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 7, 10, 9, 4, 8, 6, 5, 11, 3, 2, 1, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 7, 32, 35, 4, 70, 2, 1, 69
+	);
+
+	return compose(c, m);
+}
+
+static inline cube_t
+inline_move_L(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 11, 6, 5, 8, 7, 9, 10, 4, 3, 2, 1, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 34, 6, 5, 33, 3, 68, 71, 0
+	);
+
+	return compose(c, m);
+}
+
+static inline cube_t
+inline_move_L2(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 11, 9, 10, 8, 7, 5, 6, 4, 3, 2, 1, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 4, 6, 5, 7, 3, 1, 2, 0
+	);
+
+	return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_L3(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 11, 5, 6, 8, 7, 10, 9, 4, 3, 2, 1, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 33, 6, 5, 34, 3, 71, 68, 0
+	);
+
+	return compose(c, m);
+}
+
+static inline cube_t
+inline_move_F(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 11, 10, 19, 16, 7, 6, 5, 4, 24, 2, 1, 25,
+		0, 0, 0, 0, 0, 0, 0, 0, 7, 64, 5, 66, 3, 38, 1, 36
+	);
+
+	return compose(c, m);
+}
+
+static inline cube_t
+inline_move_F2(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 11, 10, 8, 9, 7, 6, 5, 4, 0, 2, 1, 3,
+		0, 0, 0, 0, 0, 0, 0, 0, 7, 4, 5, 6, 3, 0, 1, 2
+	);
+
+	return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_F3(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 11, 10, 16, 19, 7, 6, 5, 4, 25, 2, 1, 24,
+		0, 0, 0, 0, 0, 0, 0, 0, 7, 66, 5, 64, 3, 36, 1, 38
+	);
+
+	return compose(c, m);
+}
+
+static inline cube_t
+inline_move_B(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 18, 17, 9, 8, 7, 6, 5, 4, 3, 26, 27, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 65, 6, 67, 4, 39, 2, 37, 0
+	);
+
+	return compose(c, m);
+}
+
+static inline cube_t
+inline_move_B2(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 10, 11, 9, 8, 7, 6, 5, 4, 3, 1, 2, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, 4, 1, 2, 3, 0
+	);
+
+	return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_B3(cube_t c)
+{
+	cube_t m = _mm256_set_epi8(
+		0, 0, 0, 0, 17, 18, 9, 8, 7, 6, 5, 4, 3, 27, 26, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 67, 6, 65, 4, 37, 2, 39, 0
+	);
+
+	return compose(c, m);
+}
diff --git a/src/cube.c b/src/cube.c
@@ -269,6 +269,7 @@ static char *transstr[] = {
 	[BLm] = "mirrored BL",
 };
 
+static inline cube_t inline_compose(cube_t, cube_t);
 static bool isconsistent(cube_t);
 static cube_t flipallcorners(cube_t);
 static uint8_t readco(char *);
@@ -909,18 +910,49 @@ cube_t
 inverse(cube_t c)
 {
 	/* TODO: optimize for avx2 */
-	uint8_t i, piece, orien;
 	cube_t ret;
 
-	setzero(ret);
-
 #ifdef DEBUG
 	if (!isconsistent(c)) {
 		fprintf(stderr, "inverse error, inconsistent cube\n");
+		setzero(ret);
 		return ret;
 	}
 #endif
 
+#ifdef CUBE_AVX2
+	/* Method taken from Andrew Skalski's vcube[1]. The addition sequence
+	 * was generated using [2].
+	 * [1] https://github.com/Voltara/vcube
+	 * [2] http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
+	 */
+	cube_t v3, vi;
+
+	v3 = _mm256_shuffle_epi8(c, c);
+	v3 = _mm256_shuffle_epi8(v3, c);
+	vi = _mm256_shuffle_epi8(v3, v3);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, v3);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, c);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, v3);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	ret = _mm256_shuffle_epi8(vi, c);
+	
+	return flipallcorners(ret);
+#else
+	uint8_t i, piece, orien;
+
+	setzero(ret);
+
 	for (i = 0; i < 12; i++) {
 		piece = get_edge(c, i);
 		orien = piece & _eobit;
@@ -932,12 +964,13 @@ inverse(cube_t c)
 		orien = ((piece << 1) | (piece >> 1)) & _cobits2;
 		set_corner(ret, piece & _pbits, i | orien);
 	}
+#endif
 
 	return ret;
 }
 
-cube_t
-compose(cube_t c1, cube_t c2)
+static inline cube_t
+inline_compose(cube_t c1, cube_t c2)
 {
 	cube_t ret;
 
@@ -992,6 +1025,12 @@ compose(cube_t c1, cube_t c2)
 }
 
 cube_t
+compose(cube_t c1, cube_t c2)
+{
+	return inline_compose(c1, c2);
+}
+
+cube_t
 transform(cube_t c, trans_t t)
 {
 	cube_t ret;
diff --git a/utils/genmovecode.sh b/utils/genmovecode.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+type="${1:-src}"
+
+gcc -DDEBUG h48_to_"$type".c ../src/cube.c -o h48_to_"$type"
+
+genfuncs() {
+	for f in move_??_*.txt; do
+		move="$(echo $f | sed 's/.*_// ; s/\.txt//')"
+		printf 'static inline cube_t\ninline_move_%s' "$move"
+		printf '(cube_t c)\n{\n'
+		printf '\tcube_t m = '
+		./h48_to_"$type" <"$f" | sed '2,4s/^/\t/'
+		printf ';\n\n\treturn compose(c, m);\n}\n\n'
+	done
+}
+
+genfuncs
+rm -f h48_to_"$type" invert
diff --git a/utils/gentranscode.sh b/utils/gentranscode.sh
@@ -23,7 +23,6 @@ genfuncs() {
 	for f in transform_??_???.txt; do
 		trans="$(echo $f | sed 's/.*_// ; s/\.txt//')"
 		printf 'static inline cube_t\ninline_trans_%s' "$trans"
-		[ "$1" = "-i" ] && printf '_inverse'
 		printf '(cube_t c)\n{\n'
 		printf '\tcube_t ret;\n\n'
 		printf '\tcube_t tn = '

	h48 A prototype for an optimal Rubik's cube solver, work in progress.
	git clone https://git.tronto.net/h48
	Download \| Log \| Files \| Refs \| README \| LICENSE

M	README.md	\|	12	++++++------
M	src/_moves_avx2.c	\|	199	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
M	src/cube.c	\|	49	++++++++++++++++++++++++++++++++++++++++++++-----
A	utils/genmovecode.sh	\|	19	+++++++++++++++++++
M	utils/gentranscode.sh	\|	1	-