commit 9b1c9371333e20611ec5422e660ebec7e66c4261
parent a61382e9dec22ea1ff3fea2d59d3c605c0aa2aac
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date: Thu, 2 Nov 2023 22:17:14 +0100
Added avx2 moves
Diffstat:
5 files changed, 266 insertions(+), 14 deletions(-)
diff --git a/README.md b/README.md
@@ -15,10 +15,7 @@ $ make test
### Make AVX2 work
-* inline moves for avx2
* fix base get_ and set_ macros (constant arguments?)
-* optimize inverse for avx2
-* other things to optimize?
### Documentation and interface
@@ -37,10 +34,13 @@ $ make test
* Takes as parameters the amount of memory to use and a FILE for the tables
* Use multi-move (up to 4/5 moves at once)
-### Things I need to learn:
+### Future optimizations
-* Inspect compiled assembly
-* Use valgrind tool cachegrind and other profiling tools
+* CO is the worst part of moving, transforming and inverting. Try basing
+ everything on representing the cube without CO and apply it only at the
+ end to check that it is actually solved.
+* see if vcube's method to flip all corners is better
+* find a better way for computing the inverse?
## Internal representation of the cube
diff --git a/src/_moves_avx2.c b/src/_moves_avx2.c
@@ -1,2 +1,197 @@
-/* TODO: return compose(c, (some avx garbage));
-*/
+static inline cube_t
+inline_move_U(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 11, 10, 9, 8, 7, 6, 0, 1, 3, 2, 5, 4,
+ 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 1, 0, 3, 2, 4, 5
+ );
+
+ return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_U2(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 11, 10, 9, 8, 7, 6, 4, 5, 3, 2, 0, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 4, 5, 3, 2, 0, 1
+ );
+
+ return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_U3(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 11, 10, 9, 8, 7, 6, 1, 0, 3, 2, 4, 5,
+ 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 0, 1, 3, 2, 5, 4
+ );
+
+ return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_D(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 11, 10, 9, 8, 3, 2, 5, 4, 6, 7, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 5, 4, 6, 7, 1, 0
+ );
+
+ return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_D2(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 11, 10, 9, 8, 6, 7, 5, 4, 2, 3, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 5, 4, 2, 3, 1, 0
+ );
+
+ return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_D3(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 11, 10, 9, 8, 2, 3, 5, 4, 7, 6, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 5, 4, 7, 6, 1, 0
+ );
+
+ return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_R(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 4, 10, 9, 7, 11, 6, 5, 8, 3, 2, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 7, 35, 32, 4, 69, 2, 1, 70
+ );
+
+ return compose(c, m);
+}
+
+static inline cube_t
+inline_move_R2(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 8, 10, 9, 11, 4, 6, 5, 7, 3, 2, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 7, 5, 6, 4, 0, 2, 1, 3
+ );
+
+ return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_R3(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 7, 10, 9, 4, 8, 6, 5, 11, 3, 2, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 7, 32, 35, 4, 70, 2, 1, 69
+ );
+
+ return compose(c, m);
+}
+
+static inline cube_t
+inline_move_L(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 11, 6, 5, 8, 7, 9, 10, 4, 3, 2, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 34, 6, 5, 33, 3, 68, 71, 0
+ );
+
+ return compose(c, m);
+}
+
+static inline cube_t
+inline_move_L2(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 11, 9, 10, 8, 7, 5, 6, 4, 3, 2, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 4, 6, 5, 7, 3, 1, 2, 0
+ );
+
+ return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_L3(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 11, 5, 6, 8, 7, 10, 9, 4, 3, 2, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 33, 6, 5, 34, 3, 71, 68, 0
+ );
+
+ return compose(c, m);
+}
+
+static inline cube_t
+inline_move_F(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 11, 10, 19, 16, 7, 6, 5, 4, 24, 2, 1, 25,
+ 0, 0, 0, 0, 0, 0, 0, 0, 7, 64, 5, 66, 3, 38, 1, 36
+ );
+
+ return compose(c, m);
+}
+
+static inline cube_t
+inline_move_F2(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 11, 10, 8, 9, 7, 6, 5, 4, 0, 2, 1, 3,
+ 0, 0, 0, 0, 0, 0, 0, 0, 7, 4, 5, 6, 3, 0, 1, 2
+ );
+
+ return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_F3(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 11, 10, 16, 19, 7, 6, 5, 4, 25, 2, 1, 24,
+ 0, 0, 0, 0, 0, 0, 0, 0, 7, 66, 5, 64, 3, 36, 1, 38
+ );
+
+ return compose(c, m);
+}
+
+static inline cube_t
+inline_move_B(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 18, 17, 9, 8, 7, 6, 5, 4, 3, 26, 27, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 65, 6, 67, 4, 39, 2, 37, 0
+ );
+
+ return compose(c, m);
+}
+
+static inline cube_t
+inline_move_B2(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 10, 11, 9, 8, 7, 6, 5, 4, 3, 1, 2, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, 4, 1, 2, 3, 0
+ );
+
+ return _mm256_shuffle_epi8(c, m);
+}
+
+static inline cube_t
+inline_move_B3(cube_t c)
+{
+ cube_t m = _mm256_set_epi8(
+ 0, 0, 0, 0, 17, 18, 9, 8, 7, 6, 5, 4, 3, 27, 26, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 67, 6, 65, 4, 37, 2, 39, 0
+ );
+
+ return compose(c, m);
+}
diff --git a/src/cube.c b/src/cube.c
@@ -269,6 +269,7 @@ static char *transstr[] = {
[BLm] = "mirrored BL",
};
+static inline cube_t inline_compose(cube_t, cube_t);
static bool isconsistent(cube_t);
static cube_t flipallcorners(cube_t);
static uint8_t readco(char *);
@@ -909,18 +910,49 @@ cube_t
inverse(cube_t c)
{
/* TODO: optimize for avx2 */
- uint8_t i, piece, orien;
cube_t ret;
- setzero(ret);
-
#ifdef DEBUG
if (!isconsistent(c)) {
fprintf(stderr, "inverse error, inconsistent cube\n");
+ setzero(ret);
return ret;
}
#endif
+#ifdef CUBE_AVX2
+ /* Method taken from Andrew Skalski's vcube[1]. The addition sequence
+ * was generated using [2].
+ * [1] https://github.com/Voltara/vcube
+ * [2] http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
+ */
+ cube_t v3, vi;
+
+ v3 = _mm256_shuffle_epi8(c, c);
+ v3 = _mm256_shuffle_epi8(v3, c);
+ vi = _mm256_shuffle_epi8(v3, v3);
+ vi = _mm256_shuffle_epi8(vi, vi);
+ vi = _mm256_shuffle_epi8(vi, vi);
+ vi = _mm256_shuffle_epi8(vi, v3);
+ vi = _mm256_shuffle_epi8(vi, vi);
+ vi = _mm256_shuffle_epi8(vi, vi);
+ vi = _mm256_shuffle_epi8(vi, vi);
+ vi = _mm256_shuffle_epi8(vi, c);
+ vi = _mm256_shuffle_epi8(vi, vi);
+ vi = _mm256_shuffle_epi8(vi, vi);
+ vi = _mm256_shuffle_epi8(vi, vi);
+ vi = _mm256_shuffle_epi8(vi, vi);
+ vi = _mm256_shuffle_epi8(vi, vi);
+ vi = _mm256_shuffle_epi8(vi, v3);
+ vi = _mm256_shuffle_epi8(vi, vi);
+ ret = _mm256_shuffle_epi8(vi, c);
+
+ return flipallcorners(ret);
+#else
+ uint8_t i, piece, orien;
+
+ setzero(ret);
+
for (i = 0; i < 12; i++) {
piece = get_edge(c, i);
orien = piece & _eobit;
@@ -932,12 +964,13 @@ inverse(cube_t c)
orien = ((piece << 1) | (piece >> 1)) & _cobits2;
set_corner(ret, piece & _pbits, i | orien);
}
+#endif
return ret;
}
-cube_t
-compose(cube_t c1, cube_t c2)
+static inline cube_t
+inline_compose(cube_t c1, cube_t c2)
{
cube_t ret;
@@ -992,6 +1025,12 @@ compose(cube_t c1, cube_t c2)
}
cube_t
+compose(cube_t c1, cube_t c2)
+{
+ return inline_compose(c1, c2);
+}
+
+cube_t
transform(cube_t c, trans_t t)
{
cube_t ret;
diff --git a/utils/genmovecode.sh b/utils/genmovecode.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+type="${1:-src}"
+
+gcc -DDEBUG h48_to_"$type".c ../src/cube.c -o h48_to_"$type"
+
+genfuncs() {
+ for f in move_??_*.txt; do
+ move="$(echo $f | sed 's/.*_// ; s/\.txt//')"
+ printf 'static inline cube_t\ninline_move_%s' "$move"
+ printf '(cube_t c)\n{\n'
+ printf '\tcube_t m = '
+ ./h48_to_"$type" <"$f" | sed '2,4s/^/\t/'
+ printf ';\n\n\treturn compose(c, m);\n}\n\n'
+ done
+}
+
+genfuncs
+rm -f h48_to_"$type" invert
diff --git a/utils/gentranscode.sh b/utils/gentranscode.sh
@@ -23,7 +23,6 @@ genfuncs() {
for f in transform_??_???.txt; do
trans="$(echo $f | sed 's/.*_// ; s/\.txt//')"
printf 'static inline cube_t\ninline_trans_%s' "$trans"
- [ "$1" = "-i" ] && printf '_inverse'
printf '(cube_t c)\n{\n'
printf '\tcube_t ret;\n\n'
printf '\tcube_t tn = '