Optimized compose for avx2 - h48 - A prototype for an optimal Rubik's cube solver, work in progress.

commit a61382e9dec22ea1ff3fea2d59d3c605c0aa2aac
parent a2b8556e3cdc59bbb4a89c2779d2094db975eaa5
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Thu,  2 Nov 2023 20:05:29 +0100

Optimized compose for avx2

Diffstat:
M README.md  | 6 ++----
M src/cube.c  | 26 ++++++++++++++++++++++----

2 files changed, 24 insertions(+), 8 deletions(-)
diff --git a/README.md b/README.md
@@ -17,7 +17,8 @@ $ make test
 
 * inline moves for avx2
 * fix base get_ and set_ macros (constant arguments?)
-* optimize things that use get_ and set_
+* optimize inverse for avx2
+* other things to optimize?
 
 ### Documentation and interface
 
@@ -38,12 +39,9 @@ $ make test
 
 ### Things I need to learn:
 
-* Use AVX2 instructions, in particular
-  [_mm256_shuffle_epi8](https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/mm256-shuffle-epi8.html))
 * Inspect compiled assembly
 * Use valgrind tool cachegrind and other profiling tools
 
-
 ## Internal representation of the cube
 
 The plan (TODO) is to have multiple implementations: some that
diff --git a/src/cube.c b/src/cube.c
@@ -939,19 +939,36 @@ inverse(cube_t c)
 cube_t
 compose(cube_t c1, cube_t c2)
 {
-	/* TODO: optimize for avx2 */
-	uint8_t i, piece1, piece2, p, orien, aux, auy;
 	cube_t ret;
 
-	setzero(ret);
-
 #ifdef DEBUG
 	if (!isconsistent(c1) || !isconsistent(c2)) {
 		fprintf(stderr, "compose error, inconsistent cube\n");
+		setzero(ret);
 		return ret;
 	}
 #endif
 
+#ifdef CUBE_AVX2
+	cube_t shuf, eo, eodone, co2, aux, auy1, auy2, cw, auz1, auz2, coclean;
+
+	co2 = _mm256_and_si256(c2, _co_avx2);
+	eo = _mm256_and_si256(c2, _eo_avx2);
+	shufd = _mm256_shuffle_epi8(c1, c2);
+	eodone = _mm256_(shufd, eo);
+	aux = _mm256_add_epi8(c1, co2);
+	cw = _mm256_set_epi64x(0, 0x2020202020202020, 0, 0);
+	auy1 = _mm256_add_epi8(aux, cw);
+	auy2 = _mm256_srli_si256(auy1, 2);
+	auz1 = _mm256_add_epi8(aux, auy2);
+	auz2 = _mm256_and_si256(auz1, _co_avx2);
+	coclean = _mm256_andnot_si256(eodone, _co_avx2);
+	ret = _mm256_or_si256(coclean, auz2);
+#else
+	uint8_t i, piece1, piece2, p, orien, aux, auy;
+
+	setzero(ret);
+
 	for (i = 0; i < 12; i++) {
 		piece2 = get_edge(c2, i);
 		p = piece2 & _pbits;
@@ -969,6 +986,7 @@ compose(cube_t c1, cube_t c2)
 		orien = (aux + auy) & _cobits2;
 		set_corner(ret, i, (piece1 & _pbits) | orien);
 	}
+#endif
 
 	return ret;
 }

	h48 A prototype for an optimal Rubik's cube solver, work in progress.
	git clone https://git.tronto.net/h48
	Download \| Log \| Files \| Refs \| README \| LICENSE