commit 73a99f18e7940f424287250f780a18d861aef231
parent b0053277e385bee23336d1fe6b69a12f49f9172f
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date: Tue, 1 Apr 2025 14:01:49 +0200
Some simplification to AVX2 code
Some simplification after discussion with Arhan Chaudhary (inspired by vcube).
Performance looks unchanged, but at least the code is cleaner.
Diffstat:
2 files changed, 32 insertions(+), 62 deletions(-)
diff --git a/src/arch/avx2.h b/src/arch/avx2.h
@@ -1,10 +1,16 @@
#define CO2_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x6060606060606060))
#define COCW_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x2020202020202020))
#define CP_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x0707070707070707))
-#define EP_AVX2 \
- _mm256_set_epi64x(INT64_C(0x0F0F0F0F), INT64_C(0x0F0F0F0F0F0F0F0F), 0, 0)
-#define EO_AVX2 \
- _mm256_set_epi64x(INT64_C(0x10101010), INT64_C(0x1010101010101010), 0, 0)
+#define EP_AVX2 _mm256_set_epi64x(\
+ INT64_C(0x0F0F0F0F), INT64_C(0x0F0F0F0F0F0F0F0F), 0, 0)
+#define EO_AVX2 _mm256_set_epi64x(\
+ INT64_C(0x10101010), INT64_C(0x1010101010101010), 0, 0)
+#define ORIENT_AVX2 _mm256_set_epi64x(INT64_C(0x10101010), \
+ INT64_C(0x1010101010101010), 0, INT64_C(0x6060606060606060))
+#define USED_AVX2 _mm256_set_epi64x(INT64_C(0x00000000FFFFFFFF), \
+ INT64_C(0xFFFFFFFFFFFFFFFF), 0, INT64_C(0xFFFFFFFFFFFFFFFF))
+#define CARRY_AVX2 _mm256_set_epi64x(INT64_C(0x20202020), \
+ INT64_C(0x2020202020202020), 0, INT64_C(0x6060606060606060))
#define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
@@ -61,76 +67,36 @@ invertco(cube_t c)
}
STATIC_INLINE cube_t
-compose_epcpeo(cube_t c1, cube_t c2)
-{
- cube_t b, s, eo2;
-
- /* Permute and clean unused bits */
- s = _mm256_shuffle_epi8(c1, c2);
- b = _mm256_set_epi8(
- ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0
- );
- s = _mm256_andnot_si256(b, s);
-
- /* Change EO */
- eo2 = _mm256_and_si256(c2, EO_AVX2);
- s = _mm256_xor_si256(s, eo2);
-
- return s;
-}
-
-STATIC_INLINE cube_t
compose_edges(cube_t c1, cube_t c2)
{
- return compose_epcpeo(c1, c2);
+ return compose(c1, c2);
}
STATIC_INLINE cube_t
compose_corners(cube_t c1, cube_t c2)
{
- /*
- * We do a full compose. Minor optimizations are possible, like
- * saving one instruction by not doing EO, but it should not
- * be significant.
- */
return compose(c1, c2);
}
STATIC_INLINE cube_t
compose(cube_t c1, cube_t c2)
{
- cube_t s, co1, co2, aux, auy1, auy2, auz1, auz2;
-
- s = compose_epcpeo(c1, c2);
-
- /* Change CO */
- co1 = _mm256_and_si256(s, CO2_AVX2);
- co2 = _mm256_and_si256(c2, CO2_AVX2);
- aux = _mm256_add_epi8(co1, co2);
- auy1 = _mm256_add_epi8(aux, COCW_AVX2);
- auy2 = _mm256_srli_epi32(auy1, 2);
- auz1 = _mm256_add_epi8(aux, auy2);
- auz2 = _mm256_and_si256(auz1, CO2_AVX2);
-
- /* Put together */
- s = _mm256_andnot_si256(CO2_AVX2, s);
- s = _mm256_or_si256(s, auz2);
-
- return s;
-}
+ /*
+ * Method taken from Andrew Skalski's vcube (thanks to Arhan Chaudhary
+ * for pointing this out)
+ */
+ cube_t ss, so, su;
-STATIC_INLINE cube_t
-cleanaftershuffle(cube_t c)
-{
- __m256i b;
+ /* Permute */
+ ss = _mm256_shuffle_epi8(c1, c2);
- b = _mm256_set_epi8(
- ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0
- );
+ /* Orient */
+ so = _mm256_and_si256(c2, ORIENT_AVX2);
+ ss = _mm256_add_epi8(ss, so);
+ su = _mm256_sub_epi8(ss, CARRY_AVX2);
+ ss = _mm256_min_epu8(ss, su);
- return _mm256_andnot_si256(b, c);
+ return _mm256_and_si256(ss, USED_AVX2);
}
STATIC_INLINE cube_t
@@ -163,11 +129,11 @@ inverse(cube_t c)
vi = _mm256_shuffle_epi8(vi, vi);
vi = _mm256_shuffle_epi8(vi, c);
- vo = _mm256_and_si256(c, _mm256_or_si256(EO_AVX2, CO2_AVX2));
+ vo = _mm256_and_si256(c, ORIENT_AVX2);
vo = _mm256_shuffle_epi8(vo, vi);
- vp = _mm256_andnot_si256(_mm256_or_si256(EO_AVX2, CO2_AVX2), vi);
+ vp = _mm256_andnot_si256(ORIENT_AVX2, vi);
ret = _mm256_or_si256(vp, vo);
- ret = cleanaftershuffle(ret);
+ ret = _mm256_and_si256(ret, USED_AVX2);
return invertco(ret);
}
diff --git a/src/core/cube.h b/src/core/cube.h
@@ -109,9 +109,13 @@ issolvable(cube_t cube)
return true;
issolvable_parity:
+ LOG("There is parity\n");
+ return false;
issolvable_eo:
+ LOG("EO is not solvable\n");
+ return false;
issolvable_co:
- /* We used to do more logging here, hence the 3 different labels */
+ LOG("CO is not solvable\n");
return false;
}