Merge pull request #1 from enricotenuti/master - h48 - A prototype for an optimal Rubik's cube solver, work in progress.

commit c0d829e588c5d25ee0fc0ff3212658c92c0fb0a8
parent f148aa65dfe587e7b5d2f48b5005be670788dbe6
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Sat, 10 Aug 2024 17:40:37 +0200

Merge pull request #1 from enricotenuti/master

cube_neon ver1
Diffstat:
M .gitignore  | 2 ++
M src/cube.c  | 2 ++
M src/cube_neon.h  | 377 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
M test/104_cocsep_ttrep/cocsep_ttrep_tests.c  | 2 +-
M test/test.h  | 9 ++++++++-
M test/test.sh  | 33 ++++++++++++++++++---------------

6 files changed, 405 insertions(+), 20 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -7,7 +7,9 @@ run
 tables/*
 test/*/runtest
 test/run
+test/run.DSYM
 test/last.*
 tools/results
+.vscode
 *.o
 *.s
diff --git a/src/cube.c b/src/cube.c
@@ -27,6 +27,8 @@ void (*nissy_log)(const char *, ...);
 #include <immintrin.h>
 #include "cube_avx2.h"
 #elif defined(CUBE_NEON)
+#include <stdlib.h>
+#include <arm_neon.h>
 #include "cube_neon.h"
 #else
 #include <stdlib.h> /* TODO: check if can be removed */
diff --git a/src/cube_neon.h b/src/cube_neon.h
@@ -1,6 +1,377 @@
-/* TODO! */
-
-typedef struct {
+// cube_t
+typedef struct
+{
 	uint8x16_t corner;
 	uint8x16_t edge;
 } cube_t;
+
+#define _co2_neon vdupq_n_u8(0x60)
+#define _cocw_neon vdupq_n_u8(0x20)
+#define _cp_neon vdupq_n_u8(0x07)
+#define _ep_neon vcombine_u8(vdupq_n_u8(0x0F), vdupq_n_u8(0x0F))
+#define _eo_neon vcombine_u8(vdupq_n_u8(0x10), vdupq_n_u8(0x10))
+
+// static cube
+#define static_cube(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
+					e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
+	((cube_t){ \
+		.corner = {c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, 0, 0, 0, 0, 0, 0, 0, 0}, \
+		.edge = {e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0}})
+
+// zero cube
+#define zero \
+	(cube_t) \
+	{ \
+		.corner = vdupq_n_u8(0), \
+		.edge = vdupq_n_u8(0) \
+	}
+
+// solved cube
+#define solved static_cube(	\
+	0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
+
+// Functions
+_static void pieces(cube_t *, uint8_t[static 8], uint8_t[static 12]);
+_static_inline bool equal(cube_t, cube_t);
+_static_inline cube_t invertco(cube_t);
+_static_inline cube_t compose_edges(cube_t, cube_t);
+_static_inline cube_t compose_corners(cube_t, cube_t);
+_static_inline uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t);
+_static_inline uint8x16_t compose_corners_slim(uint8x16_t, uint8x16_t);
+_static_inline cube_t compose(cube_t, cube_t);
+_static_inline cube_t inverse(cube_t);
+
+_static_inline int64_t coord_co(cube_t);
+_static_inline int64_t coord_csep(cube_t);
+_static_inline int64_t coord_cocsep(cube_t);
+_static_inline int64_t coord_eo(cube_t);
+_static_inline int64_t coord_esep(cube_t);
+
+_static_inline void copy_corners(cube_t *, cube_t);
+_static_inline void copy_edges(cube_t *, cube_t);
+_static_inline void set_eo(cube_t *, int64_t);
+_static_inline cube_t invcoord_esep(int64_t);
+
+_static void
+pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12])
+{
+	// First 8 bytes of the corner vector are copied from the c array
+	vst1_u8(c, vget_low_u8(cube->corner));
+
+	// 12 bytes of the edge vector are copied from the e array
+	// First 8 bytes
+	vst1_u8(e, vget_low_u8(cube->edge));
+	// Next 4 bytes
+	vst1_lane_u32((uint32_t *)(e + 8), vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0);
+}
+
+_static_inline bool
+equal(cube_t c1, cube_t c2)
+{
+	uint8x16_t cmp_corner, cmp_edge;
+	uint64x2_t cmp_corner_u64, cmp_edge_u64;
+	uint64x2_t cmp_result;
+
+	// compare the corner vectors
+	cmp_corner = vceqq_u8(c1.corner, c2.corner);
+	// compare the edge vectors
+	cmp_edge = vceqq_u8(c1.edge, c2.edge);
+
+	// convert the comparison vectors to 64-bit vectors
+	cmp_corner_u64 = vreinterpretq_u64_u8(cmp_corner);
+	cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge);
+
+	// combine the comparison vectors
+	cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64);
+
+	// check if all the bits are set
+	return vgetq_lane_u64(cmp_result, 0) == ~0ULL && vgetq_lane_u64(cmp_result, 1) == ~0ULL;
+}
+
+_static_inline cube_t
+invertco(cube_t c)
+{
+	cube_t ret;
+	uint8x16_t co, shleft, shright, summed, newco, cleanco;
+
+	co = vandq_u8(c.corner, _co2_neon);
+	shleft = vshlq_n_u8(co, 1);
+	shright = vshrq_n_u8(co, 1);
+	summed = vorrq_u8(shleft, shright);
+	newco = vandq_u8(summed, _co2_neon);
+	cleanco = veorq_u8(c.corner, co);
+	ret.corner = vorrq_u8(cleanco, newco);
+	ret.edge = c.edge;
+	
+	return ret;
+}
+
+_static_inline cube_t
+compose_edges(cube_t c1, cube_t c2)
+{
+	cube_t ret = {0};
+	ret.edge = compose_edges_slim(c1.edge, c2.edge);
+	return ret;
+}
+
+_static_inline cube_t 
+compose_corners(cube_t c1, cube_t c2)
+{
+	cube_t ret = {0};
+	ret.corner = compose_corners_slim(c1.corner, c2.corner);
+	return ret;
+}
+
+_static_inline uint8x16_t 
+compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2)
+{
+	// Masks
+	uint8x16_t p_bits = vdupq_n_u8(_pbits);
+	uint8x16_t eo_bit = vdupq_n_u8(_eobit);
+
+	// Find the index and permutation
+	uint8x16_t p = vandq_u8(edge2, p_bits);
+	uint8x16_t piece1 = vqtbl1q_u8(edge1, p);
+
+	// Calculate the orientation through XOR
+	uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit);
+
+	// Combine the results
+	uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien);
+
+	// Mask to clear the last 32 bits of the result
+	uint8x16_t mask_last_32 = vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3);
+	ret = vreinterpretq_u8_u32(mask_last_32);
+
+	return ret;
+}
+
+_static_inline uint8x16_t 
+compose_corners_slim(uint8x16_t corner1, uint8x16_t corner2)
+{
+	// Masks
+	uint8x16_t p_bits = vdupq_n_u8(_pbits);
+	uint8x16_t cobits = vdupq_n_u8(_cobits);
+	uint8x16_t cobits2 = vdupq_n_u8(_cobits2);
+	uint8x16_t twist_cw = vdupq_n_u8(_ctwist_cw);
+
+	// Find the index and permutation
+	uint8x16_t p = vandq_u8(corner2, p_bits);
+	uint8x16_t piece1 = vqtbl1q_u8(corner1, p);
+
+	// Calculate the orientation
+	uint8x16_t aux = vaddq_u8(vandq_u8(corner2, cobits), vandq_u8(piece1, cobits));
+	uint8x16_t auy = vshrq_n_u8(vaddq_u8(aux, twist_cw), 2);
+	uint8x16_t orien = vandq_u8(vaddq_u8(aux, auy), cobits2);
+
+	// Combine the results
+	uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien);
+
+	// Mask to clear the last 64 bits of the result
+	uint8x16_t mask_last_64 = vsetq_lane_u64(0, vreinterpretq_u64_u8(ret), 1);
+	ret = vreinterpretq_u8_u64(mask_last_64);
+
+	return ret;
+}
+
+_static_inline cube_t 
+compose(cube_t c1, cube_t c2)
+{
+	cube_t ret = {0};
+
+	ret.edge = compose_edges_slim(c1.edge, c2.edge);
+	ret.corner = compose_corners_slim(c1.corner, c2.corner);
+
+	return ret;
+}
+
+_static_inline cube_t 
+inverse(cube_t cube)
+{
+	uint8_t i, piece, orien;
+	cube_t ret;
+
+	// Temp arrays to store the NEON vectors
+	uint8_t edges[16];
+	uint8_t corners[16];
+
+	// Copy the NEON vectors to the arrays
+	vst1q_u8(edges, cube.edge);
+	vst1q_u8(corners, cube.corner);
+
+	uint8_t edge_result[16] = {0};
+	uint8_t corner_result[16] = {0};
+
+	// Process the edges
+	for (i = 0; i < 12; i++)
+	{
+		piece = edges[i];
+		orien = piece & _eobit;
+		edge_result[piece & _pbits] = i | orien;
+	}
+
+	// Process the corners
+	for (i = 0; i < 8; i++)
+	{
+		piece = corners[i];
+		orien = ((piece << 1) | (piece >> 1)) & _cobits2;
+		corner_result[piece & _pbits] = i | orien;
+	}
+
+	// Copy the results back to the NEON vectors
+	ret.edge = vld1q_u8(edge_result);
+	ret.corner = vld1q_u8(corner_result);
+
+	return ret;
+}
+
+_static_inline int64_t 
+coord_co(cube_t c)
+{
+	// Temp array to store the NEON vector
+	uint8_t mem[16];
+	vst1q_u8(mem, c.corner);
+
+	int i, p;
+	int64_t ret;
+
+	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3)
+		ret += p * (mem[i] >> _coshift);
+
+	return ret;
+}
+
+_static_inline int64_t
+coord_csep(cube_t c)
+{
+	// Temp array to store the NEON vector
+	uint8_t mem[16];
+	vst1q_u8(mem, c.corner);
+
+	int64_t ret = 0;
+	int i, p;
+	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2)
+		ret += p * ((mem[i] & _csepbit) >> 2);
+
+	return ret;
+	return 0;
+}
+
+_static_inline int64_t
+coord_cocsep(cube_t c)
+{
+	return (coord_co(c) << 7) + coord_csep(c);
+}
+
+_static_inline int64_t 
+coord_eo(cube_t c)
+{
+	int64_t ret = 0;
+	int64_t p = 1;
+
+	// Temp array to store the NEON vector
+	uint8_t mem[16];
+	vst1q_u8(mem, c.edge);
+
+	for (int i = 1; i < 12; i++, p *= 2)
+	{
+		ret += p * (mem[i] >> _eoshift);
+	}
+
+	return ret;
+}
+
+_static_inline int64_t 
+coord_esep(cube_t c)
+{
+	int64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
+
+	// Temp array to store the NEON vector
+	uint8_t mem[16];
+	vst1q_u8(mem, c.edge);
+
+	for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++)
+	{
+		bit1 = (mem[i] & _esepbit1) >> 2;
+		bit2 = (mem[i] & _esepbit2) >> 3;
+		is1 = (1 - bit2) * bit1;
+
+		ret1 += bit2 * binomial[11 - i][k];
+		k -= bit2;
+
+		jj = j < 8;
+		ret2 += jj * is1 * binomial[7 - (j * jj)][l];
+		l -= is1;
+		j += (1 - bit2);
+	}
+
+	return ret1 * 70 + ret2;
+}
+
+_static_inline void
+copy_corners(cube_t *dst, cube_t src)
+{
+	dst->corner = src.corner;
+}
+
+_static_inline void
+copy_edges(cube_t *dst, cube_t src)
+{
+	dst->edge = src.edge;
+}
+
+_static_inline void
+set_eo(cube_t *cube, int64_t eo)
+{
+	// Temp array to store the NEON vector
+	uint8_t mem[16];
+	vst1q_u8(mem, cube->edge);
+	uint8_t i, sum, flip;
+
+	for (sum = 0, i = 1; i < 12; i++, eo >>= 1)
+	{
+		flip = eo % 2;
+		sum += flip;
+		mem[i] = (mem[i] & ~_eobit) | (_eobit * flip);
+	}
+	mem[0] = (mem[0] & ~_eobit) | (_eobit * (sum % 2));
+
+	// Copy the results back to the NEON vector
+	cube->edge = vld1q_u8(mem);
+	return;
+}
+
+_static_inline cube_t
+invcoord_esep(int64_t esep)
+{
+	cube_t ret;
+	int64_t bit1, bit2, i, j, jj, k, l, s, v, w, is1, set1, set2;
+	uint8_t slice[3] = {0};
+
+	ret = solved;
+	uint8_t mem[16];
+	set1 = esep % 70;
+	set2 = esep / 70;
+
+	for (i = 0, j = 0, k = 4, l = 4; i < 12; i++)
+	{
+		v = binomial[11 - i][k];
+		jj = j < 8;
+		w = jj * binomial[7 - (j * jj)][l];
+		bit2 = set2 >= v;
+		bit1 = set1 >= w;
+		is1 = (1 - bit2) * bit1;
+
+		set2 -= bit2 * v;
+		k -= bit2;
+		set1 -= is1 * w;
+		l -= is1;
+		j += (1 - bit2);
+		s = 2 * bit2 + (1 - bit2) * bit1;
+
+		mem[i] = (slice[s]++) | (uint8_t)(s << 2);
+	}
+
+	ret.edge = vld1q_u8(mem);
+	return ret;
+}
diff --git a/test/104_cocsep_ttrep/cocsep_ttrep_tests.c b/test/104_cocsep_ttrep/cocsep_ttrep_tests.c
@@ -29,7 +29,7 @@ void run(void) {
 			l = coord_cocsep(c);
 			if (k != l)
 				printf("cocsep %" PRId64 " (%" PRId64 "): "
-				    "%" PRId64 " ttrep %" PRIu8
+				    "%" PRId64 " ttrep %" PRIu32
 				    " ->  %" PRId64 "\n", i, l, j, tt, k);
 		}
 	}
diff --git a/test/test.h b/test/test.h
@@ -7,9 +7,16 @@
 
 #define STRLENMAX 10000
 
-#ifdef CUBE_AVX2
+#if defined(CUBE_AVX2)
 #include <immintrin.h>
 typedef __m256i cube_t;
+#elif defined(CUBE_NEON)
+#include <stdlib.h>
+#include <arm_neon.h>
+typedef struct {
+	uint8x16_t corner;
+	uint8x16_t edge;
+} cube_t;
 #else
 typedef struct {
 	uint8_t corner[8];
diff --git a/test/test.sh b/test/test.sh
@@ -11,21 +11,24 @@ for t in test/*; do
 	if [ -n "$re" ] && [ -z "$(echo "$t" | grep "$re")" ]; then
 		continue
 	fi
-	if [ ! -d $t ]; then continue; fi
-	$CC -o $TESTBIN $t/*.c $CUBEOBJ || exit 1;
-	for cin in $t/*.in; do
-		c=$(echo "$cin" | sed 's/\.in//')
-		cout=$c.out
-		printf "$c: "
-		$TESTBIN < "$cin" > $TESTOUT 2> $TESTERR
-		if diff $cout $TESTOUT; then
-			printf "OK\n"
-		else
-			printf "Test failed! stderr:\n"
-			cat $TESTERR
-			exit 1
-		fi
-	done
+	
+	# Verify if $t is a directory and if its name starts with three digits
+	if [ -d "$t" ] && echo "$(basename "$t")" | grep -Eq '^[0-9]{3}'; then
+		$CC -o $TESTBIN $t/*.c $CUBEOBJ || exit 1
+		for cin in $t/*.in; do
+			c=$(echo "$cin" | sed 's/\.in//')
+			cout=$c.out
+			printf "$c: "
+			$TESTBIN < "$cin" > $TESTOUT 2> $TESTERR
+			if diff $cout $TESTOUT; then
+				printf "OK\n"
+			else
+				printf "Test failed! stderr:\n"
+				cat $TESTERR
+				exit 1
+			fi
+		done
+	fi
 done
 
 echo "All tests passed!"

	h48 A prototype for an optimal Rubik's cube solver, work in progress.
	git clone https://git.tronto.net/h48
	Download \| Log \| Files \| Refs \| README \| LICENSE

M	.gitignore	\|	2	++
M	src/cube.c	\|	2	++
M	src/cube_neon.h	\|	377	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
M	test/104_cocsep_ttrep/cocsep_ttrep_tests.c	\|	2	+-
M	test/test.h	\|	9	++++++++-
M	test/test.sh	\|	33	++++++++++++++++++---------------