h48

A prototype for an optimal Rubik's cube solver, work in progress.
git clone https://git.tronto.net/h48
Download | Log | Files | Refs | README | LICENSE

commit 9198aede8761d6573f2ec3739b48e86f83947b56
parent 3c12def93ed667548bcd9c534c31ec31bfefb8e0
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Tue, 15 Oct 2024 20:01:46 +0200

Merge pull request #4 from enricotenuti/master

Neon uint8x8_t corners and minor fixes
Diffstat:
Msrc/arch/arch.h | 2+-
Msrc/arch/neon.h | 95+++++++++++++++++++++++++++++++++++++------------------------------------------
Msrc/solvers/h48/solve.h | 4++--
Msrc/utils/constants.h | 8++++----
4 files changed, 51 insertions(+), 58 deletions(-)

diff --git a/src/arch/arch.h b/src/arch/arch.h @@ -15,7 +15,7 @@ typedef __m256i cube_t; #include <arm_neon.h> typedef struct { - uint8x16_t corner; + uint8x8_t corner; uint8x16_t edge; } cube_t; diff --git a/src/arch/neon.h b/src/arch/neon.h @@ -1,24 +1,24 @@ -#define CO2_NEON vdupq_n_u8(0x60) -#define COCW_NEON vdupq_n_u8(0x20) -#define CP_NEON vdupq_n_u8(0x07) +#define CO2_NEON vdup_n_u8(0x60) +#define COCW_NEON vdup_n_u8(0x20) +#define CP_NEON vdup_n_u8(0x07) #define EP_NEON vcombine_u8(vdupq_n_u8(0x0F), vdupq_n_u8(0x0F)) #define EO_NEON vcombine_u8(vdupq_n_u8(0x10), vdupq_n_u8(0x10)) STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t); -STATIC_INLINE uint8x16_t compose_corners_slim(uint8x16_t, uint8x16_t); +STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t); // static cube #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \ e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \ ((cube_t){ \ - .corner = {c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, 0, 0, 0, 0, 0, 0, 0, 0}, \ + .corner = {c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl}, \ .edge = {e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0}}) // zero cube #define ZERO_CUBE \ (cube_t) \ { \ - .corner = vdupq_n_u8(0), \ + .corner = vdup_n_u8(0), \ .edge = vdupq_n_u8(0) \ } @@ -30,7 +30,7 @@ STATIC void pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12]) { // First 8 bytes of the corner vector are copied from the c array - vst1_u8(c, vget_low_u8(cube->corner)); + vst1_u8(c, cube->corner); // 12 bytes of the edge vector are copied from the e array // First 8 bytes @@ -42,20 +42,17 @@ pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12]) STATIC_INLINE bool equal(cube_t c1, cube_t c2) { - uint8x16_t cmp_corner, cmp_edge; - uint64x2_t cmp_corner_u64, cmp_edge_u64; - uint64x2_t cmp_result; + uint8x8_t cmp_corner; + uint8x16_t cmp_edge; + uint64x2_t cmp_corner_u64, cmp_edge_u64, cmp_result; - // compare the corner vectors - cmp_corner = vceqq_u8(c1.corner, c2.corner); - // compare the edge vectors + // compare the corner vectors and the edge vectors + cmp_corner = vceq_u8(c1.corner, c2.corner); cmp_edge = vceqq_u8(c1.edge, c2.edge); - // convert the comparison vectors to 64-bit vectors - cmp_corner_u64 = vreinterpretq_u64_u8(cmp_corner); + // convert the comparison vectors to 64-bit vectors and combine them + cmp_corner_u64 = vreinterpretq_u64_u8(vcombine_u64(cmp_corner, cmp_corner)); cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge); - - // combine the comparison vectors cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64); // check if all the bits are set @@ -66,15 +63,15 @@ STATIC_INLINE cube_t invertco(cube_t c) { cube_t ret; - uint8x16_t co, shleft, shright, summed, newco, cleanco; - - co = vandq_u8(c.corner, CO2_NEON); - shleft = vshlq_n_u8(co, 1); - shright = vshrq_n_u8(co, 1); - summed = vorrq_u8(shleft, shright); - newco = vandq_u8(summed, CO2_NEON); - cleanco = veorq_u8(c.corner, co); - ret.corner = vorrq_u8(cleanco, newco); + uint8x8_t co, shleft, shright, summed, newco, cleanco; + + co = vand_u8(c.corner, CO2_NEON); + shleft = vshl_n_u8(co, 1); + shright = vshr_n_u8(co, 1); + summed = vorr_u8(shleft, shright); + newco = vand_u8(summed, CO2_NEON); + cleanco = veor_u8(c.corner, co); + ret.corner = vorr_u8(cleanco, newco); ret.edge = c.edge; return ret; @@ -120,30 +117,25 @@ compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2) return ret; } -STATIC_INLINE uint8x16_t -compose_corners_slim(uint8x16_t corner1, uint8x16_t corner2) +STATIC_INLINE uint8x8_t +compose_corners_slim(uint8x8_t corner1, uint8x8_t corner2) { // Masks - uint8x16_t p_bits = vdupq_n_u8(PBITS); - uint8x16_t cobits = vdupq_n_u8(COBITS); - uint8x16_t cobits2 = vdupq_n_u8(COBITS_2); - uint8x16_t twist_cw = vdupq_n_u8(CTWIST_CW); + uint8x8_t p_bits = vdup_n_u8(PBITS); + uint8x8_t cobits = vdup_n_u8(COBITS); + uint8x8_t cobits2 = vdup_n_u8(COBITS_2); + uint8x8_t twist_cw = vdup_n_u8(CTWIST_CW); // Find the index and permutation - uint8x16_t p = vandq_u8(corner2, p_bits); - uint8x16_t piece1 = vqtbl1q_u8(corner1, p); + uint8x8_t p = vand_u8(corner2, p_bits); + uint8x8_t piece1 = vtbl1_u8(corner1, p); // Calculate the orientation - uint8x16_t aux = vaddq_u8(vandq_u8(corner2, cobits), vandq_u8(piece1, cobits)); - uint8x16_t auy = vshrq_n_u8(vaddq_u8(aux, twist_cw), 2); - uint8x16_t orien = vandq_u8(vaddq_u8(aux, auy), cobits2); - - // Combine the results - uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien); + uint8x8_t aux = vadd_u8(vand_u8(corner2, cobits), vand_u8(piece1, cobits)); + uint8x8_t auy = vshr_n_u8(vadd_u8(aux, twist_cw), 2); + uint8x8_t orien = vand_u8(vadd_u8(aux, auy), cobits2); - // Mask to clear the last 64 bits of the result - uint8x16_t mask_last_64 = vsetq_lane_u64(0, vreinterpretq_u64_u8(ret), 1); - ret = vreinterpretq_u8_u64(mask_last_64); + uint8x8_t ret = vorr_u8(vand_u8(piece1, p_bits), orien); return ret; } @@ -167,14 +159,14 @@ inverse(cube_t cube) // Temp arrays to store the NEON vectors uint8_t edges[16]; - uint8_t corners[16]; + uint8_t corners[8]; // Copy the NEON vectors to the arrays vst1q_u8(edges, cube.edge); - vst1q_u8(corners, cube.corner); + vst1_u8(corners, cube.corner); uint8_t edge_result[16] = {0}; - uint8_t corner_result[16] = {0}; + uint8_t corner_result[8] = {0}; // Process the edges for (i = 0; i < 12; i++) @@ -194,7 +186,7 @@ inverse(cube_t cube) // Copy the results back to the NEON vectors ret.edge = vld1q_u8(edge_result); - ret.corner = vld1q_u8(corner_result); + ret.corner = vld1_u8(corner_result); return ret; } @@ -203,8 +195,8 @@ STATIC_INLINE int64_t coord_co(cube_t c) { // Temp array to store the NEON vector - uint8_t mem[16]; - vst1q_u8(mem, c.corner); + uint8_t mem[8]; + vst1_u8(mem, c.corner); int i, p; int64_t ret; @@ -219,8 +211,8 @@ STATIC_INLINE int64_t coord_csep(cube_t c) { // Temp array to store the NEON vector - uint8_t mem[16]; - vst1q_u8(mem, c.corner); + uint8_t mem[8]; + vst1_u8(mem, c.corner); int64_t ret = 0; int i, p; @@ -228,6 +220,7 @@ coord_csep(cube_t c) ret += p * ((mem[i] & CSEPBIT) >> 2); return ret; + return 0; } STATIC_INLINE int64_t diff --git a/src/solvers/h48/solve.h b/src/solvers/h48/solve.h @@ -19,7 +19,7 @@ typedef struct { uint8_t premoves[MAXLEN]; } dfsarg_solveh48_t; -STATIC uint32_t allowednextmove_h48(uint8_t *, uint8_t, uint32_t); +STATIC uint32_t allowednextmove_h48(uint8_t *, uint8_t, uint8_t); STATIC void solve_h48_appendsolution(dfsarg_solveh48_t *); STATIC_INLINE bool solve_h48_stop(dfsarg_solveh48_t *); @@ -28,7 +28,7 @@ STATIC int64_t solve_h48(cube_t, int8_t, int8_t, int8_t, uint64_t, const void *, uint64_t, char *); STATIC uint32_t -allowednextmove_h48(uint8_t *moves, uint8_t n, uint32_t h48branch) +allowednextmove_h48(uint8_t *moves, uint8_t n, uint8_t h48branch) { uint32_t result = MM_ALLMOVES; if (h48branch & MM_NORMALBRANCH) diff --git a/src/utils/constants.h b/src/utils/constants.h @@ -93,10 +93,10 @@ STATIC int64_t binomial[12][12] = { #define TRANS_BDm UINT8_C(46) #define TRANS_BLm UINT8_C(47) -#define MM_NORMAL UINT32_C(0x00) -#define MM_INVERSE UINT32_C(0x01) -#define MM_INVERSEBRANCH UINT32_C(0x03) -#define MM_NORMALBRANCH UINT32_C(0x02) +#define MM_NORMAL UINT8_C(0x00) +#define MM_INVERSE UINT8_C(0x01) +#define MM_INVERSEBRANCH UINT8_C(0x03) +#define MM_NORMALBRANCH UINT8_C(0x02) #define MM_ALLMOVES UINT32_C(0x3FFFF) #define MM_NOHALFTURNS UINT32_C(0x2DB6D)