neon.h - h48 - A prototype for an optimal Rubik's cube solver, work in progress.

neon.h (7784B)
      1 #define CO2_NEON vdup_n_u8(0x60)
      2 #define COCW_NEON vdup_n_u8(0x20)
      3 #define CP_NEON vdup_n_u8(0x07)
      4 #define EP_NEON vcombine_u8(vdupq_n_u8(0x0F), vdupq_n_u8(0x0F))
      5 #define EO_NEON vcombine_u8(vdupq_n_u8(0x10), vdupq_n_u8(0x10))
      6 
      7 STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t);
      8 STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t);
      9 
     10 #define STATIC_CUBE( \
     11     c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
     12     e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
     13 	((cube_t){ \
     14 		.corner = { \
     15 			c_ufr, c_ubl, c_dfl, c_dbr, \
     16 			c_ufl, c_ubr, c_dfr, c_dbl \
     17 		}, \
     18 		.edge = { \
     19 			e_uf, e_ub, e_db, e_df, e_ur, e_ul, \
     20 			e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0 \
     21 		} \
     22 	})
     23 
     24 #define ZERO_CUBE \
     25 	((cube_t){ \
     26 		.corner = vdup_n_u8(0), \
     27 		.edge = vdupq_n_u8(0) \
     28 	})
     29 
     30 #define SOLVED_CUBE STATIC_CUBE( \
     31 	0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
     32 
     33 /* TODO: optimize this (use intrinsics?) */
     34 STATIC_INLINE int
     35 popcount_u32(uint32_t x)
     36 {
     37 	int ret;
     38 
     39 	for (ret = 0; x != 0; x >>= 1)
     40 		ret += x & 1;
     41 
     42 	return ret;
     43 }
     44 
     45 STATIC void
     46 pieces(cube_t cube[static 1], uint8_t c[static 8], uint8_t e[static 12])
     47 {
     48 	// First 8 bytes of the corner vector are copied from the c array
     49 	vst1_u8(c, cube->corner);
     50 
     51 	// 12 bytes of the edge vector are copied from the e array
     52 	// First 8 bytes
     53 	vst1_u8(e, vget_low_u8(cube->edge));
     54 	// Next 4 bytes
     55 	vst1_lane_u32((uint32_t *)(e + 8),
     56 	    vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0);
     57 }
     58 
     59 STATIC_INLINE bool
     60 equal(cube_t c1, cube_t c2)
     61 {
     62 	uint8x8_t cmp_corner;
     63 	uint8x16_t cmp_edge;
     64 	uint64x2_t cmp_corner_u64, cmp_edge_u64, cmp_result;
     65 
     66 	// compare the corner vectors and the edge vectors
     67 	cmp_corner = vceq_u8(c1.corner, c2.corner);
     68 	cmp_edge = vceqq_u8(c1.edge, c2.edge);
     69 
     70 	// convert the comparison vectors to 64-bit vectors and combine them
     71 	cmp_corner_u64 = vreinterpretq_u64_u8(
     72 	    vcombine_u8(cmp_corner, cmp_corner));
     73 	cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge);
     74 	cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64);
     75 
     76 	// check if all the bits are set
     77 	return vgetq_lane_u64(cmp_result, 0) == ~0ULL &&
     78 	    vgetq_lane_u64(cmp_result, 1) == ~0ULL;
     79 }
     80 
     81 STATIC_INLINE cube_t
     82 invertco(cube_t c)
     83 {
     84 	cube_t ret;
     85 	uint8x8_t co, shleft, shright, summed, newco, cleanco;
     86 
     87 	co = vand_u8(c.corner, CO2_NEON);
     88 	shleft = vshl_n_u8(co, 1);
     89 	shright = vshr_n_u8(co, 1);
     90 	summed = vorr_u8(shleft, shright);
     91 	newco = vand_u8(summed, CO2_NEON);
     92 	cleanco = veor_u8(c.corner, co);
     93 	ret.corner = vorr_u8(cleanco, newco);
     94 	ret.edge = c.edge;
     95 	
     96 	return ret;
     97 }
     98 
     99 STATIC_INLINE cube_t
    100 compose_edges(cube_t c1, cube_t c2)
    101 {
    102 	cube_t ret = {0};
    103 	ret.edge = compose_edges_slim(c1.edge, c2.edge);
    104 	return ret;
    105 }
    106 
    107 STATIC_INLINE cube_t
    108 compose_corners(cube_t c1, cube_t c2)
    109 {
    110 	cube_t ret = {0};
    111 	ret.corner = compose_corners_slim(c1.corner, c2.corner);
    112 	return ret;
    113 }
    114 
    115 STATIC_INLINE uint8x16_t
    116 compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2)
    117 {
    118 	// Masks
    119 	uint8x16_t p_bits = vdupq_n_u8(PBITS);
    120 	uint8x16_t eo_bit = vdupq_n_u8(EOBIT);
    121 
    122 	// Find the index and permutation
    123 	uint8x16_t p = vandq_u8(edge2, p_bits);
    124 	uint8x16_t piece1 = vqtbl1q_u8(edge1, p);
    125 
    126 	// Calculate the orientation through XOR
    127 	uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit);
    128 
    129 	// Combine the results
    130 	uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien);
    131 
    132 	// Mask to clear the last 32 bits of the result
    133 	uint32x4_t mask_last_32 =
    134 	    vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3);
    135 	ret = vreinterpretq_u8_u32(mask_last_32);
    136 
    137 	return ret;
    138 }
    139 
    140 STATIC_INLINE uint8x8_t
    141 compose_corners_slim(uint8x8_t corner1, uint8x8_t corner2)
    142 {
    143 	// Masks
    144 	uint8x8_t p_bits = vdup_n_u8(PBITS);
    145 	uint8x8_t cobits = vdup_n_u8(COBITS);
    146 	uint8x8_t cobits2 = vdup_n_u8(COBITS_2);
    147 	uint8x8_t twist_cw = vdup_n_u8(CTWIST_CW);
    148 
    149 	// Find the index and permutation
    150 	uint8x8_t p = vand_u8(corner2, p_bits);
    151 	uint8x8_t piece1 = vtbl1_u8(corner1, p);
    152 
    153 	// Calculate the orientation
    154 	uint8x8_t aux =
    155 	    vadd_u8(vand_u8(corner2, cobits), vand_u8(piece1, cobits));
    156 	uint8x8_t auy = vshr_n_u8(vadd_u8(aux, twist_cw), 2);
    157 	uint8x8_t orien = vand_u8(vadd_u8(aux, auy), cobits2);
    158 
    159 	uint8x8_t ret = vorr_u8(vand_u8(piece1, p_bits), orien);
    160 
    161 	return ret;
    162 }
    163 
    164 STATIC_INLINE cube_t
    165 compose(cube_t c1, cube_t c2)
    166 {
    167 	cube_t ret = {0};
    168 
    169 	ret.edge = compose_edges_slim(c1.edge, c2.edge);
    170 	ret.corner = compose_corners_slim(c1.corner, c2.corner);
    171 
    172 	return ret;
    173 }
    174 
    175 STATIC_INLINE cube_t
    176 inverse(cube_t cube)
    177 {
    178 	uint8_t i, piece, orien;
    179 	cube_t ret;
    180 
    181 	// Temp arrays to store the NEON vectors
    182 	uint8_t edges[16];
    183 	uint8_t corners[8];
    184 
    185 	// Copy the NEON vectors to the arrays
    186 	vst1q_u8(edges, cube.edge);
    187 	vst1_u8(corners, cube.corner);
    188 
    189 	uint8_t edge_result[16] = {0};
    190 	uint8_t corner_result[8] = {0};
    191 
    192 	// Process the edges
    193 	for (i = 0; i < 12; i++)
    194 	{
    195 		piece = edges[i];
    196 		orien = piece & EOBIT;
    197 		edge_result[piece & PBITS] = i | orien;
    198 	}
    199 
    200 	// Process the corners
    201 	for (i = 0; i < 8; i++)
    202 	{
    203 		piece = corners[i];
    204 		orien = ((piece << 1) | (piece >> 1)) & COBITS_2;
    205 		corner_result[piece & PBITS] = i | orien;
    206 	}
    207 
    208 	// Copy the results back to the NEON vectors
    209 	ret.edge = vld1q_u8(edge_result);
    210 	ret.corner = vld1_u8(corner_result);
    211 
    212 	return ret;
    213 }
    214 
    215 STATIC_INLINE int64_t
    216 coord_co(cube_t c)
    217 {
    218 	// Temp array to store the NEON vector
    219 	uint8_t mem[8];
    220 	vst1_u8(mem, c.corner);
    221 
    222 	int i, p;
    223 	int64_t ret;
    224 
    225 	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3)
    226 		ret += p * (mem[i] >> COSHIFT);
    227 
    228 	return ret;
    229 }
    230 
    231 STATIC_INLINE cube_t
    232 invcoord_co(int64_t coord)
    233 {
    234 	int64_t co, c, i, p;
    235 	uint8_t mem[8];
    236 	cube_t cube;
    237 
    238 	for (i = 0, p = 0, c = coord; i < 8; i++, c /= 3) {
    239 		co = i == 7 ? ((3 - (p % 3)) % 3) : (c % 3);
    240 		p += co;
    241 		mem[i] = i + (co << COSHIFT);
    242 	}
    243 
    244 	cube.corner = vld1_u8(mem);
    245 	cube.edge = SOLVED_CUBE.edge;
    246 
    247 	return cube;
    248 }
    249 
    250 STATIC_INLINE int64_t
    251 coord_csep(cube_t c)
    252 {
    253 	// Temp array to store the NEON vector
    254 	uint8_t mem[8];
    255 	vst1_u8(mem, c.corner);
    256 
    257 	int64_t ret = 0;
    258 	int i, p;
    259 	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2)
    260 		ret += p * ((mem[i] & CSEPBIT) >> 2);
    261 
    262 	return ret;
    263 	return 0;
    264 }
    265 
    266 STATIC_INLINE int64_t
    267 coord_cocsep(cube_t c)
    268 {
    269 	return (coord_co(c) << 7) + coord_csep(c);
    270 }
    271 
    272 STATIC_INLINE int64_t
    273 coord_eo(cube_t c)
    274 {
    275 	int64_t ret = 0;
    276 	int64_t p = 1;
    277 
    278 	// Temp array to store the NEON vector
    279 	uint8_t mem[16];
    280 	vst1q_u8(mem, c.edge);
    281 
    282 	for (int i = 1; i < 12; i++, p *= 2)
    283 	{
    284 		ret += p * (mem[i] >> EOSHIFT);
    285 	}
    286 
    287 	return ret;
    288 }
    289 
    290 STATIC_INLINE int64_t
    291 coord_esep(cube_t c)
    292 {
    293 	int64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
    294 
    295 	// Temp array to store the NEON vector
    296 	uint8_t mem[16];
    297 	vst1q_u8(mem, c.edge);
    298 
    299 	for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++)
    300 	{
    301 		bit1 = (mem[i] & ESEPBIT_1) >> 2;
    302 		bit2 = (mem[i] & ESEPBIT_2) >> 3;
    303 		is1 = (1 - bit2) * bit1;
    304 
    305 		ret1 += bit2 * binomial[11 - i][k];
    306 		k -= bit2;
    307 
    308 		jj = j < 8;
    309 		ret2 += jj * is1 * binomial[7 - (j * jj)][l];
    310 		l -= is1;
    311 		j += (1 - bit2);
    312 	}
    313 
    314 	return ret1 * 70 + ret2;
    315 }
    316 
    317 STATIC_INLINE void
    318 copy_corners(cube_t dst[static 1], cube_t src)
    319 {
    320 	dst->corner = src.corner;
    321 }
    322 
    323 STATIC_INLINE void
    324 copy_edges(cube_t dst[static 1], cube_t src)
    325 {
    326 	dst->edge = src.edge;
    327 }
    328 
    329 STATIC_INLINE void
    330 set_eo(cube_t cube[static 1], int64_t eo)
    331 {
    332 	// Temp array to store the NEON vector
    333 	uint8_t mem[16];
    334 	vst1q_u8(mem, cube->edge);
    335 	uint8_t i, sum, flip;
    336 
    337 	for (sum = 0, i = 1; i < 12; i++, eo >>= 1)
    338 	{
    339 		flip = eo % 2;
    340 		sum += flip;
    341 		mem[i] = (mem[i] & ~EOBIT) | (EOBIT * flip);
    342 	}
    343 	mem[0] = (mem[0] & ~EOBIT) | (EOBIT * (sum % 2));
    344 
    345 	// Copy the results back to the NEON vector
    346 	cube->edge = vld1q_u8(mem);
    347 	return;
    348 }
    349 
    350 STATIC_INLINE cube_t
    351 invcoord_esep(int64_t esep)
    352 {
    353 	cube_t ret;
    354 	uint8_t mem[16] = {0};
    355 
    356 	invcoord_esep_array(esep % 70, esep / 70, mem);
    357 
    358 	ret = SOLVED_CUBE;
    359 	ret.edge = vld1q_u8(mem);
    360 
    361 	return ret;
    362 }
	h48 A prototype for an optimal Rubik's cube solver, work in progress.
	git clone https://git.tronto.net/h48
	Download \| Log \| Files \| Refs \| README \| LICENSE