nissy-core

The "engine" of nissy, including the H48 optimal solver.
git clone https://git.tronto.net/nissy-core
Download | Log | Files | Refs | README | LICENSE

neon.h (11485B)


      1 #define CO2_NEON    vdup_n_u8(0x60)
      2 #define COCW_NEON   vdup_n_u8(0x20)
      3 #define EO_NEON     vdupq_n_u8(0x10)
      4 #define PBITS8_NEON vdup_n_u8(PBITS)
      5 
      6 STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t);
      7 STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t);
      8 
      9 #define STATIC_CUBE( \
     10     c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
     11     e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
     12 	((cube_t){ \
     13 		.corner = { \
     14 			c_ufr, c_ubl, c_dfl, c_dbr, \
     15 			c_ufl, c_ubr, c_dfr, c_dbl \
     16 		}, \
     17 		.edge = { \
     18 			e_uf, e_ub, e_db, e_df, e_ur, e_ul, \
     19 			e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0 \
     20 		} \
     21 	})
     22 #define ZERO_CUBE \
     23 	((cube_t){ \
     24 		.corner = vdup_n_u8(0), \
     25 		.edge = vdupq_n_u8(0) \
     26 	})
     27 #define SOLVED_CUBE STATIC_CUBE( \
     28 	0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
     29 
     30 const uint8_t SOLVED_L[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     31 const uint8_t SOLVED_H[8] = {8, 9, 10, 11, 0, 0, 0};
     32 
     33 STATIC_INLINE uint64_t permtoindex_Nx8(uint64_t, uint8x8_t);
     34 STATIC_INLINE uint8x8_t indextoperm_8x8(uint64_t);
     35 STATIC_INLINE uint8x8_t indextoperm_4x8(uint64_t);
     36 
     37 STATIC_INLINE int
     38 popcount_u32(uint32_t x)
     39 {
     40 	/* Same as the portable version */
     41 	x -= (x >> UINT32_C(1)) & UINT32_C(0x55555555);
     42 	x = (x & UINT32_C(0x33333333)) +
     43 	    ((x >> UINT32_C(2)) & UINT32_C(0x33333333));
     44 	x = (x + (x >> UINT32_C(4))) & UINT32_C(0x0F0F0F0F);
     45 	x = (x * UINT32_C(0x01010101)) >> UINT32_C(24);
     46 
     47 	return (int)x;
     48 }
     49 
     50 STATIC void
     51 pieces(cube_t cube[static 1], uint8_t c[static 8], uint8_t e[static 12])
     52 {
     53 	// First 8 bytes of the corner vector are copied from the c array
     54 	vst1_u8(c, cube->corner);
     55 
     56 	// 12 bytes of the edge vector are copied from the e array
     57 	// First 8 bytes
     58 	vst1_u8(e, vget_low_u8(cube->edge));
     59 	// Next 4 bytes
     60 	vst1_lane_u32((uint32_t *)(e + 8),
     61 	    vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0);
     62 }
     63 
     64 STATIC_INLINE bool
     65 equal(cube_t c1, cube_t c2)
     66 {
     67 	uint8x8_t cmp_corner;
     68 	uint8x16_t cmp_edge;
     69 	uint64x2_t cmp_corner_u64, cmp_edge_u64, cmp_result;
     70 
     71 	// compare the corner vectors and the edge vectors
     72 	cmp_corner = vceq_u8(c1.corner, c2.corner);
     73 	cmp_edge = vceqq_u8(c1.edge, c2.edge);
     74 
     75 	// convert the comparison vectors to 64-bit vectors and combine them
     76 	cmp_corner_u64 = vreinterpretq_u64_u8(
     77 	    vcombine_u8(cmp_corner, cmp_corner));
     78 	cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge);
     79 	cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64);
     80 
     81 	// check if all the bits are set
     82 	return vgetq_lane_u64(cmp_result, 0) == ~0ULL &&
     83 	    vgetq_lane_u64(cmp_result, 1) == ~0ULL;
     84 }
     85 
     86 STATIC_INLINE cube_t
     87 invertco(cube_t c)
     88 {
     89 	cube_t ret;
     90 	uint8x8_t co, shleft, shright, summed, newco, cleanco;
     91 
     92 	co = vand_u8(c.corner, CO2_NEON);
     93 	shleft = vshl_n_u8(co, 1);
     94 	shright = vshr_n_u8(co, 1);
     95 	summed = vorr_u8(shleft, shright);
     96 	newco = vand_u8(summed, CO2_NEON);
     97 	cleanco = veor_u8(c.corner, co);
     98 	ret.corner = vorr_u8(cleanco, newco);
     99 	ret.edge = c.edge;
    100 	
    101 	return ret;
    102 }
    103 
    104 STATIC_INLINE cube_t
    105 compose_edges(cube_t c1, cube_t c2)
    106 {
    107 	cube_t ret = {0};
    108 	ret.edge = compose_edges_slim(c1.edge, c2.edge);
    109 	return ret;
    110 }
    111 
    112 STATIC_INLINE cube_t
    113 compose_corners(cube_t c1, cube_t c2)
    114 {
    115 	cube_t ret = {0};
    116 	ret.corner = compose_corners_slim(c1.corner, c2.corner);
    117 	return ret;
    118 }
    119 
    120 STATIC_INLINE uint8x16_t
    121 compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2)
    122 {
    123 	// Masks
    124 	uint8x16_t p_bits = vdupq_n_u8(PBITS);
    125 	uint8x16_t eo_bit = vdupq_n_u8(EOBIT);
    126 
    127 	// Find the index and permutation
    128 	uint8x16_t p = vandq_u8(edge2, p_bits);
    129 	uint8x16_t piece1 = vqtbl1q_u8(edge1, p);
    130 
    131 	// Calculate the orientation through XOR
    132 	uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit);
    133 
    134 	// Combine the results
    135 	uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien);
    136 
    137 	// Mask to clear the last 32 bits of the result
    138 	uint32x4_t mask_last_32 =
    139 	    vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3);
    140 	ret = vreinterpretq_u8_u32(mask_last_32);
    141 
    142 	return ret;
    143 }
    144 
    145 STATIC_INLINE uint8x8_t
    146 compose_corners_slim(uint8x8_t corner1, uint8x8_t corner2)
    147 {
    148 	// Masks
    149 	uint8x8_t p_bits = vdup_n_u8(PBITS);
    150 	uint8x8_t cobits = vdup_n_u8(COBITS);
    151 	uint8x8_t cobits2 = vdup_n_u8(COBITS_2);
    152 	uint8x8_t twist_cw = vdup_n_u8(CTWIST_CW);
    153 
    154 	// Find the index and permutation
    155 	uint8x8_t p = vand_u8(corner2, p_bits);
    156 	uint8x8_t piece1 = vtbl1_u8(corner1, p);
    157 
    158 	// Calculate the orientation
    159 	uint8x8_t aux =
    160 	    vadd_u8(vand_u8(corner2, cobits), vand_u8(piece1, cobits));
    161 	uint8x8_t auy = vshr_n_u8(vadd_u8(aux, twist_cw), 2);
    162 	uint8x8_t orien = vand_u8(vadd_u8(aux, auy), cobits2);
    163 
    164 	uint8x8_t ret = vorr_u8(vand_u8(piece1, p_bits), orien);
    165 
    166 	return ret;
    167 }
    168 
    169 STATIC_INLINE cube_t
    170 compose(cube_t c1, cube_t c2)
    171 {
    172 	cube_t ret = {0};
    173 
    174 	ret.edge = compose_edges_slim(c1.edge, c2.edge);
    175 	ret.corner = compose_corners_slim(c1.corner, c2.corner);
    176 
    177 	return ret;
    178 }
    179 
    180 STATIC_INLINE cube_t
    181 inverse(cube_t cube)
    182 {
    183 	uint8_t i, piece, orien;
    184 	cube_t ret;
    185 
    186 	// Temp arrays to store the NEON vectors
    187 	uint8_t edges[16];
    188 	uint8_t corners[8];
    189 
    190 	// Copy the NEON vectors to the arrays
    191 	vst1q_u8(edges, cube.edge);
    192 	vst1_u8(corners, cube.corner);
    193 
    194 	uint8_t edge_result[16] = {0};
    195 	uint8_t corner_result[8] = {0};
    196 
    197 	// Process the edges
    198 	for (i = 0; i < 12; i++)
    199 	{
    200 		piece = edges[i];
    201 		orien = piece & EOBIT;
    202 		edge_result[piece & PBITS] = i | orien;
    203 	}
    204 
    205 	// Process the corners
    206 	for (i = 0; i < 8; i++)
    207 	{
    208 		piece = corners[i];
    209 		orien = ((piece << 1) | (piece >> 1)) & COBITS_2;
    210 		corner_result[piece & PBITS] = i | orien;
    211 	}
    212 
    213 	// Copy the results back to the NEON vectors
    214 	ret.edge = vld1q_u8(edge_result);
    215 	ret.corner = vld1_u8(corner_result);
    216 
    217 	return ret;
    218 }
    219 
    220 STATIC_INLINE uint64_t
    221 coord_co(cube_t c)
    222 {
    223 	uint64_t i, p, ret;
    224 
    225 	// Temp array to store the NEON vector
    226 	uint8_t mem[8];
    227 	vst1_u8(mem, c.corner);
    228 
    229 	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3)
    230 		ret += p * (mem[i] >> COSHIFT);
    231 
    232 	return ret;
    233 }
    234 
    235 STATIC_INLINE cube_t
    236 invcoord_co(uint64_t coord)
    237 {
    238 	uint64_t co, c, i, p;
    239 	uint8_t mem[8];
    240 	cube_t cube;
    241 
    242 	for (i = 0, p = 0, c = coord; i < 8; i++, c /= 3) {
    243 		co = i == 7 ? ((3 - (p % 3)) % 3) : (c % 3);
    244 		p += co;
    245 		mem[i] = i + (co << COSHIFT);
    246 	}
    247 
    248 	cube.corner = vld1_u8(mem);
    249 	cube.edge = SOLVED_CUBE.edge;
    250 
    251 	return cube;
    252 }
    253 
    254 STATIC_INLINE uint64_t
    255 coord_csep(cube_t c)
    256 {
    257 	uint64_t ret, i, p;
    258 
    259 	// Temp array to store the NEON vector
    260 	uint8_t mem[8];
    261 	vst1_u8(mem, c.corner);
    262 
    263 	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2)
    264 		ret += p * ((mem[i] & CSEPBIT) >> 2);
    265 
    266 	return ret;
    267 	return 0;
    268 }
    269 
    270 STATIC_INLINE uint64_t
    271 coord_cocsep(cube_t c)
    272 {
    273 	return (coord_co(c) << UINT64_C(7)) + coord_csep(c);
    274 }
    275 
    276 STATIC_INLINE uint64_t
    277 coord_eo(cube_t c)
    278 {
    279 	uint64_t ret, p;
    280 	int i;
    281 
    282 	// Temp array to store the NEON vector
    283 	uint8_t mem[16];
    284 	vst1q_u8(mem, c.edge);
    285 
    286 	for (i = 1, ret = 0, p = 1; i < 12; i++, p *= 2)
    287 	{
    288 		ret += p * (mem[i] >> EOSHIFT);
    289 	}
    290 
    291 	return ret;
    292 }
    293 
    294 STATIC_INLINE uint64_t
    295 coord_esep(cube_t c)
    296 {
    297 	uint64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
    298 
    299 	// Temp array to store the NEON vector
    300 	uint8_t mem[16];
    301 	vst1q_u8(mem, c.edge);
    302 
    303 	for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++)
    304 	{
    305 		bit1 = (mem[i] & ESEPBIT_1) >> 2;
    306 		bit2 = (mem[i] & ESEPBIT_2) >> 3;
    307 		is1 = (1 - bit2) * bit1;
    308 
    309 		ret1 += bit2 * binomial[11 - i][k];
    310 		k -= bit2;
    311 
    312 		jj = j < 8;
    313 		ret2 += jj * is1 * binomial[7 - (j * jj)][l];
    314 		l -= is1;
    315 		j += (1 - bit2);
    316 	}
    317 
    318 	return ret1 * 70 + ret2;
    319 }
    320 
    321 STATIC_INLINE void
    322 copy_corners(cube_t dst[static 1], cube_t src)
    323 {
    324 	dst->corner = src.corner;
    325 }
    326 
    327 STATIC_INLINE void
    328 copy_edges(cube_t dst[static 1], cube_t src)
    329 {
    330 	dst->edge = src.edge;
    331 }
    332 
    333 STATIC_INLINE void
    334 set_eo(cube_t cube[static 1], uint64_t eo)
    335 {
    336 	// Temp array to store the NEON vector
    337 	uint8_t mem[16];
    338 	vst1q_u8(mem, cube->edge);
    339 	uint8_t i, sum, flip;
    340 
    341 	for (sum = 0, i = 1; i < 12; i++, eo >>= 1)
    342 	{
    343 		flip = eo % 2;
    344 		sum += flip;
    345 		mem[i] = (mem[i] & ~EOBIT) | (EOBIT * flip);
    346 	}
    347 	mem[0] = (mem[0] & ~EOBIT) | (EOBIT * (sum % 2));
    348 
    349 	// Copy the results back to the NEON vector
    350 	cube->edge = vld1q_u8(mem);
    351 	return;
    352 }
    353 
    354 STATIC_INLINE cube_t
    355 invcoord_esep(uint64_t esep)
    356 {
    357 	cube_t ret;
    358 	uint8_t mem[16] = {0};
    359 
    360 	invcoord_esep_array(esep % UINT64_C(70), esep / UINT64_C(70), mem);
    361 
    362 	ret = SOLVED_CUBE;
    363 	ret.edge = vld1q_u8(mem);
    364 
    365 	return ret;
    366 }
    367 
    368 STATIC_INLINE uint64_t
    369 permtoindex_Nx8(uint64_t n, uint8x8_t a)
    370 {
    371 	uint64_t i, c, ret;
    372 	uint8x8_t cmp;
    373 	uint64x1_t anum;
    374 	uint8_t or[8] = {0, 0, 0, 0, 0, 0, 0, 0x0F};
    375 
    376 	for (i = 0, ret = 0; i < n; i++) {
    377 		cmp = vdup_lane_u8(a, 0);
    378 		anum = vreinterpret_u64_u8(a);
    379 		anum = vshr_n_u64(anum, 8);
    380 		a = vreinterpret_u8_u64(anum);
    381 		a = vorr_u8(a, vld1_u8(or));
    382 		cmp = vcgt_u8(cmp, a);
    383 		c = vaddv_u8(vshr_n_u8(cmp, 7));
    384 		ret += c * factorial[n-1-i];
    385 	}
    386 
    387 	return ret;
    388 }
    389 
    390 STATIC_INLINE uint8x8_t
    391 indextoperm_8x8(uint64_t p)
    392 {
    393 	int used;
    394 	uint64_t c, k, i, j;
    395 	uint8_t ret[8];
    396 
    397 	for (i = 0, used = 0; i < 8; i++) {
    398 		k = p / factorial[7-i];
    399 
    400 		/* Find k-th unused number */
    401 		for (j = 0, c = 0; c <= k; j++)
    402 			c += 1 - ((used & (1 << j)) >> j);
    403 
    404 		ret[i] = j-1;
    405 		used |= 1 << (j-1);
    406 		p %= factorial[7-i];
    407 	}
    408 
    409 	return vld1_u8(ret);
    410 }
    411 
    412 STATIC_INLINE uint8x8_t
    413 indextoperm_4x8(uint64_t p)
    414 {
    415 	static const int64_t A[FACT_4] = {
    416 		[0] = UINT64_C(0x03020100),
    417 		[1] = UINT64_C(0x02030100),
    418 		[2] = UINT64_C(0x03010200),
    419 		[3] = UINT64_C(0x01030200),
    420 		[4] = UINT64_C(0x02010300),
    421 		[5] = UINT64_C(0x01020300),
    422 		[6] = UINT64_C(0x03020001),
    423 		[7] = UINT64_C(0x02030001),
    424 		[8] = UINT64_C(0x03000201),
    425 		[9] = UINT64_C(0x00030201),
    426 		[10] = UINT64_C(0x02000301),
    427 		[11] = UINT64_C(0x00020301),
    428 		[12] = UINT64_C(0x03010002),
    429 		[13] = UINT64_C(0x01030002),
    430 		[14] = UINT64_C(0x03000102),
    431 		[15] = UINT64_C(0x00030102),
    432 		[16] = UINT64_C(0x01000302),
    433 		[17] = UINT64_C(0x00010302),
    434 		[18] = UINT64_C(0x02010003),
    435 		[19] = UINT64_C(0x01020003),
    436 		[20] = UINT64_C(0x02000103),
    437 		[21] = UINT64_C(0x00020103),
    438 		[22] = UINT64_C(0x01000203),
    439 		[23] = UINT64_C(0x00010203),
    440 	};
    441 
    442 	return vreinterpret_u8_u64(vdup_n_u64(A[p]));
    443 }
    444 
    445 STATIC_INLINE uint64_t
    446 coord_cp(cube_t cube)
    447 {
    448 	return permtoindex_Nx8(8, vand_u8(cube.corner, PBITS8_NEON));
    449 }
    450 
    451 STATIC_INLINE cube_t
    452 invcoord_cp(uint64_t i)
    453 {
    454 	return (cube_t) {
    455 		.corner = indextoperm_8x8(i),
    456 		.edge = vcombine_u8(vld1_u8(SOLVED_L), vld1_u8(SOLVED_H))
    457 	};
    458 }
    459 
    460 STATIC_INLINE uint64_t
    461 coord_epud(cube_t cube)
    462 {
    463 	uint8x8_t a;
    464 
    465 	a = vget_low_u8(cube.edge);
    466 	a = vand_u8(a, PBITS8_NEON);
    467 
    468 	return permtoindex_Nx8(8, a);
    469 }
    470 
    471 STATIC_INLINE cube_t
    472 invcoord_epud(uint64_t i)
    473 {
    474 	return (cube_t) {
    475 		.corner = vld1_u8(SOLVED_L),
    476 		.edge = vcombine_u8(indextoperm_8x8(i), vld1_u8(SOLVED_H))
    477 	};
    478 }
    479 
    480 STATIC_INLINE uint64_t
    481 coord_epe(cube_t cube)
    482 {
    483 	uint8x8_t a;
    484 
    485 	a = vget_high_u8(cube.edge);
    486 	a = vand_u8(a, PBITS8_NEON);
    487 	a = veor_u8(a, vdup_n_u8(8));
    488 
    489 	return permtoindex_Nx8(4, a);
    490 }
    491 
    492 STATIC_INLINE cube_t
    493 invcoord_epe(uint64_t i)
    494 {
    495 	uint8x8_t a;
    496 
    497 	a = indextoperm_4x8(i);
    498 	a = vadd_u8(a, vreinterpret_u8_u64(vdup_n_u64(UINT64_C(0x08080808))));
    499 
    500 	return (cube_t) {
    501 		.corner = vld1_u8(SOLVED_L),
    502 		.edge = vcombine_u8(vld1_u8(SOLVED_L), a)
    503 	};
    504 }
    505 
    506 STATIC_INLINE bool
    507 is_eo_even(cube_t cube)
    508 {
    509 	int8_t count;
    510 	uint8x16_t e;
    511 
    512 	e = vandq_u8(cube.edge, EO_NEON);
    513 	e = vshrq_n_u8(e, EOSHIFT);
    514 	count = vaddvq_u8(e);
    515 
    516 	return count % 2 == 0;
    517 }
    518 
    519 STATIC_INLINE uint64_t
    520 coord_epudsep(cube_t cube)
    521 {
    522 	uint8_t e[8];
    523 
    524 	vst1_u8(e, vget_low_u8(cube.edge));
    525 	return coord_epudsep_array(e);
    526 }
    527 
    528 STATIC_INLINE cube_t
    529 invcoord_epudsep(uint64_t i)
    530 {
    531 	uint8_t e[8];
    532 
    533 	invcoord_epudsep_array(i, e);
    534 	return (cube_t) {
    535 		.corner = vld1_u8(SOLVED_L),
    536 		.edge = vcombine_u8(vld1_u8(e), vld1_u8(SOLVED_H))
    537 	};
    538 }