nissy-core

The "engine" of nissy, including the H48 optimal solver.
git clone https://git.tronto.net/nissy-core
Download | Log | Files | Refs | README | LICENSE

neon.h (11912B)


      1 #define CO2_NEON    vdup_n_u8(0x60)
      2 #define COCW_NEON   vdup_n_u8(0x20)
      3 #define EO_NEON     vdupq_n_u8(0x10)
      4 #define PBITS8_NEON vdup_n_u8(PBITS)
      5 
      6 STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t);
      7 STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t);
      8 
      9 #define STATIC_CUBE( \
     10     c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
     11     e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
     12 	((cube_t){ \
     13 		.corner = { \
     14 			c_ufr, c_ubl, c_dfl, c_dbr, \
     15 			c_ufl, c_ubr, c_dfr, c_dbl \
     16 		}, \
     17 		.edge = { \
     18 			e_uf, e_ub, e_db, e_df, e_ur, e_ul, \
     19 			e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0 \
     20 		} \
     21 	})
     22 #define ZERO_CUBE \
     23 	((cube_t){ \
     24 		.corner = vdup_n_u8(0), \
     25 		.edge = vdupq_n_u8(0) \
     26 	})
     27 #define SOLVED_CUBE STATIC_CUBE( \
     28 	0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
     29 
     30 const uint8_t SOLVED_L[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     31 const uint8_t SOLVED_H[8] = {8, 9, 10, 11, 0, 0, 0};
     32 
     33 STATIC_INLINE uint64_t permtoindex_Nx8(uint64_t, uint8x8_t);
     34 STATIC_INLINE uint8x8_t indextoperm_8x8(uint64_t);
     35 STATIC_INLINE uint8x8_t indextoperm_4x8(uint64_t);
     36 
     37 STATIC_INLINE int
     38 popcount_u32(uint32_t x)
     39 {
     40 	/* Same as the portable version */
     41 	x -= (x >> UINT32_C(1)) & UINT32_C(0x55555555);
     42 	x = (x & UINT32_C(0x33333333)) +
     43 	    ((x >> UINT32_C(2)) & UINT32_C(0x33333333));
     44 	x = (x + (x >> UINT32_C(4))) & UINT32_C(0x0F0F0F0F);
     45 	x = (x * UINT32_C(0x01010101)) >> UINT32_C(24);
     46 
     47 	return (int)x;
     48 }
     49 
     50 STATIC_INLINE int
     51 popcount_u64(uint64_t x)
     52 {
     53 	int top, bottom;
     54 
     55 	bottom = popcount_u32((uint32_t)(x & UINT64_C(0xFFFFFFFF)));
     56 	top = popcount_u32((uint32_t)(x >> UINT64_C(32)));
     57 
     58 	return top + bottom;
     59 }
     60 
     61 STATIC void
     62 pieces(cube_t cube[NON_NULL], uint8_t c[SIZE(8)], uint8_t e[SIZE(12)])
     63 {
     64 	// First 8 bytes of the corner vector are copied from the c array
     65 	vst1_u8(c, cube->corner);
     66 
     67 	// 12 bytes of the edge vector are copied from the e array
     68 	// First 8 bytes
     69 	vst1_u8(e, vget_low_u8(cube->edge));
     70 	// Next 4 bytes
     71 	vst1_lane_u32((uint32_t *)(e + 8),
     72 	    vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0);
     73 }
     74 
     75 STATIC_INLINE bool
     76 equal(cube_t c1, cube_t c2)
     77 {
     78 	uint8x8_t cmp_corner;
     79 	uint8x16_t cmp_edge;
     80 	uint64x2_t cmp_corner_u64, cmp_edge_u64, cmp_result;
     81 
     82 	// compare the corner vectors and the edge vectors
     83 	cmp_corner = vceq_u8(c1.corner, c2.corner);
     84 	cmp_edge = vceqq_u8(c1.edge, c2.edge);
     85 
     86 	// convert the comparison vectors to 64-bit vectors and combine them
     87 	cmp_corner_u64 = vreinterpretq_u64_u8(
     88 	    vcombine_u8(cmp_corner, cmp_corner));
     89 	cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge);
     90 	cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64);
     91 
     92 	// check if all the bits are set
     93 	return vgetq_lane_u64(cmp_result, 0) == ~0ULL &&
     94 	    vgetq_lane_u64(cmp_result, 1) == ~0ULL;
     95 }
     96 
     97 STATIC_INLINE cube_t
     98 invertco(cube_t c)
     99 {
    100 	cube_t ret;
    101 	uint8x8_t co, shleft, shright, summed, newco, cleanco;
    102 
    103 	co = vand_u8(c.corner, CO2_NEON);
    104 	shleft = vshl_n_u8(co, 1);
    105 	shright = vshr_n_u8(co, 1);
    106 	summed = vorr_u8(shleft, shright);
    107 	newco = vand_u8(summed, CO2_NEON);
    108 	cleanco = veor_u8(c.corner, co);
    109 	ret.corner = vorr_u8(cleanco, newco);
    110 	ret.edge = c.edge;
    111 	
    112 	return ret;
    113 }
    114 
    115 STATIC_INLINE cube_t
    116 compose_edges(cube_t c1, cube_t c2)
    117 {
    118 	cube_t ret = {0};
    119 	ret.edge = compose_edges_slim(c1.edge, c2.edge);
    120 	return ret;
    121 }
    122 
    123 STATIC_INLINE cube_t
    124 compose_corners(cube_t c1, cube_t c2)
    125 {
    126 	cube_t ret = {0};
    127 	ret.corner = compose_corners_slim(c1.corner, c2.corner);
    128 	return ret;
    129 }
    130 
    131 STATIC_INLINE uint8x16_t
    132 compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2)
    133 {
    134 	// Masks
    135 	uint8x16_t p_bits = vdupq_n_u8(PBITS);
    136 	uint8x16_t eo_bit = vdupq_n_u8(EOBIT);
    137 
    138 	// Find the index and permutation
    139 	uint8x16_t p = vandq_u8(edge2, p_bits);
    140 	uint8x16_t piece1 = vqtbl1q_u8(edge1, p);
    141 
    142 	// Calculate the orientation through XOR
    143 	uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit);
    144 
    145 	// Combine the results
    146 	uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien);
    147 
    148 	// Mask to clear the last 32 bits of the result
    149 	uint32x4_t mask_last_32 =
    150 	    vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3);
    151 	ret = vreinterpretq_u8_u32(mask_last_32);
    152 
    153 	return ret;
    154 }
    155 
    156 STATIC_INLINE uint8x8_t
    157 compose_corners_slim(uint8x8_t corner1, uint8x8_t corner2)
    158 {
    159 	// Masks
    160 	uint8x8_t p_bits = vdup_n_u8(PBITS);
    161 	uint8x8_t cobits = vdup_n_u8(COBITS);
    162 	uint8x8_t cobits2 = vdup_n_u8(COBITS_2);
    163 	uint8x8_t twist_cw = vdup_n_u8(CTWIST_CW);
    164 
    165 	// Find the index and permutation
    166 	uint8x8_t p = vand_u8(corner2, p_bits);
    167 	uint8x8_t piece1 = vtbl1_u8(corner1, p);
    168 
    169 	// Calculate the orientation
    170 	uint8x8_t aux =
    171 	    vadd_u8(vand_u8(corner2, cobits), vand_u8(piece1, cobits));
    172 	uint8x8_t auy = vshr_n_u8(vadd_u8(aux, twist_cw), 2);
    173 	uint8x8_t orien = vand_u8(vadd_u8(aux, auy), cobits2);
    174 
    175 	uint8x8_t ret = vorr_u8(vand_u8(piece1, p_bits), orien);
    176 
    177 	return ret;
    178 }
    179 
    180 STATIC_INLINE cube_t
    181 compose(cube_t c1, cube_t c2)
    182 {
    183 	cube_t ret = {0};
    184 
    185 	ret.edge = compose_edges_slim(c1.edge, c2.edge);
    186 	ret.corner = compose_corners_slim(c1.corner, c2.corner);
    187 
    188 	return ret;
    189 }
    190 
    191 STATIC_INLINE cube_t
    192 inverse(cube_t cube)
    193 {
    194 	uint8_t i, piece, orien;
    195 	cube_t ret;
    196 
    197 	// Temp arrays to store the NEON vectors
    198 	uint8_t edges[16];
    199 	uint8_t corners[8];
    200 
    201 	// Copy the NEON vectors to the arrays
    202 	vst1q_u8(edges, cube.edge);
    203 	vst1_u8(corners, cube.corner);
    204 
    205 	uint8_t edge_result[16] = {0};
    206 	uint8_t corner_result[8] = {0};
    207 
    208 	// Process the edges
    209 	for (i = 0; i < 12; i++)
    210 	{
    211 		piece = edges[i];
    212 		orien = piece & EOBIT;
    213 		edge_result[piece & PBITS] = i | orien;
    214 	}
    215 
    216 	// Process the corners
    217 	for (i = 0; i < 8; i++)
    218 	{
    219 		piece = corners[i];
    220 		orien = ((piece << 1) | (piece >> 1)) & COBITS_2;
    221 		corner_result[piece & PBITS] = i | orien;
    222 	}
    223 
    224 	// Copy the results back to the NEON vectors
    225 	ret.edge = vld1q_u8(edge_result);
    226 	ret.corner = vld1_u8(corner_result);
    227 
    228 	return ret;
    229 }
    230 
    231 STATIC_INLINE uint64_t
    232 coord_co(cube_t c)
    233 {
    234 	uint64_t i, p, ret;
    235 
    236 	// Temp array to store the NEON vector
    237 	uint8_t mem[8];
    238 	vst1_u8(mem, c.corner);
    239 
    240 	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3)
    241 		ret += p * (mem[i] >> COSHIFT);
    242 
    243 	return ret;
    244 }
    245 
    246 STATIC_INLINE void
    247 copy_co(cube_t cube[NON_NULL], cube_t co)
    248 {
    249 	uint8x8_t coclean;
    250 
    251 	coclean = vand_u8(co.corner, CO2_NEON);
    252 	cube->corner = vbic_u8(cube->corner, CO2_NEON);
    253 	cube->corner = vorr_u8(cube->corner, coclean);
    254 }
    255 
    256 STATIC_INLINE cube_t
    257 invcoord_co(uint64_t coord)
    258 {
    259 	uint64_t co, c, i, p;
    260 	uint8_t mem[8];
    261 	cube_t cube;
    262 
    263 	for (i = 0, p = 0, c = coord; i < 8; i++, c /= 3) {
    264 		co = i == 7 ? ((3 - (p % 3)) % 3) : (c % 3);
    265 		p += co;
    266 		mem[i] = i + (co << COSHIFT);
    267 	}
    268 
    269 	cube.corner = vld1_u8(mem);
    270 	cube.edge = SOLVED_CUBE.edge;
    271 
    272 	return cube;
    273 }
    274 
    275 STATIC_INLINE uint64_t
    276 coord_csep(cube_t c)
    277 {
    278 	uint64_t ret, i, p;
    279 
    280 	// Temp array to store the NEON vector
    281 	uint8_t mem[8];
    282 	vst1_u8(mem, c.corner);
    283 
    284 	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2)
    285 		ret += p * ((mem[i] & CSEPBIT) >> 2);
    286 
    287 	return ret;
    288 	return 0;
    289 }
    290 
    291 STATIC_INLINE uint64_t
    292 coord_cocsep(cube_t c)
    293 {
    294 	return (coord_co(c) << UINT64_C(7)) + coord_csep(c);
    295 }
    296 
    297 STATIC_INLINE uint64_t
    298 coord_eo(cube_t c)
    299 {
    300 	uint64_t ret, p;
    301 	int i;
    302 
    303 	// Temp array to store the NEON vector
    304 	uint8_t mem[16];
    305 	vst1q_u8(mem, c.edge);
    306 
    307 	for (i = 1, ret = 0, p = 1; i < 12; i++, p *= 2)
    308 	{
    309 		ret += p * (mem[i] >> EOSHIFT);
    310 	}
    311 
    312 	return ret;
    313 }
    314 
    315 STATIC_INLINE uint64_t
    316 coord_esep(cube_t c)
    317 {
    318 	uint64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
    319 
    320 	// Temp array to store the NEON vector
    321 	uint8_t mem[16];
    322 	vst1q_u8(mem, c.edge);
    323 
    324 	for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++)
    325 	{
    326 		bit1 = (mem[i] & ESEPBIT_1) >> 2;
    327 		bit2 = (mem[i] & ESEPBIT_2) >> 3;
    328 		is1 = (1 - bit2) * bit1;
    329 
    330 		ret1 += bit2 * binomial[11 - i][k];
    331 		k -= bit2;
    332 
    333 		jj = j < 8;
    334 		ret2 += jj * is1 * binomial[7 - (j * jj)][l];
    335 		l -= is1;
    336 		j += (1 - bit2);
    337 	}
    338 
    339 	return ret1 * 70 + ret2;
    340 }
    341 
    342 STATIC_INLINE void
    343 copy_corners(cube_t dst[NON_NULL], cube_t src)
    344 {
    345 	dst->corner = src.corner;
    346 }
    347 
    348 STATIC_INLINE void
    349 copy_edges(cube_t dst[NON_NULL], cube_t src)
    350 {
    351 	dst->edge = src.edge;
    352 }
    353 
    354 STATIC_INLINE void
    355 set_eo(cube_t cube[NON_NULL], uint64_t eo)
    356 {
    357 	// Temp array to store the NEON vector
    358 	uint8_t mem[16];
    359 	vst1q_u8(mem, cube->edge);
    360 	uint8_t i, sum, flip;
    361 
    362 	for (sum = 0, i = 1; i < 12; i++, eo >>= 1)
    363 	{
    364 		flip = eo % 2;
    365 		sum += flip;
    366 		mem[i] = (mem[i] & ~EOBIT) | (EOBIT * flip);
    367 	}
    368 	mem[0] = (mem[0] & ~EOBIT) | (EOBIT * (sum % 2));
    369 
    370 	// Copy the results back to the NEON vector
    371 	cube->edge = vld1q_u8(mem);
    372 	return;
    373 }
    374 
    375 STATIC_INLINE cube_t
    376 invcoord_esep(uint64_t esep)
    377 {
    378 	cube_t ret;
    379 	uint8_t mem[16] = {0};
    380 
    381 	invcoord_esep_array(esep % UINT64_C(70), esep / UINT64_C(70), mem);
    382 
    383 	ret = SOLVED_CUBE;
    384 	ret.edge = vld1q_u8(mem);
    385 
    386 	return ret;
    387 }
    388 
    389 STATIC_INLINE uint64_t
    390 permtoindex_Nx8(uint64_t n, uint8x8_t a)
    391 {
    392 	uint64_t i, c, ret;
    393 	uint8x8_t cmp;
    394 	uint64x1_t anum;
    395 	uint8_t or[8] = {0, 0, 0, 0, 0, 0, 0, 0x0F};
    396 
    397 	for (i = 0, ret = 0; i < n; i++) {
    398 		cmp = vdup_lane_u8(a, 0);
    399 		anum = vreinterpret_u64_u8(a);
    400 		anum = vshr_n_u64(anum, 8);
    401 		a = vreinterpret_u8_u64(anum);
    402 		a = vorr_u8(a, vld1_u8(or));
    403 		cmp = vcgt_u8(cmp, a);
    404 		c = vaddv_u8(vshr_n_u8(cmp, 7));
    405 		ret += c * factorial[n-1-i];
    406 	}
    407 
    408 	return ret;
    409 }
    410 
    411 STATIC_INLINE uint8x8_t
    412 indextoperm_8x8(uint64_t p)
    413 {
    414 	int used;
    415 	uint64_t c, k, i, j;
    416 	uint8_t ret[8];
    417 
    418 	for (i = 0, used = 0; i < 8; i++) {
    419 		k = p / factorial[7-i];
    420 
    421 		/* Find k-th unused number */
    422 		for (j = 0, c = 0; c <= k; j++)
    423 			c += 1 - ((used & (1 << j)) >> j);
    424 
    425 		ret[i] = j-1;
    426 		used |= 1 << (j-1);
    427 		p %= factorial[7-i];
    428 	}
    429 
    430 	return vld1_u8(ret);
    431 }
    432 
    433 STATIC_INLINE uint8x8_t
    434 indextoperm_4x8(uint64_t p)
    435 {
    436 	static const int64_t A[FACT_4] = {
    437 		[0] = UINT64_C(0x03020100),
    438 		[1] = UINT64_C(0x02030100),
    439 		[2] = UINT64_C(0x03010200),
    440 		[3] = UINT64_C(0x01030200),
    441 		[4] = UINT64_C(0x02010300),
    442 		[5] = UINT64_C(0x01020300),
    443 		[6] = UINT64_C(0x03020001),
    444 		[7] = UINT64_C(0x02030001),
    445 		[8] = UINT64_C(0x03000201),
    446 		[9] = UINT64_C(0x00030201),
    447 		[10] = UINT64_C(0x02000301),
    448 		[11] = UINT64_C(0x00020301),
    449 		[12] = UINT64_C(0x03010002),
    450 		[13] = UINT64_C(0x01030002),
    451 		[14] = UINT64_C(0x03000102),
    452 		[15] = UINT64_C(0x00030102),
    453 		[16] = UINT64_C(0x01000302),
    454 		[17] = UINT64_C(0x00010302),
    455 		[18] = UINT64_C(0x02010003),
    456 		[19] = UINT64_C(0x01020003),
    457 		[20] = UINT64_C(0x02000103),
    458 		[21] = UINT64_C(0x00020103),
    459 		[22] = UINT64_C(0x01000203),
    460 		[23] = UINT64_C(0x00010203),
    461 	};
    462 
    463 	return vreinterpret_u8_u64(vdup_n_u64(A[p]));
    464 }
    465 
    466 STATIC_INLINE uint64_t
    467 coord_cp(cube_t cube)
    468 {
    469 	return permtoindex_Nx8(8, vand_u8(cube.corner, PBITS8_NEON));
    470 }
    471 
    472 STATIC_INLINE cube_t
    473 invcoord_cp(uint64_t i)
    474 {
    475 	return (cube_t) {
    476 		.corner = indextoperm_8x8(i),
    477 		.edge = vcombine_u8(vld1_u8(SOLVED_L), vld1_u8(SOLVED_H))
    478 	};
    479 }
    480 
    481 STATIC_INLINE uint64_t
    482 coord_epud(cube_t cube)
    483 {
    484 	uint8x8_t a;
    485 
    486 	a = vget_low_u8(cube.edge);
    487 	a = vand_u8(a, PBITS8_NEON);
    488 
    489 	return permtoindex_Nx8(8, a);
    490 }
    491 
    492 STATIC_INLINE cube_t
    493 invcoord_epud(uint64_t i)
    494 {
    495 	return (cube_t) {
    496 		.corner = vld1_u8(SOLVED_L),
    497 		.edge = vcombine_u8(indextoperm_8x8(i), vld1_u8(SOLVED_H))
    498 	};
    499 }
    500 
    501 STATIC_INLINE uint64_t
    502 coord_epe(cube_t cube)
    503 {
    504 	uint8x8_t a;
    505 
    506 	a = vget_high_u8(cube.edge);
    507 	a = vand_u8(a, PBITS8_NEON);
    508 	a = veor_u8(a, vdup_n_u8(8));
    509 
    510 	return permtoindex_Nx8(4, a);
    511 }
    512 
    513 STATIC_INLINE cube_t
    514 invcoord_epe(uint64_t i)
    515 {
    516 	uint8x8_t a;
    517 
    518 	a = indextoperm_4x8(i);
    519 	a = vadd_u8(a, vreinterpret_u8_u64(vdup_n_u64(UINT64_C(0x08080808))));
    520 
    521 	return (cube_t) {
    522 		.corner = vld1_u8(SOLVED_L),
    523 		.edge = vcombine_u8(vld1_u8(SOLVED_L), a)
    524 	};
    525 }
    526 
    527 STATIC_INLINE bool
    528 is_eo_even(cube_t cube)
    529 {
    530 	int8_t count;
    531 	uint8x16_t e;
    532 
    533 	e = vandq_u8(cube.edge, EO_NEON);
    534 	e = vshrq_n_u8(e, EOSHIFT);
    535 	count = vaddvq_u8(e);
    536 
    537 	return count % 2 == 0;
    538 }
    539 
    540 STATIC_INLINE uint64_t
    541 coord_epudsep(cube_t cube)
    542 {
    543 	uint8_t e[8];
    544 
    545 	vst1_u8(e, vget_low_u8(cube.edge));
    546 	return coord_epudsep_array(e);
    547 }
    548 
    549 STATIC_INLINE cube_t
    550 invcoord_epudsep(uint64_t i)
    551 {
    552 	uint8_t e[8];
    553 
    554 	invcoord_epudsep_array(i, e);
    555 	return (cube_t) {
    556 		.corner = vld1_u8(SOLVED_L),
    557 		.edge = vcombine_u8(vld1_u8(e), vld1_u8(SOLVED_H))
    558 	};
    559 }