neon.h - nissy-core - The "engine" of nissy, including the H48 optimal solver.

neon.h (11710B)
      1 #define CO2_NEON    vdup_n_u8(0x60)
      2 #define COCW_NEON   vdup_n_u8(0x20)
      3 #define EO_NEON     vdupq_n_u8(0x10)
      4 #define PBITS8_NEON vdup_n_u8(PBITS)
      5 
      6 STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t);
      7 STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t);
      8 
      9 #define STATIC_CUBE( \
     10     c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
     11     e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
     12 	((cube_t){ \
     13 		.corner = { \
     14 			c_ufr, c_ubl, c_dfl, c_dbr, \
     15 			c_ufl, c_ubr, c_dfr, c_dbl \
     16 		}, \
     17 		.edge = { \
     18 			e_uf, e_ub, e_db, e_df, e_ur, e_ul, \
     19 			e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0 \
     20 		} \
     21 	})
     22 #define ZERO_CUBE \
     23 	((cube_t){ \
     24 		.corner = vdup_n_u8(0), \
     25 		.edge = vdupq_n_u8(0) \
     26 	})
     27 #define SOLVED_CUBE STATIC_CUBE( \
     28 	0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
     29 
     30 const uint8_t SOLVED_L[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     31 const uint8_t SOLVED_H[8] = {8, 9, 10, 11, 0, 0, 0};
     32 
     33 STATIC_INLINE uint64_t permtoindex_Nx8(uint64_t, uint8x8_t);
     34 STATIC_INLINE uint8x8_t indextoperm_8x8(uint64_t);
     35 STATIC_INLINE uint8x8_t indextoperm_4x8(uint64_t);
     36 
     37 STATIC_INLINE int
     38 popcount_u32(uint32_t x)
     39 {
     40 	/* Same as the portable version */
     41 	x -= (x >> UINT32_C(1)) & UINT32_C(0x55555555);
     42 	x = (x & UINT32_C(0x33333333)) +
     43 	    ((x >> UINT32_C(2)) & UINT32_C(0x33333333));
     44 	x = (x + (x >> UINT32_C(4))) & UINT32_C(0x0F0F0F0F);
     45 	x = (x * UINT32_C(0x01010101)) >> UINT32_C(24);
     46 
     47 	return (int)x;
     48 }
     49 
     50 STATIC void
     51 pieces(cube_t cube[static 1], uint8_t c[static 8], uint8_t e[static 12])
     52 {
     53 	// First 8 bytes of the corner vector are copied from the c array
     54 	vst1_u8(c, cube->corner);
     55 
     56 	// 12 bytes of the edge vector are copied from the e array
     57 	// First 8 bytes
     58 	vst1_u8(e, vget_low_u8(cube->edge));
     59 	// Next 4 bytes
     60 	vst1_lane_u32((uint32_t *)(e + 8),
     61 	    vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0);
     62 }
     63 
     64 STATIC_INLINE bool
     65 equal(cube_t c1, cube_t c2)
     66 {
     67 	uint8x8_t cmp_corner;
     68 	uint8x16_t cmp_edge;
     69 	uint64x2_t cmp_corner_u64, cmp_edge_u64, cmp_result;
     70 
     71 	// compare the corner vectors and the edge vectors
     72 	cmp_corner = vceq_u8(c1.corner, c2.corner);
     73 	cmp_edge = vceqq_u8(c1.edge, c2.edge);
     74 
     75 	// convert the comparison vectors to 64-bit vectors and combine them
     76 	cmp_corner_u64 = vreinterpretq_u64_u8(
     77 	    vcombine_u8(cmp_corner, cmp_corner));
     78 	cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge);
     79 	cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64);
     80 
     81 	// check if all the bits are set
     82 	return vgetq_lane_u64(cmp_result, 0) == ~0ULL &&
     83 	    vgetq_lane_u64(cmp_result, 1) == ~0ULL;
     84 }
     85 
     86 STATIC_INLINE cube_t
     87 invertco(cube_t c)
     88 {
     89 	cube_t ret;
     90 	uint8x8_t co, shleft, shright, summed, newco, cleanco;
     91 
     92 	co = vand_u8(c.corner, CO2_NEON);
     93 	shleft = vshl_n_u8(co, 1);
     94 	shright = vshr_n_u8(co, 1);
     95 	summed = vorr_u8(shleft, shright);
     96 	newco = vand_u8(summed, CO2_NEON);
     97 	cleanco = veor_u8(c.corner, co);
     98 	ret.corner = vorr_u8(cleanco, newco);
     99 	ret.edge = c.edge;
    100 	
    101 	return ret;
    102 }
    103 
    104 STATIC_INLINE cube_t
    105 compose_edges(cube_t c1, cube_t c2)
    106 {
    107 	cube_t ret = {0};
    108 	ret.edge = compose_edges_slim(c1.edge, c2.edge);
    109 	return ret;
    110 }
    111 
    112 STATIC_INLINE cube_t
    113 compose_corners(cube_t c1, cube_t c2)
    114 {
    115 	cube_t ret = {0};
    116 	ret.corner = compose_corners_slim(c1.corner, c2.corner);
    117 	return ret;
    118 }
    119 
    120 STATIC_INLINE uint8x16_t
    121 compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2)
    122 {
    123 	// Masks
    124 	uint8x16_t p_bits = vdupq_n_u8(PBITS);
    125 	uint8x16_t eo_bit = vdupq_n_u8(EOBIT);
    126 
    127 	// Find the index and permutation
    128 	uint8x16_t p = vandq_u8(edge2, p_bits);
    129 	uint8x16_t piece1 = vqtbl1q_u8(edge1, p);
    130 
    131 	// Calculate the orientation through XOR
    132 	uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit);
    133 
    134 	// Combine the results
    135 	uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien);
    136 
    137 	// Mask to clear the last 32 bits of the result
    138 	uint32x4_t mask_last_32 =
    139 	    vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3);
    140 	ret = vreinterpretq_u8_u32(mask_last_32);
    141 
    142 	return ret;
    143 }
    144 
    145 STATIC_INLINE uint8x8_t
    146 compose_corners_slim(uint8x8_t corner1, uint8x8_t corner2)
    147 {
    148 	// Masks
    149 	uint8x8_t p_bits = vdup_n_u8(PBITS);
    150 	uint8x8_t cobits = vdup_n_u8(COBITS);
    151 	uint8x8_t cobits2 = vdup_n_u8(COBITS_2);
    152 	uint8x8_t twist_cw = vdup_n_u8(CTWIST_CW);
    153 
    154 	// Find the index and permutation
    155 	uint8x8_t p = vand_u8(corner2, p_bits);
    156 	uint8x8_t piece1 = vtbl1_u8(corner1, p);
    157 
    158 	// Calculate the orientation
    159 	uint8x8_t aux =
    160 	    vadd_u8(vand_u8(corner2, cobits), vand_u8(piece1, cobits));
    161 	uint8x8_t auy = vshr_n_u8(vadd_u8(aux, twist_cw), 2);
    162 	uint8x8_t orien = vand_u8(vadd_u8(aux, auy), cobits2);
    163 
    164 	uint8x8_t ret = vorr_u8(vand_u8(piece1, p_bits), orien);
    165 
    166 	return ret;
    167 }
    168 
    169 STATIC_INLINE cube_t
    170 compose(cube_t c1, cube_t c2)
    171 {
    172 	cube_t ret = {0};
    173 
    174 	ret.edge = compose_edges_slim(c1.edge, c2.edge);
    175 	ret.corner = compose_corners_slim(c1.corner, c2.corner);
    176 
    177 	return ret;
    178 }
    179 
    180 STATIC_INLINE cube_t
    181 inverse(cube_t cube)
    182 {
    183 	uint8_t i, piece, orien;
    184 	cube_t ret;
    185 
    186 	// Temp arrays to store the NEON vectors
    187 	uint8_t edges[16];
    188 	uint8_t corners[8];
    189 
    190 	// Copy the NEON vectors to the arrays
    191 	vst1q_u8(edges, cube.edge);
    192 	vst1_u8(corners, cube.corner);
    193 
    194 	uint8_t edge_result[16] = {0};
    195 	uint8_t corner_result[8] = {0};
    196 
    197 	// Process the edges
    198 	for (i = 0; i < 12; i++)
    199 	{
    200 		piece = edges[i];
    201 		orien = piece & EOBIT;
    202 		edge_result[piece & PBITS] = i | orien;
    203 	}
    204 
    205 	// Process the corners
    206 	for (i = 0; i < 8; i++)
    207 	{
    208 		piece = corners[i];
    209 		orien = ((piece << 1) | (piece >> 1)) & COBITS_2;
    210 		corner_result[piece & PBITS] = i | orien;
    211 	}
    212 
    213 	// Copy the results back to the NEON vectors
    214 	ret.edge = vld1q_u8(edge_result);
    215 	ret.corner = vld1_u8(corner_result);
    216 
    217 	return ret;
    218 }
    219 
    220 STATIC_INLINE uint64_t
    221 coord_co(cube_t c)
    222 {
    223 	uint64_t i, p, ret;
    224 
    225 	// Temp array to store the NEON vector
    226 	uint8_t mem[8];
    227 	vst1_u8(mem, c.corner);
    228 
    229 	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3)
    230 		ret += p * (mem[i] >> COSHIFT);
    231 
    232 	return ret;
    233 }
    234 
    235 STATIC_INLINE void
    236 copy_co(cube_t cube[static 1], cube_t co)
    237 {
    238 	uint8x8_t coclean;
    239 
    240 	coclean = vand_u8(co.corner, CO2_NEON);
    241 	cube->corner = vbic_u8(cube->corner, CO2_NEON);
    242 	cube->corner = vorr_u8(cube->corner, coclean);
    243 }
    244 
    245 STATIC_INLINE cube_t
    246 invcoord_co(uint64_t coord)
    247 {
    248 	uint64_t co, c, i, p;
    249 	uint8_t mem[8];
    250 	cube_t cube;
    251 
    252 	for (i = 0, p = 0, c = coord; i < 8; i++, c /= 3) {
    253 		co = i == 7 ? ((3 - (p % 3)) % 3) : (c % 3);
    254 		p += co;
    255 		mem[i] = i + (co << COSHIFT);
    256 	}
    257 
    258 	cube.corner = vld1_u8(mem);
    259 	cube.edge = SOLVED_CUBE.edge;
    260 
    261 	return cube;
    262 }
    263 
    264 STATIC_INLINE uint64_t
    265 coord_csep(cube_t c)
    266 {
    267 	uint64_t ret, i, p;
    268 
    269 	// Temp array to store the NEON vector
    270 	uint8_t mem[8];
    271 	vst1_u8(mem, c.corner);
    272 
    273 	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2)
    274 		ret += p * ((mem[i] & CSEPBIT) >> 2);
    275 
    276 	return ret;
    277 	return 0;
    278 }
    279 
    280 STATIC_INLINE uint64_t
    281 coord_cocsep(cube_t c)
    282 {
    283 	return (coord_co(c) << UINT64_C(7)) + coord_csep(c);
    284 }
    285 
    286 STATIC_INLINE uint64_t
    287 coord_eo(cube_t c)
    288 {
    289 	uint64_t ret, p;
    290 	int i;
    291 
    292 	// Temp array to store the NEON vector
    293 	uint8_t mem[16];
    294 	vst1q_u8(mem, c.edge);
    295 
    296 	for (i = 1, ret = 0, p = 1; i < 12; i++, p *= 2)
    297 	{
    298 		ret += p * (mem[i] >> EOSHIFT);
    299 	}
    300 
    301 	return ret;
    302 }
    303 
    304 STATIC_INLINE uint64_t
    305 coord_esep(cube_t c)
    306 {
    307 	uint64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
    308 
    309 	// Temp array to store the NEON vector
    310 	uint8_t mem[16];
    311 	vst1q_u8(mem, c.edge);
    312 
    313 	for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++)
    314 	{
    315 		bit1 = (mem[i] & ESEPBIT_1) >> 2;
    316 		bit2 = (mem[i] & ESEPBIT_2) >> 3;
    317 		is1 = (1 - bit2) * bit1;
    318 
    319 		ret1 += bit2 * binomial[11 - i][k];
    320 		k -= bit2;
    321 
    322 		jj = j < 8;
    323 		ret2 += jj * is1 * binomial[7 - (j * jj)][l];
    324 		l -= is1;
    325 		j += (1 - bit2);
    326 	}
    327 
    328 	return ret1 * 70 + ret2;
    329 }
    330 
    331 STATIC_INLINE void
    332 copy_corners(cube_t dst[static 1], cube_t src)
    333 {
    334 	dst->corner = src.corner;
    335 }
    336 
    337 STATIC_INLINE void
    338 copy_edges(cube_t dst[static 1], cube_t src)
    339 {
    340 	dst->edge = src.edge;
    341 }
    342 
    343 STATIC_INLINE void
    344 set_eo(cube_t cube[static 1], uint64_t eo)
    345 {
    346 	// Temp array to store the NEON vector
    347 	uint8_t mem[16];
    348 	vst1q_u8(mem, cube->edge);
    349 	uint8_t i, sum, flip;
    350 
    351 	for (sum = 0, i = 1; i < 12; i++, eo >>= 1)
    352 	{
    353 		flip = eo % 2;
    354 		sum += flip;
    355 		mem[i] = (mem[i] & ~EOBIT) | (EOBIT * flip);
    356 	}
    357 	mem[0] = (mem[0] & ~EOBIT) | (EOBIT * (sum % 2));
    358 
    359 	// Copy the results back to the NEON vector
    360 	cube->edge = vld1q_u8(mem);
    361 	return;
    362 }
    363 
    364 STATIC_INLINE cube_t
    365 invcoord_esep(uint64_t esep)
    366 {
    367 	cube_t ret;
    368 	uint8_t mem[16] = {0};
    369 
    370 	invcoord_esep_array(esep % UINT64_C(70), esep / UINT64_C(70), mem);
    371 
    372 	ret = SOLVED_CUBE;
    373 	ret.edge = vld1q_u8(mem);
    374 
    375 	return ret;
    376 }
    377 
    378 STATIC_INLINE uint64_t
    379 permtoindex_Nx8(uint64_t n, uint8x8_t a)
    380 {
    381 	uint64_t i, c, ret;
    382 	uint8x8_t cmp;
    383 	uint64x1_t anum;
    384 	uint8_t or[8] = {0, 0, 0, 0, 0, 0, 0, 0x0F};
    385 
    386 	for (i = 0, ret = 0; i < n; i++) {
    387 		cmp = vdup_lane_u8(a, 0);
    388 		anum = vreinterpret_u64_u8(a);
    389 		anum = vshr_n_u64(anum, 8);
    390 		a = vreinterpret_u8_u64(anum);
    391 		a = vorr_u8(a, vld1_u8(or));
    392 		cmp = vcgt_u8(cmp, a);
    393 		c = vaddv_u8(vshr_n_u8(cmp, 7));
    394 		ret += c * factorial[n-1-i];
    395 	}
    396 
    397 	return ret;
    398 }
    399 
    400 STATIC_INLINE uint8x8_t
    401 indextoperm_8x8(uint64_t p)
    402 {
    403 	int used;
    404 	uint64_t c, k, i, j;
    405 	uint8_t ret[8];
    406 
    407 	for (i = 0, used = 0; i < 8; i++) {
    408 		k = p / factorial[7-i];
    409 
    410 		/* Find k-th unused number */
    411 		for (j = 0, c = 0; c <= k; j++)
    412 			c += 1 - ((used & (1 << j)) >> j);
    413 
    414 		ret[i] = j-1;
    415 		used |= 1 << (j-1);
    416 		p %= factorial[7-i];
    417 	}
    418 
    419 	return vld1_u8(ret);
    420 }
    421 
    422 STATIC_INLINE uint8x8_t
    423 indextoperm_4x8(uint64_t p)
    424 {
    425 	static const int64_t A[FACT_4] = {
    426 		[0] = UINT64_C(0x03020100),
    427 		[1] = UINT64_C(0x02030100),
    428 		[2] = UINT64_C(0x03010200),
    429 		[3] = UINT64_C(0x01030200),
    430 		[4] = UINT64_C(0x02010300),
    431 		[5] = UINT64_C(0x01020300),
    432 		[6] = UINT64_C(0x03020001),
    433 		[7] = UINT64_C(0x02030001),
    434 		[8] = UINT64_C(0x03000201),
    435 		[9] = UINT64_C(0x00030201),
    436 		[10] = UINT64_C(0x02000301),
    437 		[11] = UINT64_C(0x00020301),
    438 		[12] = UINT64_C(0x03010002),
    439 		[13] = UINT64_C(0x01030002),
    440 		[14] = UINT64_C(0x03000102),
    441 		[15] = UINT64_C(0x00030102),
    442 		[16] = UINT64_C(0x01000302),
    443 		[17] = UINT64_C(0x00010302),
    444 		[18] = UINT64_C(0x02010003),
    445 		[19] = UINT64_C(0x01020003),
    446 		[20] = UINT64_C(0x02000103),
    447 		[21] = UINT64_C(0x00020103),
    448 		[22] = UINT64_C(0x01000203),
    449 		[23] = UINT64_C(0x00010203),
    450 	};
    451 
    452 	return vreinterpret_u8_u64(vdup_n_u64(A[p]));
    453 }
    454 
    455 STATIC_INLINE uint64_t
    456 coord_cp(cube_t cube)
    457 {
    458 	return permtoindex_Nx8(8, vand_u8(cube.corner, PBITS8_NEON));
    459 }
    460 
    461 STATIC_INLINE cube_t
    462 invcoord_cp(uint64_t i)
    463 {
    464 	return (cube_t) {
    465 		.corner = indextoperm_8x8(i),
    466 		.edge = vcombine_u8(vld1_u8(SOLVED_L), vld1_u8(SOLVED_H))
    467 	};
    468 }
    469 
    470 STATIC_INLINE uint64_t
    471 coord_epud(cube_t cube)
    472 {
    473 	uint8x8_t a;
    474 
    475 	a = vget_low_u8(cube.edge);
    476 	a = vand_u8(a, PBITS8_NEON);
    477 
    478 	return permtoindex_Nx8(8, a);
    479 }
    480 
    481 STATIC_INLINE cube_t
    482 invcoord_epud(uint64_t i)
    483 {
    484 	return (cube_t) {
    485 		.corner = vld1_u8(SOLVED_L),
    486 		.edge = vcombine_u8(indextoperm_8x8(i), vld1_u8(SOLVED_H))
    487 	};
    488 }
    489 
    490 STATIC_INLINE uint64_t
    491 coord_epe(cube_t cube)
    492 {
    493 	uint8x8_t a;
    494 
    495 	a = vget_high_u8(cube.edge);
    496 	a = vand_u8(a, PBITS8_NEON);
    497 	a = veor_u8(a, vdup_n_u8(8));
    498 
    499 	return permtoindex_Nx8(4, a);
    500 }
    501 
    502 STATIC_INLINE cube_t
    503 invcoord_epe(uint64_t i)
    504 {
    505 	uint8x8_t a;
    506 
    507 	a = indextoperm_4x8(i);
    508 	a = vadd_u8(a, vreinterpret_u8_u64(vdup_n_u64(UINT64_C(0x08080808))));
    509 
    510 	return (cube_t) {
    511 		.corner = vld1_u8(SOLVED_L),
    512 		.edge = vcombine_u8(vld1_u8(SOLVED_L), a)
    513 	};
    514 }
    515 
    516 STATIC_INLINE bool
    517 is_eo_even(cube_t cube)
    518 {
    519 	int8_t count;
    520 	uint8x16_t e;
    521 
    522 	e = vandq_u8(cube.edge, EO_NEON);
    523 	e = vshrq_n_u8(e, EOSHIFT);
    524 	count = vaddvq_u8(e);
    525 
    526 	return count % 2 == 0;
    527 }
    528 
    529 STATIC_INLINE uint64_t
    530 coord_epudsep(cube_t cube)
    531 {
    532 	uint8_t e[8];
    533 
    534 	vst1_u8(e, vget_low_u8(cube.edge));
    535 	return coord_epudsep_array(e);
    536 }
    537 
    538 STATIC_INLINE cube_t
    539 invcoord_epudsep(uint64_t i)
    540 {
    541 	uint8_t e[8];
    542 
    543 	invcoord_epudsep_array(i, e);
    544 	return (cube_t) {
    545 		.corner = vld1_u8(SOLVED_L),
    546 		.edge = vcombine_u8(vld1_u8(e), vld1_u8(SOLVED_H))
    547 	};
    548 }
	nissy-core The "engine" of nissy, including the H48 optimal solver.
	git clone https://git.tronto.net/nissy-core
	Download \| Log \| Files \| Refs \| README \| LICENSE