neon.h (11912B)
1 #define CO2_NEON vdup_n_u8(0x60) 2 #define COCW_NEON vdup_n_u8(0x20) 3 #define EO_NEON vdupq_n_u8(0x10) 4 #define PBITS8_NEON vdup_n_u8(PBITS) 5 6 STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t); 7 STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t); 8 9 #define STATIC_CUBE( \ 10 c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \ 11 e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \ 12 ((cube_t){ \ 13 .corner = { \ 14 c_ufr, c_ubl, c_dfl, c_dbr, \ 15 c_ufl, c_ubr, c_dfr, c_dbl \ 16 }, \ 17 .edge = { \ 18 e_uf, e_ub, e_db, e_df, e_ur, e_ul, \ 19 e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0 \ 20 } \ 21 }) 22 #define ZERO_CUBE \ 23 ((cube_t){ \ 24 .corner = vdup_n_u8(0), \ 25 .edge = vdupq_n_u8(0) \ 26 }) 27 #define SOLVED_CUBE STATIC_CUBE( \ 28 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) 29 30 const uint8_t SOLVED_L[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 31 const uint8_t SOLVED_H[8] = {8, 9, 10, 11, 0, 0, 0}; 32 33 STATIC_INLINE uint64_t permtoindex_Nx8(uint64_t, uint8x8_t); 34 STATIC_INLINE uint8x8_t indextoperm_8x8(uint64_t); 35 STATIC_INLINE uint8x8_t indextoperm_4x8(uint64_t); 36 37 STATIC_INLINE int 38 popcount_u32(uint32_t x) 39 { 40 /* Same as the portable version */ 41 x -= (x >> UINT32_C(1)) & UINT32_C(0x55555555); 42 x = (x & UINT32_C(0x33333333)) + 43 ((x >> UINT32_C(2)) & UINT32_C(0x33333333)); 44 x = (x + (x >> UINT32_C(4))) & UINT32_C(0x0F0F0F0F); 45 x = (x * UINT32_C(0x01010101)) >> UINT32_C(24); 46 47 return (int)x; 48 } 49 50 STATIC_INLINE int 51 popcount_u64(uint64_t x) 52 { 53 int top, bottom; 54 55 bottom = popcount_u32((uint32_t)(x & UINT64_C(0xFFFFFFFF))); 56 top = popcount_u32((uint32_t)(x >> UINT64_C(32))); 57 58 return top + bottom; 59 } 60 61 STATIC void 62 pieces(cube_t cube[NON_NULL], uint8_t c[SIZE(8)], uint8_t e[SIZE(12)]) 63 { 64 // First 8 bytes of the corner vector are copied from the c array 65 vst1_u8(c, cube->corner); 66 67 // 12 bytes of the edge vector are copied from the e array 68 // First 8 bytes 69 vst1_u8(e, vget_low_u8(cube->edge)); 70 // Next 4 bytes 71 vst1_lane_u32((uint32_t *)(e + 8), 72 vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0); 73 } 74 75 STATIC_INLINE bool 76 equal(cube_t c1, cube_t c2) 77 { 78 uint8x8_t cmp_corner; 79 uint8x16_t cmp_edge; 80 uint64x2_t cmp_corner_u64, cmp_edge_u64, cmp_result; 81 82 // compare the corner vectors and the edge vectors 83 cmp_corner = vceq_u8(c1.corner, c2.corner); 84 cmp_edge = vceqq_u8(c1.edge, c2.edge); 85 86 // convert the comparison vectors to 64-bit vectors and combine them 87 cmp_corner_u64 = vreinterpretq_u64_u8( 88 vcombine_u8(cmp_corner, cmp_corner)); 89 cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge); 90 cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64); 91 92 // check if all the bits are set 93 return vgetq_lane_u64(cmp_result, 0) == ~0ULL && 94 vgetq_lane_u64(cmp_result, 1) == ~0ULL; 95 } 96 97 STATIC_INLINE cube_t 98 invertco(cube_t c) 99 { 100 cube_t ret; 101 uint8x8_t co, shleft, shright, summed, newco, cleanco; 102 103 co = vand_u8(c.corner, CO2_NEON); 104 shleft = vshl_n_u8(co, 1); 105 shright = vshr_n_u8(co, 1); 106 summed = vorr_u8(shleft, shright); 107 newco = vand_u8(summed, CO2_NEON); 108 cleanco = veor_u8(c.corner, co); 109 ret.corner = vorr_u8(cleanco, newco); 110 ret.edge = c.edge; 111 112 return ret; 113 } 114 115 STATIC_INLINE cube_t 116 compose_edges(cube_t c1, cube_t c2) 117 { 118 cube_t ret = {0}; 119 ret.edge = compose_edges_slim(c1.edge, c2.edge); 120 return ret; 121 } 122 123 STATIC_INLINE cube_t 124 compose_corners(cube_t c1, cube_t c2) 125 { 126 cube_t ret = {0}; 127 ret.corner = compose_corners_slim(c1.corner, c2.corner); 128 return ret; 129 } 130 131 STATIC_INLINE uint8x16_t 132 compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2) 133 { 134 // Masks 135 uint8x16_t p_bits = vdupq_n_u8(PBITS); 136 uint8x16_t eo_bit = vdupq_n_u8(EOBIT); 137 138 // Find the index and permutation 139 uint8x16_t p = vandq_u8(edge2, p_bits); 140 uint8x16_t piece1 = vqtbl1q_u8(edge1, p); 141 142 // Calculate the orientation through XOR 143 uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit); 144 145 // Combine the results 146 uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien); 147 148 // Mask to clear the last 32 bits of the result 149 uint32x4_t mask_last_32 = 150 vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3); 151 ret = vreinterpretq_u8_u32(mask_last_32); 152 153 return ret; 154 } 155 156 STATIC_INLINE uint8x8_t 157 compose_corners_slim(uint8x8_t corner1, uint8x8_t corner2) 158 { 159 // Masks 160 uint8x8_t p_bits = vdup_n_u8(PBITS); 161 uint8x8_t cobits = vdup_n_u8(COBITS); 162 uint8x8_t cobits2 = vdup_n_u8(COBITS_2); 163 uint8x8_t twist_cw = vdup_n_u8(CTWIST_CW); 164 165 // Find the index and permutation 166 uint8x8_t p = vand_u8(corner2, p_bits); 167 uint8x8_t piece1 = vtbl1_u8(corner1, p); 168 169 // Calculate the orientation 170 uint8x8_t aux = 171 vadd_u8(vand_u8(corner2, cobits), vand_u8(piece1, cobits)); 172 uint8x8_t auy = vshr_n_u8(vadd_u8(aux, twist_cw), 2); 173 uint8x8_t orien = vand_u8(vadd_u8(aux, auy), cobits2); 174 175 uint8x8_t ret = vorr_u8(vand_u8(piece1, p_bits), orien); 176 177 return ret; 178 } 179 180 STATIC_INLINE cube_t 181 compose(cube_t c1, cube_t c2) 182 { 183 cube_t ret = {0}; 184 185 ret.edge = compose_edges_slim(c1.edge, c2.edge); 186 ret.corner = compose_corners_slim(c1.corner, c2.corner); 187 188 return ret; 189 } 190 191 STATIC_INLINE cube_t 192 inverse(cube_t cube) 193 { 194 uint8_t i, piece, orien; 195 cube_t ret; 196 197 // Temp arrays to store the NEON vectors 198 uint8_t edges[16]; 199 uint8_t corners[8]; 200 201 // Copy the NEON vectors to the arrays 202 vst1q_u8(edges, cube.edge); 203 vst1_u8(corners, cube.corner); 204 205 uint8_t edge_result[16] = {0}; 206 uint8_t corner_result[8] = {0}; 207 208 // Process the edges 209 for (i = 0; i < 12; i++) 210 { 211 piece = edges[i]; 212 orien = piece & EOBIT; 213 edge_result[piece & PBITS] = i | orien; 214 } 215 216 // Process the corners 217 for (i = 0; i < 8; i++) 218 { 219 piece = corners[i]; 220 orien = ((piece << 1) | (piece >> 1)) & COBITS_2; 221 corner_result[piece & PBITS] = i | orien; 222 } 223 224 // Copy the results back to the NEON vectors 225 ret.edge = vld1q_u8(edge_result); 226 ret.corner = vld1_u8(corner_result); 227 228 return ret; 229 } 230 231 STATIC_INLINE uint64_t 232 coord_co(cube_t c) 233 { 234 uint64_t i, p, ret; 235 236 // Temp array to store the NEON vector 237 uint8_t mem[8]; 238 vst1_u8(mem, c.corner); 239 240 for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3) 241 ret += p * (mem[i] >> COSHIFT); 242 243 return ret; 244 } 245 246 STATIC_INLINE void 247 copy_co(cube_t cube[NON_NULL], cube_t co) 248 { 249 uint8x8_t coclean; 250 251 coclean = vand_u8(co.corner, CO2_NEON); 252 cube->corner = vbic_u8(cube->corner, CO2_NEON); 253 cube->corner = vorr_u8(cube->corner, coclean); 254 } 255 256 STATIC_INLINE cube_t 257 invcoord_co(uint64_t coord) 258 { 259 uint64_t co, c, i, p; 260 uint8_t mem[8]; 261 cube_t cube; 262 263 for (i = 0, p = 0, c = coord; i < 8; i++, c /= 3) { 264 co = i == 7 ? ((3 - (p % 3)) % 3) : (c % 3); 265 p += co; 266 mem[i] = i + (co << COSHIFT); 267 } 268 269 cube.corner = vld1_u8(mem); 270 cube.edge = SOLVED_CUBE.edge; 271 272 return cube; 273 } 274 275 STATIC_INLINE uint64_t 276 coord_csep(cube_t c) 277 { 278 uint64_t ret, i, p; 279 280 // Temp array to store the NEON vector 281 uint8_t mem[8]; 282 vst1_u8(mem, c.corner); 283 284 for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2) 285 ret += p * ((mem[i] & CSEPBIT) >> 2); 286 287 return ret; 288 return 0; 289 } 290 291 STATIC_INLINE uint64_t 292 coord_cocsep(cube_t c) 293 { 294 return (coord_co(c) << UINT64_C(7)) + coord_csep(c); 295 } 296 297 STATIC_INLINE uint64_t 298 coord_eo(cube_t c) 299 { 300 uint64_t ret, p; 301 int i; 302 303 // Temp array to store the NEON vector 304 uint8_t mem[16]; 305 vst1q_u8(mem, c.edge); 306 307 for (i = 1, ret = 0, p = 1; i < 12; i++, p *= 2) 308 { 309 ret += p * (mem[i] >> EOSHIFT); 310 } 311 312 return ret; 313 } 314 315 STATIC_INLINE uint64_t 316 coord_esep(cube_t c) 317 { 318 uint64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1; 319 320 // Temp array to store the NEON vector 321 uint8_t mem[16]; 322 vst1q_u8(mem, c.edge); 323 324 for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++) 325 { 326 bit1 = (mem[i] & ESEPBIT_1) >> 2; 327 bit2 = (mem[i] & ESEPBIT_2) >> 3; 328 is1 = (1 - bit2) * bit1; 329 330 ret1 += bit2 * binomial[11 - i][k]; 331 k -= bit2; 332 333 jj = j < 8; 334 ret2 += jj * is1 * binomial[7 - (j * jj)][l]; 335 l -= is1; 336 j += (1 - bit2); 337 } 338 339 return ret1 * 70 + ret2; 340 } 341 342 STATIC_INLINE void 343 copy_corners(cube_t dst[NON_NULL], cube_t src) 344 { 345 dst->corner = src.corner; 346 } 347 348 STATIC_INLINE void 349 copy_edges(cube_t dst[NON_NULL], cube_t src) 350 { 351 dst->edge = src.edge; 352 } 353 354 STATIC_INLINE void 355 set_eo(cube_t cube[NON_NULL], uint64_t eo) 356 { 357 // Temp array to store the NEON vector 358 uint8_t mem[16]; 359 vst1q_u8(mem, cube->edge); 360 uint8_t i, sum, flip; 361 362 for (sum = 0, i = 1; i < 12; i++, eo >>= 1) 363 { 364 flip = eo % 2; 365 sum += flip; 366 mem[i] = (mem[i] & ~EOBIT) | (EOBIT * flip); 367 } 368 mem[0] = (mem[0] & ~EOBIT) | (EOBIT * (sum % 2)); 369 370 // Copy the results back to the NEON vector 371 cube->edge = vld1q_u8(mem); 372 return; 373 } 374 375 STATIC_INLINE cube_t 376 invcoord_esep(uint64_t esep) 377 { 378 cube_t ret; 379 uint8_t mem[16] = {0}; 380 381 invcoord_esep_array(esep % UINT64_C(70), esep / UINT64_C(70), mem); 382 383 ret = SOLVED_CUBE; 384 ret.edge = vld1q_u8(mem); 385 386 return ret; 387 } 388 389 STATIC_INLINE uint64_t 390 permtoindex_Nx8(uint64_t n, uint8x8_t a) 391 { 392 uint64_t i, c, ret; 393 uint8x8_t cmp; 394 uint64x1_t anum; 395 uint8_t or[8] = {0, 0, 0, 0, 0, 0, 0, 0x0F}; 396 397 for (i = 0, ret = 0; i < n; i++) { 398 cmp = vdup_lane_u8(a, 0); 399 anum = vreinterpret_u64_u8(a); 400 anum = vshr_n_u64(anum, 8); 401 a = vreinterpret_u8_u64(anum); 402 a = vorr_u8(a, vld1_u8(or)); 403 cmp = vcgt_u8(cmp, a); 404 c = vaddv_u8(vshr_n_u8(cmp, 7)); 405 ret += c * factorial[n-1-i]; 406 } 407 408 return ret; 409 } 410 411 STATIC_INLINE uint8x8_t 412 indextoperm_8x8(uint64_t p) 413 { 414 int used; 415 uint64_t c, k, i, j; 416 uint8_t ret[8]; 417 418 for (i = 0, used = 0; i < 8; i++) { 419 k = p / factorial[7-i]; 420 421 /* Find k-th unused number */ 422 for (j = 0, c = 0; c <= k; j++) 423 c += 1 - ((used & (1 << j)) >> j); 424 425 ret[i] = j-1; 426 used |= 1 << (j-1); 427 p %= factorial[7-i]; 428 } 429 430 return vld1_u8(ret); 431 } 432 433 STATIC_INLINE uint8x8_t 434 indextoperm_4x8(uint64_t p) 435 { 436 static const int64_t A[FACT_4] = { 437 [0] = UINT64_C(0x03020100), 438 [1] = UINT64_C(0x02030100), 439 [2] = UINT64_C(0x03010200), 440 [3] = UINT64_C(0x01030200), 441 [4] = UINT64_C(0x02010300), 442 [5] = UINT64_C(0x01020300), 443 [6] = UINT64_C(0x03020001), 444 [7] = UINT64_C(0x02030001), 445 [8] = UINT64_C(0x03000201), 446 [9] = UINT64_C(0x00030201), 447 [10] = UINT64_C(0x02000301), 448 [11] = UINT64_C(0x00020301), 449 [12] = UINT64_C(0x03010002), 450 [13] = UINT64_C(0x01030002), 451 [14] = UINT64_C(0x03000102), 452 [15] = UINT64_C(0x00030102), 453 [16] = UINT64_C(0x01000302), 454 [17] = UINT64_C(0x00010302), 455 [18] = UINT64_C(0x02010003), 456 [19] = UINT64_C(0x01020003), 457 [20] = UINT64_C(0x02000103), 458 [21] = UINT64_C(0x00020103), 459 [22] = UINT64_C(0x01000203), 460 [23] = UINT64_C(0x00010203), 461 }; 462 463 return vreinterpret_u8_u64(vdup_n_u64(A[p])); 464 } 465 466 STATIC_INLINE uint64_t 467 coord_cp(cube_t cube) 468 { 469 return permtoindex_Nx8(8, vand_u8(cube.corner, PBITS8_NEON)); 470 } 471 472 STATIC_INLINE cube_t 473 invcoord_cp(uint64_t i) 474 { 475 return (cube_t) { 476 .corner = indextoperm_8x8(i), 477 .edge = vcombine_u8(vld1_u8(SOLVED_L), vld1_u8(SOLVED_H)) 478 }; 479 } 480 481 STATIC_INLINE uint64_t 482 coord_epud(cube_t cube) 483 { 484 uint8x8_t a; 485 486 a = vget_low_u8(cube.edge); 487 a = vand_u8(a, PBITS8_NEON); 488 489 return permtoindex_Nx8(8, a); 490 } 491 492 STATIC_INLINE cube_t 493 invcoord_epud(uint64_t i) 494 { 495 return (cube_t) { 496 .corner = vld1_u8(SOLVED_L), 497 .edge = vcombine_u8(indextoperm_8x8(i), vld1_u8(SOLVED_H)) 498 }; 499 } 500 501 STATIC_INLINE uint64_t 502 coord_epe(cube_t cube) 503 { 504 uint8x8_t a; 505 506 a = vget_high_u8(cube.edge); 507 a = vand_u8(a, PBITS8_NEON); 508 a = veor_u8(a, vdup_n_u8(8)); 509 510 return permtoindex_Nx8(4, a); 511 } 512 513 STATIC_INLINE cube_t 514 invcoord_epe(uint64_t i) 515 { 516 uint8x8_t a; 517 518 a = indextoperm_4x8(i); 519 a = vadd_u8(a, vreinterpret_u8_u64(vdup_n_u64(UINT64_C(0x08080808)))); 520 521 return (cube_t) { 522 .corner = vld1_u8(SOLVED_L), 523 .edge = vcombine_u8(vld1_u8(SOLVED_L), a) 524 }; 525 } 526 527 STATIC_INLINE bool 528 is_eo_even(cube_t cube) 529 { 530 int8_t count; 531 uint8x16_t e; 532 533 e = vandq_u8(cube.edge, EO_NEON); 534 e = vshrq_n_u8(e, EOSHIFT); 535 count = vaddvq_u8(e); 536 537 return count % 2 == 0; 538 } 539 540 STATIC_INLINE uint64_t 541 coord_epudsep(cube_t cube) 542 { 543 uint8_t e[8]; 544 545 vst1_u8(e, vget_low_u8(cube.edge)); 546 return coord_epudsep_array(e); 547 } 548 549 STATIC_INLINE cube_t 550 invcoord_epudsep(uint64_t i) 551 { 552 uint8_t e[8]; 553 554 invcoord_epudsep_array(i, e); 555 return (cube_t) { 556 .corner = vld1_u8(SOLVED_L), 557 .edge = vcombine_u8(vld1_u8(e), vld1_u8(SOLVED_H)) 558 }; 559 }