neon.h (11710B)
1 #define CO2_NEON vdup_n_u8(0x60) 2 #define COCW_NEON vdup_n_u8(0x20) 3 #define EO_NEON vdupq_n_u8(0x10) 4 #define PBITS8_NEON vdup_n_u8(PBITS) 5 6 STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t); 7 STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t); 8 9 #define STATIC_CUBE( \ 10 c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \ 11 e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \ 12 ((cube_t){ \ 13 .corner = { \ 14 c_ufr, c_ubl, c_dfl, c_dbr, \ 15 c_ufl, c_ubr, c_dfr, c_dbl \ 16 }, \ 17 .edge = { \ 18 e_uf, e_ub, e_db, e_df, e_ur, e_ul, \ 19 e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0 \ 20 } \ 21 }) 22 #define ZERO_CUBE \ 23 ((cube_t){ \ 24 .corner = vdup_n_u8(0), \ 25 .edge = vdupq_n_u8(0) \ 26 }) 27 #define SOLVED_CUBE STATIC_CUBE( \ 28 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) 29 30 const uint8_t SOLVED_L[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 31 const uint8_t SOLVED_H[8] = {8, 9, 10, 11, 0, 0, 0}; 32 33 STATIC_INLINE uint64_t permtoindex_Nx8(uint64_t, uint8x8_t); 34 STATIC_INLINE uint8x8_t indextoperm_8x8(uint64_t); 35 STATIC_INLINE uint8x8_t indextoperm_4x8(uint64_t); 36 37 STATIC_INLINE int 38 popcount_u32(uint32_t x) 39 { 40 /* Same as the portable version */ 41 x -= (x >> UINT32_C(1)) & UINT32_C(0x55555555); 42 x = (x & UINT32_C(0x33333333)) + 43 ((x >> UINT32_C(2)) & UINT32_C(0x33333333)); 44 x = (x + (x >> UINT32_C(4))) & UINT32_C(0x0F0F0F0F); 45 x = (x * UINT32_C(0x01010101)) >> UINT32_C(24); 46 47 return (int)x; 48 } 49 50 STATIC void 51 pieces(cube_t cube[static 1], uint8_t c[static 8], uint8_t e[static 12]) 52 { 53 // First 8 bytes of the corner vector are copied from the c array 54 vst1_u8(c, cube->corner); 55 56 // 12 bytes of the edge vector are copied from the e array 57 // First 8 bytes 58 vst1_u8(e, vget_low_u8(cube->edge)); 59 // Next 4 bytes 60 vst1_lane_u32((uint32_t *)(e + 8), 61 vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0); 62 } 63 64 STATIC_INLINE bool 65 equal(cube_t c1, cube_t c2) 66 { 67 uint8x8_t cmp_corner; 68 uint8x16_t cmp_edge; 69 uint64x2_t cmp_corner_u64, cmp_edge_u64, cmp_result; 70 71 // compare the corner vectors and the edge vectors 72 cmp_corner = vceq_u8(c1.corner, c2.corner); 73 cmp_edge = vceqq_u8(c1.edge, c2.edge); 74 75 // convert the comparison vectors to 64-bit vectors and combine them 76 cmp_corner_u64 = vreinterpretq_u64_u8( 77 vcombine_u8(cmp_corner, cmp_corner)); 78 cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge); 79 cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64); 80 81 // check if all the bits are set 82 return vgetq_lane_u64(cmp_result, 0) == ~0ULL && 83 vgetq_lane_u64(cmp_result, 1) == ~0ULL; 84 } 85 86 STATIC_INLINE cube_t 87 invertco(cube_t c) 88 { 89 cube_t ret; 90 uint8x8_t co, shleft, shright, summed, newco, cleanco; 91 92 co = vand_u8(c.corner, CO2_NEON); 93 shleft = vshl_n_u8(co, 1); 94 shright = vshr_n_u8(co, 1); 95 summed = vorr_u8(shleft, shright); 96 newco = vand_u8(summed, CO2_NEON); 97 cleanco = veor_u8(c.corner, co); 98 ret.corner = vorr_u8(cleanco, newco); 99 ret.edge = c.edge; 100 101 return ret; 102 } 103 104 STATIC_INLINE cube_t 105 compose_edges(cube_t c1, cube_t c2) 106 { 107 cube_t ret = {0}; 108 ret.edge = compose_edges_slim(c1.edge, c2.edge); 109 return ret; 110 } 111 112 STATIC_INLINE cube_t 113 compose_corners(cube_t c1, cube_t c2) 114 { 115 cube_t ret = {0}; 116 ret.corner = compose_corners_slim(c1.corner, c2.corner); 117 return ret; 118 } 119 120 STATIC_INLINE uint8x16_t 121 compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2) 122 { 123 // Masks 124 uint8x16_t p_bits = vdupq_n_u8(PBITS); 125 uint8x16_t eo_bit = vdupq_n_u8(EOBIT); 126 127 // Find the index and permutation 128 uint8x16_t p = vandq_u8(edge2, p_bits); 129 uint8x16_t piece1 = vqtbl1q_u8(edge1, p); 130 131 // Calculate the orientation through XOR 132 uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit); 133 134 // Combine the results 135 uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien); 136 137 // Mask to clear the last 32 bits of the result 138 uint32x4_t mask_last_32 = 139 vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3); 140 ret = vreinterpretq_u8_u32(mask_last_32); 141 142 return ret; 143 } 144 145 STATIC_INLINE uint8x8_t 146 compose_corners_slim(uint8x8_t corner1, uint8x8_t corner2) 147 { 148 // Masks 149 uint8x8_t p_bits = vdup_n_u8(PBITS); 150 uint8x8_t cobits = vdup_n_u8(COBITS); 151 uint8x8_t cobits2 = vdup_n_u8(COBITS_2); 152 uint8x8_t twist_cw = vdup_n_u8(CTWIST_CW); 153 154 // Find the index and permutation 155 uint8x8_t p = vand_u8(corner2, p_bits); 156 uint8x8_t piece1 = vtbl1_u8(corner1, p); 157 158 // Calculate the orientation 159 uint8x8_t aux = 160 vadd_u8(vand_u8(corner2, cobits), vand_u8(piece1, cobits)); 161 uint8x8_t auy = vshr_n_u8(vadd_u8(aux, twist_cw), 2); 162 uint8x8_t orien = vand_u8(vadd_u8(aux, auy), cobits2); 163 164 uint8x8_t ret = vorr_u8(vand_u8(piece1, p_bits), orien); 165 166 return ret; 167 } 168 169 STATIC_INLINE cube_t 170 compose(cube_t c1, cube_t c2) 171 { 172 cube_t ret = {0}; 173 174 ret.edge = compose_edges_slim(c1.edge, c2.edge); 175 ret.corner = compose_corners_slim(c1.corner, c2.corner); 176 177 return ret; 178 } 179 180 STATIC_INLINE cube_t 181 inverse(cube_t cube) 182 { 183 uint8_t i, piece, orien; 184 cube_t ret; 185 186 // Temp arrays to store the NEON vectors 187 uint8_t edges[16]; 188 uint8_t corners[8]; 189 190 // Copy the NEON vectors to the arrays 191 vst1q_u8(edges, cube.edge); 192 vst1_u8(corners, cube.corner); 193 194 uint8_t edge_result[16] = {0}; 195 uint8_t corner_result[8] = {0}; 196 197 // Process the edges 198 for (i = 0; i < 12; i++) 199 { 200 piece = edges[i]; 201 orien = piece & EOBIT; 202 edge_result[piece & PBITS] = i | orien; 203 } 204 205 // Process the corners 206 for (i = 0; i < 8; i++) 207 { 208 piece = corners[i]; 209 orien = ((piece << 1) | (piece >> 1)) & COBITS_2; 210 corner_result[piece & PBITS] = i | orien; 211 } 212 213 // Copy the results back to the NEON vectors 214 ret.edge = vld1q_u8(edge_result); 215 ret.corner = vld1_u8(corner_result); 216 217 return ret; 218 } 219 220 STATIC_INLINE uint64_t 221 coord_co(cube_t c) 222 { 223 uint64_t i, p, ret; 224 225 // Temp array to store the NEON vector 226 uint8_t mem[8]; 227 vst1_u8(mem, c.corner); 228 229 for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3) 230 ret += p * (mem[i] >> COSHIFT); 231 232 return ret; 233 } 234 235 STATIC_INLINE void 236 copy_co(cube_t cube[static 1], cube_t co) 237 { 238 uint8x8_t coclean; 239 240 coclean = vand_u8(co.corner, CO2_NEON); 241 cube->corner = vbic_u8(cube->corner, CO2_NEON); 242 cube->corner = vorr_u8(cube->corner, coclean); 243 } 244 245 STATIC_INLINE cube_t 246 invcoord_co(uint64_t coord) 247 { 248 uint64_t co, c, i, p; 249 uint8_t mem[8]; 250 cube_t cube; 251 252 for (i = 0, p = 0, c = coord; i < 8; i++, c /= 3) { 253 co = i == 7 ? ((3 - (p % 3)) % 3) : (c % 3); 254 p += co; 255 mem[i] = i + (co << COSHIFT); 256 } 257 258 cube.corner = vld1_u8(mem); 259 cube.edge = SOLVED_CUBE.edge; 260 261 return cube; 262 } 263 264 STATIC_INLINE uint64_t 265 coord_csep(cube_t c) 266 { 267 uint64_t ret, i, p; 268 269 // Temp array to store the NEON vector 270 uint8_t mem[8]; 271 vst1_u8(mem, c.corner); 272 273 for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2) 274 ret += p * ((mem[i] & CSEPBIT) >> 2); 275 276 return ret; 277 return 0; 278 } 279 280 STATIC_INLINE uint64_t 281 coord_cocsep(cube_t c) 282 { 283 return (coord_co(c) << UINT64_C(7)) + coord_csep(c); 284 } 285 286 STATIC_INLINE uint64_t 287 coord_eo(cube_t c) 288 { 289 uint64_t ret, p; 290 int i; 291 292 // Temp array to store the NEON vector 293 uint8_t mem[16]; 294 vst1q_u8(mem, c.edge); 295 296 for (i = 1, ret = 0, p = 1; i < 12; i++, p *= 2) 297 { 298 ret += p * (mem[i] >> EOSHIFT); 299 } 300 301 return ret; 302 } 303 304 STATIC_INLINE uint64_t 305 coord_esep(cube_t c) 306 { 307 uint64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1; 308 309 // Temp array to store the NEON vector 310 uint8_t mem[16]; 311 vst1q_u8(mem, c.edge); 312 313 for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++) 314 { 315 bit1 = (mem[i] & ESEPBIT_1) >> 2; 316 bit2 = (mem[i] & ESEPBIT_2) >> 3; 317 is1 = (1 - bit2) * bit1; 318 319 ret1 += bit2 * binomial[11 - i][k]; 320 k -= bit2; 321 322 jj = j < 8; 323 ret2 += jj * is1 * binomial[7 - (j * jj)][l]; 324 l -= is1; 325 j += (1 - bit2); 326 } 327 328 return ret1 * 70 + ret2; 329 } 330 331 STATIC_INLINE void 332 copy_corners(cube_t dst[static 1], cube_t src) 333 { 334 dst->corner = src.corner; 335 } 336 337 STATIC_INLINE void 338 copy_edges(cube_t dst[static 1], cube_t src) 339 { 340 dst->edge = src.edge; 341 } 342 343 STATIC_INLINE void 344 set_eo(cube_t cube[static 1], uint64_t eo) 345 { 346 // Temp array to store the NEON vector 347 uint8_t mem[16]; 348 vst1q_u8(mem, cube->edge); 349 uint8_t i, sum, flip; 350 351 for (sum = 0, i = 1; i < 12; i++, eo >>= 1) 352 { 353 flip = eo % 2; 354 sum += flip; 355 mem[i] = (mem[i] & ~EOBIT) | (EOBIT * flip); 356 } 357 mem[0] = (mem[0] & ~EOBIT) | (EOBIT * (sum % 2)); 358 359 // Copy the results back to the NEON vector 360 cube->edge = vld1q_u8(mem); 361 return; 362 } 363 364 STATIC_INLINE cube_t 365 invcoord_esep(uint64_t esep) 366 { 367 cube_t ret; 368 uint8_t mem[16] = {0}; 369 370 invcoord_esep_array(esep % UINT64_C(70), esep / UINT64_C(70), mem); 371 372 ret = SOLVED_CUBE; 373 ret.edge = vld1q_u8(mem); 374 375 return ret; 376 } 377 378 STATIC_INLINE uint64_t 379 permtoindex_Nx8(uint64_t n, uint8x8_t a) 380 { 381 uint64_t i, c, ret; 382 uint8x8_t cmp; 383 uint64x1_t anum; 384 uint8_t or[8] = {0, 0, 0, 0, 0, 0, 0, 0x0F}; 385 386 for (i = 0, ret = 0; i < n; i++) { 387 cmp = vdup_lane_u8(a, 0); 388 anum = vreinterpret_u64_u8(a); 389 anum = vshr_n_u64(anum, 8); 390 a = vreinterpret_u8_u64(anum); 391 a = vorr_u8(a, vld1_u8(or)); 392 cmp = vcgt_u8(cmp, a); 393 c = vaddv_u8(vshr_n_u8(cmp, 7)); 394 ret += c * factorial[n-1-i]; 395 } 396 397 return ret; 398 } 399 400 STATIC_INLINE uint8x8_t 401 indextoperm_8x8(uint64_t p) 402 { 403 int used; 404 uint64_t c, k, i, j; 405 uint8_t ret[8]; 406 407 for (i = 0, used = 0; i < 8; i++) { 408 k = p / factorial[7-i]; 409 410 /* Find k-th unused number */ 411 for (j = 0, c = 0; c <= k; j++) 412 c += 1 - ((used & (1 << j)) >> j); 413 414 ret[i] = j-1; 415 used |= 1 << (j-1); 416 p %= factorial[7-i]; 417 } 418 419 return vld1_u8(ret); 420 } 421 422 STATIC_INLINE uint8x8_t 423 indextoperm_4x8(uint64_t p) 424 { 425 static const int64_t A[FACT_4] = { 426 [0] = UINT64_C(0x03020100), 427 [1] = UINT64_C(0x02030100), 428 [2] = UINT64_C(0x03010200), 429 [3] = UINT64_C(0x01030200), 430 [4] = UINT64_C(0x02010300), 431 [5] = UINT64_C(0x01020300), 432 [6] = UINT64_C(0x03020001), 433 [7] = UINT64_C(0x02030001), 434 [8] = UINT64_C(0x03000201), 435 [9] = UINT64_C(0x00030201), 436 [10] = UINT64_C(0x02000301), 437 [11] = UINT64_C(0x00020301), 438 [12] = UINT64_C(0x03010002), 439 [13] = UINT64_C(0x01030002), 440 [14] = UINT64_C(0x03000102), 441 [15] = UINT64_C(0x00030102), 442 [16] = UINT64_C(0x01000302), 443 [17] = UINT64_C(0x00010302), 444 [18] = UINT64_C(0x02010003), 445 [19] = UINT64_C(0x01020003), 446 [20] = UINT64_C(0x02000103), 447 [21] = UINT64_C(0x00020103), 448 [22] = UINT64_C(0x01000203), 449 [23] = UINT64_C(0x00010203), 450 }; 451 452 return vreinterpret_u8_u64(vdup_n_u64(A[p])); 453 } 454 455 STATIC_INLINE uint64_t 456 coord_cp(cube_t cube) 457 { 458 return permtoindex_Nx8(8, vand_u8(cube.corner, PBITS8_NEON)); 459 } 460 461 STATIC_INLINE cube_t 462 invcoord_cp(uint64_t i) 463 { 464 return (cube_t) { 465 .corner = indextoperm_8x8(i), 466 .edge = vcombine_u8(vld1_u8(SOLVED_L), vld1_u8(SOLVED_H)) 467 }; 468 } 469 470 STATIC_INLINE uint64_t 471 coord_epud(cube_t cube) 472 { 473 uint8x8_t a; 474 475 a = vget_low_u8(cube.edge); 476 a = vand_u8(a, PBITS8_NEON); 477 478 return permtoindex_Nx8(8, a); 479 } 480 481 STATIC_INLINE cube_t 482 invcoord_epud(uint64_t i) 483 { 484 return (cube_t) { 485 .corner = vld1_u8(SOLVED_L), 486 .edge = vcombine_u8(indextoperm_8x8(i), vld1_u8(SOLVED_H)) 487 }; 488 } 489 490 STATIC_INLINE uint64_t 491 coord_epe(cube_t cube) 492 { 493 uint8x8_t a; 494 495 a = vget_high_u8(cube.edge); 496 a = vand_u8(a, PBITS8_NEON); 497 a = veor_u8(a, vdup_n_u8(8)); 498 499 return permtoindex_Nx8(4, a); 500 } 501 502 STATIC_INLINE cube_t 503 invcoord_epe(uint64_t i) 504 { 505 uint8x8_t a; 506 507 a = indextoperm_4x8(i); 508 a = vadd_u8(a, vreinterpret_u8_u64(vdup_n_u64(UINT64_C(0x08080808)))); 509 510 return (cube_t) { 511 .corner = vld1_u8(SOLVED_L), 512 .edge = vcombine_u8(vld1_u8(SOLVED_L), a) 513 }; 514 } 515 516 STATIC_INLINE bool 517 is_eo_even(cube_t cube) 518 { 519 int8_t count; 520 uint8x16_t e; 521 522 e = vandq_u8(cube.edge, EO_NEON); 523 e = vshrq_n_u8(e, EOSHIFT); 524 count = vaddvq_u8(e); 525 526 return count % 2 == 0; 527 } 528 529 STATIC_INLINE uint64_t 530 coord_epudsep(cube_t cube) 531 { 532 uint8_t e[8]; 533 534 vst1_u8(e, vget_low_u8(cube.edge)); 535 return coord_epudsep_array(e); 536 } 537 538 STATIC_INLINE cube_t 539 invcoord_epudsep(uint64_t i) 540 { 541 uint8_t e[8]; 542 543 invcoord_epudsep_array(i, e); 544 return (cube_t) { 545 .corner = vld1_u8(SOLVED_L), 546 .edge = vcombine_u8(vld1_u8(e), vld1_u8(SOLVED_H)) 547 }; 548 }