neon.h (11485B)
1 #define CO2_NEON vdup_n_u8(0x60) 2 #define COCW_NEON vdup_n_u8(0x20) 3 #define EO_NEON vdupq_n_u8(0x10) 4 #define PBITS8_NEON vdup_n_u8(PBITS) 5 6 STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t); 7 STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t); 8 9 #define STATIC_CUBE( \ 10 c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \ 11 e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \ 12 ((cube_t){ \ 13 .corner = { \ 14 c_ufr, c_ubl, c_dfl, c_dbr, \ 15 c_ufl, c_ubr, c_dfr, c_dbl \ 16 }, \ 17 .edge = { \ 18 e_uf, e_ub, e_db, e_df, e_ur, e_ul, \ 19 e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0 \ 20 } \ 21 }) 22 #define ZERO_CUBE \ 23 ((cube_t){ \ 24 .corner = vdup_n_u8(0), \ 25 .edge = vdupq_n_u8(0) \ 26 }) 27 #define SOLVED_CUBE STATIC_CUBE( \ 28 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) 29 30 const uint8_t SOLVED_L[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 31 const uint8_t SOLVED_H[8] = {8, 9, 10, 11, 0, 0, 0}; 32 33 STATIC_INLINE uint64_t permtoindex_Nx8(uint64_t, uint8x8_t); 34 STATIC_INLINE uint8x8_t indextoperm_8x8(uint64_t); 35 STATIC_INLINE uint8x8_t indextoperm_4x8(uint64_t); 36 37 STATIC_INLINE int 38 popcount_u32(uint32_t x) 39 { 40 /* Same as the portable version */ 41 x -= (x >> UINT32_C(1)) & UINT32_C(0x55555555); 42 x = (x & UINT32_C(0x33333333)) + 43 ((x >> UINT32_C(2)) & UINT32_C(0x33333333)); 44 x = (x + (x >> UINT32_C(4))) & UINT32_C(0x0F0F0F0F); 45 x = (x * UINT32_C(0x01010101)) >> UINT32_C(24); 46 47 return (int)x; 48 } 49 50 STATIC void 51 pieces(cube_t cube[static 1], uint8_t c[static 8], uint8_t e[static 12]) 52 { 53 // First 8 bytes of the corner vector are copied from the c array 54 vst1_u8(c, cube->corner); 55 56 // 12 bytes of the edge vector are copied from the e array 57 // First 8 bytes 58 vst1_u8(e, vget_low_u8(cube->edge)); 59 // Next 4 bytes 60 vst1_lane_u32((uint32_t *)(e + 8), 61 vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0); 62 } 63 64 STATIC_INLINE bool 65 equal(cube_t c1, cube_t c2) 66 { 67 uint8x8_t cmp_corner; 68 uint8x16_t cmp_edge; 69 uint64x2_t cmp_corner_u64, cmp_edge_u64, cmp_result; 70 71 // compare the corner vectors and the edge vectors 72 cmp_corner = vceq_u8(c1.corner, c2.corner); 73 cmp_edge = vceqq_u8(c1.edge, c2.edge); 74 75 // convert the comparison vectors to 64-bit vectors and combine them 76 cmp_corner_u64 = vreinterpretq_u64_u8( 77 vcombine_u8(cmp_corner, cmp_corner)); 78 cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge); 79 cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64); 80 81 // check if all the bits are set 82 return vgetq_lane_u64(cmp_result, 0) == ~0ULL && 83 vgetq_lane_u64(cmp_result, 1) == ~0ULL; 84 } 85 86 STATIC_INLINE cube_t 87 invertco(cube_t c) 88 { 89 cube_t ret; 90 uint8x8_t co, shleft, shright, summed, newco, cleanco; 91 92 co = vand_u8(c.corner, CO2_NEON); 93 shleft = vshl_n_u8(co, 1); 94 shright = vshr_n_u8(co, 1); 95 summed = vorr_u8(shleft, shright); 96 newco = vand_u8(summed, CO2_NEON); 97 cleanco = veor_u8(c.corner, co); 98 ret.corner = vorr_u8(cleanco, newco); 99 ret.edge = c.edge; 100 101 return ret; 102 } 103 104 STATIC_INLINE cube_t 105 compose_edges(cube_t c1, cube_t c2) 106 { 107 cube_t ret = {0}; 108 ret.edge = compose_edges_slim(c1.edge, c2.edge); 109 return ret; 110 } 111 112 STATIC_INLINE cube_t 113 compose_corners(cube_t c1, cube_t c2) 114 { 115 cube_t ret = {0}; 116 ret.corner = compose_corners_slim(c1.corner, c2.corner); 117 return ret; 118 } 119 120 STATIC_INLINE uint8x16_t 121 compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2) 122 { 123 // Masks 124 uint8x16_t p_bits = vdupq_n_u8(PBITS); 125 uint8x16_t eo_bit = vdupq_n_u8(EOBIT); 126 127 // Find the index and permutation 128 uint8x16_t p = vandq_u8(edge2, p_bits); 129 uint8x16_t piece1 = vqtbl1q_u8(edge1, p); 130 131 // Calculate the orientation through XOR 132 uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit); 133 134 // Combine the results 135 uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien); 136 137 // Mask to clear the last 32 bits of the result 138 uint32x4_t mask_last_32 = 139 vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3); 140 ret = vreinterpretq_u8_u32(mask_last_32); 141 142 return ret; 143 } 144 145 STATIC_INLINE uint8x8_t 146 compose_corners_slim(uint8x8_t corner1, uint8x8_t corner2) 147 { 148 // Masks 149 uint8x8_t p_bits = vdup_n_u8(PBITS); 150 uint8x8_t cobits = vdup_n_u8(COBITS); 151 uint8x8_t cobits2 = vdup_n_u8(COBITS_2); 152 uint8x8_t twist_cw = vdup_n_u8(CTWIST_CW); 153 154 // Find the index and permutation 155 uint8x8_t p = vand_u8(corner2, p_bits); 156 uint8x8_t piece1 = vtbl1_u8(corner1, p); 157 158 // Calculate the orientation 159 uint8x8_t aux = 160 vadd_u8(vand_u8(corner2, cobits), vand_u8(piece1, cobits)); 161 uint8x8_t auy = vshr_n_u8(vadd_u8(aux, twist_cw), 2); 162 uint8x8_t orien = vand_u8(vadd_u8(aux, auy), cobits2); 163 164 uint8x8_t ret = vorr_u8(vand_u8(piece1, p_bits), orien); 165 166 return ret; 167 } 168 169 STATIC_INLINE cube_t 170 compose(cube_t c1, cube_t c2) 171 { 172 cube_t ret = {0}; 173 174 ret.edge = compose_edges_slim(c1.edge, c2.edge); 175 ret.corner = compose_corners_slim(c1.corner, c2.corner); 176 177 return ret; 178 } 179 180 STATIC_INLINE cube_t 181 inverse(cube_t cube) 182 { 183 uint8_t i, piece, orien; 184 cube_t ret; 185 186 // Temp arrays to store the NEON vectors 187 uint8_t edges[16]; 188 uint8_t corners[8]; 189 190 // Copy the NEON vectors to the arrays 191 vst1q_u8(edges, cube.edge); 192 vst1_u8(corners, cube.corner); 193 194 uint8_t edge_result[16] = {0}; 195 uint8_t corner_result[8] = {0}; 196 197 // Process the edges 198 for (i = 0; i < 12; i++) 199 { 200 piece = edges[i]; 201 orien = piece & EOBIT; 202 edge_result[piece & PBITS] = i | orien; 203 } 204 205 // Process the corners 206 for (i = 0; i < 8; i++) 207 { 208 piece = corners[i]; 209 orien = ((piece << 1) | (piece >> 1)) & COBITS_2; 210 corner_result[piece & PBITS] = i | orien; 211 } 212 213 // Copy the results back to the NEON vectors 214 ret.edge = vld1q_u8(edge_result); 215 ret.corner = vld1_u8(corner_result); 216 217 return ret; 218 } 219 220 STATIC_INLINE uint64_t 221 coord_co(cube_t c) 222 { 223 uint64_t i, p, ret; 224 225 // Temp array to store the NEON vector 226 uint8_t mem[8]; 227 vst1_u8(mem, c.corner); 228 229 for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3) 230 ret += p * (mem[i] >> COSHIFT); 231 232 return ret; 233 } 234 235 STATIC_INLINE cube_t 236 invcoord_co(uint64_t coord) 237 { 238 uint64_t co, c, i, p; 239 uint8_t mem[8]; 240 cube_t cube; 241 242 for (i = 0, p = 0, c = coord; i < 8; i++, c /= 3) { 243 co = i == 7 ? ((3 - (p % 3)) % 3) : (c % 3); 244 p += co; 245 mem[i] = i + (co << COSHIFT); 246 } 247 248 cube.corner = vld1_u8(mem); 249 cube.edge = SOLVED_CUBE.edge; 250 251 return cube; 252 } 253 254 STATIC_INLINE uint64_t 255 coord_csep(cube_t c) 256 { 257 uint64_t ret, i, p; 258 259 // Temp array to store the NEON vector 260 uint8_t mem[8]; 261 vst1_u8(mem, c.corner); 262 263 for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2) 264 ret += p * ((mem[i] & CSEPBIT) >> 2); 265 266 return ret; 267 return 0; 268 } 269 270 STATIC_INLINE uint64_t 271 coord_cocsep(cube_t c) 272 { 273 return (coord_co(c) << UINT64_C(7)) + coord_csep(c); 274 } 275 276 STATIC_INLINE uint64_t 277 coord_eo(cube_t c) 278 { 279 uint64_t ret, p; 280 int i; 281 282 // Temp array to store the NEON vector 283 uint8_t mem[16]; 284 vst1q_u8(mem, c.edge); 285 286 for (i = 1, ret = 0, p = 1; i < 12; i++, p *= 2) 287 { 288 ret += p * (mem[i] >> EOSHIFT); 289 } 290 291 return ret; 292 } 293 294 STATIC_INLINE uint64_t 295 coord_esep(cube_t c) 296 { 297 uint64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1; 298 299 // Temp array to store the NEON vector 300 uint8_t mem[16]; 301 vst1q_u8(mem, c.edge); 302 303 for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++) 304 { 305 bit1 = (mem[i] & ESEPBIT_1) >> 2; 306 bit2 = (mem[i] & ESEPBIT_2) >> 3; 307 is1 = (1 - bit2) * bit1; 308 309 ret1 += bit2 * binomial[11 - i][k]; 310 k -= bit2; 311 312 jj = j < 8; 313 ret2 += jj * is1 * binomial[7 - (j * jj)][l]; 314 l -= is1; 315 j += (1 - bit2); 316 } 317 318 return ret1 * 70 + ret2; 319 } 320 321 STATIC_INLINE void 322 copy_corners(cube_t dst[static 1], cube_t src) 323 { 324 dst->corner = src.corner; 325 } 326 327 STATIC_INLINE void 328 copy_edges(cube_t dst[static 1], cube_t src) 329 { 330 dst->edge = src.edge; 331 } 332 333 STATIC_INLINE void 334 set_eo(cube_t cube[static 1], uint64_t eo) 335 { 336 // Temp array to store the NEON vector 337 uint8_t mem[16]; 338 vst1q_u8(mem, cube->edge); 339 uint8_t i, sum, flip; 340 341 for (sum = 0, i = 1; i < 12; i++, eo >>= 1) 342 { 343 flip = eo % 2; 344 sum += flip; 345 mem[i] = (mem[i] & ~EOBIT) | (EOBIT * flip); 346 } 347 mem[0] = (mem[0] & ~EOBIT) | (EOBIT * (sum % 2)); 348 349 // Copy the results back to the NEON vector 350 cube->edge = vld1q_u8(mem); 351 return; 352 } 353 354 STATIC_INLINE cube_t 355 invcoord_esep(uint64_t esep) 356 { 357 cube_t ret; 358 uint8_t mem[16] = {0}; 359 360 invcoord_esep_array(esep % UINT64_C(70), esep / UINT64_C(70), mem); 361 362 ret = SOLVED_CUBE; 363 ret.edge = vld1q_u8(mem); 364 365 return ret; 366 } 367 368 STATIC_INLINE uint64_t 369 permtoindex_Nx8(uint64_t n, uint8x8_t a) 370 { 371 uint64_t i, c, ret; 372 uint8x8_t cmp; 373 uint64x1_t anum; 374 uint8_t or[8] = {0, 0, 0, 0, 0, 0, 0, 0x0F}; 375 376 for (i = 0, ret = 0; i < n; i++) { 377 cmp = vdup_lane_u8(a, 0); 378 anum = vreinterpret_u64_u8(a); 379 anum = vshr_n_u64(anum, 8); 380 a = vreinterpret_u8_u64(anum); 381 a = vorr_u8(a, vld1_u8(or)); 382 cmp = vcgt_u8(cmp, a); 383 c = vaddv_u8(vshr_n_u8(cmp, 7)); 384 ret += c * factorial[n-1-i]; 385 } 386 387 return ret; 388 } 389 390 STATIC_INLINE uint8x8_t 391 indextoperm_8x8(uint64_t p) 392 { 393 int used; 394 uint64_t c, k, i, j; 395 uint8_t ret[8]; 396 397 for (i = 0, used = 0; i < 8; i++) { 398 k = p / factorial[7-i]; 399 400 /* Find k-th unused number */ 401 for (j = 0, c = 0; c <= k; j++) 402 c += 1 - ((used & (1 << j)) >> j); 403 404 ret[i] = j-1; 405 used |= 1 << (j-1); 406 p %= factorial[7-i]; 407 } 408 409 return vld1_u8(ret); 410 } 411 412 STATIC_INLINE uint8x8_t 413 indextoperm_4x8(uint64_t p) 414 { 415 static const int64_t A[FACT_4] = { 416 [0] = UINT64_C(0x03020100), 417 [1] = UINT64_C(0x02030100), 418 [2] = UINT64_C(0x03010200), 419 [3] = UINT64_C(0x01030200), 420 [4] = UINT64_C(0x02010300), 421 [5] = UINT64_C(0x01020300), 422 [6] = UINT64_C(0x03020001), 423 [7] = UINT64_C(0x02030001), 424 [8] = UINT64_C(0x03000201), 425 [9] = UINT64_C(0x00030201), 426 [10] = UINT64_C(0x02000301), 427 [11] = UINT64_C(0x00020301), 428 [12] = UINT64_C(0x03010002), 429 [13] = UINT64_C(0x01030002), 430 [14] = UINT64_C(0x03000102), 431 [15] = UINT64_C(0x00030102), 432 [16] = UINT64_C(0x01000302), 433 [17] = UINT64_C(0x00010302), 434 [18] = UINT64_C(0x02010003), 435 [19] = UINT64_C(0x01020003), 436 [20] = UINT64_C(0x02000103), 437 [21] = UINT64_C(0x00020103), 438 [22] = UINT64_C(0x01000203), 439 [23] = UINT64_C(0x00010203), 440 }; 441 442 return vreinterpret_u8_u64(vdup_n_u64(A[p])); 443 } 444 445 STATIC_INLINE uint64_t 446 coord_cp(cube_t cube) 447 { 448 return permtoindex_Nx8(8, vand_u8(cube.corner, PBITS8_NEON)); 449 } 450 451 STATIC_INLINE cube_t 452 invcoord_cp(uint64_t i) 453 { 454 return (cube_t) { 455 .corner = indextoperm_8x8(i), 456 .edge = vcombine_u8(vld1_u8(SOLVED_L), vld1_u8(SOLVED_H)) 457 }; 458 } 459 460 STATIC_INLINE uint64_t 461 coord_epud(cube_t cube) 462 { 463 uint8x8_t a; 464 465 a = vget_low_u8(cube.edge); 466 a = vand_u8(a, PBITS8_NEON); 467 468 return permtoindex_Nx8(8, a); 469 } 470 471 STATIC_INLINE cube_t 472 invcoord_epud(uint64_t i) 473 { 474 return (cube_t) { 475 .corner = vld1_u8(SOLVED_L), 476 .edge = vcombine_u8(indextoperm_8x8(i), vld1_u8(SOLVED_H)) 477 }; 478 } 479 480 STATIC_INLINE uint64_t 481 coord_epe(cube_t cube) 482 { 483 uint8x8_t a; 484 485 a = vget_high_u8(cube.edge); 486 a = vand_u8(a, PBITS8_NEON); 487 a = veor_u8(a, vdup_n_u8(8)); 488 489 return permtoindex_Nx8(4, a); 490 } 491 492 STATIC_INLINE cube_t 493 invcoord_epe(uint64_t i) 494 { 495 uint8x8_t a; 496 497 a = indextoperm_4x8(i); 498 a = vadd_u8(a, vreinterpret_u8_u64(vdup_n_u64(UINT64_C(0x08080808)))); 499 500 return (cube_t) { 501 .corner = vld1_u8(SOLVED_L), 502 .edge = vcombine_u8(vld1_u8(SOLVED_L), a) 503 }; 504 } 505 506 STATIC_INLINE bool 507 is_eo_even(cube_t cube) 508 { 509 int8_t count; 510 uint8x16_t e; 511 512 e = vandq_u8(cube.edge, EO_NEON); 513 e = vshrq_n_u8(e, EOSHIFT); 514 count = vaddvq_u8(e); 515 516 return count % 2 == 0; 517 } 518 519 STATIC_INLINE uint64_t 520 coord_epudsep(cube_t cube) 521 { 522 uint8_t e[8]; 523 524 vst1_u8(e, vget_low_u8(cube.edge)); 525 return coord_epudsep_array(e); 526 } 527 528 STATIC_INLINE cube_t 529 invcoord_epudsep(uint64_t i) 530 { 531 uint8_t e[8]; 532 533 invcoord_epudsep_array(i, e); 534 return (cube_t) { 535 .corner = vld1_u8(SOLVED_L), 536 .edge = vcombine_u8(vld1_u8(e), vld1_u8(SOLVED_H)) 537 }; 538 }