avx2.h (10725B)
1 #define CO2_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x6060606060606060)) 2 #define COCW_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x2020202020202020)) 3 #define CP_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x0707070707070707)) 4 #define EP_AVX2 _mm256_set_epi64x(\ 5 INT64_C(0x0F0F0F0F), INT64_C(0x0F0F0F0F0F0F0F0F), 0, 0) 6 #define EO_AVX2 _mm256_set_epi64x(\ 7 INT64_C(0x10101010), INT64_C(0x1010101010101010), 0, 0) 8 #define ORIENT_AVX2 _mm256_set_epi64x(INT64_C(0x10101010), \ 9 INT64_C(0x1010101010101010), 0, INT64_C(0x6060606060606060)) 10 #define USED_AVX2 _mm256_set_epi64x(INT64_C(0x00000000FFFFFFFF), \ 11 INT64_C(0xFFFFFFFFFFFFFFFF), 0, INT64_C(0xFFFFFFFFFFFFFFFF)) 12 #define CARRY_AVX2 _mm256_set_epi64x(INT64_C(0x20202020), \ 13 INT64_C(0x2020202020202020), 0, INT64_C(0x6060606060606060)) 14 15 #define SOLVED_L INT64_C(0x0706050403020100) 16 #define SOLVED_H INT64_C(0x0B0A0908) 17 18 #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \ 19 e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \ 20 _mm256_set_epi8(0, 0, 0, 0, e_br, e_bl, e_fl, e_fr, \ 21 e_dr, e_dl, e_ul, e_ur, e_df, e_db, e_ub, e_uf, \ 22 0, 0, 0, 0, 0, 0, 0, 0, \ 23 c_dbl, c_dfr, c_ubr, c_ufl, c_dbr, c_dfl, c_ubl, c_ufr) 24 #define ZERO_CUBE _mm256_set_epi64x(0, 0, 0, 0) 25 #define SOLVED_CUBE _mm256_set_epi64x(SOLVED_H, SOLVED_L, 0, SOLVED_L) 26 27 STATIC_INLINE uint64_t permtoindex_Nx8(uint64_t, int64_t); 28 STATIC_INLINE int64_t indextoperm_8x8(uint64_t); 29 STATIC_INLINE int64_t indextoperm_4x8(uint64_t); 30 31 STATIC_INLINE int 32 popcount_u32(uint32_t x) 33 { 34 return _mm_popcnt_u32(x); 35 } 36 37 STATIC void 38 pieces(cube_t cube[static 1], uint8_t c[static 8], uint8_t e[static 12]) 39 { 40 uint8_t aux[32]; 41 42 _mm256_storeu_si256((__m256i *)aux, *cube); 43 memcpy(c, aux, 8); 44 memcpy(e, aux+16, 12); 45 } 46 47 STATIC_INLINE bool 48 equal(cube_t c1, cube_t c2) 49 { 50 int32_t mask; 51 __m256i cmp; 52 53 cmp = _mm256_cmpeq_epi8(c1, c2); 54 mask = _mm256_movemask_epi8(cmp); 55 56 return mask == ~0; 57 } 58 59 STATIC_INLINE cube_t 60 invertco(cube_t c) 61 { 62 cube_t co, shleft, shright, summed, newco, cleanco, ret; 63 64 co = _mm256_and_si256(c, CO2_AVX2); 65 shleft = _mm256_slli_epi32(co, 1); 66 shright = _mm256_srli_epi32(co, 1); 67 summed = _mm256_or_si256(shleft, shright); 68 newco = _mm256_and_si256(summed, CO2_AVX2); 69 cleanco = _mm256_xor_si256(c, co); 70 ret = _mm256_or_si256(cleanco, newco); 71 72 return ret; 73 } 74 75 STATIC_INLINE cube_t 76 compose_edges(cube_t c1, cube_t c2) 77 { 78 return compose(c1, c2); 79 } 80 81 STATIC_INLINE cube_t 82 compose_corners(cube_t c1, cube_t c2) 83 { 84 return compose(c1, c2); 85 } 86 87 STATIC_INLINE cube_t 88 compose(cube_t c1, cube_t c2) 89 { 90 /* 91 * Method taken from Andrew Skalski's vcube (thanks to Arhan Chaudhary 92 * for pointing this out) 93 */ 94 cube_t ss, so, su; 95 96 /* Permute */ 97 ss = _mm256_shuffle_epi8(c1, c2); 98 99 /* Orient */ 100 so = _mm256_and_si256(c2, ORIENT_AVX2); 101 ss = _mm256_add_epi8(ss, so); 102 su = _mm256_sub_epi8(ss, CARRY_AVX2); 103 ss = _mm256_min_epu8(ss, su); 104 105 return _mm256_and_si256(ss, USED_AVX2); 106 } 107 108 STATIC_INLINE cube_t 109 inverse(cube_t c) 110 { 111 /* Method taken from Andrew Skalski's vcube[1]. The addition sequence 112 * was generated using [2]. 113 * [1] https://github.com/Voltara/vcube 114 * [2] http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html 115 */ 116 cube_t v3, vi, vo, vp, ret; 117 118 v3 = _mm256_shuffle_epi8(c, c); 119 v3 = _mm256_shuffle_epi8(v3, c); 120 vi = _mm256_shuffle_epi8(v3, v3); 121 vi = _mm256_shuffle_epi8(vi, vi); 122 vi = _mm256_shuffle_epi8(vi, vi); 123 vi = _mm256_shuffle_epi8(vi, v3); 124 vi = _mm256_shuffle_epi8(vi, vi); 125 vi = _mm256_shuffle_epi8(vi, vi); 126 vi = _mm256_shuffle_epi8(vi, vi); 127 vi = _mm256_shuffle_epi8(vi, vi); 128 vi = _mm256_shuffle_epi8(vi, c); 129 vi = _mm256_shuffle_epi8(vi, vi); 130 vi = _mm256_shuffle_epi8(vi, vi); 131 vi = _mm256_shuffle_epi8(vi, vi); 132 vi = _mm256_shuffle_epi8(vi, vi); 133 vi = _mm256_shuffle_epi8(vi, vi); 134 vi = _mm256_shuffle_epi8(vi, v3); 135 vi = _mm256_shuffle_epi8(vi, vi); 136 vi = _mm256_shuffle_epi8(vi, c); 137 138 vo = _mm256_and_si256(c, ORIENT_AVX2); 139 vo = _mm256_shuffle_epi8(vo, vi); 140 vp = _mm256_andnot_si256(ORIENT_AVX2, vi); 141 ret = _mm256_or_si256(vp, vo); 142 ret = _mm256_and_si256(ret, USED_AVX2); 143 144 return invertco(ret); 145 } 146 147 STATIC_INLINE uint64_t 148 coord_co(cube_t c) 149 { 150 cube_t co; 151 uint64_t mem[4], ret, i, p; 152 153 co = _mm256_and_si256(c, CO2_AVX2); 154 _mm256_storeu_si256((__m256i *)mem, co); 155 156 mem[0] >>= 5; 157 for (i = 0, ret = 0, p = 1; i < 7; i++, mem[0] >>= 8, p *= 3) 158 ret += (mem[0] & 3) * p; 159 160 return ret; 161 } 162 163 STATIC_INLINE void 164 copy_co(cube_t cube[static 1], cube_t co) 165 { 166 cube_t coclean; 167 168 coclean = _mm256_and_si256(co, CO2_AVX2); 169 *cube = _mm256_andnot_si256(CO2_AVX2, *cube); 170 *cube = _mm256_or_si256(*cube, coclean); 171 } 172 173 STATIC_INLINE cube_t 174 invcoord_co(uint64_t coord) 175 { 176 uint64_t i, c, p, co, mem[4] = {0}; 177 cube_t cube, cc; 178 179 for (i = 0, p = 0, c = coord; i < 8; i++, c /= 3) { 180 co = i == 7 ? ((3 - (p % 3)) % 3) : (c % 3); 181 p += co; 182 mem[0] |= (uint64_t)(i + (co << COSHIFT)) << (uint64_t)(8 * i); 183 } 184 185 cc = _mm256_loadu_si256((__m256i *)mem); 186 cube = SOLVED_CUBE; 187 copy_co(&cube, cc); 188 189 return cube; 190 } 191 192 STATIC_INLINE uint64_t 193 coord_csep(cube_t c) 194 { 195 cube_t cp, shifted; 196 int mask; 197 198 cp = _mm256_and_si256(c, CP_AVX2); 199 shifted = _mm256_slli_epi32(cp, 5); 200 mask = _mm256_movemask_epi8(shifted); 201 202 return (uint64_t)(mask & 0x7F); 203 } 204 205 STATIC_INLINE uint64_t 206 coord_cocsep(cube_t c) 207 { 208 return (coord_co(c) << UINT8_C(7)) + coord_csep(c); 209 } 210 211 STATIC_INLINE uint64_t 212 coord_eo(cube_t c) 213 { 214 cube_t eo, shifted; 215 int mask; 216 217 eo = _mm256_and_si256(c, EO_AVX2); 218 shifted = _mm256_slli_epi32(eo, 3); 219 mask = _mm256_movemask_epi8(shifted); 220 221 return (uint64_t)(mask >> 17); 222 } 223 224 STATIC_INLINE uint64_t 225 coord_esep(cube_t c) 226 { 227 cube_t ep; 228 uint64_t e, mem[4], i, j, jj, k, l, ret1, ret2, bit1, bit2, is1; 229 230 ep = _mm256_and_si256(c, EP_AVX2); 231 _mm256_storeu_si256((__m256i *)mem, ep); 232 233 mem[3] <<= 8; 234 ret1 = ret2 = 0; 235 k = l = 4; 236 for (i = 0, j = 0; i < 12; i++, mem[i/8 + 2] >>= 8) { 237 e = mem[i/8 + 2]; 238 239 bit1 = (e & ESEPBIT_1) >> 2; 240 bit2 = (e & ESEPBIT_2) >> 3; 241 is1 = (1 - bit2) * bit1; 242 243 ret1 += bit2 * binomial[11-i][k]; 244 k -= bit2; 245 246 jj = j < 8; 247 ret2 += jj * is1 * binomial[7-(j*jj)][l]; 248 l -= is1; 249 j += (1-bit2); 250 } 251 252 return ret1 * 70 + ret2; 253 } 254 255 STATIC_INLINE cube_t 256 invcoord_esep(uint64_t esep) 257 { 258 cube_t eee, ret; 259 uint8_t mem[32] = {0}; 260 261 invcoord_esep_array(esep % UINT64_C(70), esep / UINT64_C(70), mem+16); 262 263 ret = SOLVED_CUBE; 264 eee = _mm256_loadu_si256((__m256i *)mem); 265 copy_edges(&ret, eee); 266 267 return ret; 268 } 269 270 STATIC_INLINE void 271 copy_corners(cube_t dest[static 1], cube_t src) 272 { 273 *dest = _mm256_blend_epi32(*dest, src, 0x0F); 274 } 275 276 STATIC_INLINE void 277 copy_edges(cube_t dest[static 1], cube_t src) 278 { 279 *dest = _mm256_blend_epi32(*dest, src, 0xF0); 280 } 281 282 STATIC_INLINE void 283 set_eo(cube_t cube[static 1], uint64_t eo) 284 { 285 uint64_t eo12, eotop, eobot; 286 __m256i veo; 287 288 eo12 = (eo << 1) + (_mm_popcnt_u64(eo) % 2); 289 eotop = (eo12 & (1 << 11)) << 17 | 290 (eo12 & (1 << 10)) << 10 | 291 (eo12 & (1 << 9)) << 3 | 292 (eo12 & (1 << 8)) >> 4; 293 eobot = (eo12 & (1 << 7)) << 53 | 294 (eo12 & (1 << 6)) << 46 | 295 (eo12 & (1 << 5)) << 39 | 296 (eo12 & (1 << 4)) << 32 | 297 (eo12 & (1 << 3)) << 25 | 298 (eo12 & (1 << 2)) << 18 | 299 (eo12 & (1 << 1)) << 11 | 300 (eo12 & 1) << 4; 301 veo = _mm256_set_epi64x(eotop, eobot, 0, 0); 302 303 *cube = _mm256_andnot_si256(EO_AVX2, *cube); 304 *cube = _mm256_or_si256(*cube, veo); 305 } 306 307 STATIC_INLINE uint64_t 308 permtoindex_Nx8(uint64_t n, int64_t a) 309 { 310 uint64_t i, c, ret; 311 __m64 cmp; 312 313 for (i = 0, ret = 0; i < n; i++) { 314 cmp = _mm_set1_pi8(a & INT64_C(0xFF)); 315 a = (a >> INT64_C(8)) | INT64_C(0x0F00000000000000); 316 cmp = _mm_cmpgt_pi8(cmp, _mm_cvtsi64_m64(a)); 317 c = _mm_popcnt_u64(_mm_cvtm64_si64(cmp)) >> UINT64_C(3); 318 ret += c * factorial[n-1-i]; 319 } 320 321 return ret; 322 } 323 324 STATIC_INLINE int64_t 325 indextoperm_8x8(uint64_t p) 326 { 327 int used; 328 uint64_t c, k, i, j, ret; 329 330 for (i = 0, ret = 0, used = 0; i < 8; i++) { 331 k = p / factorial[7-i]; 332 333 /* Find k-th unused number */ 334 for (j = 0, c = 0; c <= k; j++) 335 c += 1 - ((used & (1 << j)) >> j); 336 337 ret |= (j-1) << (8*i); 338 used |= 1 << (j-1); 339 p %= factorial[7-i]; 340 } 341 342 return ret; 343 } 344 345 STATIC_INLINE int64_t 346 indextoperm_4x8(uint64_t p) 347 { 348 static const int64_t A[FACT_4] = { 349 [0] = INT64_C(0x03020100), 350 [1] = INT64_C(0x02030100), 351 [2] = INT64_C(0x03010200), 352 [3] = INT64_C(0x01030200), 353 [4] = INT64_C(0x02010300), 354 [5] = INT64_C(0x01020300), 355 [6] = INT64_C(0x03020001), 356 [7] = INT64_C(0x02030001), 357 [8] = INT64_C(0x03000201), 358 [9] = INT64_C(0x00030201), 359 [10] = INT64_C(0x02000301), 360 [11] = INT64_C(0x00020301), 361 [12] = INT64_C(0x03010002), 362 [13] = INT64_C(0x01030002), 363 [14] = INT64_C(0x03000102), 364 [15] = INT64_C(0x00030102), 365 [16] = INT64_C(0x01000302), 366 [17] = INT64_C(0x00010302), 367 [18] = INT64_C(0x02010003), 368 [19] = INT64_C(0x01020003), 369 [20] = INT64_C(0x02000103), 370 [21] = INT64_C(0x00020103), 371 [22] = INT64_C(0x01000203), 372 [23] = INT64_C(0x00010203), 373 }; 374 375 return A[p]; 376 } 377 378 STATIC_INLINE uint64_t 379 coord_cp(cube_t cube) 380 { 381 cube_t cp; 382 int64_t aux[4]; 383 384 cp = _mm256_and_si256(cube, CP_AVX2); 385 _mm256_storeu_si256((__m256i *)aux, cp); 386 387 return permtoindex_Nx8(8, aux[0]); 388 } 389 390 STATIC_INLINE cube_t 391 invcoord_cp(uint64_t i) 392 { 393 return _mm256_set_epi64x(SOLVED_H, SOLVED_L, 0, indextoperm_8x8(i)); 394 } 395 396 STATIC_INLINE uint64_t 397 coord_epud(cube_t cube) 398 { 399 cube_t ep; 400 int64_t aux[4]; 401 402 ep = _mm256_and_si256(cube, EP_AVX2); 403 _mm256_storeu_si256((__m256i *)aux, ep); 404 405 return permtoindex_Nx8(8, aux[2]); 406 } 407 408 STATIC_INLINE cube_t 409 invcoord_epud(uint64_t i) 410 { 411 return _mm256_set_epi64x(SOLVED_H, indextoperm_8x8(i), 0, SOLVED_L); 412 } 413 414 STATIC_INLINE uint64_t 415 coord_epe(cube_t cube) 416 { 417 cube_t ep; 418 int64_t aux[4]; 419 420 ep = _mm256_and_si256(cube, EP_AVX2); 421 ep = _mm256_xor_si256(ep, _mm256_set1_epi8(8)); 422 _mm256_storeu_si256((__m256i *)aux, ep); 423 424 return permtoindex_Nx8(4, aux[3]); 425 } 426 427 STATIC_INLINE cube_t 428 invcoord_epe(uint64_t i) 429 { 430 int64_t a; 431 __m64 a64; 432 433 a = indextoperm_4x8(i); 434 a64 = _mm_add_pi8(_mm_cvtsi64_m64(a), _mm_set_pi32(0, 0x08080808)); 435 a = _mm_cvtm64_si64(a64); 436 437 return _mm256_set_epi64x(a, SOLVED_L, 0, SOLVED_L); 438 } 439 440 STATIC_INLINE bool 441 is_eo_even(cube_t cube) 442 { 443 uint32_t mask; 444 __m256i e; 445 446 e = _mm256_and_si256(cube, EO_AVX2); 447 e = _mm256_slli_epi16(e, 7-EOSHIFT); 448 mask = _mm256_movemask_epi8(e); 449 450 return popcount_u32(mask) % 2 == 0; 451 } 452 453 STATIC_INLINE uint64_t 454 coord_epudsep(cube_t cube) 455 { 456 uint8_t aux[32]; 457 458 _mm256_storeu_si256((__m256i *)aux, cube); 459 return coord_epudsep_array(aux + 16); 460 } 461 462 STATIC_INLINE cube_t 463 invcoord_epudsep(uint64_t i) 464 { 465 cube_t cube, elow; 466 uint8_t e[32] = {0}; 467 468 invcoord_epudsep_array(i, e+16); 469 elow = _mm256_loadu_si256((__m256i *)e); 470 cube = _mm256_set_epi64x(SOLVED_H, 0, 0, SOLVED_L); 471 472 return _mm256_or_si256(elow, cube); 473 }