avx2.h (6904B)
1 #define CO2_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x6060606060606060)) 2 #define COCW_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x2020202020202020)) 3 #define CP_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x0707070707070707)) 4 #define EP_AVX2 _mm256_set_epi64x(\ 5 INT64_C(0x0F0F0F0F), INT64_C(0x0F0F0F0F0F0F0F0F), 0, 0) 6 #define EO_AVX2 _mm256_set_epi64x(\ 7 INT64_C(0x10101010), INT64_C(0x1010101010101010), 0, 0) 8 #define ORIENT_AVX2 _mm256_set_epi64x(INT64_C(0x10101010), \ 9 INT64_C(0x1010101010101010), 0, INT64_C(0x6060606060606060)) 10 #define USED_AVX2 _mm256_set_epi64x(INT64_C(0x00000000FFFFFFFF), \ 11 INT64_C(0xFFFFFFFFFFFFFFFF), 0, INT64_C(0xFFFFFFFFFFFFFFFF)) 12 #define CARRY_AVX2 _mm256_set_epi64x(INT64_C(0x20202020), \ 13 INT64_C(0x2020202020202020), 0, INT64_C(0x6060606060606060)) 14 15 #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \ 16 e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \ 17 _mm256_set_epi8(0, 0, 0, 0, e_br, e_bl, e_fl, e_fr, \ 18 e_dr, e_dl, e_ul, e_ur, e_df, e_db, e_ub, e_uf, \ 19 0, 0, 0, 0, 0, 0, 0, 0, \ 20 c_dbl, c_dfr, c_ubr, c_ufl, c_dbr, c_dfl, c_ubl, c_ufr) 21 #define ZERO_CUBE _mm256_set_epi64x(0, 0, 0, 0) 22 #define SOLVED_CUBE STATIC_CUBE( \ 23 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) 24 25 STATIC_INLINE int 26 popcount_u32(uint32_t x) 27 { 28 return _mm_popcnt_u32(x); 29 } 30 31 STATIC void 32 pieces(cube_t cube[static 1], uint8_t c[static 8], uint8_t e[static 12]) 33 { 34 uint8_t aux[32]; 35 36 _mm256_storeu_si256((__m256i_u *)aux, *cube); 37 memcpy(c, aux, 8); 38 memcpy(e, aux+16, 12); 39 } 40 41 STATIC_INLINE bool 42 equal(cube_t c1, cube_t c2) 43 { 44 int32_t mask; 45 __m256i cmp; 46 47 cmp = _mm256_cmpeq_epi8(c1, c2); 48 mask = _mm256_movemask_epi8(cmp); 49 50 return mask == ~0; 51 } 52 53 STATIC_INLINE cube_t 54 invertco(cube_t c) 55 { 56 cube_t co, shleft, shright, summed, newco, cleanco, ret; 57 58 co = _mm256_and_si256(c, CO2_AVX2); 59 shleft = _mm256_slli_epi32(co, 1); 60 shright = _mm256_srli_epi32(co, 1); 61 summed = _mm256_or_si256(shleft, shright); 62 newco = _mm256_and_si256(summed, CO2_AVX2); 63 cleanco = _mm256_xor_si256(c, co); 64 ret = _mm256_or_si256(cleanco, newco); 65 66 return ret; 67 } 68 69 STATIC_INLINE cube_t 70 compose_edges(cube_t c1, cube_t c2) 71 { 72 return compose(c1, c2); 73 } 74 75 STATIC_INLINE cube_t 76 compose_corners(cube_t c1, cube_t c2) 77 { 78 return compose(c1, c2); 79 } 80 81 STATIC_INLINE cube_t 82 compose(cube_t c1, cube_t c2) 83 { 84 /* 85 * Method taken from Andrew Skalski's vcube (thanks to Arhan Chaudhary 86 * for pointing this out) 87 */ 88 cube_t ss, so, su; 89 90 /* Permute */ 91 ss = _mm256_shuffle_epi8(c1, c2); 92 93 /* Orient */ 94 so = _mm256_and_si256(c2, ORIENT_AVX2); 95 ss = _mm256_add_epi8(ss, so); 96 su = _mm256_sub_epi8(ss, CARRY_AVX2); 97 ss = _mm256_min_epu8(ss, su); 98 99 return _mm256_and_si256(ss, USED_AVX2); 100 } 101 102 STATIC_INLINE cube_t 103 inverse(cube_t c) 104 { 105 /* Method taken from Andrew Skalski's vcube[1]. The addition sequence 106 * was generated using [2]. 107 * [1] https://github.com/Voltara/vcube 108 * [2] http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html 109 */ 110 cube_t v3, vi, vo, vp, ret; 111 112 v3 = _mm256_shuffle_epi8(c, c); 113 v3 = _mm256_shuffle_epi8(v3, c); 114 vi = _mm256_shuffle_epi8(v3, v3); 115 vi = _mm256_shuffle_epi8(vi, vi); 116 vi = _mm256_shuffle_epi8(vi, vi); 117 vi = _mm256_shuffle_epi8(vi, v3); 118 vi = _mm256_shuffle_epi8(vi, vi); 119 vi = _mm256_shuffle_epi8(vi, vi); 120 vi = _mm256_shuffle_epi8(vi, vi); 121 vi = _mm256_shuffle_epi8(vi, vi); 122 vi = _mm256_shuffle_epi8(vi, c); 123 vi = _mm256_shuffle_epi8(vi, vi); 124 vi = _mm256_shuffle_epi8(vi, vi); 125 vi = _mm256_shuffle_epi8(vi, vi); 126 vi = _mm256_shuffle_epi8(vi, vi); 127 vi = _mm256_shuffle_epi8(vi, vi); 128 vi = _mm256_shuffle_epi8(vi, v3); 129 vi = _mm256_shuffle_epi8(vi, vi); 130 vi = _mm256_shuffle_epi8(vi, c); 131 132 vo = _mm256_and_si256(c, ORIENT_AVX2); 133 vo = _mm256_shuffle_epi8(vo, vi); 134 vp = _mm256_andnot_si256(ORIENT_AVX2, vi); 135 ret = _mm256_or_si256(vp, vo); 136 ret = _mm256_and_si256(ret, USED_AVX2); 137 138 return invertco(ret); 139 } 140 141 STATIC_INLINE int64_t 142 coord_co(cube_t c) 143 { 144 cube_t co; 145 int64_t mem[4], ret, i, p; 146 147 co = _mm256_and_si256(c, CO2_AVX2); 148 _mm256_storeu_si256((__m256i *)mem, co); 149 150 mem[0] >>= 5; 151 for (i = 0, ret = 0, p = 1; i < 7; i++, mem[0] >>= 8, p *= 3) 152 ret += (mem[0] & 3) * p; 153 154 return ret; 155 } 156 157 STATIC_INLINE cube_t 158 invcoord_co(int64_t coord) 159 { 160 int64_t i, c, p, co, mem[4] = {0}; 161 cube_t cube, cc; 162 163 for (i = 0, p = 0, c = coord; i < 8; i++, c /= 3) { 164 co = i == 7 ? ((3 - (p % 3)) % 3) : (c % 3); 165 p += co; 166 mem[0] |= (int64_t)(i + (co << COSHIFT)) << (int64_t)(8 * i); 167 } 168 169 cc = _mm256_loadu_si256((const __m256i *)mem); 170 cube = SOLVED_CUBE; 171 copy_corners(&cube, cc); 172 173 return cube; 174 } 175 176 STATIC_INLINE int64_t 177 coord_csep(cube_t c) 178 { 179 cube_t cp, shifted; 180 int64_t mask; 181 182 cp = _mm256_and_si256(c, CP_AVX2); 183 shifted = _mm256_slli_epi32(cp, 5); 184 mask = _mm256_movemask_epi8(shifted); 185 186 return mask & 0x7F; 187 } 188 189 STATIC_INLINE int64_t 190 coord_cocsep(cube_t c) 191 { 192 return (coord_co(c) << 7) + coord_csep(c); 193 } 194 195 STATIC_INLINE int64_t 196 coord_eo(cube_t c) 197 { 198 cube_t eo, shifted; 199 int64_t mask; 200 201 eo = _mm256_and_si256(c, EO_AVX2); 202 shifted = _mm256_slli_epi32(eo, 3); 203 mask = _mm256_movemask_epi8(shifted); 204 205 return mask >> 17; 206 } 207 208 STATIC_INLINE int64_t 209 coord_esep(cube_t c) 210 { 211 cube_t ep; 212 int64_t e, mem[4], i, j, jj, k, l, ret1, ret2, bit1, bit2, is1; 213 214 ep = _mm256_and_si256(c, EP_AVX2); 215 _mm256_storeu_si256((__m256i *)mem, ep); 216 217 mem[3] <<= 8; 218 ret1 = ret2 = 0; 219 k = l = 4; 220 for (i = 0, j = 0; i < 12; i++, mem[i/8 + 2] >>= 8) { 221 e = mem[i/8 + 2]; 222 223 bit1 = (e & ESEPBIT_1) >> 2; 224 bit2 = (e & ESEPBIT_2) >> 3; 225 is1 = (1 - bit2) * bit1; 226 227 ret1 += bit2 * binomial[11-i][k]; 228 k -= bit2; 229 230 jj = j < 8; 231 ret2 += jj * is1 * binomial[7-(j*jj)][l]; 232 l -= is1; 233 j += (1-bit2); 234 } 235 236 return ret1 * 70 + ret2; 237 } 238 239 STATIC_INLINE cube_t 240 invcoord_esep(int64_t esep) 241 { 242 cube_t eee, ret; 243 uint8_t mem[32] = {0}; 244 245 invcoord_esep_array(esep % 70, esep / 70, mem+16); 246 247 ret = SOLVED_CUBE; 248 eee = _mm256_loadu_si256((__m256i_u *)&mem); 249 copy_edges(&ret, eee); 250 251 return ret; 252 } 253 254 STATIC_INLINE void 255 copy_corners(cube_t dest[static 1], cube_t src) 256 { 257 *dest = _mm256_blend_epi32(*dest, src, 0x0F); 258 } 259 260 STATIC_INLINE void 261 copy_edges(cube_t dest[static 1], cube_t src) 262 { 263 *dest = _mm256_blend_epi32(*dest, src, 0xF0); 264 } 265 266 STATIC_INLINE void 267 set_eo(cube_t cube[static 1], int64_t eo) 268 { 269 int64_t eo12, eotop, eobot; 270 __m256i veo; 271 272 eo12 = (eo << 1) + (_mm_popcnt_u64(eo) % 2); 273 eotop = (eo12 & (1 << 11)) << 17 | 274 (eo12 & (1 << 10)) << 10 | 275 (eo12 & (1 << 9)) << 3 | 276 (eo12 & (1 << 8)) >> 4; 277 eobot = (eo12 & (1 << 7)) << 53 | 278 (eo12 & (1 << 6)) << 46 | 279 (eo12 & (1 << 5)) << 39 | 280 (eo12 & (1 << 4)) << 32 | 281 (eo12 & (1 << 3)) << 25 | 282 (eo12 & (1 << 2)) << 18 | 283 (eo12 & (1 << 1)) << 11 | 284 (eo12 & 1) << 4; 285 veo = _mm256_set_epi64x(eotop, eobot, 0, 0); 286 287 *cube = _mm256_andnot_si256(EO_AVX2, *cube); 288 *cube = _mm256_or_si256(*cube, veo); 289 }