avx2.h (7038B)
1 #define CO2_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x6060606060606060)) 2 #define COCW_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x2020202020202020)) 3 #define CP_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x0707070707070707)) 4 #define EP_AVX2 \ 5 _mm256_set_epi64x(INT64_C(0x0F0F0F0F), INT64_C(0x0F0F0F0F0F0F0F0F), 0, 0) 6 #define EO_AVX2 \ 7 _mm256_set_epi64x(INT64_C(0x10101010), INT64_C(0x1010101010101010), 0, 0) 8 9 #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \ 10 e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \ 11 _mm256_set_epi8(0, 0, 0, 0, e_br, e_bl, e_fl, e_fr, \ 12 e_dr, e_dl, e_ul, e_ur, e_df, e_db, e_ub, e_uf, \ 13 0, 0, 0, 0, 0, 0, 0, 0, \ 14 c_dbl, c_dfr, c_ubr, c_ufl, c_dbr, c_dfl, c_ubl, c_ufr) 15 #define ZERO_CUBE _mm256_set_epi64x(0, 0, 0, 0) 16 #define SOLVED_CUBE STATIC_CUBE( \ 17 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) 18 19 STATIC_INLINE int 20 popcount_u32(uint32_t x) 21 { 22 return _mm_popcnt_u32(x); 23 } 24 25 STATIC void 26 pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12]) 27 { 28 uint8_t aux[32]; 29 30 _mm256_storeu_si256((__m256i_u *)aux, *cube); 31 memcpy(c, aux, 8); 32 memcpy(e, aux+16, 12); 33 } 34 35 STATIC_INLINE bool 36 equal(cube_t c1, cube_t c2) 37 { 38 int32_t mask; 39 __m256i cmp; 40 41 cmp = _mm256_cmpeq_epi8(c1, c2); 42 mask = _mm256_movemask_epi8(cmp); 43 44 return mask == ~0; 45 } 46 47 STATIC_INLINE cube_t 48 invertco(cube_t c) 49 { 50 cube_t co, shleft, shright, summed, newco, cleanco, ret; 51 52 co = _mm256_and_si256(c, CO2_AVX2); 53 shleft = _mm256_slli_epi32(co, 1); 54 shright = _mm256_srli_epi32(co, 1); 55 summed = _mm256_or_si256(shleft, shright); 56 newco = _mm256_and_si256(summed, CO2_AVX2); 57 cleanco = _mm256_xor_si256(c, co); 58 ret = _mm256_or_si256(cleanco, newco); 59 60 return ret; 61 } 62 63 STATIC_INLINE cube_t 64 compose_epcpeo(cube_t c1, cube_t c2) 65 { 66 cube_t b, s, eo2; 67 68 /* Permute and clean unused bits */ 69 s = _mm256_shuffle_epi8(c1, c2); 70 b = _mm256_set_epi8( 71 ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 72 ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0 73 ); 74 s = _mm256_andnot_si256(b, s); 75 76 /* Change EO */ 77 eo2 = _mm256_and_si256(c2, EO_AVX2); 78 s = _mm256_xor_si256(s, eo2); 79 80 return s; 81 } 82 83 STATIC_INLINE cube_t 84 compose_edges(cube_t c1, cube_t c2) 85 { 86 return compose_epcpeo(c1, c2); 87 } 88 89 STATIC_INLINE cube_t 90 compose_corners(cube_t c1, cube_t c2) 91 { 92 /* 93 * We do a full compose. Minor optimizations are possible, like 94 * saving one instruction by not doing EO, but it should not 95 * be significant. 96 */ 97 return compose(c1, c2); 98 } 99 100 STATIC_INLINE cube_t 101 compose(cube_t c1, cube_t c2) 102 { 103 cube_t s, co1, co2, aux, auy1, auy2, auz1, auz2; 104 105 s = compose_epcpeo(c1, c2); 106 107 /* Change CO */ 108 co1 = _mm256_and_si256(s, CO2_AVX2); 109 co2 = _mm256_and_si256(c2, CO2_AVX2); 110 aux = _mm256_add_epi8(co1, co2); 111 auy1 = _mm256_add_epi8(aux, COCW_AVX2); 112 auy2 = _mm256_srli_epi32(auy1, 2); 113 auz1 = _mm256_add_epi8(aux, auy2); 114 auz2 = _mm256_and_si256(auz1, CO2_AVX2); 115 116 /* Put together */ 117 s = _mm256_andnot_si256(CO2_AVX2, s); 118 s = _mm256_or_si256(s, auz2); 119 120 return s; 121 } 122 123 STATIC_INLINE cube_t 124 cleanaftershuffle(cube_t c) 125 { 126 __m256i b; 127 128 b = _mm256_set_epi8( 129 ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 130 ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0 131 ); 132 133 return _mm256_andnot_si256(b, c); 134 } 135 136 STATIC_INLINE cube_t 137 inverse(cube_t c) 138 { 139 /* Method taken from Andrew Skalski's vcube[1]. The addition sequence 140 * was generated using [2]. 141 * [1] https://github.com/Voltara/vcube 142 * [2] http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html 143 */ 144 cube_t v3, vi, vo, vp, ret; 145 146 v3 = _mm256_shuffle_epi8(c, c); 147 v3 = _mm256_shuffle_epi8(v3, c); 148 vi = _mm256_shuffle_epi8(v3, v3); 149 vi = _mm256_shuffle_epi8(vi, vi); 150 vi = _mm256_shuffle_epi8(vi, vi); 151 vi = _mm256_shuffle_epi8(vi, v3); 152 vi = _mm256_shuffle_epi8(vi, vi); 153 vi = _mm256_shuffle_epi8(vi, vi); 154 vi = _mm256_shuffle_epi8(vi, vi); 155 vi = _mm256_shuffle_epi8(vi, vi); 156 vi = _mm256_shuffle_epi8(vi, c); 157 vi = _mm256_shuffle_epi8(vi, vi); 158 vi = _mm256_shuffle_epi8(vi, vi); 159 vi = _mm256_shuffle_epi8(vi, vi); 160 vi = _mm256_shuffle_epi8(vi, vi); 161 vi = _mm256_shuffle_epi8(vi, vi); 162 vi = _mm256_shuffle_epi8(vi, v3); 163 vi = _mm256_shuffle_epi8(vi, vi); 164 vi = _mm256_shuffle_epi8(vi, c); 165 166 vo = _mm256_and_si256(c, _mm256_or_si256(EO_AVX2, CO2_AVX2)); 167 vo = _mm256_shuffle_epi8(vo, vi); 168 vp = _mm256_andnot_si256(_mm256_or_si256(EO_AVX2, CO2_AVX2), vi); 169 ret = _mm256_or_si256(vp, vo); 170 ret = cleanaftershuffle(ret); 171 172 return invertco(ret); 173 } 174 175 STATIC_INLINE int64_t 176 coord_co(cube_t c) 177 { 178 cube_t co; 179 int64_t mem[4], ret, i, p; 180 181 co = _mm256_and_si256(c, CO2_AVX2); 182 _mm256_storeu_si256((__m256i *)mem, co); 183 184 mem[0] >>= 5; 185 for (i = 0, ret = 0, p = 1; i < 7; i++, mem[0] >>= 8, p *= 3) 186 ret += (mem[0] & 3) * p; 187 188 return ret; 189 } 190 191 STATIC_INLINE int64_t 192 coord_csep(cube_t c) 193 { 194 cube_t cp, shifted; 195 int64_t mask; 196 197 cp = _mm256_and_si256(c, CP_AVX2); 198 shifted = _mm256_slli_epi32(cp, 5); 199 mask = _mm256_movemask_epi8(shifted); 200 201 return mask & 0x7F; 202 } 203 204 STATIC_INLINE int64_t 205 coord_cocsep(cube_t c) 206 { 207 return (coord_co(c) << 7) + coord_csep(c); 208 } 209 210 STATIC_INLINE int64_t 211 coord_eo(cube_t c) 212 { 213 cube_t eo, shifted; 214 int64_t mask; 215 216 eo = _mm256_and_si256(c, EO_AVX2); 217 shifted = _mm256_slli_epi32(eo, 3); 218 mask = _mm256_movemask_epi8(shifted); 219 220 return mask >> 17; 221 } 222 223 STATIC_INLINE int64_t 224 coord_esep(cube_t c) 225 { 226 cube_t ep; 227 int64_t e, mem[4], i, j, jj, k, l, ret1, ret2, bit1, bit2, is1; 228 229 ep = _mm256_and_si256(c, EP_AVX2); 230 _mm256_storeu_si256((__m256i *)mem, ep); 231 232 mem[3] <<= 8; 233 ret1 = ret2 = 0; 234 k = l = 4; 235 for (i = 0, j = 0; i < 12; i++, mem[i/8 + 2] >>= 8) { 236 e = mem[i/8 + 2]; 237 238 bit1 = (e & ESEPBIT_1) >> 2; 239 bit2 = (e & ESEPBIT_2) >> 3; 240 is1 = (1 - bit2) * bit1; 241 242 ret1 += bit2 * binomial[11-i][k]; 243 k -= bit2; 244 245 jj = j < 8; 246 ret2 += jj * is1 * binomial[7-(j*jj)][l]; 247 l -= is1; 248 j += (1-bit2); 249 } 250 251 return ret1 * 70 + ret2; 252 } 253 254 STATIC_INLINE void 255 copy_corners(cube_t *dest, cube_t src) 256 { 257 *dest = _mm256_blend_epi32(*dest, src, 0x0F); 258 } 259 260 STATIC_INLINE void 261 copy_edges(cube_t *dest, cube_t src) 262 { 263 *dest = _mm256_blend_epi32(*dest, src, 0xF0); 264 } 265 266 STATIC_INLINE void 267 set_eo(cube_t *cube, int64_t eo) 268 { 269 int64_t eo12, eotop, eobot; 270 __m256i veo; 271 272 eo12 = (eo << 1) + (_mm_popcnt_u64(eo) % 2); 273 eotop = (eo12 & (1 << 11)) << 17 | 274 (eo12 & (1 << 10)) << 10 | 275 (eo12 & (1 << 9)) << 3 | 276 (eo12 & (1 << 8)) >> 4; 277 eobot = (eo12 & (1 << 7)) << 53 | 278 (eo12 & (1 << 6)) << 46 | 279 (eo12 & (1 << 5)) << 39 | 280 (eo12 & (1 << 4)) << 32 | 281 (eo12 & (1 << 3)) << 25 | 282 (eo12 & (1 << 2)) << 18 | 283 (eo12 & (1 << 1)) << 11 | 284 (eo12 & 1) << 4; 285 veo = _mm256_set_epi64x(eotop, eobot, 0, 0); 286 287 *cube = _mm256_andnot_si256(EO_AVX2, *cube); 288 *cube = _mm256_or_si256(*cube, veo); 289 } 290 291 STATIC_INLINE cube_t 292 invcoord_esep(int64_t esep) 293 { 294 cube_t eee, ret; 295 uint8_t mem[32] = {0}; 296 297 invcoord_esep_array(esep % 70, esep / 70, mem+16); 298 299 ret = SOLVED_CUBE; 300 eee = _mm256_loadu_si256((__m256i_u *)&mem); 301 copy_edges(&ret, eee); 302 303 return ret; 304 }