avx2.h (6963B)
1 #define CO2_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x6060606060606060)) 2 #define COCW_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x2020202020202020)) 3 #define CP_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x0707070707070707)) 4 #define EP_AVX2 \ 5 _mm256_set_epi64x(INT64_C(0x0F0F0F0F), INT64_C(0x0F0F0F0F0F0F0F0F), 0, 0) 6 #define EO_AVX2 \ 7 _mm256_set_epi64x(INT64_C(0x10101010), INT64_C(0x1010101010101010), 0, 0) 8 9 #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \ 10 e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \ 11 _mm256_set_epi8(0, 0, 0, 0, e_br, e_bl, e_fl, e_fr, \ 12 e_dr, e_dl, e_ul, e_ur, e_df, e_db, e_ub, e_uf, \ 13 0, 0, 0, 0, 0, 0, 0, 0, \ 14 c_dbl, c_dfr, c_ubr, c_ufl, c_dbr, c_dfl, c_ubl, c_ufr) 15 #define ZERO_CUBE _mm256_set_epi64x(0, 0, 0, 0) 16 #define SOLVED_CUBE STATIC_CUBE( \ 17 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) 18 19 STATIC void 20 pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12]) 21 { 22 uint8_t aux[32]; 23 24 _mm256_storeu_si256((__m256i_u *)aux, *cube); 25 memcpy(c, aux, 8); 26 memcpy(e, aux+16, 12); 27 } 28 29 STATIC_INLINE bool 30 equal(cube_t c1, cube_t c2) 31 { 32 int32_t mask; 33 __m256i cmp; 34 35 cmp = _mm256_cmpeq_epi8(c1, c2); 36 mask = _mm256_movemask_epi8(cmp); 37 38 return mask == ~0; 39 } 40 41 STATIC_INLINE cube_t 42 invertco(cube_t c) 43 { 44 cube_t co, shleft, shright, summed, newco, cleanco, ret; 45 46 co = _mm256_and_si256(c, CO2_AVX2); 47 shleft = _mm256_slli_epi32(co, 1); 48 shright = _mm256_srli_epi32(co, 1); 49 summed = _mm256_or_si256(shleft, shright); 50 newco = _mm256_and_si256(summed, CO2_AVX2); 51 cleanco = _mm256_xor_si256(c, co); 52 ret = _mm256_or_si256(cleanco, newco); 53 54 return ret; 55 } 56 57 STATIC_INLINE cube_t 58 compose_epcpeo(cube_t c1, cube_t c2) 59 { 60 cube_t b, s, eo2; 61 62 /* Permute and clean unused bits */ 63 s = _mm256_shuffle_epi8(c1, c2); 64 b = _mm256_set_epi8( 65 ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 66 ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0 67 ); 68 s = _mm256_andnot_si256(b, s); 69 70 /* Change EO */ 71 eo2 = _mm256_and_si256(c2, EO_AVX2); 72 s = _mm256_xor_si256(s, eo2); 73 74 return s; 75 } 76 77 STATIC_INLINE cube_t 78 compose_edges(cube_t c1, cube_t c2) 79 { 80 return compose_epcpeo(c1, c2); 81 } 82 83 STATIC_INLINE cube_t 84 compose_corners(cube_t c1, cube_t c2) 85 { 86 /* 87 * We do a full compose. Minor optimizations are possible, like 88 * saving one instruction by not doing EO, but it should not 89 * be significant. 90 */ 91 return compose(c1, c2); 92 } 93 94 STATIC_INLINE cube_t 95 compose(cube_t c1, cube_t c2) 96 { 97 cube_t s, co1, co2, aux, auy1, auy2, auz1, auz2; 98 99 s = compose_epcpeo(c1, c2); 100 101 /* Change CO */ 102 co1 = _mm256_and_si256(s, CO2_AVX2); 103 co2 = _mm256_and_si256(c2, CO2_AVX2); 104 aux = _mm256_add_epi8(co1, co2); 105 auy1 = _mm256_add_epi8(aux, COCW_AVX2); 106 auy2 = _mm256_srli_epi32(auy1, 2); 107 auz1 = _mm256_add_epi8(aux, auy2); 108 auz2 = _mm256_and_si256(auz1, CO2_AVX2); 109 110 /* Put together */ 111 s = _mm256_andnot_si256(CO2_AVX2, s); 112 s = _mm256_or_si256(s, auz2); 113 114 return s; 115 } 116 117 STATIC_INLINE cube_t 118 cleanaftershuffle(cube_t c) 119 { 120 __m256i b; 121 122 b = _mm256_set_epi8( 123 ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 124 ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0 125 ); 126 127 return _mm256_andnot_si256(b, c); 128 } 129 130 STATIC_INLINE cube_t 131 inverse(cube_t c) 132 { 133 /* Method taken from Andrew Skalski's vcube[1]. The addition sequence 134 * was generated using [2]. 135 * [1] https://github.com/Voltara/vcube 136 * [2] http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html 137 */ 138 cube_t v3, vi, vo, vp, ret; 139 140 v3 = _mm256_shuffle_epi8(c, c); 141 v3 = _mm256_shuffle_epi8(v3, c); 142 vi = _mm256_shuffle_epi8(v3, v3); 143 vi = _mm256_shuffle_epi8(vi, vi); 144 vi = _mm256_shuffle_epi8(vi, vi); 145 vi = _mm256_shuffle_epi8(vi, v3); 146 vi = _mm256_shuffle_epi8(vi, vi); 147 vi = _mm256_shuffle_epi8(vi, vi); 148 vi = _mm256_shuffle_epi8(vi, vi); 149 vi = _mm256_shuffle_epi8(vi, vi); 150 vi = _mm256_shuffle_epi8(vi, c); 151 vi = _mm256_shuffle_epi8(vi, vi); 152 vi = _mm256_shuffle_epi8(vi, vi); 153 vi = _mm256_shuffle_epi8(vi, vi); 154 vi = _mm256_shuffle_epi8(vi, vi); 155 vi = _mm256_shuffle_epi8(vi, vi); 156 vi = _mm256_shuffle_epi8(vi, v3); 157 vi = _mm256_shuffle_epi8(vi, vi); 158 vi = _mm256_shuffle_epi8(vi, c); 159 160 vo = _mm256_and_si256(c, _mm256_or_si256(EO_AVX2, CO2_AVX2)); 161 vo = _mm256_shuffle_epi8(vo, vi); 162 vp = _mm256_andnot_si256(_mm256_or_si256(EO_AVX2, CO2_AVX2), vi); 163 ret = _mm256_or_si256(vp, vo); 164 ret = cleanaftershuffle(ret); 165 166 return invertco(ret); 167 } 168 169 STATIC_INLINE int64_t 170 coord_co(cube_t c) 171 { 172 cube_t co; 173 int64_t mem[4], ret, i, p; 174 175 co = _mm256_and_si256(c, CO2_AVX2); 176 _mm256_storeu_si256((__m256i *)mem, co); 177 178 mem[0] >>= 5; 179 for (i = 0, ret = 0, p = 1; i < 7; i++, mem[0] >>= 8, p *= 3) 180 ret += (mem[0] & 3) * p; 181 182 return ret; 183 } 184 185 STATIC_INLINE int64_t 186 coord_csep(cube_t c) 187 { 188 cube_t cp, shifted; 189 int64_t mask; 190 191 cp = _mm256_and_si256(c, CP_AVX2); 192 shifted = _mm256_slli_epi32(cp, 5); 193 mask = _mm256_movemask_epi8(shifted); 194 195 return mask & 0x7F; 196 } 197 198 STATIC_INLINE int64_t 199 coord_cocsep(cube_t c) 200 { 201 return (coord_co(c) << 7) + coord_csep(c); 202 } 203 204 STATIC_INLINE int64_t 205 coord_eo(cube_t c) 206 { 207 cube_t eo, shifted; 208 int64_t mask; 209 210 eo = _mm256_and_si256(c, EO_AVX2); 211 shifted = _mm256_slli_epi32(eo, 3); 212 mask = _mm256_movemask_epi8(shifted); 213 214 return mask >> 17; 215 } 216 217 STATIC_INLINE int64_t 218 coord_esep(cube_t c) 219 { 220 cube_t ep; 221 int64_t e, mem[4], i, j, jj, k, l, ret1, ret2, bit1, bit2, is1; 222 223 ep = _mm256_and_si256(c, EP_AVX2); 224 _mm256_storeu_si256((__m256i *)mem, ep); 225 226 mem[3] <<= 8; 227 ret1 = ret2 = 0; 228 k = l = 4; 229 for (i = 0, j = 0; i < 12; i++, mem[i/8 + 2] >>= 8) { 230 e = mem[i/8 + 2]; 231 232 bit1 = (e & ESEPBIT_1) >> 2; 233 bit2 = (e & ESEPBIT_2) >> 3; 234 is1 = (1 - bit2) * bit1; 235 236 ret1 += bit2 * binomial[11-i][k]; 237 k -= bit2; 238 239 jj = j < 8; 240 ret2 += jj * is1 * binomial[7-(j*jj)][l]; 241 l -= is1; 242 j += (1-bit2); 243 } 244 245 return ret1 * 70 + ret2; 246 } 247 248 STATIC_INLINE void 249 copy_corners(cube_t *dest, cube_t src) 250 { 251 *dest = _mm256_blend_epi32(*dest, src, 0x0F); 252 } 253 254 STATIC_INLINE void 255 copy_edges(cube_t *dest, cube_t src) 256 { 257 *dest = _mm256_blend_epi32(*dest, src, 0xF0); 258 } 259 260 STATIC_INLINE void 261 set_eo(cube_t *cube, int64_t eo) 262 { 263 int64_t eo12, eotop, eobot; 264 __m256i veo; 265 266 eo12 = (eo << 1) + (_mm_popcnt_u64(eo) % 2); 267 eotop = (eo12 & (1 << 11)) << 17 | 268 (eo12 & (1 << 10)) << 10 | 269 (eo12 & (1 << 9)) << 3 | 270 (eo12 & (1 << 8)) >> 4; 271 eobot = (eo12 & (1 << 7)) << 53 | 272 (eo12 & (1 << 6)) << 46 | 273 (eo12 & (1 << 5)) << 39 | 274 (eo12 & (1 << 4)) << 32 | 275 (eo12 & (1 << 3)) << 25 | 276 (eo12 & (1 << 2)) << 18 | 277 (eo12 & (1 << 1)) << 11 | 278 (eo12 & 1) << 4; 279 veo = _mm256_set_epi64x(eotop, eobot, 0, 0); 280 281 *cube = _mm256_andnot_si256(EO_AVX2, *cube); 282 *cube = _mm256_or_si256(*cube, veo); 283 } 284 285 STATIC_INLINE cube_t 286 invcoord_esep(int64_t esep) 287 { 288 cube_t eee, ret; 289 uint8_t mem[32] = {0}; 290 291 invcoord_esep_array(esep % 70, esep / 70, mem+16); 292 293 ret = SOLVED_CUBE; 294 eee = _mm256_loadu_si256((__m256i_u *)&mem); 295 copy_edges(&ret, eee); 296 297 return ret; 298 }