nissy-core

The "engine" of nissy, including the H48 optimal solver.
git clone https://git.tronto.net/nissy-core
Download | Log | Files | Refs | README | LICENSE

avx2.h (10853B)


      1 #define CO2_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x6060606060606060))
      2 #define COCW_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x2020202020202020))
      3 #define CP_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x0707070707070707))
      4 #define EP_AVX2 _mm256_set_epi64x(\
      5     INT64_C(0x0F0F0F0F), INT64_C(0x0F0F0F0F0F0F0F0F), 0, 0)
      6 #define EO_AVX2 _mm256_set_epi64x(\
      7     INT64_C(0x10101010), INT64_C(0x1010101010101010), 0, 0)
      8 #define ORIENT_AVX2 _mm256_set_epi64x(INT64_C(0x10101010), \
      9     INT64_C(0x1010101010101010), 0, INT64_C(0x6060606060606060))
     10 #define USED_AVX2 _mm256_set_epi64x(INT64_C(0x00000000FFFFFFFF), \
     11     INT64_C(0xFFFFFFFFFFFFFFFF), 0, INT64_C(0xFFFFFFFFFFFFFFFF))
     12 #define CARRY_AVX2 _mm256_set_epi64x(INT64_C(0x20202020), \
     13     INT64_C(0x2020202020202020), 0, INT64_C(0x6060606060606060))
     14 
     15 #define SOLVED_L INT64_C(0x0706050403020100)
     16 #define SOLVED_H INT64_C(0x0B0A0908)
     17 
     18 #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
     19     e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
     20     _mm256_set_epi8(0, 0, 0, 0, e_br, e_bl, e_fl, e_fr, \
     21         e_dr, e_dl, e_ul, e_ur, e_df, e_db, e_ub, e_uf, \
     22         0, 0, 0, 0, 0, 0, 0, 0, \
     23         c_dbl, c_dfr, c_ubr, c_ufl, c_dbr, c_dfl, c_ubl, c_ufr)
     24 #define ZERO_CUBE _mm256_set_epi64x(0, 0, 0, 0)
     25 #define SOLVED_CUBE _mm256_set_epi64x(SOLVED_H, SOLVED_L, 0, SOLVED_L)
     26 
     27 STATIC_INLINE uint64_t permtoindex_Nx8(uint64_t, int64_t);
     28 STATIC_INLINE int64_t indextoperm_8x8(uint64_t);
     29 STATIC_INLINE int64_t indextoperm_4x8(uint64_t);
     30 
     31 STATIC_INLINE int
     32 popcount_u32(uint32_t x)
     33 {
     34 	return _mm_popcnt_u32(x);
     35 }
     36 
     37 STATIC_INLINE int
     38 popcount_u64(uint64_t x)
     39 {
     40 	return (int)_mm_popcnt_u64(x);
     41 }
     42 
     43 STATIC void
     44 pieces(cube_t cube[NON_NULL], uint8_t c[SIZE(8)], uint8_t e[SIZE(12)])
     45 {
     46 	uint8_t aux[32];
     47 
     48 	_mm256_storeu_si256((__m256i *)aux, *cube);
     49 	memcpy(c, aux, 8);
     50 	memcpy(e, aux+16, 12);
     51 }
     52 
     53 STATIC_INLINE bool
     54 equal(cube_t c1, cube_t c2)
     55 {
     56 	int32_t mask;
     57 	__m256i cmp;
     58 
     59 	cmp = _mm256_cmpeq_epi8(c1, c2);
     60 	mask = _mm256_movemask_epi8(cmp);
     61 
     62 	return mask == ~0;
     63 }
     64 
     65 STATIC_INLINE cube_t
     66 invertco(cube_t c)
     67 {
     68         cube_t co, shleft, shright, summed, newco, cleanco, ret;
     69 
     70         co = _mm256_and_si256(c, CO2_AVX2);
     71         shleft = _mm256_slli_epi32(co, 1);
     72         shright = _mm256_srli_epi32(co, 1);
     73         summed = _mm256_or_si256(shleft, shright);
     74         newco = _mm256_and_si256(summed, CO2_AVX2);
     75         cleanco = _mm256_xor_si256(c, co);
     76         ret = _mm256_or_si256(cleanco, newco);
     77 
     78         return ret;
     79 }
     80 
     81 STATIC_INLINE cube_t
     82 compose_edges(cube_t c1, cube_t c2)
     83 {
     84 	return compose(c1, c2);
     85 }
     86 
     87 STATIC_INLINE cube_t
     88 compose_corners(cube_t c1, cube_t c2)
     89 {
     90 	return compose(c1, c2);
     91 }
     92 
     93 STATIC_INLINE cube_t
     94 compose(cube_t c1, cube_t c2)
     95 {
     96 	/*
     97 	 * Method taken from Andrew Skalski's vcube (thanks to Arhan Chaudhary
     98 	 * for pointing this out)
     99 	 */
    100 	cube_t ss, so, su;
    101 
    102 	/* Permute */
    103 	ss = _mm256_shuffle_epi8(c1, c2);
    104 
    105 	/* Orient */
    106 	so = _mm256_and_si256(c2, ORIENT_AVX2);
    107 	ss = _mm256_add_epi8(ss, so);
    108 	su = _mm256_sub_epi8(ss, CARRY_AVX2);
    109 	ss = _mm256_min_epu8(ss, su);
    110 
    111 	return _mm256_and_si256(ss, USED_AVX2);
    112 }
    113 
    114 STATIC_INLINE cube_t
    115 inverse(cube_t c)
    116 {
    117 	/* Method taken from Andrew Skalski's vcube[1]. The addition sequence
    118 	 * was generated using [2].
    119 	 * [1] https://github.com/Voltara/vcube
    120 	 * [2] http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
    121 	 */
    122 	cube_t v3, vi, vo, vp, ret;
    123 
    124 	v3 = _mm256_shuffle_epi8(c, c);
    125 	v3 = _mm256_shuffle_epi8(v3, c);
    126 	vi = _mm256_shuffle_epi8(v3, v3);
    127 	vi = _mm256_shuffle_epi8(vi, vi);
    128 	vi = _mm256_shuffle_epi8(vi, vi);
    129 	vi = _mm256_shuffle_epi8(vi, v3);
    130 	vi = _mm256_shuffle_epi8(vi, vi);
    131 	vi = _mm256_shuffle_epi8(vi, vi);
    132 	vi = _mm256_shuffle_epi8(vi, vi);
    133 	vi = _mm256_shuffle_epi8(vi, vi);
    134 	vi = _mm256_shuffle_epi8(vi, c);
    135 	vi = _mm256_shuffle_epi8(vi, vi);
    136 	vi = _mm256_shuffle_epi8(vi, vi);
    137 	vi = _mm256_shuffle_epi8(vi, vi);
    138 	vi = _mm256_shuffle_epi8(vi, vi);
    139 	vi = _mm256_shuffle_epi8(vi, vi);
    140 	vi = _mm256_shuffle_epi8(vi, v3);
    141 	vi = _mm256_shuffle_epi8(vi, vi);
    142 	vi = _mm256_shuffle_epi8(vi, c);
    143 
    144 	vo = _mm256_and_si256(c, ORIENT_AVX2);
    145 	vo = _mm256_shuffle_epi8(vo, vi);
    146 	vp = _mm256_andnot_si256(ORIENT_AVX2, vi);
    147 	ret = _mm256_or_si256(vp, vo);
    148 	ret = _mm256_and_si256(ret, USED_AVX2);
    149 
    150 	return invertco(ret);
    151 }
    152 
    153 STATIC_INLINE uint64_t
    154 coord_co(cube_t c)
    155 {
    156 	cube_t co;
    157 	uint64_t mem[4], ret, i, p;
    158 
    159 	co = _mm256_and_si256(c, CO2_AVX2);
    160 	_mm256_storeu_si256((__m256i *)mem, co);
    161 
    162 	mem[0] >>= 5;
    163 	for (i = 0, ret = 0, p = 1; i < 7; i++, mem[0] >>= 8, p *= 3)
    164 		ret += (mem[0] & 3) * p;
    165 
    166 	return ret;
    167 }
    168 
    169 STATIC_INLINE void
    170 copy_co(cube_t cube[NON_NULL], cube_t co)
    171 {
    172 	cube_t coclean;
    173 
    174 	coclean = _mm256_and_si256(co, CO2_AVX2);
    175 	*cube = _mm256_andnot_si256(CO2_AVX2, *cube);
    176 	*cube = _mm256_or_si256(*cube, coclean);
    177 }
    178 
    179 STATIC_INLINE cube_t
    180 invcoord_co(uint64_t coord)
    181 {
    182 	uint64_t i, c, p, co, mem[4] = {0};
    183 	cube_t cube, cc;
    184 
    185 	for (i = 0, p = 0, c = coord; i < 8; i++, c /= 3) {
    186 		co = i == 7 ? ((3 - (p % 3)) % 3) : (c % 3);
    187 		p += co;
    188 		mem[0] |= (uint64_t)(i + (co << COSHIFT)) << (uint64_t)(8 * i);
    189 	}
    190 
    191 	cc = _mm256_loadu_si256((__m256i *)mem);
    192 	cube = SOLVED_CUBE;
    193 	copy_co(&cube, cc);
    194 
    195 	return cube;
    196 }
    197 
    198 STATIC_INLINE uint64_t
    199 coord_csep(cube_t c)
    200 {
    201 	cube_t cp, shifted;
    202 	int mask;
    203 
    204 	cp = _mm256_and_si256(c, CP_AVX2);
    205 	shifted = _mm256_slli_epi32(cp, 5);
    206 	mask = _mm256_movemask_epi8(shifted);
    207 
    208 	return (uint64_t)(mask & 0x7F);
    209 }
    210 
    211 STATIC_INLINE uint64_t
    212 coord_cocsep(cube_t c)
    213 {
    214 	return (coord_co(c) << UINT8_C(7)) + coord_csep(c);
    215 }
    216 
    217 STATIC_INLINE uint64_t
    218 coord_eo(cube_t c)
    219 {
    220 	cube_t eo, shifted;
    221 	int mask;
    222 
    223 	eo = _mm256_and_si256(c, EO_AVX2);
    224 	shifted = _mm256_slli_epi32(eo, 3);
    225 	mask = _mm256_movemask_epi8(shifted);
    226 
    227 	return (uint64_t)(mask >> 17);
    228 }
    229 
    230 STATIC_INLINE uint64_t
    231 coord_esep(cube_t c)
    232 {
    233 	cube_t ep;
    234 	uint64_t e, mem[4], i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
    235 
    236 	ep = _mm256_and_si256(c, EP_AVX2);
    237 	_mm256_storeu_si256((__m256i *)mem, ep);
    238 
    239 	mem[3] <<= 8;
    240 	ret1 = ret2 = 0;
    241 	k = l = 4;
    242 	for (i = 0, j = 0; i < 12; i++, mem[i/8 + 2] >>= 8) {
    243 		e = mem[i/8 + 2];
    244 
    245 		bit1 = (e & ESEPBIT_1) >> 2;
    246 		bit2 = (e & ESEPBIT_2) >> 3;
    247 		is1 = (1 - bit2) * bit1;
    248 
    249 		ret1 += bit2 * binomial[11-i][k];
    250 		k -= bit2;
    251 
    252 		jj = j < 8;
    253 		ret2 += jj * is1 * binomial[7-(j*jj)][l];
    254 		l -= is1;
    255 		j += (1-bit2);
    256 	}
    257 
    258 	return ret1 * 70 + ret2;
    259 }
    260 
    261 STATIC_INLINE cube_t
    262 invcoord_esep(uint64_t esep)
    263 {
    264 	cube_t eee, ret;
    265 	uint8_t mem[32] = {0};
    266 
    267 	invcoord_esep_array(esep % UINT64_C(70), esep / UINT64_C(70), mem+16);
    268 
    269 	ret = SOLVED_CUBE;
    270 	eee = _mm256_loadu_si256((__m256i *)mem);
    271 	copy_edges(&ret, eee);
    272 
    273 	return ret;
    274 }
    275 
    276 STATIC_INLINE void
    277 copy_corners(cube_t dest[NON_NULL], cube_t src)
    278 {
    279 	*dest = _mm256_blend_epi32(*dest, src, 0x0F);
    280 }
    281 
    282 STATIC_INLINE void
    283 copy_edges(cube_t dest[NON_NULL], cube_t src)
    284 {
    285 	*dest = _mm256_blend_epi32(*dest, src, 0xF0);
    286 }
    287 
    288 STATIC_INLINE void
    289 set_eo(cube_t cube[NON_NULL], uint64_t eo)
    290 {
    291 	uint64_t eo12, eotop, eobot;
    292 	__m256i veo;
    293 
    294 	eo12 = (eo << 1) + (_mm_popcnt_u64(eo) % 2);
    295 	eotop = (eo12 & (1 << 11)) << 17 |
    296 		(eo12 & (1 << 10)) << 10 |
    297 		(eo12 & (1 << 9)) << 3 |
    298 		(eo12 & (1 << 8)) >> 4;
    299 	eobot = (eo12 & (1 << 7)) << 53 |
    300 		(eo12 & (1 << 6)) << 46 |
    301 		(eo12 & (1 << 5)) << 39 |
    302 		(eo12 & (1 << 4)) << 32 |
    303 		(eo12 & (1 << 3)) << 25 |
    304 		(eo12 & (1 << 2)) << 18 |
    305 		(eo12 & (1 << 1)) << 11 |
    306 		(eo12 & 1) << 4;
    307 	veo = _mm256_set_epi64x(eotop, eobot, 0, 0);
    308 
    309 	*cube = _mm256_andnot_si256(EO_AVX2, *cube);
    310 	*cube = _mm256_or_si256(*cube, veo);
    311 }
    312 
    313 STATIC_INLINE uint64_t
    314 permtoindex_Nx8(uint64_t n, int64_t a)
    315 {
    316 	uint64_t i, c, ret;
    317 	__m256i cmp;
    318 
    319 	for (i = 0, ret = 0; i < n; i++) {
    320 		cmp = _mm256_set1_epi8((char)(a & INT64_C(0xFF)));
    321 		a = (a >> INT64_C(8)) | INT64_C(0x0F00000000000000);
    322 		cmp = _mm256_cmpgt_epi8(cmp, _mm256_set1_epi64x(a));
    323 		c = _mm_popcnt_u32(_mm256_movemask_epi8(cmp)) >> 2;
    324 		ret += c * factorial[n-1-i];
    325 	}
    326 
    327 	return ret;
    328 }
    329 
    330 STATIC_INLINE int64_t
    331 indextoperm_8x8(uint64_t p)
    332 {
    333 	int used;
    334 	uint64_t c, k, i, j, ret;
    335 
    336 	for (i = 0, ret = 0, used = 0; i < 8; i++) {
    337 		k = p / factorial[7-i];
    338 
    339 		/* Find k-th unused number */
    340 		for (j = 0, c = 0; c <= k; j++)
    341 			c += 1 - ((used & (1 << j)) >> j);
    342 
    343 		ret |= (j-1) << (8*i);
    344 		used |= 1 << (j-1);
    345 		p %= factorial[7-i];
    346 	}
    347 
    348 	return ret;
    349 }
    350 
    351 STATIC_INLINE int64_t
    352 indextoperm_4x8(uint64_t p)
    353 {
    354 	static const int64_t A[FACT_4] = {
    355 		[0] = INT64_C(0x03020100),
    356 		[1] = INT64_C(0x02030100),
    357 		[2] = INT64_C(0x03010200),
    358 		[3] = INT64_C(0x01030200),
    359 		[4] = INT64_C(0x02010300),
    360 		[5] = INT64_C(0x01020300),
    361 		[6] = INT64_C(0x03020001),
    362 		[7] = INT64_C(0x02030001),
    363 		[8] = INT64_C(0x03000201),
    364 		[9] = INT64_C(0x00030201),
    365 		[10] = INT64_C(0x02000301),
    366 		[11] = INT64_C(0x00020301),
    367 		[12] = INT64_C(0x03010002),
    368 		[13] = INT64_C(0x01030002),
    369 		[14] = INT64_C(0x03000102),
    370 		[15] = INT64_C(0x00030102),
    371 		[16] = INT64_C(0x01000302),
    372 		[17] = INT64_C(0x00010302),
    373 		[18] = INT64_C(0x02010003),
    374 		[19] = INT64_C(0x01020003),
    375 		[20] = INT64_C(0x02000103),
    376 		[21] = INT64_C(0x00020103),
    377 		[22] = INT64_C(0x01000203),
    378 		[23] = INT64_C(0x00010203),
    379 	};
    380 
    381 	return A[p];
    382 }
    383 
    384 STATIC_INLINE uint64_t
    385 coord_cp(cube_t cube)
    386 {
    387 	cube_t cp;
    388 	int64_t aux[4];
    389 
    390 	cp = _mm256_and_si256(cube, CP_AVX2);
    391 	_mm256_storeu_si256((__m256i *)aux, cp);
    392 
    393 	return permtoindex_Nx8(8, aux[0]);
    394 }
    395 
    396 STATIC_INLINE cube_t
    397 invcoord_cp(uint64_t i)
    398 {
    399 	return _mm256_set_epi64x(SOLVED_H, SOLVED_L, 0, indextoperm_8x8(i));
    400 }
    401 
    402 STATIC_INLINE uint64_t
    403 coord_epud(cube_t cube)
    404 {
    405 	cube_t ep;
    406 	int64_t aux[4];
    407 
    408 	ep = _mm256_and_si256(cube, EP_AVX2);
    409 	_mm256_storeu_si256((__m256i *)aux, ep);
    410 
    411 	return permtoindex_Nx8(8, aux[2]);
    412 }
    413 
    414 STATIC_INLINE cube_t
    415 invcoord_epud(uint64_t i)
    416 {
    417 	return _mm256_set_epi64x(SOLVED_H, indextoperm_8x8(i), 0, SOLVED_L);
    418 }
    419 
    420 STATIC_INLINE uint64_t
    421 coord_epe(cube_t cube)
    422 {
    423 	cube_t ep;
    424 	int64_t aux[4];
    425 
    426 	ep = _mm256_and_si256(cube, EP_AVX2);
    427 	ep = _mm256_xor_si256(ep, _mm256_set1_epi8(8));
    428 	_mm256_storeu_si256((__m256i *)aux, ep);
    429 
    430 	return permtoindex_Nx8(4, aux[3]);
    431 }
    432 
    433 STATIC_INLINE cube_t
    434 invcoord_epe(uint64_t i)
    435 {
    436 	int64_t a;
    437 	__m256i b, r;
    438 
    439 	a = indextoperm_4x8(i);
    440 	b = _mm256_set1_epi64x(INT64_C(0x08080808));
    441 	r = _mm256_add_epi8(_mm256_set1_epi64x(a), b);
    442 	a = _mm256_extract_epi64(r, 0);
    443 
    444 	return _mm256_set_epi64x(a, SOLVED_L, 0, SOLVED_L);
    445 }
    446 
    447 STATIC_INLINE bool
    448 is_eo_even(cube_t cube)
    449 {
    450 	uint32_t mask;
    451 	__m256i e;
    452 
    453 	e = _mm256_and_si256(cube, EO_AVX2);
    454 	e = _mm256_slli_epi16(e, 7-EOSHIFT);
    455 	mask = _mm256_movemask_epi8(e);
    456 
    457 	return popcount_u32(mask) % 2 == 0;
    458 }
    459 
    460 STATIC_INLINE uint64_t
    461 coord_epudsep(cube_t cube)
    462 {
    463 	uint8_t aux[32];
    464 
    465 	_mm256_storeu_si256((__m256i *)aux, cube);
    466 	return coord_epudsep_array(aux + 16);
    467 }
    468 
    469 STATIC_INLINE cube_t
    470 invcoord_epudsep(uint64_t i)
    471 {
    472 	cube_t cube, elow;
    473 	uint8_t e[32] = {0};
    474 
    475 	invcoord_epudsep_array(i, e+16);
    476 	elow = _mm256_loadu_si256((__m256i *)e);
    477 	cube = _mm256_set_epi64x(SOLVED_H, 0, 0, SOLVED_L);
    478 
    479 	return _mm256_or_si256(elow, cube);
    480 }