h48

A prototype for an optimal Rubik's cube solver, work in progress.
git clone https://git.tronto.net/h48
Download | Log | Files | Refs | README | LICENSE

avx2.h (6963B)


      1 #define CO2_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x6060606060606060))
      2 #define COCW_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x2020202020202020))
      3 #define CP_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x0707070707070707))
      4 #define EP_AVX2 \
      5     _mm256_set_epi64x(INT64_C(0x0F0F0F0F), INT64_C(0x0F0F0F0F0F0F0F0F), 0, 0)
      6 #define EO_AVX2 \
      7     _mm256_set_epi64x(INT64_C(0x10101010), INT64_C(0x1010101010101010), 0, 0)
      8 
      9 #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
     10     e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
     11     _mm256_set_epi8(0, 0, 0, 0, e_br, e_bl, e_fl, e_fr, \
     12         e_dr, e_dl, e_ul, e_ur, e_df, e_db, e_ub, e_uf, \
     13         0, 0, 0, 0, 0, 0, 0, 0, \
     14         c_dbl, c_dfr, c_ubr, c_ufl, c_dbr, c_dfl, c_ubl, c_ufr)
     15 #define ZERO_CUBE _mm256_set_epi64x(0, 0, 0, 0)
     16 #define SOLVED_CUBE STATIC_CUBE( \
     17     0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
     18 
     19 STATIC void
     20 pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12])
     21 {
     22 	uint8_t aux[32];
     23 
     24 	_mm256_storeu_si256((__m256i_u *)aux, *cube);
     25 	memcpy(c, aux, 8);
     26 	memcpy(e, aux+16, 12);
     27 }
     28 
     29 STATIC_INLINE bool
     30 equal(cube_t c1, cube_t c2)
     31 {
     32 	int32_t mask;
     33 	__m256i cmp;
     34 
     35 	cmp = _mm256_cmpeq_epi8(c1, c2);
     36 	mask = _mm256_movemask_epi8(cmp);
     37 
     38 	return mask == ~0;
     39 }
     40 
     41 STATIC_INLINE cube_t
     42 invertco(cube_t c)
     43 {
     44         cube_t co, shleft, shright, summed, newco, cleanco, ret;
     45 
     46         co = _mm256_and_si256(c, CO2_AVX2);
     47         shleft = _mm256_slli_epi32(co, 1);
     48         shright = _mm256_srli_epi32(co, 1);
     49         summed = _mm256_or_si256(shleft, shright);
     50         newco = _mm256_and_si256(summed, CO2_AVX2);
     51         cleanco = _mm256_xor_si256(c, co);
     52         ret = _mm256_or_si256(cleanco, newco);
     53 
     54         return ret;
     55 }
     56 
     57 STATIC_INLINE cube_t
     58 compose_epcpeo(cube_t c1, cube_t c2)
     59 {
     60 	cube_t b, s, eo2;
     61 
     62 	/* Permute and clean unused bits */
     63 	s = _mm256_shuffle_epi8(c1, c2);
     64 	b = _mm256_set_epi8(
     65 		~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     66 		~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0
     67 	);
     68 	s = _mm256_andnot_si256(b, s);
     69 
     70 	/* Change EO */
     71 	eo2 = _mm256_and_si256(c2, EO_AVX2);
     72 	s = _mm256_xor_si256(s, eo2);
     73 
     74 	return s;
     75 }
     76 
     77 STATIC_INLINE cube_t
     78 compose_edges(cube_t c1, cube_t c2)
     79 {
     80 	return compose_epcpeo(c1, c2);
     81 }
     82 
     83 STATIC_INLINE cube_t
     84 compose_corners(cube_t c1, cube_t c2)
     85 {
     86 	/*
     87 	 * We do a full compose. Minor optimizations are possible, like
     88 	 * saving one instruction by not doing EO, but it should not
     89 	 * be significant.
     90 	 */
     91 	return compose(c1, c2);
     92 }
     93 
     94 STATIC_INLINE cube_t
     95 compose(cube_t c1, cube_t c2)
     96 {
     97 	cube_t s, co1, co2, aux, auy1, auy2, auz1, auz2;
     98 
     99 	s = compose_epcpeo(c1, c2);
    100 
    101 	/* Change CO */
    102 	co1 = _mm256_and_si256(s, CO2_AVX2);
    103 	co2 = _mm256_and_si256(c2, CO2_AVX2);
    104 	aux = _mm256_add_epi8(co1, co2);
    105 	auy1 = _mm256_add_epi8(aux, COCW_AVX2);
    106 	auy2 = _mm256_srli_epi32(auy1, 2);
    107 	auz1 = _mm256_add_epi8(aux, auy2);
    108 	auz2 = _mm256_and_si256(auz1, CO2_AVX2);
    109 
    110 	/* Put together */
    111 	s = _mm256_andnot_si256(CO2_AVX2, s);
    112 	s = _mm256_or_si256(s, auz2);
    113 
    114 	return s;
    115 }
    116 
    117 STATIC_INLINE cube_t
    118 cleanaftershuffle(cube_t c)
    119 {
    120 	__m256i b;
    121 
    122 	b = _mm256_set_epi8(
    123 		~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    124 		~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0
    125 	);
    126 
    127 	return _mm256_andnot_si256(b, c);
    128 }
    129 
    130 STATIC_INLINE cube_t
    131 inverse(cube_t c)
    132 {
    133 	/* Method taken from Andrew Skalski's vcube[1]. The addition sequence
    134 	 * was generated using [2].
    135 	 * [1] https://github.com/Voltara/vcube
    136 	 * [2] http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
    137 	 */
    138 	cube_t v3, vi, vo, vp, ret;
    139 
    140 	v3 = _mm256_shuffle_epi8(c, c);
    141 	v3 = _mm256_shuffle_epi8(v3, c);
    142 	vi = _mm256_shuffle_epi8(v3, v3);
    143 	vi = _mm256_shuffle_epi8(vi, vi);
    144 	vi = _mm256_shuffle_epi8(vi, vi);
    145 	vi = _mm256_shuffle_epi8(vi, v3);
    146 	vi = _mm256_shuffle_epi8(vi, vi);
    147 	vi = _mm256_shuffle_epi8(vi, vi);
    148 	vi = _mm256_shuffle_epi8(vi, vi);
    149 	vi = _mm256_shuffle_epi8(vi, vi);
    150 	vi = _mm256_shuffle_epi8(vi, c);
    151 	vi = _mm256_shuffle_epi8(vi, vi);
    152 	vi = _mm256_shuffle_epi8(vi, vi);
    153 	vi = _mm256_shuffle_epi8(vi, vi);
    154 	vi = _mm256_shuffle_epi8(vi, vi);
    155 	vi = _mm256_shuffle_epi8(vi, vi);
    156 	vi = _mm256_shuffle_epi8(vi, v3);
    157 	vi = _mm256_shuffle_epi8(vi, vi);
    158 	vi = _mm256_shuffle_epi8(vi, c);
    159 
    160 	vo = _mm256_and_si256(c, _mm256_or_si256(EO_AVX2, CO2_AVX2));
    161 	vo = _mm256_shuffle_epi8(vo, vi);
    162 	vp = _mm256_andnot_si256(_mm256_or_si256(EO_AVX2, CO2_AVX2), vi);
    163 	ret = _mm256_or_si256(vp, vo);
    164 	ret = cleanaftershuffle(ret);
    165 	
    166 	return invertco(ret);
    167 }
    168 
    169 STATIC_INLINE int64_t
    170 coord_co(cube_t c)
    171 {
    172 	cube_t co;
    173 	int64_t mem[4], ret, i, p;
    174 
    175 	co = _mm256_and_si256(c, CO2_AVX2);
    176 	_mm256_storeu_si256((__m256i *)mem, co);
    177 
    178 	mem[0] >>= 5;
    179 	for (i = 0, ret = 0, p = 1; i < 7; i++, mem[0] >>= 8, p *= 3)
    180 		ret += (mem[0] & 3) * p;
    181 
    182 	return ret;
    183 }
    184 
    185 STATIC_INLINE int64_t
    186 coord_csep(cube_t c)
    187 {
    188 	cube_t cp, shifted;
    189 	int64_t mask;
    190 
    191 	cp = _mm256_and_si256(c, CP_AVX2);
    192 	shifted = _mm256_slli_epi32(cp, 5);
    193 	mask = _mm256_movemask_epi8(shifted);
    194 
    195 	return mask & 0x7F;
    196 }
    197 
    198 STATIC_INLINE int64_t
    199 coord_cocsep(cube_t c)
    200 {
    201 	return (coord_co(c) << 7) + coord_csep(c);
    202 }
    203 
    204 STATIC_INLINE int64_t
    205 coord_eo(cube_t c)
    206 {
    207 	cube_t eo, shifted;
    208 	int64_t mask;
    209 
    210 	eo = _mm256_and_si256(c, EO_AVX2);
    211 	shifted = _mm256_slli_epi32(eo, 3);
    212 	mask = _mm256_movemask_epi8(shifted);
    213 
    214 	return mask >> 17;
    215 }
    216 
    217 STATIC_INLINE int64_t
    218 coord_esep(cube_t c)
    219 {
    220 	cube_t ep;
    221 	int64_t e, mem[4], i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
    222 
    223 	ep = _mm256_and_si256(c, EP_AVX2);
    224 	_mm256_storeu_si256((__m256i *)mem, ep);
    225 
    226 	mem[3] <<= 8;
    227 	ret1 = ret2 = 0;
    228 	k = l = 4;
    229 	for (i = 0, j = 0; i < 12; i++, mem[i/8 + 2] >>= 8) {
    230 		e = mem[i/8 + 2];
    231 
    232 		bit1 = (e & ESEPBIT_1) >> 2;
    233 		bit2 = (e & ESEPBIT_2) >> 3;
    234 		is1 = (1 - bit2) * bit1;
    235 
    236 		ret1 += bit2 * binomial[11-i][k];
    237 		k -= bit2;
    238 
    239 		jj = j < 8;
    240 		ret2 += jj * is1 * binomial[7-(j*jj)][l];
    241 		l -= is1;
    242 		j += (1-bit2);
    243 	}
    244 
    245 	return ret1 * 70 + ret2;
    246 }
    247 
    248 STATIC_INLINE void
    249 copy_corners(cube_t *dest, cube_t src)
    250 {
    251 	*dest = _mm256_blend_epi32(*dest, src, 0x0F);
    252 }
    253 
    254 STATIC_INLINE void
    255 copy_edges(cube_t *dest, cube_t src)
    256 {
    257 	*dest = _mm256_blend_epi32(*dest, src, 0xF0);
    258 }
    259 
    260 STATIC_INLINE void
    261 set_eo(cube_t *cube, int64_t eo)
    262 {
    263 	int64_t eo12, eotop, eobot;
    264 	__m256i veo;
    265 
    266 	eo12 = (eo << 1) + (_mm_popcnt_u64(eo) % 2);
    267 	eotop = (eo12 & (1 << 11)) << 17 |
    268 		(eo12 & (1 << 10)) << 10 |
    269 		(eo12 & (1 << 9)) << 3 |
    270 		(eo12 & (1 << 8)) >> 4;
    271 	eobot = (eo12 & (1 << 7)) << 53 |
    272 		(eo12 & (1 << 6)) << 46 |
    273 		(eo12 & (1 << 5)) << 39 |
    274 		(eo12 & (1 << 4)) << 32 |
    275 		(eo12 & (1 << 3)) << 25 |
    276 		(eo12 & (1 << 2)) << 18 |
    277 		(eo12 & (1 << 1)) << 11 |
    278 		(eo12 & 1) << 4;
    279 	veo = _mm256_set_epi64x(eotop, eobot, 0, 0);
    280 
    281 	*cube = _mm256_andnot_si256(EO_AVX2, *cube);
    282 	*cube = _mm256_or_si256(*cube, veo);
    283 }
    284 
    285 STATIC_INLINE cube_t
    286 invcoord_esep(int64_t esep)
    287 {
    288 	cube_t eee, ret;
    289 	uint8_t mem[32] = {0};
    290 
    291 	invcoord_esep_array(esep % 70, esep / 70, mem+16);
    292 
    293 	ret = SOLVED_CUBE;
    294 	eee = _mm256_loadu_si256((__m256i_u *)&mem);
    295 	copy_edges(&ret, eee);
    296 
    297 	return ret;
    298 }