h48

A prototype for an optimal Rubik's cube solver, work in progress.
git clone https://git.tronto.net/h48
Download | Log | Files | Refs | README | LICENSE

avx2.h (7038B)


      1 #define CO2_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x6060606060606060))
      2 #define COCW_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x2020202020202020))
      3 #define CP_AVX2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x0707070707070707))
      4 #define EP_AVX2 \
      5     _mm256_set_epi64x(INT64_C(0x0F0F0F0F), INT64_C(0x0F0F0F0F0F0F0F0F), 0, 0)
      6 #define EO_AVX2 \
      7     _mm256_set_epi64x(INT64_C(0x10101010), INT64_C(0x1010101010101010), 0, 0)
      8 
      9 #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
     10     e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
     11     _mm256_set_epi8(0, 0, 0, 0, e_br, e_bl, e_fl, e_fr, \
     12         e_dr, e_dl, e_ul, e_ur, e_df, e_db, e_ub, e_uf, \
     13         0, 0, 0, 0, 0, 0, 0, 0, \
     14         c_dbl, c_dfr, c_ubr, c_ufl, c_dbr, c_dfl, c_ubl, c_ufr)
     15 #define ZERO_CUBE _mm256_set_epi64x(0, 0, 0, 0)
     16 #define SOLVED_CUBE STATIC_CUBE( \
     17     0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
     18 
     19 STATIC_INLINE int
     20 popcount_u32(uint32_t x)
     21 {
     22 	return _mm_popcnt_u32(x);
     23 }
     24 
     25 STATIC void
     26 pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12])
     27 {
     28 	uint8_t aux[32];
     29 
     30 	_mm256_storeu_si256((__m256i_u *)aux, *cube);
     31 	memcpy(c, aux, 8);
     32 	memcpy(e, aux+16, 12);
     33 }
     34 
     35 STATIC_INLINE bool
     36 equal(cube_t c1, cube_t c2)
     37 {
     38 	int32_t mask;
     39 	__m256i cmp;
     40 
     41 	cmp = _mm256_cmpeq_epi8(c1, c2);
     42 	mask = _mm256_movemask_epi8(cmp);
     43 
     44 	return mask == ~0;
     45 }
     46 
     47 STATIC_INLINE cube_t
     48 invertco(cube_t c)
     49 {
     50         cube_t co, shleft, shright, summed, newco, cleanco, ret;
     51 
     52         co = _mm256_and_si256(c, CO2_AVX2);
     53         shleft = _mm256_slli_epi32(co, 1);
     54         shright = _mm256_srli_epi32(co, 1);
     55         summed = _mm256_or_si256(shleft, shright);
     56         newco = _mm256_and_si256(summed, CO2_AVX2);
     57         cleanco = _mm256_xor_si256(c, co);
     58         ret = _mm256_or_si256(cleanco, newco);
     59 
     60         return ret;
     61 }
     62 
     63 STATIC_INLINE cube_t
     64 compose_epcpeo(cube_t c1, cube_t c2)
     65 {
     66 	cube_t b, s, eo2;
     67 
     68 	/* Permute and clean unused bits */
     69 	s = _mm256_shuffle_epi8(c1, c2);
     70 	b = _mm256_set_epi8(
     71 		~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     72 		~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0
     73 	);
     74 	s = _mm256_andnot_si256(b, s);
     75 
     76 	/* Change EO */
     77 	eo2 = _mm256_and_si256(c2, EO_AVX2);
     78 	s = _mm256_xor_si256(s, eo2);
     79 
     80 	return s;
     81 }
     82 
     83 STATIC_INLINE cube_t
     84 compose_edges(cube_t c1, cube_t c2)
     85 {
     86 	return compose_epcpeo(c1, c2);
     87 }
     88 
     89 STATIC_INLINE cube_t
     90 compose_corners(cube_t c1, cube_t c2)
     91 {
     92 	/*
     93 	 * We do a full compose. Minor optimizations are possible, like
     94 	 * saving one instruction by not doing EO, but it should not
     95 	 * be significant.
     96 	 */
     97 	return compose(c1, c2);
     98 }
     99 
    100 STATIC_INLINE cube_t
    101 compose(cube_t c1, cube_t c2)
    102 {
    103 	cube_t s, co1, co2, aux, auy1, auy2, auz1, auz2;
    104 
    105 	s = compose_epcpeo(c1, c2);
    106 
    107 	/* Change CO */
    108 	co1 = _mm256_and_si256(s, CO2_AVX2);
    109 	co2 = _mm256_and_si256(c2, CO2_AVX2);
    110 	aux = _mm256_add_epi8(co1, co2);
    111 	auy1 = _mm256_add_epi8(aux, COCW_AVX2);
    112 	auy2 = _mm256_srli_epi32(auy1, 2);
    113 	auz1 = _mm256_add_epi8(aux, auy2);
    114 	auz2 = _mm256_and_si256(auz1, CO2_AVX2);
    115 
    116 	/* Put together */
    117 	s = _mm256_andnot_si256(CO2_AVX2, s);
    118 	s = _mm256_or_si256(s, auz2);
    119 
    120 	return s;
    121 }
    122 
    123 STATIC_INLINE cube_t
    124 cleanaftershuffle(cube_t c)
    125 {
    126 	__m256i b;
    127 
    128 	b = _mm256_set_epi8(
    129 		~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    130 		~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0
    131 	);
    132 
    133 	return _mm256_andnot_si256(b, c);
    134 }
    135 
    136 STATIC_INLINE cube_t
    137 inverse(cube_t c)
    138 {
    139 	/* Method taken from Andrew Skalski's vcube[1]. The addition sequence
    140 	 * was generated using [2].
    141 	 * [1] https://github.com/Voltara/vcube
    142 	 * [2] http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
    143 	 */
    144 	cube_t v3, vi, vo, vp, ret;
    145 
    146 	v3 = _mm256_shuffle_epi8(c, c);
    147 	v3 = _mm256_shuffle_epi8(v3, c);
    148 	vi = _mm256_shuffle_epi8(v3, v3);
    149 	vi = _mm256_shuffle_epi8(vi, vi);
    150 	vi = _mm256_shuffle_epi8(vi, vi);
    151 	vi = _mm256_shuffle_epi8(vi, v3);
    152 	vi = _mm256_shuffle_epi8(vi, vi);
    153 	vi = _mm256_shuffle_epi8(vi, vi);
    154 	vi = _mm256_shuffle_epi8(vi, vi);
    155 	vi = _mm256_shuffle_epi8(vi, vi);
    156 	vi = _mm256_shuffle_epi8(vi, c);
    157 	vi = _mm256_shuffle_epi8(vi, vi);
    158 	vi = _mm256_shuffle_epi8(vi, vi);
    159 	vi = _mm256_shuffle_epi8(vi, vi);
    160 	vi = _mm256_shuffle_epi8(vi, vi);
    161 	vi = _mm256_shuffle_epi8(vi, vi);
    162 	vi = _mm256_shuffle_epi8(vi, v3);
    163 	vi = _mm256_shuffle_epi8(vi, vi);
    164 	vi = _mm256_shuffle_epi8(vi, c);
    165 
    166 	vo = _mm256_and_si256(c, _mm256_or_si256(EO_AVX2, CO2_AVX2));
    167 	vo = _mm256_shuffle_epi8(vo, vi);
    168 	vp = _mm256_andnot_si256(_mm256_or_si256(EO_AVX2, CO2_AVX2), vi);
    169 	ret = _mm256_or_si256(vp, vo);
    170 	ret = cleanaftershuffle(ret);
    171 	
    172 	return invertco(ret);
    173 }
    174 
    175 STATIC_INLINE int64_t
    176 coord_co(cube_t c)
    177 {
    178 	cube_t co;
    179 	int64_t mem[4], ret, i, p;
    180 
    181 	co = _mm256_and_si256(c, CO2_AVX2);
    182 	_mm256_storeu_si256((__m256i *)mem, co);
    183 
    184 	mem[0] >>= 5;
    185 	for (i = 0, ret = 0, p = 1; i < 7; i++, mem[0] >>= 8, p *= 3)
    186 		ret += (mem[0] & 3) * p;
    187 
    188 	return ret;
    189 }
    190 
    191 STATIC_INLINE int64_t
    192 coord_csep(cube_t c)
    193 {
    194 	cube_t cp, shifted;
    195 	int64_t mask;
    196 
    197 	cp = _mm256_and_si256(c, CP_AVX2);
    198 	shifted = _mm256_slli_epi32(cp, 5);
    199 	mask = _mm256_movemask_epi8(shifted);
    200 
    201 	return mask & 0x7F;
    202 }
    203 
    204 STATIC_INLINE int64_t
    205 coord_cocsep(cube_t c)
    206 {
    207 	return (coord_co(c) << 7) + coord_csep(c);
    208 }
    209 
    210 STATIC_INLINE int64_t
    211 coord_eo(cube_t c)
    212 {
    213 	cube_t eo, shifted;
    214 	int64_t mask;
    215 
    216 	eo = _mm256_and_si256(c, EO_AVX2);
    217 	shifted = _mm256_slli_epi32(eo, 3);
    218 	mask = _mm256_movemask_epi8(shifted);
    219 
    220 	return mask >> 17;
    221 }
    222 
    223 STATIC_INLINE int64_t
    224 coord_esep(cube_t c)
    225 {
    226 	cube_t ep;
    227 	int64_t e, mem[4], i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
    228 
    229 	ep = _mm256_and_si256(c, EP_AVX2);
    230 	_mm256_storeu_si256((__m256i *)mem, ep);
    231 
    232 	mem[3] <<= 8;
    233 	ret1 = ret2 = 0;
    234 	k = l = 4;
    235 	for (i = 0, j = 0; i < 12; i++, mem[i/8 + 2] >>= 8) {
    236 		e = mem[i/8 + 2];
    237 
    238 		bit1 = (e & ESEPBIT_1) >> 2;
    239 		bit2 = (e & ESEPBIT_2) >> 3;
    240 		is1 = (1 - bit2) * bit1;
    241 
    242 		ret1 += bit2 * binomial[11-i][k];
    243 		k -= bit2;
    244 
    245 		jj = j < 8;
    246 		ret2 += jj * is1 * binomial[7-(j*jj)][l];
    247 		l -= is1;
    248 		j += (1-bit2);
    249 	}
    250 
    251 	return ret1 * 70 + ret2;
    252 }
    253 
    254 STATIC_INLINE void
    255 copy_corners(cube_t *dest, cube_t src)
    256 {
    257 	*dest = _mm256_blend_epi32(*dest, src, 0x0F);
    258 }
    259 
    260 STATIC_INLINE void
    261 copy_edges(cube_t *dest, cube_t src)
    262 {
    263 	*dest = _mm256_blend_epi32(*dest, src, 0xF0);
    264 }
    265 
    266 STATIC_INLINE void
    267 set_eo(cube_t *cube, int64_t eo)
    268 {
    269 	int64_t eo12, eotop, eobot;
    270 	__m256i veo;
    271 
    272 	eo12 = (eo << 1) + (_mm_popcnt_u64(eo) % 2);
    273 	eotop = (eo12 & (1 << 11)) << 17 |
    274 		(eo12 & (1 << 10)) << 10 |
    275 		(eo12 & (1 << 9)) << 3 |
    276 		(eo12 & (1 << 8)) >> 4;
    277 	eobot = (eo12 & (1 << 7)) << 53 |
    278 		(eo12 & (1 << 6)) << 46 |
    279 		(eo12 & (1 << 5)) << 39 |
    280 		(eo12 & (1 << 4)) << 32 |
    281 		(eo12 & (1 << 3)) << 25 |
    282 		(eo12 & (1 << 2)) << 18 |
    283 		(eo12 & (1 << 1)) << 11 |
    284 		(eo12 & 1) << 4;
    285 	veo = _mm256_set_epi64x(eotop, eobot, 0, 0);
    286 
    287 	*cube = _mm256_andnot_si256(EO_AVX2, *cube);
    288 	*cube = _mm256_or_si256(*cube, veo);
    289 }
    290 
    291 STATIC_INLINE cube_t
    292 invcoord_esep(int64_t esep)
    293 {
    294 	cube_t eee, ret;
    295 	uint8_t mem[32] = {0};
    296 
    297 	invcoord_esep_array(esep % 70, esep / 70, mem+16);
    298 
    299 	ret = SOLVED_CUBE;
    300 	eee = _mm256_loadu_si256((__m256i_u *)&mem);
    301 	copy_edges(&ret, eee);
    302 
    303 	return ret;
    304 }