h48

A prototype for an optimal Rubik's cube solver, work in progress.
git clone https://git.tronto.net/h48
Download | Log | Files | Refs | README | LICENSE

neon.h (7372B)


      1 #define CO2_NEON vdup_n_u8(0x60)
      2 #define COCW_NEON vdup_n_u8(0x20)
      3 #define CP_NEON vdup_n_u8(0x07)
      4 #define EP_NEON vcombine_u8(vdupq_n_u8(0x0F), vdupq_n_u8(0x0F))
      5 #define EO_NEON vcombine_u8(vdupq_n_u8(0x10), vdupq_n_u8(0x10))
      6 
      7 STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t);
      8 STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t);
      9 
     10 // static cube
     11 #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
     12 					e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
     13 	((cube_t){ \
     14 		.corner = {c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl}, \
     15 		.edge = {e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0}})
     16 
     17 // zero cube
     18 #define ZERO_CUBE \
     19 	(cube_t) \
     20 	{ \
     21 		.corner = vdup_n_u8(0), \
     22 		.edge = vdupq_n_u8(0) \
     23 	}
     24 
     25 // solved cube
     26 #define SOLVED_CUBE STATIC_CUBE(	\
     27 	0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
     28 
     29 /* TODO: optimize this (use intrinsics?) */
     30 STATIC_INLINE int
     31 {
     32 	int ret;
     33 
     34 	for (ret = 0; x != 0; x >>= 1)
     35 		ret += x & 1;
     36 
     37 	return ret;
     38 }
     39 
     40 STATIC void
     41 pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12])
     42 {
     43 	// First 8 bytes of the corner vector are copied from the c array
     44 	vst1_u8(c, cube->corner);
     45 
     46 	// 12 bytes of the edge vector are copied from the e array
     47 	// First 8 bytes
     48 	vst1_u8(e, vget_low_u8(cube->edge));
     49 	// Next 4 bytes
     50 	vst1_lane_u32((uint32_t *)(e + 8), vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0);
     51 }
     52 
     53 STATIC_INLINE bool
     54 equal(cube_t c1, cube_t c2)
     55 {
     56 	uint8x8_t cmp_corner;
     57 	uint8x16_t cmp_edge;
     58 	uint64x2_t cmp_corner_u64, cmp_edge_u64, cmp_result;
     59 
     60 	// compare the corner vectors and the edge vectors
     61 	cmp_corner = vceq_u8(c1.corner, c2.corner);
     62 	cmp_edge = vceqq_u8(c1.edge, c2.edge);
     63 
     64 	// convert the comparison vectors to 64-bit vectors and combine them
     65 	cmp_corner_u64 = vreinterpretq_u64_u8(vcombine_u64(cmp_corner, cmp_corner));
     66 	cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge);
     67 	cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64);
     68 
     69 	// check if all the bits are set
     70 	return vgetq_lane_u64(cmp_result, 0) == ~0ULL && vgetq_lane_u64(cmp_result, 1) == ~0ULL;
     71 }
     72 
     73 STATIC_INLINE cube_t
     74 invertco(cube_t c)
     75 {
     76 	cube_t ret;
     77 	uint8x8_t co, shleft, shright, summed, newco, cleanco;
     78 
     79 	co = vand_u8(c.corner, CO2_NEON);
     80 	shleft = vshl_n_u8(co, 1);
     81 	shright = vshr_n_u8(co, 1);
     82 	summed = vorr_u8(shleft, shright);
     83 	newco = vand_u8(summed, CO2_NEON);
     84 	cleanco = veor_u8(c.corner, co);
     85 	ret.corner = vorr_u8(cleanco, newco);
     86 	ret.edge = c.edge;
     87 	
     88 	return ret;
     89 }
     90 
     91 STATIC_INLINE cube_t
     92 compose_edges(cube_t c1, cube_t c2)
     93 {
     94 	cube_t ret = {0};
     95 	ret.edge = compose_edges_slim(c1.edge, c2.edge);
     96 	return ret;
     97 }
     98 
     99 STATIC_INLINE cube_t
    100 compose_corners(cube_t c1, cube_t c2)
    101 {
    102 	cube_t ret = {0};
    103 	ret.corner = compose_corners_slim(c1.corner, c2.corner);
    104 	return ret;
    105 }
    106 
    107 STATIC_INLINE uint8x16_t
    108 compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2)
    109 {
    110 	// Masks
    111 	uint8x16_t p_bits = vdupq_n_u8(PBITS);
    112 	uint8x16_t eo_bit = vdupq_n_u8(EOBIT);
    113 
    114 	// Find the index and permutation
    115 	uint8x16_t p = vandq_u8(edge2, p_bits);
    116 	uint8x16_t piece1 = vqtbl1q_u8(edge1, p);
    117 
    118 	// Calculate the orientation through XOR
    119 	uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit);
    120 
    121 	// Combine the results
    122 	uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien);
    123 
    124 	// Mask to clear the last 32 bits of the result
    125 	uint8x16_t mask_last_32 = vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3);
    126 	ret = vreinterpretq_u8_u32(mask_last_32);
    127 
    128 	return ret;
    129 }
    130 
    131 STATIC_INLINE uint8x8_t
    132 compose_corners_slim(uint8x8_t corner1, uint8x8_t corner2)
    133 {
    134 	// Masks
    135 	uint8x8_t p_bits = vdup_n_u8(PBITS);
    136 	uint8x8_t cobits = vdup_n_u8(COBITS);
    137 	uint8x8_t cobits2 = vdup_n_u8(COBITS_2);
    138 	uint8x8_t twist_cw = vdup_n_u8(CTWIST_CW);
    139 
    140 	// Find the index and permutation
    141 	uint8x8_t p = vand_u8(corner2, p_bits);
    142 	uint8x8_t piece1 = vtbl1_u8(corner1, p);
    143 
    144 	// Calculate the orientation
    145 	uint8x8_t aux = vadd_u8(vand_u8(corner2, cobits), vand_u8(piece1, cobits));
    146 	uint8x8_t auy = vshr_n_u8(vadd_u8(aux, twist_cw), 2);
    147 	uint8x8_t orien = vand_u8(vadd_u8(aux, auy), cobits2);
    148 
    149 	uint8x8_t ret = vorr_u8(vand_u8(piece1, p_bits), orien);
    150 
    151 	return ret;
    152 }
    153 
    154 STATIC_INLINE cube_t
    155 compose(cube_t c1, cube_t c2)
    156 {
    157 	cube_t ret = {0};
    158 
    159 	ret.edge = compose_edges_slim(c1.edge, c2.edge);
    160 	ret.corner = compose_corners_slim(c1.corner, c2.corner);
    161 
    162 	return ret;
    163 }
    164 
    165 STATIC_INLINE cube_t
    166 inverse(cube_t cube)
    167 {
    168 	uint8_t i, piece, orien;
    169 	cube_t ret;
    170 
    171 	// Temp arrays to store the NEON vectors
    172 	uint8_t edges[16];
    173 	uint8_t corners[8];
    174 
    175 	// Copy the NEON vectors to the arrays
    176 	vst1q_u8(edges, cube.edge);
    177 	vst1_u8(corners, cube.corner);
    178 
    179 	uint8_t edge_result[16] = {0};
    180 	uint8_t corner_result[8] = {0};
    181 
    182 	// Process the edges
    183 	for (i = 0; i < 12; i++)
    184 	{
    185 		piece = edges[i];
    186 		orien = piece & EOBIT;
    187 		edge_result[piece & PBITS] = i | orien;
    188 	}
    189 
    190 	// Process the corners
    191 	for (i = 0; i < 8; i++)
    192 	{
    193 		piece = corners[i];
    194 		orien = ((piece << 1) | (piece >> 1)) & COBITS_2;
    195 		corner_result[piece & PBITS] = i | orien;
    196 	}
    197 
    198 	// Copy the results back to the NEON vectors
    199 	ret.edge = vld1q_u8(edge_result);
    200 	ret.corner = vld1_u8(corner_result);
    201 
    202 	return ret;
    203 }
    204 
    205 STATIC_INLINE int64_t
    206 coord_co(cube_t c)
    207 {
    208 	// Temp array to store the NEON vector
    209 	uint8_t mem[8];
    210 	vst1_u8(mem, c.corner);
    211 
    212 	int i, p;
    213 	int64_t ret;
    214 
    215 	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3)
    216 		ret += p * (mem[i] >> COSHIFT);
    217 
    218 	return ret;
    219 }
    220 
    221 STATIC_INLINE int64_t
    222 coord_csep(cube_t c)
    223 {
    224 	// Temp array to store the NEON vector
    225 	uint8_t mem[8];
    226 	vst1_u8(mem, c.corner);
    227 
    228 	int64_t ret = 0;
    229 	int i, p;
    230 	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2)
    231 		ret += p * ((mem[i] & CSEPBIT) >> 2);
    232 
    233 	return ret;
    234 	return 0;
    235 }
    236 
    237 STATIC_INLINE int64_t
    238 coord_cocsep(cube_t c)
    239 {
    240 	return (coord_co(c) << 7) + coord_csep(c);
    241 }
    242 
    243 STATIC_INLINE int64_t
    244 coord_eo(cube_t c)
    245 {
    246 	int64_t ret = 0;
    247 	int64_t p = 1;
    248 
    249 	// Temp array to store the NEON vector
    250 	uint8_t mem[16];
    251 	vst1q_u8(mem, c.edge);
    252 
    253 	for (int i = 1; i < 12; i++, p *= 2)
    254 	{
    255 		ret += p * (mem[i] >> EOSHIFT);
    256 	}
    257 
    258 	return ret;
    259 }
    260 
    261 STATIC_INLINE int64_t
    262 coord_esep(cube_t c)
    263 {
    264 	int64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
    265 
    266 	// Temp array to store the NEON vector
    267 	uint8_t mem[16];
    268 	vst1q_u8(mem, c.edge);
    269 
    270 	for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++)
    271 	{
    272 		bit1 = (mem[i] & ESEPBIT_1) >> 2;
    273 		bit2 = (mem[i] & ESEPBIT_2) >> 3;
    274 		is1 = (1 - bit2) * bit1;
    275 
    276 		ret1 += bit2 * binomial[11 - i][k];
    277 		k -= bit2;
    278 
    279 		jj = j < 8;
    280 		ret2 += jj * is1 * binomial[7 - (j * jj)][l];
    281 		l -= is1;
    282 		j += (1 - bit2);
    283 	}
    284 
    285 	return ret1 * 70 + ret2;
    286 }
    287 
    288 STATIC_INLINE void
    289 copy_corners(cube_t *dst, cube_t src)
    290 {
    291 	dst->corner = src.corner;
    292 }
    293 
    294 STATIC_INLINE void
    295 copy_edges(cube_t *dst, cube_t src)
    296 {
    297 	dst->edge = src.edge;
    298 }
    299 
    300 STATIC_INLINE void
    301 set_eo(cube_t *cube, int64_t eo)
    302 {
    303 	// Temp array to store the NEON vector
    304 	uint8_t mem[16];
    305 	vst1q_u8(mem, cube->edge);
    306 	uint8_t i, sum, flip;
    307 
    308 	for (sum = 0, i = 1; i < 12; i++, eo >>= 1)
    309 	{
    310 		flip = eo % 2;
    311 		sum += flip;
    312 		mem[i] = (mem[i] & ~EOBIT) | (EOBIT * flip);
    313 	}
    314 	mem[0] = (mem[0] & ~EOBIT) | (EOBIT * (sum % 2));
    315 
    316 	// Copy the results back to the NEON vector
    317 	cube->edge = vld1q_u8(mem);
    318 	return;
    319 }
    320 
    321 STATIC_INLINE cube_t
    322 invcoord_esep(int64_t esep)
    323 {
    324 	cube_t ret;
    325 	uint8_t mem[16] = {0};
    326 
    327 	invcoord_esep_array(esep % 70, esep / 70, mem);
    328 
    329 	ret = SOLVED_CUBE;
    330 	ret.edge = vld1q_u8(mem);
    331 
    332 	return ret;
    333 }