h48

A prototype for an optimal Rubik's cube solver, work in progress.
git clone https://git.tronto.net/h48
Download | Log | Files | Refs | README | LICENSE

neon.h (7517B)


      1 #define CO2_NEON vdupq_n_u8(0x60)
      2 #define COCW_NEON vdupq_n_u8(0x20)
      3 #define CP_NEON vdupq_n_u8(0x07)
      4 #define EP_NEON vcombine_u8(vdupq_n_u8(0x0F), vdupq_n_u8(0x0F))
      5 #define EO_NEON vcombine_u8(vdupq_n_u8(0x10), vdupq_n_u8(0x10))
      6 
      7 STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t);
      8 STATIC_INLINE uint8x16_t compose_corners_slim(uint8x16_t, uint8x16_t);
      9 
     10 // static cube
     11 #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
     12 					e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
     13 	((cube_t){ \
     14 		.corner = {c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, 0, 0, 0, 0, 0, 0, 0, 0}, \
     15 		.edge = {e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0}})
     16 
     17 // zero cube
     18 #define ZERO_CUBE \
     19 	(cube_t) \
     20 	{ \
     21 		.corner = vdupq_n_u8(0), \
     22 		.edge = vdupq_n_u8(0) \
     23 	}
     24 
     25 // solved cube
     26 #define SOLVED_CUBE STATIC_CUBE(	\
     27 	0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
     28 
     29 STATIC void
     30 pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12])
     31 {
     32 	// First 8 bytes of the corner vector are copied from the c array
     33 	vst1_u8(c, vget_low_u8(cube->corner));
     34 
     35 	// 12 bytes of the edge vector are copied from the e array
     36 	// First 8 bytes
     37 	vst1_u8(e, vget_low_u8(cube->edge));
     38 	// Next 4 bytes
     39 	vst1_lane_u32((uint32_t *)(e + 8), vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0);
     40 }
     41 
     42 STATIC_INLINE bool
     43 equal(cube_t c1, cube_t c2)
     44 {
     45 	uint8x16_t cmp_corner, cmp_edge;
     46 	uint64x2_t cmp_corner_u64, cmp_edge_u64;
     47 	uint64x2_t cmp_result;
     48 
     49 	// compare the corner vectors
     50 	cmp_corner = vceqq_u8(c1.corner, c2.corner);
     51 	// compare the edge vectors
     52 	cmp_edge = vceqq_u8(c1.edge, c2.edge);
     53 
     54 	// convert the comparison vectors to 64-bit vectors
     55 	cmp_corner_u64 = vreinterpretq_u64_u8(cmp_corner);
     56 	cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge);
     57 
     58 	// combine the comparison vectors
     59 	cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64);
     60 
     61 	// check if all the bits are set
     62 	return vgetq_lane_u64(cmp_result, 0) == ~0ULL && vgetq_lane_u64(cmp_result, 1) == ~0ULL;
     63 }
     64 
     65 STATIC_INLINE cube_t
     66 invertco(cube_t c)
     67 {
     68 	cube_t ret;
     69 	uint8x16_t co, shleft, shright, summed, newco, cleanco;
     70 
     71 	co = vandq_u8(c.corner, CO2_NEON);
     72 	shleft = vshlq_n_u8(co, 1);
     73 	shright = vshrq_n_u8(co, 1);
     74 	summed = vorrq_u8(shleft, shright);
     75 	newco = vandq_u8(summed, CO2_NEON);
     76 	cleanco = veorq_u8(c.corner, co);
     77 	ret.corner = vorrq_u8(cleanco, newco);
     78 	ret.edge = c.edge;
     79 	
     80 	return ret;
     81 }
     82 
     83 STATIC_INLINE cube_t
     84 compose_edges(cube_t c1, cube_t c2)
     85 {
     86 	cube_t ret = {0};
     87 	ret.edge = compose_edges_slim(c1.edge, c2.edge);
     88 	return ret;
     89 }
     90 
     91 STATIC_INLINE cube_t
     92 compose_corners(cube_t c1, cube_t c2)
     93 {
     94 	cube_t ret = {0};
     95 	ret.corner = compose_corners_slim(c1.corner, c2.corner);
     96 	return ret;
     97 }
     98 
     99 STATIC_INLINE uint8x16_t
    100 compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2)
    101 {
    102 	// Masks
    103 	uint8x16_t p_bits = vdupq_n_u8(PBITS);
    104 	uint8x16_t eo_bit = vdupq_n_u8(EOBIT);
    105 
    106 	// Find the index and permutation
    107 	uint8x16_t p = vandq_u8(edge2, p_bits);
    108 	uint8x16_t piece1 = vqtbl1q_u8(edge1, p);
    109 
    110 	// Calculate the orientation through XOR
    111 	uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit);
    112 
    113 	// Combine the results
    114 	uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien);
    115 
    116 	// Mask to clear the last 32 bits of the result
    117 	uint8x16_t mask_last_32 = vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3);
    118 	ret = vreinterpretq_u8_u32(mask_last_32);
    119 
    120 	return ret;
    121 }
    122 
    123 STATIC_INLINE uint8x16_t
    124 compose_corners_slim(uint8x16_t corner1, uint8x16_t corner2)
    125 {
    126 	// Masks
    127 	uint8x16_t p_bits = vdupq_n_u8(PBITS);
    128 	uint8x16_t cobits = vdupq_n_u8(COBITS);
    129 	uint8x16_t cobits2 = vdupq_n_u8(COBITS_2);
    130 	uint8x16_t twist_cw = vdupq_n_u8(CTWIST_CW);
    131 
    132 	// Find the index and permutation
    133 	uint8x16_t p = vandq_u8(corner2, p_bits);
    134 	uint8x16_t piece1 = vqtbl1q_u8(corner1, p);
    135 
    136 	// Calculate the orientation
    137 	uint8x16_t aux = vaddq_u8(vandq_u8(corner2, cobits), vandq_u8(piece1, cobits));
    138 	uint8x16_t auy = vshrq_n_u8(vaddq_u8(aux, twist_cw), 2);
    139 	uint8x16_t orien = vandq_u8(vaddq_u8(aux, auy), cobits2);
    140 
    141 	// Combine the results
    142 	uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien);
    143 
    144 	// Mask to clear the last 64 bits of the result
    145 	uint8x16_t mask_last_64 = vsetq_lane_u64(0, vreinterpretq_u64_u8(ret), 1);
    146 	ret = vreinterpretq_u8_u64(mask_last_64);
    147 
    148 	return ret;
    149 }
    150 
    151 STATIC_INLINE cube_t
    152 compose(cube_t c1, cube_t c2)
    153 {
    154 	cube_t ret = {0};
    155 
    156 	ret.edge = compose_edges_slim(c1.edge, c2.edge);
    157 	ret.corner = compose_corners_slim(c1.corner, c2.corner);
    158 
    159 	return ret;
    160 }
    161 
    162 STATIC_INLINE cube_t
    163 inverse(cube_t cube)
    164 {
    165 	uint8_t i, piece, orien;
    166 	cube_t ret;
    167 
    168 	// Temp arrays to store the NEON vectors
    169 	uint8_t edges[16];
    170 	uint8_t corners[16];
    171 
    172 	// Copy the NEON vectors to the arrays
    173 	vst1q_u8(edges, cube.edge);
    174 	vst1q_u8(corners, cube.corner);
    175 
    176 	uint8_t edge_result[16] = {0};
    177 	uint8_t corner_result[16] = {0};
    178 
    179 	// Process the edges
    180 	for (i = 0; i < 12; i++)
    181 	{
    182 		piece = edges[i];
    183 		orien = piece & EOBIT;
    184 		edge_result[piece & PBITS] = i | orien;
    185 	}
    186 
    187 	// Process the corners
    188 	for (i = 0; i < 8; i++)
    189 	{
    190 		piece = corners[i];
    191 		orien = ((piece << 1) | (piece >> 1)) & COBITS_2;
    192 		corner_result[piece & PBITS] = i | orien;
    193 	}
    194 
    195 	// Copy the results back to the NEON vectors
    196 	ret.edge = vld1q_u8(edge_result);
    197 	ret.corner = vld1q_u8(corner_result);
    198 
    199 	return ret;
    200 }
    201 
    202 STATIC_INLINE int64_t
    203 coord_co(cube_t c)
    204 {
    205 	// Temp array to store the NEON vector
    206 	uint8_t mem[16];
    207 	vst1q_u8(mem, c.corner);
    208 
    209 	int i, p;
    210 	int64_t ret;
    211 
    212 	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3)
    213 		ret += p * (mem[i] >> COSHIFT);
    214 
    215 	return ret;
    216 }
    217 
    218 STATIC_INLINE int64_t
    219 coord_csep(cube_t c)
    220 {
    221 	// Temp array to store the NEON vector
    222 	uint8_t mem[16];
    223 	vst1q_u8(mem, c.corner);
    224 
    225 	int64_t ret = 0;
    226 	int i, p;
    227 	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2)
    228 		ret += p * ((mem[i] & CSEPBIT) >> 2);
    229 
    230 	return ret;
    231 	return 0;
    232 }
    233 
    234 STATIC_INLINE int64_t
    235 coord_cocsep(cube_t c)
    236 {
    237 	return (coord_co(c) << 7) + coord_csep(c);
    238 }
    239 
    240 STATIC_INLINE int64_t
    241 coord_eo(cube_t c)
    242 {
    243 	int64_t ret = 0;
    244 	int64_t p = 1;
    245 
    246 	// Temp array to store the NEON vector
    247 	uint8_t mem[16];
    248 	vst1q_u8(mem, c.edge);
    249 
    250 	for (int i = 1; i < 12; i++, p *= 2)
    251 	{
    252 		ret += p * (mem[i] >> EOSHIFT);
    253 	}
    254 
    255 	return ret;
    256 }
    257 
    258 STATIC_INLINE int64_t
    259 coord_esep(cube_t c)
    260 {
    261 	int64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
    262 
    263 	// Temp array to store the NEON vector
    264 	uint8_t mem[16];
    265 	vst1q_u8(mem, c.edge);
    266 
    267 	for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++)
    268 	{
    269 		bit1 = (mem[i] & ESEPBIT_1) >> 2;
    270 		bit2 = (mem[i] & ESEPBIT_2) >> 3;
    271 		is1 = (1 - bit2) * bit1;
    272 
    273 		ret1 += bit2 * binomial[11 - i][k];
    274 		k -= bit2;
    275 
    276 		jj = j < 8;
    277 		ret2 += jj * is1 * binomial[7 - (j * jj)][l];
    278 		l -= is1;
    279 		j += (1 - bit2);
    280 	}
    281 
    282 	return ret1 * 70 + ret2;
    283 }
    284 
    285 STATIC_INLINE void
    286 copy_corners(cube_t *dst, cube_t src)
    287 {
    288 	dst->corner = src.corner;
    289 }
    290 
    291 STATIC_INLINE void
    292 copy_edges(cube_t *dst, cube_t src)
    293 {
    294 	dst->edge = src.edge;
    295 }
    296 
    297 STATIC_INLINE void
    298 set_eo(cube_t *cube, int64_t eo)
    299 {
    300 	// Temp array to store the NEON vector
    301 	uint8_t mem[16];
    302 	vst1q_u8(mem, cube->edge);
    303 	uint8_t i, sum, flip;
    304 
    305 	for (sum = 0, i = 1; i < 12; i++, eo >>= 1)
    306 	{
    307 		flip = eo % 2;
    308 		sum += flip;
    309 		mem[i] = (mem[i] & ~EOBIT) | (EOBIT * flip);
    310 	}
    311 	mem[0] = (mem[0] & ~EOBIT) | (EOBIT * (sum % 2));
    312 
    313 	// Copy the results back to the NEON vector
    314 	cube->edge = vld1q_u8(mem);
    315 	return;
    316 }
    317 
    318 STATIC_INLINE cube_t
    319 invcoord_esep(int64_t esep)
    320 {
    321 	cube_t ret;
    322 	uint8_t mem[16] = {0};
    323 
    324 	invcoord_esep_array(esep % 70, esep / 70, mem);
    325 
    326 	ret = SOLVED_CUBE;
    327 	ret.edge = vld1q_u8(mem);
    328 
    329 	return ret;
    330 }