neon.h (7785B)
1 #define CO2_NEON vdup_n_u8(0x60) 2 #define COCW_NEON vdup_n_u8(0x20) 3 #define CP_NEON vdup_n_u8(0x07) 4 #define EP_NEON vcombine_u8(vdupq_n_u8(0x0F), vdupq_n_u8(0x0F)) 5 #define EO_NEON vcombine_u8(vdupq_n_u8(0x10), vdupq_n_u8(0x10)) 6 7 STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t); 8 STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t); 9 10 #define STATIC_CUBE( \ 11 c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \ 12 e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \ 13 ((cube_t){ \ 14 .corner = { \ 15 c_ufr, c_ubl, c_dfl, c_dbr, \ 16 c_ufl, c_ubr, c_dfr, c_dbl \ 17 }, \ 18 .edge = { \ 19 e_uf, e_ub, e_db, e_df, e_ur, e_ul, \ 20 e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0 \ 21 } \ 22 }) 23 24 #define ZERO_CUBE \ 25 ((cube_t){ \ 26 .corner = vdup_n_u8(0), \ 27 .edge = vdupq_n_u8(0) \ 28 }) 29 30 #define SOLVED_CUBE STATIC_CUBE( \ 31 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) 32 33 /* TODO: optimize this (use intrinsics?) */ 34 STATIC_INLINE int 35 popcount_u32(uint32_t x) 36 { 37 int ret; 38 39 for (ret = 0; x != 0; x >>= 1) 40 ret += x & 1; 41 42 return ret; 43 } 44 45 STATIC void 46 pieces(cube_t cube[static 1], uint8_t c[static 8], uint8_t e[static 12]) 47 { 48 // First 8 bytes of the corner vector are copied from the c array 49 vst1_u8(c, cube->corner); 50 51 // 12 bytes of the edge vector are copied from the e array 52 // First 8 bytes 53 vst1_u8(e, vget_low_u8(cube->edge)); 54 // Next 4 bytes 55 vst1_lane_u32((uint32_t *)(e + 8), 56 vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0); 57 } 58 59 STATIC_INLINE bool 60 equal(cube_t c1, cube_t c2) 61 { 62 uint8x8_t cmp_corner; 63 uint8x16_t cmp_edge; 64 uint64x2_t cmp_corner_u64, cmp_edge_u64, cmp_result; 65 66 // compare the corner vectors and the edge vectors 67 cmp_corner = vceq_u8(c1.corner, c2.corner); 68 cmp_edge = vceqq_u8(c1.edge, c2.edge); 69 70 // convert the comparison vectors to 64-bit vectors and combine them 71 cmp_corner_u64 = vreinterpretq_u64_u8( 72 vcombine_u64(cmp_corner, cmp_corner)); 73 cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge); 74 cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64); 75 76 // check if all the bits are set 77 return vgetq_lane_u64(cmp_result, 0) == ~0ULL && 78 vgetq_lane_u64(cmp_result, 1) == ~0ULL; 79 } 80 81 STATIC_INLINE cube_t 82 invertco(cube_t c) 83 { 84 cube_t ret; 85 uint8x8_t co, shleft, shright, summed, newco, cleanco; 86 87 co = vand_u8(c.corner, CO2_NEON); 88 shleft = vshl_n_u8(co, 1); 89 shright = vshr_n_u8(co, 1); 90 summed = vorr_u8(shleft, shright); 91 newco = vand_u8(summed, CO2_NEON); 92 cleanco = veor_u8(c.corner, co); 93 ret.corner = vorr_u8(cleanco, newco); 94 ret.edge = c.edge; 95 96 return ret; 97 } 98 99 STATIC_INLINE cube_t 100 compose_edges(cube_t c1, cube_t c2) 101 { 102 cube_t ret = {0}; 103 ret.edge = compose_edges_slim(c1.edge, c2.edge); 104 return ret; 105 } 106 107 STATIC_INLINE cube_t 108 compose_corners(cube_t c1, cube_t c2) 109 { 110 cube_t ret = {0}; 111 ret.corner = compose_corners_slim(c1.corner, c2.corner); 112 return ret; 113 } 114 115 STATIC_INLINE uint8x16_t 116 compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2) 117 { 118 // Masks 119 uint8x16_t p_bits = vdupq_n_u8(PBITS); 120 uint8x16_t eo_bit = vdupq_n_u8(EOBIT); 121 122 // Find the index and permutation 123 uint8x16_t p = vandq_u8(edge2, p_bits); 124 uint8x16_t piece1 = vqtbl1q_u8(edge1, p); 125 126 // Calculate the orientation through XOR 127 uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit); 128 129 // Combine the results 130 uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien); 131 132 // Mask to clear the last 32 bits of the result 133 uint8x16_t mask_last_32 = 134 vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3); 135 ret = vreinterpretq_u8_u32(mask_last_32); 136 137 return ret; 138 } 139 140 STATIC_INLINE uint8x8_t 141 compose_corners_slim(uint8x8_t corner1, uint8x8_t corner2) 142 { 143 // Masks 144 uint8x8_t p_bits = vdup_n_u8(PBITS); 145 uint8x8_t cobits = vdup_n_u8(COBITS); 146 uint8x8_t cobits2 = vdup_n_u8(COBITS_2); 147 uint8x8_t twist_cw = vdup_n_u8(CTWIST_CW); 148 149 // Find the index and permutation 150 uint8x8_t p = vand_u8(corner2, p_bits); 151 uint8x8_t piece1 = vtbl1_u8(corner1, p); 152 153 // Calculate the orientation 154 uint8x8_t aux = 155 vadd_u8(vand_u8(corner2, cobits), vand_u8(piece1, cobits)); 156 uint8x8_t auy = vshr_n_u8(vadd_u8(aux, twist_cw), 2); 157 uint8x8_t orien = vand_u8(vadd_u8(aux, auy), cobits2); 158 159 uint8x8_t ret = vorr_u8(vand_u8(piece1, p_bits), orien); 160 161 return ret; 162 } 163 164 STATIC_INLINE cube_t 165 compose(cube_t c1, cube_t c2) 166 { 167 cube_t ret = {0}; 168 169 ret.edge = compose_edges_slim(c1.edge, c2.edge); 170 ret.corner = compose_corners_slim(c1.corner, c2.corner); 171 172 return ret; 173 } 174 175 STATIC_INLINE cube_t 176 inverse(cube_t cube) 177 { 178 uint8_t i, piece, orien; 179 cube_t ret; 180 181 // Temp arrays to store the NEON vectors 182 uint8_t edges[16]; 183 uint8_t corners[8]; 184 185 // Copy the NEON vectors to the arrays 186 vst1q_u8(edges, cube.edge); 187 vst1_u8(corners, cube.corner); 188 189 uint8_t edge_result[16] = {0}; 190 uint8_t corner_result[8] = {0}; 191 192 // Process the edges 193 for (i = 0; i < 12; i++) 194 { 195 piece = edges[i]; 196 orien = piece & EOBIT; 197 edge_result[piece & PBITS] = i | orien; 198 } 199 200 // Process the corners 201 for (i = 0; i < 8; i++) 202 { 203 piece = corners[i]; 204 orien = ((piece << 1) | (piece >> 1)) & COBITS_2; 205 corner_result[piece & PBITS] = i | orien; 206 } 207 208 // Copy the results back to the NEON vectors 209 ret.edge = vld1q_u8(edge_result); 210 ret.corner = vld1_u8(corner_result); 211 212 return ret; 213 } 214 215 STATIC_INLINE int64_t 216 coord_co(cube_t c) 217 { 218 // Temp array to store the NEON vector 219 uint8_t mem[8]; 220 vst1_u8(mem, c.corner); 221 222 int i, p; 223 int64_t ret; 224 225 for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3) 226 ret += p * (mem[i] >> COSHIFT); 227 228 return ret; 229 } 230 231 STATIC_INLINE cube_t 232 invcoord_co(int64_t coord) 233 { 234 int64_t co, c, i, p; 235 uint8_t mem[8]; 236 cube_t cube; 237 238 for (i = 0, p = 0, c = coord; i < 8; i++, c /= 3) { 239 co = i == 7 ? ((3 - (p % 3)) % 3) : (c % 3); 240 p += co; 241 mem[i] = i + (co << COSHIFT); 242 } 243 244 cube.corner = vld1_u8(mem); 245 cube.edge = SOLVED_CUBE.edge; 246 247 return cube; 248 } 249 250 STATIC_INLINE int64_t 251 coord_csep(cube_t c) 252 { 253 // Temp array to store the NEON vector 254 uint8_t mem[8]; 255 vst1_u8(mem, c.corner); 256 257 int64_t ret = 0; 258 int i, p; 259 for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2) 260 ret += p * ((mem[i] & CSEPBIT) >> 2); 261 262 return ret; 263 return 0; 264 } 265 266 STATIC_INLINE int64_t 267 coord_cocsep(cube_t c) 268 { 269 return (coord_co(c) << 7) + coord_csep(c); 270 } 271 272 STATIC_INLINE int64_t 273 coord_eo(cube_t c) 274 { 275 int64_t ret = 0; 276 int64_t p = 1; 277 278 // Temp array to store the NEON vector 279 uint8_t mem[16]; 280 vst1q_u8(mem, c.edge); 281 282 for (int i = 1; i < 12; i++, p *= 2) 283 { 284 ret += p * (mem[i] >> EOSHIFT); 285 } 286 287 return ret; 288 } 289 290 STATIC_INLINE int64_t 291 coord_esep(cube_t c) 292 { 293 int64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1; 294 295 // Temp array to store the NEON vector 296 uint8_t mem[16]; 297 vst1q_u8(mem, c.edge); 298 299 for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++) 300 { 301 bit1 = (mem[i] & ESEPBIT_1) >> 2; 302 bit2 = (mem[i] & ESEPBIT_2) >> 3; 303 is1 = (1 - bit2) * bit1; 304 305 ret1 += bit2 * binomial[11 - i][k]; 306 k -= bit2; 307 308 jj = j < 8; 309 ret2 += jj * is1 * binomial[7 - (j * jj)][l]; 310 l -= is1; 311 j += (1 - bit2); 312 } 313 314 return ret1 * 70 + ret2; 315 } 316 317 STATIC_INLINE void 318 copy_corners(cube_t dst[static 1], cube_t src) 319 { 320 dst->corner = src.corner; 321 } 322 323 STATIC_INLINE void 324 copy_edges(cube_t dst[static 1], cube_t src) 325 { 326 dst->edge = src.edge; 327 } 328 329 STATIC_INLINE void 330 set_eo(cube_t cube[static 1], int64_t eo) 331 { 332 // Temp array to store the NEON vector 333 uint8_t mem[16]; 334 vst1q_u8(mem, cube->edge); 335 uint8_t i, sum, flip; 336 337 for (sum = 0, i = 1; i < 12; i++, eo >>= 1) 338 { 339 flip = eo % 2; 340 sum += flip; 341 mem[i] = (mem[i] & ~EOBIT) | (EOBIT * flip); 342 } 343 mem[0] = (mem[0] & ~EOBIT) | (EOBIT * (sum % 2)); 344 345 // Copy the results back to the NEON vector 346 cube->edge = vld1q_u8(mem); 347 return; 348 } 349 350 STATIC_INLINE cube_t 351 invcoord_esep(int64_t esep) 352 { 353 cube_t ret; 354 uint8_t mem[16] = {0}; 355 356 invcoord_esep_array(esep % 70, esep / 70, mem); 357 358 ret = SOLVED_CUBE; 359 ret.edge = vld1q_u8(mem); 360 361 return ret; 362 }