neon.h (7517B)
1 #define CO2_NEON vdupq_n_u8(0x60) 2 #define COCW_NEON vdupq_n_u8(0x20) 3 #define CP_NEON vdupq_n_u8(0x07) 4 #define EP_NEON vcombine_u8(vdupq_n_u8(0x0F), vdupq_n_u8(0x0F)) 5 #define EO_NEON vcombine_u8(vdupq_n_u8(0x10), vdupq_n_u8(0x10)) 6 7 STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t); 8 STATIC_INLINE uint8x16_t compose_corners_slim(uint8x16_t, uint8x16_t); 9 10 // static cube 11 #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \ 12 e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \ 13 ((cube_t){ \ 14 .corner = {c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, 0, 0, 0, 0, 0, 0, 0, 0}, \ 15 .edge = {e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0}}) 16 17 // zero cube 18 #define ZERO_CUBE \ 19 (cube_t) \ 20 { \ 21 .corner = vdupq_n_u8(0), \ 22 .edge = vdupq_n_u8(0) \ 23 } 24 25 // solved cube 26 #define SOLVED_CUBE STATIC_CUBE( \ 27 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) 28 29 STATIC void 30 pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12]) 31 { 32 // First 8 bytes of the corner vector are copied from the c array 33 vst1_u8(c, vget_low_u8(cube->corner)); 34 35 // 12 bytes of the edge vector are copied from the e array 36 // First 8 bytes 37 vst1_u8(e, vget_low_u8(cube->edge)); 38 // Next 4 bytes 39 vst1_lane_u32((uint32_t *)(e + 8), vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0); 40 } 41 42 STATIC_INLINE bool 43 equal(cube_t c1, cube_t c2) 44 { 45 uint8x16_t cmp_corner, cmp_edge; 46 uint64x2_t cmp_corner_u64, cmp_edge_u64; 47 uint64x2_t cmp_result; 48 49 // compare the corner vectors 50 cmp_corner = vceqq_u8(c1.corner, c2.corner); 51 // compare the edge vectors 52 cmp_edge = vceqq_u8(c1.edge, c2.edge); 53 54 // convert the comparison vectors to 64-bit vectors 55 cmp_corner_u64 = vreinterpretq_u64_u8(cmp_corner); 56 cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge); 57 58 // combine the comparison vectors 59 cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64); 60 61 // check if all the bits are set 62 return vgetq_lane_u64(cmp_result, 0) == ~0ULL && vgetq_lane_u64(cmp_result, 1) == ~0ULL; 63 } 64 65 STATIC_INLINE cube_t 66 invertco(cube_t c) 67 { 68 cube_t ret; 69 uint8x16_t co, shleft, shright, summed, newco, cleanco; 70 71 co = vandq_u8(c.corner, CO2_NEON); 72 shleft = vshlq_n_u8(co, 1); 73 shright = vshrq_n_u8(co, 1); 74 summed = vorrq_u8(shleft, shright); 75 newco = vandq_u8(summed, CO2_NEON); 76 cleanco = veorq_u8(c.corner, co); 77 ret.corner = vorrq_u8(cleanco, newco); 78 ret.edge = c.edge; 79 80 return ret; 81 } 82 83 STATIC_INLINE cube_t 84 compose_edges(cube_t c1, cube_t c2) 85 { 86 cube_t ret = {0}; 87 ret.edge = compose_edges_slim(c1.edge, c2.edge); 88 return ret; 89 } 90 91 STATIC_INLINE cube_t 92 compose_corners(cube_t c1, cube_t c2) 93 { 94 cube_t ret = {0}; 95 ret.corner = compose_corners_slim(c1.corner, c2.corner); 96 return ret; 97 } 98 99 STATIC_INLINE uint8x16_t 100 compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2) 101 { 102 // Masks 103 uint8x16_t p_bits = vdupq_n_u8(PBITS); 104 uint8x16_t eo_bit = vdupq_n_u8(EOBIT); 105 106 // Find the index and permutation 107 uint8x16_t p = vandq_u8(edge2, p_bits); 108 uint8x16_t piece1 = vqtbl1q_u8(edge1, p); 109 110 // Calculate the orientation through XOR 111 uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit); 112 113 // Combine the results 114 uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien); 115 116 // Mask to clear the last 32 bits of the result 117 uint8x16_t mask_last_32 = vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3); 118 ret = vreinterpretq_u8_u32(mask_last_32); 119 120 return ret; 121 } 122 123 STATIC_INLINE uint8x16_t 124 compose_corners_slim(uint8x16_t corner1, uint8x16_t corner2) 125 { 126 // Masks 127 uint8x16_t p_bits = vdupq_n_u8(PBITS); 128 uint8x16_t cobits = vdupq_n_u8(COBITS); 129 uint8x16_t cobits2 = vdupq_n_u8(COBITS_2); 130 uint8x16_t twist_cw = vdupq_n_u8(CTWIST_CW); 131 132 // Find the index and permutation 133 uint8x16_t p = vandq_u8(corner2, p_bits); 134 uint8x16_t piece1 = vqtbl1q_u8(corner1, p); 135 136 // Calculate the orientation 137 uint8x16_t aux = vaddq_u8(vandq_u8(corner2, cobits), vandq_u8(piece1, cobits)); 138 uint8x16_t auy = vshrq_n_u8(vaddq_u8(aux, twist_cw), 2); 139 uint8x16_t orien = vandq_u8(vaddq_u8(aux, auy), cobits2); 140 141 // Combine the results 142 uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien); 143 144 // Mask to clear the last 64 bits of the result 145 uint8x16_t mask_last_64 = vsetq_lane_u64(0, vreinterpretq_u64_u8(ret), 1); 146 ret = vreinterpretq_u8_u64(mask_last_64); 147 148 return ret; 149 } 150 151 STATIC_INLINE cube_t 152 compose(cube_t c1, cube_t c2) 153 { 154 cube_t ret = {0}; 155 156 ret.edge = compose_edges_slim(c1.edge, c2.edge); 157 ret.corner = compose_corners_slim(c1.corner, c2.corner); 158 159 return ret; 160 } 161 162 STATIC_INLINE cube_t 163 inverse(cube_t cube) 164 { 165 uint8_t i, piece, orien; 166 cube_t ret; 167 168 // Temp arrays to store the NEON vectors 169 uint8_t edges[16]; 170 uint8_t corners[16]; 171 172 // Copy the NEON vectors to the arrays 173 vst1q_u8(edges, cube.edge); 174 vst1q_u8(corners, cube.corner); 175 176 uint8_t edge_result[16] = {0}; 177 uint8_t corner_result[16] = {0}; 178 179 // Process the edges 180 for (i = 0; i < 12; i++) 181 { 182 piece = edges[i]; 183 orien = piece & EOBIT; 184 edge_result[piece & PBITS] = i | orien; 185 } 186 187 // Process the corners 188 for (i = 0; i < 8; i++) 189 { 190 piece = corners[i]; 191 orien = ((piece << 1) | (piece >> 1)) & COBITS_2; 192 corner_result[piece & PBITS] = i | orien; 193 } 194 195 // Copy the results back to the NEON vectors 196 ret.edge = vld1q_u8(edge_result); 197 ret.corner = vld1q_u8(corner_result); 198 199 return ret; 200 } 201 202 STATIC_INLINE int64_t 203 coord_co(cube_t c) 204 { 205 // Temp array to store the NEON vector 206 uint8_t mem[16]; 207 vst1q_u8(mem, c.corner); 208 209 int i, p; 210 int64_t ret; 211 212 for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3) 213 ret += p * (mem[i] >> COSHIFT); 214 215 return ret; 216 } 217 218 STATIC_INLINE int64_t 219 coord_csep(cube_t c) 220 { 221 // Temp array to store the NEON vector 222 uint8_t mem[16]; 223 vst1q_u8(mem, c.corner); 224 225 int64_t ret = 0; 226 int i, p; 227 for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2) 228 ret += p * ((mem[i] & CSEPBIT) >> 2); 229 230 return ret; 231 return 0; 232 } 233 234 STATIC_INLINE int64_t 235 coord_cocsep(cube_t c) 236 { 237 return (coord_co(c) << 7) + coord_csep(c); 238 } 239 240 STATIC_INLINE int64_t 241 coord_eo(cube_t c) 242 { 243 int64_t ret = 0; 244 int64_t p = 1; 245 246 // Temp array to store the NEON vector 247 uint8_t mem[16]; 248 vst1q_u8(mem, c.edge); 249 250 for (int i = 1; i < 12; i++, p *= 2) 251 { 252 ret += p * (mem[i] >> EOSHIFT); 253 } 254 255 return ret; 256 } 257 258 STATIC_INLINE int64_t 259 coord_esep(cube_t c) 260 { 261 int64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1; 262 263 // Temp array to store the NEON vector 264 uint8_t mem[16]; 265 vst1q_u8(mem, c.edge); 266 267 for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++) 268 { 269 bit1 = (mem[i] & ESEPBIT_1) >> 2; 270 bit2 = (mem[i] & ESEPBIT_2) >> 3; 271 is1 = (1 - bit2) * bit1; 272 273 ret1 += bit2 * binomial[11 - i][k]; 274 k -= bit2; 275 276 jj = j < 8; 277 ret2 += jj * is1 * binomial[7 - (j * jj)][l]; 278 l -= is1; 279 j += (1 - bit2); 280 } 281 282 return ret1 * 70 + ret2; 283 } 284 285 STATIC_INLINE void 286 copy_corners(cube_t *dst, cube_t src) 287 { 288 dst->corner = src.corner; 289 } 290 291 STATIC_INLINE void 292 copy_edges(cube_t *dst, cube_t src) 293 { 294 dst->edge = src.edge; 295 } 296 297 STATIC_INLINE void 298 set_eo(cube_t *cube, int64_t eo) 299 { 300 // Temp array to store the NEON vector 301 uint8_t mem[16]; 302 vst1q_u8(mem, cube->edge); 303 uint8_t i, sum, flip; 304 305 for (sum = 0, i = 1; i < 12; i++, eo >>= 1) 306 { 307 flip = eo % 2; 308 sum += flip; 309 mem[i] = (mem[i] & ~EOBIT) | (EOBIT * flip); 310 } 311 mem[0] = (mem[0] & ~EOBIT) | (EOBIT * (sum % 2)); 312 313 // Copy the results back to the NEON vector 314 cube->edge = vld1q_u8(mem); 315 return; 316 } 317 318 STATIC_INLINE cube_t 319 invcoord_esep(int64_t esep) 320 { 321 cube_t ret; 322 uint8_t mem[16] = {0}; 323 324 invcoord_esep_array(esep % 70, esep / 70, mem); 325 326 ret = SOLVED_CUBE; 327 ret.edge = vld1q_u8(mem); 328 329 return ret; 330 }