neon.h (7372B)
1 #define CO2_NEON vdup_n_u8(0x60) 2 #define COCW_NEON vdup_n_u8(0x20) 3 #define CP_NEON vdup_n_u8(0x07) 4 #define EP_NEON vcombine_u8(vdupq_n_u8(0x0F), vdupq_n_u8(0x0F)) 5 #define EO_NEON vcombine_u8(vdupq_n_u8(0x10), vdupq_n_u8(0x10)) 6 7 STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t); 8 STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t); 9 10 // static cube 11 #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \ 12 e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \ 13 ((cube_t){ \ 14 .corner = {c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl}, \ 15 .edge = {e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0}}) 16 17 // zero cube 18 #define ZERO_CUBE \ 19 (cube_t) \ 20 { \ 21 .corner = vdup_n_u8(0), \ 22 .edge = vdupq_n_u8(0) \ 23 } 24 25 // solved cube 26 #define SOLVED_CUBE STATIC_CUBE( \ 27 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) 28 29 /* TODO: optimize this (use intrinsics?) */ 30 STATIC_INLINE int 31 { 32 int ret; 33 34 for (ret = 0; x != 0; x >>= 1) 35 ret += x & 1; 36 37 return ret; 38 } 39 40 STATIC void 41 pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12]) 42 { 43 // First 8 bytes of the corner vector are copied from the c array 44 vst1_u8(c, cube->corner); 45 46 // 12 bytes of the edge vector are copied from the e array 47 // First 8 bytes 48 vst1_u8(e, vget_low_u8(cube->edge)); 49 // Next 4 bytes 50 vst1_lane_u32((uint32_t *)(e + 8), vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0); 51 } 52 53 STATIC_INLINE bool 54 equal(cube_t c1, cube_t c2) 55 { 56 uint8x8_t cmp_corner; 57 uint8x16_t cmp_edge; 58 uint64x2_t cmp_corner_u64, cmp_edge_u64, cmp_result; 59 60 // compare the corner vectors and the edge vectors 61 cmp_corner = vceq_u8(c1.corner, c2.corner); 62 cmp_edge = vceqq_u8(c1.edge, c2.edge); 63 64 // convert the comparison vectors to 64-bit vectors and combine them 65 cmp_corner_u64 = vreinterpretq_u64_u8(vcombine_u64(cmp_corner, cmp_corner)); 66 cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge); 67 cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64); 68 69 // check if all the bits are set 70 return vgetq_lane_u64(cmp_result, 0) == ~0ULL && vgetq_lane_u64(cmp_result, 1) == ~0ULL; 71 } 72 73 STATIC_INLINE cube_t 74 invertco(cube_t c) 75 { 76 cube_t ret; 77 uint8x8_t co, shleft, shright, summed, newco, cleanco; 78 79 co = vand_u8(c.corner, CO2_NEON); 80 shleft = vshl_n_u8(co, 1); 81 shright = vshr_n_u8(co, 1); 82 summed = vorr_u8(shleft, shright); 83 newco = vand_u8(summed, CO2_NEON); 84 cleanco = veor_u8(c.corner, co); 85 ret.corner = vorr_u8(cleanco, newco); 86 ret.edge = c.edge; 87 88 return ret; 89 } 90 91 STATIC_INLINE cube_t 92 compose_edges(cube_t c1, cube_t c2) 93 { 94 cube_t ret = {0}; 95 ret.edge = compose_edges_slim(c1.edge, c2.edge); 96 return ret; 97 } 98 99 STATIC_INLINE cube_t 100 compose_corners(cube_t c1, cube_t c2) 101 { 102 cube_t ret = {0}; 103 ret.corner = compose_corners_slim(c1.corner, c2.corner); 104 return ret; 105 } 106 107 STATIC_INLINE uint8x16_t 108 compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2) 109 { 110 // Masks 111 uint8x16_t p_bits = vdupq_n_u8(PBITS); 112 uint8x16_t eo_bit = vdupq_n_u8(EOBIT); 113 114 // Find the index and permutation 115 uint8x16_t p = vandq_u8(edge2, p_bits); 116 uint8x16_t piece1 = vqtbl1q_u8(edge1, p); 117 118 // Calculate the orientation through XOR 119 uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit); 120 121 // Combine the results 122 uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien); 123 124 // Mask to clear the last 32 bits of the result 125 uint8x16_t mask_last_32 = vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3); 126 ret = vreinterpretq_u8_u32(mask_last_32); 127 128 return ret; 129 } 130 131 STATIC_INLINE uint8x8_t 132 compose_corners_slim(uint8x8_t corner1, uint8x8_t corner2) 133 { 134 // Masks 135 uint8x8_t p_bits = vdup_n_u8(PBITS); 136 uint8x8_t cobits = vdup_n_u8(COBITS); 137 uint8x8_t cobits2 = vdup_n_u8(COBITS_2); 138 uint8x8_t twist_cw = vdup_n_u8(CTWIST_CW); 139 140 // Find the index and permutation 141 uint8x8_t p = vand_u8(corner2, p_bits); 142 uint8x8_t piece1 = vtbl1_u8(corner1, p); 143 144 // Calculate the orientation 145 uint8x8_t aux = vadd_u8(vand_u8(corner2, cobits), vand_u8(piece1, cobits)); 146 uint8x8_t auy = vshr_n_u8(vadd_u8(aux, twist_cw), 2); 147 uint8x8_t orien = vand_u8(vadd_u8(aux, auy), cobits2); 148 149 uint8x8_t ret = vorr_u8(vand_u8(piece1, p_bits), orien); 150 151 return ret; 152 } 153 154 STATIC_INLINE cube_t 155 compose(cube_t c1, cube_t c2) 156 { 157 cube_t ret = {0}; 158 159 ret.edge = compose_edges_slim(c1.edge, c2.edge); 160 ret.corner = compose_corners_slim(c1.corner, c2.corner); 161 162 return ret; 163 } 164 165 STATIC_INLINE cube_t 166 inverse(cube_t cube) 167 { 168 uint8_t i, piece, orien; 169 cube_t ret; 170 171 // Temp arrays to store the NEON vectors 172 uint8_t edges[16]; 173 uint8_t corners[8]; 174 175 // Copy the NEON vectors to the arrays 176 vst1q_u8(edges, cube.edge); 177 vst1_u8(corners, cube.corner); 178 179 uint8_t edge_result[16] = {0}; 180 uint8_t corner_result[8] = {0}; 181 182 // Process the edges 183 for (i = 0; i < 12; i++) 184 { 185 piece = edges[i]; 186 orien = piece & EOBIT; 187 edge_result[piece & PBITS] = i | orien; 188 } 189 190 // Process the corners 191 for (i = 0; i < 8; i++) 192 { 193 piece = corners[i]; 194 orien = ((piece << 1) | (piece >> 1)) & COBITS_2; 195 corner_result[piece & PBITS] = i | orien; 196 } 197 198 // Copy the results back to the NEON vectors 199 ret.edge = vld1q_u8(edge_result); 200 ret.corner = vld1_u8(corner_result); 201 202 return ret; 203 } 204 205 STATIC_INLINE int64_t 206 coord_co(cube_t c) 207 { 208 // Temp array to store the NEON vector 209 uint8_t mem[8]; 210 vst1_u8(mem, c.corner); 211 212 int i, p; 213 int64_t ret; 214 215 for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3) 216 ret += p * (mem[i] >> COSHIFT); 217 218 return ret; 219 } 220 221 STATIC_INLINE int64_t 222 coord_csep(cube_t c) 223 { 224 // Temp array to store the NEON vector 225 uint8_t mem[8]; 226 vst1_u8(mem, c.corner); 227 228 int64_t ret = 0; 229 int i, p; 230 for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2) 231 ret += p * ((mem[i] & CSEPBIT) >> 2); 232 233 return ret; 234 return 0; 235 } 236 237 STATIC_INLINE int64_t 238 coord_cocsep(cube_t c) 239 { 240 return (coord_co(c) << 7) + coord_csep(c); 241 } 242 243 STATIC_INLINE int64_t 244 coord_eo(cube_t c) 245 { 246 int64_t ret = 0; 247 int64_t p = 1; 248 249 // Temp array to store the NEON vector 250 uint8_t mem[16]; 251 vst1q_u8(mem, c.edge); 252 253 for (int i = 1; i < 12; i++, p *= 2) 254 { 255 ret += p * (mem[i] >> EOSHIFT); 256 } 257 258 return ret; 259 } 260 261 STATIC_INLINE int64_t 262 coord_esep(cube_t c) 263 { 264 int64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1; 265 266 // Temp array to store the NEON vector 267 uint8_t mem[16]; 268 vst1q_u8(mem, c.edge); 269 270 for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++) 271 { 272 bit1 = (mem[i] & ESEPBIT_1) >> 2; 273 bit2 = (mem[i] & ESEPBIT_2) >> 3; 274 is1 = (1 - bit2) * bit1; 275 276 ret1 += bit2 * binomial[11 - i][k]; 277 k -= bit2; 278 279 jj = j < 8; 280 ret2 += jj * is1 * binomial[7 - (j * jj)][l]; 281 l -= is1; 282 j += (1 - bit2); 283 } 284 285 return ret1 * 70 + ret2; 286 } 287 288 STATIC_INLINE void 289 copy_corners(cube_t *dst, cube_t src) 290 { 291 dst->corner = src.corner; 292 } 293 294 STATIC_INLINE void 295 copy_edges(cube_t *dst, cube_t src) 296 { 297 dst->edge = src.edge; 298 } 299 300 STATIC_INLINE void 301 set_eo(cube_t *cube, int64_t eo) 302 { 303 // Temp array to store the NEON vector 304 uint8_t mem[16]; 305 vst1q_u8(mem, cube->edge); 306 uint8_t i, sum, flip; 307 308 for (sum = 0, i = 1; i < 12; i++, eo >>= 1) 309 { 310 flip = eo % 2; 311 sum += flip; 312 mem[i] = (mem[i] & ~EOBIT) | (EOBIT * flip); 313 } 314 mem[0] = (mem[0] & ~EOBIT) | (EOBIT * (sum % 2)); 315 316 // Copy the results back to the NEON vector 317 cube->edge = vld1q_u8(mem); 318 return; 319 } 320 321 STATIC_INLINE cube_t 322 invcoord_esep(int64_t esep) 323 { 324 cube_t ret; 325 uint8_t mem[16] = {0}; 326 327 invcoord_esep_array(esep % 70, esep / 70, mem); 328 329 ret = SOLVED_CUBE; 330 ret.edge = vld1q_u8(mem); 331 332 return ret; 333 }