neon.h (7232B)
1 #define CO2_NEON vdup_n_u8(0x60) 2 #define COCW_NEON vdup_n_u8(0x20) 3 #define CP_NEON vdup_n_u8(0x07) 4 #define EP_NEON vcombine_u8(vdupq_n_u8(0x0F), vdupq_n_u8(0x0F)) 5 #define EO_NEON vcombine_u8(vdupq_n_u8(0x10), vdupq_n_u8(0x10)) 6 7 STATIC_INLINE uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t); 8 STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t); 9 10 // static cube 11 #define STATIC_CUBE(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \ 12 e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \ 13 ((cube_t){ \ 14 .corner = {c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl}, \ 15 .edge = {e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0}}) 16 17 // zero cube 18 #define ZERO_CUBE \ 19 (cube_t) \ 20 { \ 21 .corner = vdup_n_u8(0), \ 22 .edge = vdupq_n_u8(0) \ 23 } 24 25 // solved cube 26 #define SOLVED_CUBE STATIC_CUBE( \ 27 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) 28 29 STATIC void 30 pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12]) 31 { 32 // First 8 bytes of the corner vector are copied from the c array 33 vst1_u8(c, cube->corner); 34 35 // 12 bytes of the edge vector are copied from the e array 36 // First 8 bytes 37 vst1_u8(e, vget_low_u8(cube->edge)); 38 // Next 4 bytes 39 vst1_lane_u32((uint32_t *)(e + 8), vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0); 40 } 41 42 STATIC_INLINE bool 43 equal(cube_t c1, cube_t c2) 44 { 45 uint8x8_t cmp_corner; 46 uint8x16_t cmp_edge; 47 uint64x2_t cmp_corner_u64, cmp_edge_u64, cmp_result; 48 49 // compare the corner vectors and the edge vectors 50 cmp_corner = vceq_u8(c1.corner, c2.corner); 51 cmp_edge = vceqq_u8(c1.edge, c2.edge); 52 53 // convert the comparison vectors to 64-bit vectors and combine them 54 cmp_corner_u64 = vreinterpretq_u64_u8(vcombine_u64(cmp_corner, cmp_corner)); 55 cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge); 56 cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64); 57 58 // check if all the bits are set 59 return vgetq_lane_u64(cmp_result, 0) == ~0ULL && vgetq_lane_u64(cmp_result, 1) == ~0ULL; 60 } 61 62 STATIC_INLINE cube_t 63 invertco(cube_t c) 64 { 65 cube_t ret; 66 uint8x8_t co, shleft, shright, summed, newco, cleanco; 67 68 co = vand_u8(c.corner, CO2_NEON); 69 shleft = vshl_n_u8(co, 1); 70 shright = vshr_n_u8(co, 1); 71 summed = vorr_u8(shleft, shright); 72 newco = vand_u8(summed, CO2_NEON); 73 cleanco = veor_u8(c.corner, co); 74 ret.corner = vorr_u8(cleanco, newco); 75 ret.edge = c.edge; 76 77 return ret; 78 } 79 80 STATIC_INLINE cube_t 81 compose_edges(cube_t c1, cube_t c2) 82 { 83 cube_t ret = {0}; 84 ret.edge = compose_edges_slim(c1.edge, c2.edge); 85 return ret; 86 } 87 88 STATIC_INLINE cube_t 89 compose_corners(cube_t c1, cube_t c2) 90 { 91 cube_t ret = {0}; 92 ret.corner = compose_corners_slim(c1.corner, c2.corner); 93 return ret; 94 } 95 96 STATIC_INLINE uint8x16_t 97 compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2) 98 { 99 // Masks 100 uint8x16_t p_bits = vdupq_n_u8(PBITS); 101 uint8x16_t eo_bit = vdupq_n_u8(EOBIT); 102 103 // Find the index and permutation 104 uint8x16_t p = vandq_u8(edge2, p_bits); 105 uint8x16_t piece1 = vqtbl1q_u8(edge1, p); 106 107 // Calculate the orientation through XOR 108 uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit); 109 110 // Combine the results 111 uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien); 112 113 // Mask to clear the last 32 bits of the result 114 uint8x16_t mask_last_32 = vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3); 115 ret = vreinterpretq_u8_u32(mask_last_32); 116 117 return ret; 118 } 119 120 STATIC_INLINE uint8x8_t 121 compose_corners_slim(uint8x8_t corner1, uint8x8_t corner2) 122 { 123 // Masks 124 uint8x8_t p_bits = vdup_n_u8(PBITS); 125 uint8x8_t cobits = vdup_n_u8(COBITS); 126 uint8x8_t cobits2 = vdup_n_u8(COBITS_2); 127 uint8x8_t twist_cw = vdup_n_u8(CTWIST_CW); 128 129 // Find the index and permutation 130 uint8x8_t p = vand_u8(corner2, p_bits); 131 uint8x8_t piece1 = vtbl1_u8(corner1, p); 132 133 // Calculate the orientation 134 uint8x8_t aux = vadd_u8(vand_u8(corner2, cobits), vand_u8(piece1, cobits)); 135 uint8x8_t auy = vshr_n_u8(vadd_u8(aux, twist_cw), 2); 136 uint8x8_t orien = vand_u8(vadd_u8(aux, auy), cobits2); 137 138 uint8x8_t ret = vorr_u8(vand_u8(piece1, p_bits), orien); 139 140 return ret; 141 } 142 143 STATIC_INLINE cube_t 144 compose(cube_t c1, cube_t c2) 145 { 146 cube_t ret = {0}; 147 148 ret.edge = compose_edges_slim(c1.edge, c2.edge); 149 ret.corner = compose_corners_slim(c1.corner, c2.corner); 150 151 return ret; 152 } 153 154 STATIC_INLINE cube_t 155 inverse(cube_t cube) 156 { 157 uint8_t i, piece, orien; 158 cube_t ret; 159 160 // Temp arrays to store the NEON vectors 161 uint8_t edges[16]; 162 uint8_t corners[8]; 163 164 // Copy the NEON vectors to the arrays 165 vst1q_u8(edges, cube.edge); 166 vst1_u8(corners, cube.corner); 167 168 uint8_t edge_result[16] = {0}; 169 uint8_t corner_result[8] = {0}; 170 171 // Process the edges 172 for (i = 0; i < 12; i++) 173 { 174 piece = edges[i]; 175 orien = piece & EOBIT; 176 edge_result[piece & PBITS] = i | orien; 177 } 178 179 // Process the corners 180 for (i = 0; i < 8; i++) 181 { 182 piece = corners[i]; 183 orien = ((piece << 1) | (piece >> 1)) & COBITS_2; 184 corner_result[piece & PBITS] = i | orien; 185 } 186 187 // Copy the results back to the NEON vectors 188 ret.edge = vld1q_u8(edge_result); 189 ret.corner = vld1_u8(corner_result); 190 191 return ret; 192 } 193 194 STATIC_INLINE int64_t 195 coord_co(cube_t c) 196 { 197 // Temp array to store the NEON vector 198 uint8_t mem[8]; 199 vst1_u8(mem, c.corner); 200 201 int i, p; 202 int64_t ret; 203 204 for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3) 205 ret += p * (mem[i] >> COSHIFT); 206 207 return ret; 208 } 209 210 STATIC_INLINE int64_t 211 coord_csep(cube_t c) 212 { 213 // Temp array to store the NEON vector 214 uint8_t mem[8]; 215 vst1_u8(mem, c.corner); 216 217 int64_t ret = 0; 218 int i, p; 219 for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2) 220 ret += p * ((mem[i] & CSEPBIT) >> 2); 221 222 return ret; 223 return 0; 224 } 225 226 STATIC_INLINE int64_t 227 coord_cocsep(cube_t c) 228 { 229 return (coord_co(c) << 7) + coord_csep(c); 230 } 231 232 STATIC_INLINE int64_t 233 coord_eo(cube_t c) 234 { 235 int64_t ret = 0; 236 int64_t p = 1; 237 238 // Temp array to store the NEON vector 239 uint8_t mem[16]; 240 vst1q_u8(mem, c.edge); 241 242 for (int i = 1; i < 12; i++, p *= 2) 243 { 244 ret += p * (mem[i] >> EOSHIFT); 245 } 246 247 return ret; 248 } 249 250 STATIC_INLINE int64_t 251 coord_esep(cube_t c) 252 { 253 int64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1; 254 255 // Temp array to store the NEON vector 256 uint8_t mem[16]; 257 vst1q_u8(mem, c.edge); 258 259 for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++) 260 { 261 bit1 = (mem[i] & ESEPBIT_1) >> 2; 262 bit2 = (mem[i] & ESEPBIT_2) >> 3; 263 is1 = (1 - bit2) * bit1; 264 265 ret1 += bit2 * binomial[11 - i][k]; 266 k -= bit2; 267 268 jj = j < 8; 269 ret2 += jj * is1 * binomial[7 - (j * jj)][l]; 270 l -= is1; 271 j += (1 - bit2); 272 } 273 274 return ret1 * 70 + ret2; 275 } 276 277 STATIC_INLINE void 278 copy_corners(cube_t *dst, cube_t src) 279 { 280 dst->corner = src.corner; 281 } 282 283 STATIC_INLINE void 284 copy_edges(cube_t *dst, cube_t src) 285 { 286 dst->edge = src.edge; 287 } 288 289 STATIC_INLINE void 290 set_eo(cube_t *cube, int64_t eo) 291 { 292 // Temp array to store the NEON vector 293 uint8_t mem[16]; 294 vst1q_u8(mem, cube->edge); 295 uint8_t i, sum, flip; 296 297 for (sum = 0, i = 1; i < 12; i++, eo >>= 1) 298 { 299 flip = eo % 2; 300 sum += flip; 301 mem[i] = (mem[i] & ~EOBIT) | (EOBIT * flip); 302 } 303 mem[0] = (mem[0] & ~EOBIT) | (EOBIT * (sum % 2)); 304 305 // Copy the results back to the NEON vector 306 cube->edge = vld1q_u8(mem); 307 return; 308 } 309 310 STATIC_INLINE cube_t 311 invcoord_esep(int64_t esep) 312 { 313 cube_t ret; 314 uint8_t mem[16] = {0}; 315 316 invcoord_esep_array(esep % 70, esep / 70, mem); 317 318 ret = SOLVED_CUBE; 319 ret.edge = vld1q_u8(mem); 320 321 return ret; 322 }