Reorganized folder structure - h48 - A prototype for an optimal Rubik's cube solver, work in progress.

commit 18c9a8b8905304cf5f8fc15825769046a3144866
parent f25a10e19eca294c4e6a99e4f80ce5cfd11a0e5f
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Sun, 18 Aug 2024 14:26:45 +0200

Reorganized folder structure

Diffstat:
M LICENSE  | 4 +++-
M Makefile  | 6 +++---
M shell.c  | 2 +-
A src/arch/arch.h  | 41 +++++++++++++++++++++++++++++++++++++++++
A src/arch/avx2.h  | 319 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/arch/common.h  | 19 +++++++++++++++++++
A src/arch/neon.h  | 348 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/arch/portable.h  | 272 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R src/constant_cubes.h -> src/core/constant_cubes.h  | 0 
A src/core/core.h  | 13 +++++++++++++
R src/cube_generic.h -> src/core/cube.h  | 0 
R src/io_cube.h -> src/core/io_cube.h  | 0 
A src/core/io_moves.h  | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/core/io_trans.h  | 25 +++++++++++++++++++++++++
R src/moves.h -> src/core/moves.h  | 0 
R src/cube_transform.h -> src/core/transform.h  | 0 
R src/cube_transform_with_switch.h -> src/core/transform_with_switch.h  | 0 
D src/cube.c  | 55 -------------------------------------------------------
D src/cube_avx2.h  | 341 -------------------------------------------------------------------------------
D src/cube_neon.h  | 377 -------------------------------------------------------------------------------
D src/cube_portable.h  | 298 -------------------------------------------------------------------------------
D src/cube_public.h  | 255 -------------------------------------------------------------------------------
D src/io_move_trans.h  | 87 -------------------------------------------------------------------------------
A src/nissy.c  | 265 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R src/cube.h -> src/nissy.h  | 0 
D src/solve_h48.h  | 839 -------------------------------------------------------------------------------
R src/solve_generic.h -> src/solvers/generic/generic.h  | 0 
A src/solvers/h48/coordinate.h  | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/solvers/h48/gendata.h  | 425 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/solvers/h48/h48.h  | 4 ++++
A src/solvers/h48/map.h  | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/solvers/h48/solve.h  | 242 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/solvers/solvers.h  | 2 ++
R src/constants.h -> src/utils/constants.h  | 0 
A src/utils/dbg_log.h  | 16 ++++++++++++++++
R src/utils.h -> src/utils/math.h  | 0 
A src/utils/utils.h  | 3 +++
M test/test.h  | 21 ++++-----------------
M tools/001_gendata_h48/gendata_h48.c  | 2 +-
M tools/002_stats_tables_h48/stats_tables_h48.c  | 2 +-
M utils/genmovecode.sh  | 2 +-
M utils/gentranscode.sh  | 4 ++--
M utils/h48_to_lst.c  | 2 +-
M utils/invert.c  | 2 +-

44 files changed, 2246 insertions(+), 2281 deletions(-)
diff --git a/LICENSE b/LICENSE
@@ -2,7 +2,9 @@ The following license applies to every C source code and header file
 distributed with this LICENSE file.
 
     h48 - a prototype for an optimal rubik's cube solver
-    Copyright (C) 2023 Sebastiano Tronto <sebastiano@tronto.net>
+    Copyright (C) 2023-2024
+    Sebastiano Tronto <sebastiano@tronto.net>
+    Enrico Tenuti <>
 
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
diff --git a/Makefile b/Makefile
@@ -3,13 +3,13 @@ include config.mk
 all: cube.o debugcube.o
 
 cube.s: clean
-	${CC} -D${CUBETYPE} ${CFLAGS} -c -S -o cube.s src/cube.c
+	${CC} -D${CUBETYPE} ${CFLAGS} -c -S -o cube.s src/nissy.c
 
 cube.o: clean
-	${CC} -D${CUBETYPE} ${CFLAGS} -c -o cube.o src/cube.c
+	${CC} -D${CUBETYPE} ${CFLAGS} -c -o cube.o src/nissy.c
 
 debugcube.o: clean
-	${CC} -D${CUBETYPE} ${DBGFLAGS} -c -o debugcube.o src/cube.c
+	${CC} -D${CUBETYPE} ${DBGFLAGS} -c -o debugcube.o src/nissy.c
 
 clean:
 	rm -rf *.o run
diff --git a/shell.c b/shell.c
@@ -7,7 +7,7 @@
 #include <string.h>
 #include <time.h>
 
-#include "src/cube.h"
+#include "src/nissy.h"
 
 #define PRINTCUBE_BUFFER_SIZE 1024 /* Should be enough */
 #define SOLUTIONS_BUFFER_SIZE 500000 /* Should be enough */
diff --git a/src/arch/arch.h b/src/arch/arch.h
@@ -0,0 +1,41 @@
+#if defined(CUBE_AVX2)
+
+#include <immintrin.h>
+
+typedef __m256i cube_t;
+
+#if !defined(TEST_H)
+#include "common.h"
+#include "avx2.h"
+#endif
+
+#elif defined(CUBE_NEON)
+
+#include <stdlib.h>
+#include <arm_neon.h>
+
+typedef struct {
+	uint8x16_t corner;
+	uint8x16_t edge;
+} cube_t;
+
+#if !defined(TEST_H)
+#include "common.h"
+#include "neon.h"
+#endif
+
+#else
+
+#include <stdlib.h>
+
+typedef struct {
+	uint8_t corner[8];
+	uint8_t edge[12];
+} cube_t;
+
+#if !defined(TEST_H)
+#include "common.h"
+#include "portable.h"
+#endif
+
+#endif
diff --git a/src/arch/avx2.h b/src/arch/avx2.h
@@ -0,0 +1,319 @@
+#define _co2_avx2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x6060606060606060))
+#define _cocw_avx2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x2020202020202020))
+#define _cp_avx2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x0707070707070707))
+#define _ep_avx2 \
+    _mm256_set_epi64x(INT64_C(0x0F0F0F0F), INT64_C(0x0F0F0F0F0F0F0F0F), 0, 0)
+#define _eo_avx2 \
+    _mm256_set_epi64x(INT64_C(0x10101010), INT64_C(0x1010101010101010), 0, 0)
+
+#define static_cube(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
+    e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
+    _mm256_set_epi8(0, 0, 0, 0, e_br, e_bl, e_fl, e_fr, \
+        e_dr, e_dl, e_ul, e_ur, e_df, e_db, e_ub, e_uf, \
+        0, 0, 0, 0, 0, 0, 0, 0, \
+        c_dbl, c_dfr, c_ubr, c_ufl, c_dbr, c_dfl, c_ubl, c_ufr)
+#define zero _mm256_set_epi64x(0, 0, 0, 0)
+#define solved static_cube( \
+    0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
+
+_static void
+pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12])
+{
+	uint8_t aux[32];
+
+	_mm256_storeu_si256((__m256i_u *)aux, *cube);
+	memcpy(c, aux, 8);
+	memcpy(e, aux+16, 12);
+}
+
+_static_inline bool
+equal(cube_t c1, cube_t c2)
+{
+	int32_t mask;
+	__m256i cmp;
+
+	cmp = _mm256_cmpeq_epi8(c1, c2);
+	mask = _mm256_movemask_epi8(cmp);
+
+	return mask == ~0;
+}
+
+_static_inline cube_t
+invertco(cube_t c)
+{
+        cube_t co, shleft, shright, summed, newco, cleanco, ret;
+
+        co = _mm256_and_si256(c, _co2_avx2);
+        shleft = _mm256_slli_epi32(co, 1);
+        shright = _mm256_srli_epi32(co, 1);
+        summed = _mm256_or_si256(shleft, shright);
+        newco = _mm256_and_si256(summed, _co2_avx2);
+        cleanco = _mm256_xor_si256(c, co);
+        ret = _mm256_or_si256(cleanco, newco);
+
+        return ret;
+}
+
+_static_inline cube_t
+compose_epcpeo(cube_t c1, cube_t c2)
+{
+	cube_t b, s, eo2;
+
+	/* Permute and clean unused bits */
+	s = _mm256_shuffle_epi8(c1, c2);
+	b = _mm256_set_epi8(
+		~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0
+	);
+	s = _mm256_andnot_si256(b, s);
+
+	/* Change EO */
+	eo2 = _mm256_and_si256(c2, _eo_avx2);
+	s = _mm256_xor_si256(s, eo2);
+
+	return s;
+}
+
+_static_inline cube_t
+compose_edges(cube_t c1, cube_t c2)
+{
+	return compose_epcpeo(c1, c2);
+}
+
+_static_inline cube_t
+compose_corners(cube_t c1, cube_t c2)
+{
+	/*
+	 * We do a full compose. Minor optimizations are possible, like
+	 * saving one instruction by not doing EO, but it should not
+	 * be significant.
+	 */
+	return compose(c1, c2);
+}
+
+_static_inline cube_t
+compose(cube_t c1, cube_t c2)
+{
+	cube_t s, co1, co2, aux, auy1, auy2, auz1, auz2;
+
+	s = compose_epcpeo(c1, c2);
+
+	/* Change CO */
+	co1 = _mm256_and_si256(s, _co2_avx2);
+	co2 = _mm256_and_si256(c2, _co2_avx2);
+	aux = _mm256_add_epi8(co1, co2);
+	auy1 = _mm256_add_epi8(aux, _cocw_avx2);
+	auy2 = _mm256_srli_epi32(auy1, 2);
+	auz1 = _mm256_add_epi8(aux, auy2);
+	auz2 = _mm256_and_si256(auz1, _co2_avx2);
+
+	/* Put together */
+	s = _mm256_andnot_si256(_co2_avx2, s);
+	s = _mm256_or_si256(s, auz2);
+
+	return s;
+}
+
+_static_inline cube_t
+cleanaftershuffle(cube_t c)
+{
+	__m256i b;
+
+	b = _mm256_set_epi8(
+		~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0
+	);
+
+	return _mm256_andnot_si256(b, c);
+}
+
+_static_inline cube_t
+inverse(cube_t c)
+{
+	/* Method taken from Andrew Skalski's vcube[1]. The addition sequence
+	 * was generated using [2].
+	 * [1] https://github.com/Voltara/vcube
+	 * [2] http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
+	 */
+	cube_t v3, vi, vo, vp, ret;
+
+	v3 = _mm256_shuffle_epi8(c, c);
+	v3 = _mm256_shuffle_epi8(v3, c);
+	vi = _mm256_shuffle_epi8(v3, v3);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, v3);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, c);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, v3);
+	vi = _mm256_shuffle_epi8(vi, vi);
+	vi = _mm256_shuffle_epi8(vi, c);
+
+	vo = _mm256_and_si256(c, _mm256_or_si256(_eo_avx2, _co2_avx2));
+	vo = _mm256_shuffle_epi8(vo, vi);
+	vp = _mm256_andnot_si256(_mm256_or_si256(_eo_avx2, _co2_avx2), vi);
+	ret = _mm256_or_si256(vp, vo);
+	ret = cleanaftershuffle(ret);
+	
+	return invertco(ret);
+}
+
+_static_inline int64_t
+coord_co(cube_t c)
+{
+	cube_t co;
+	int64_t mem[4], ret, i, p;
+
+	co = _mm256_and_si256(c, _co2_avx2);
+	_mm256_storeu_si256((__m256i *)mem, co);
+
+	mem[0] >>= 5;
+	for (i = 0, ret = 0, p = 1; i < 7; i++, mem[0] >>= 8, p *= 3)
+		ret += (mem[0] & 3) * p;
+
+	return ret;
+}
+
+_static_inline int64_t
+coord_csep(cube_t c)
+{
+	cube_t cp, shifted;
+	int64_t mask;
+
+	cp = _mm256_and_si256(c, _cp_avx2);
+	shifted = _mm256_slli_epi32(cp, 5);
+	mask = _mm256_movemask_epi8(shifted);
+
+	return mask & 0x7F;
+}
+
+_static_inline int64_t
+coord_cocsep(cube_t c)
+{
+	return (coord_co(c) << 7) + coord_csep(c);
+}
+
+_static_inline int64_t
+coord_eo(cube_t c)
+{
+	cube_t eo, shifted;
+	int64_t mask;
+
+	eo = _mm256_and_si256(c, _eo_avx2);
+	shifted = _mm256_slli_epi32(eo, 3);
+	mask = _mm256_movemask_epi8(shifted);
+
+	return mask >> 17;
+}
+
+_static_inline int64_t
+coord_esep(cube_t c)
+{
+	cube_t ep;
+	int64_t e, mem[4], i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
+
+	ep = _mm256_and_si256(c, _ep_avx2);
+	_mm256_storeu_si256((__m256i *)mem, ep);
+
+	mem[3] <<= 8;
+	ret1 = ret2 = 0;
+	k = l = 4;
+	for (i = 0, j = 0; i < 12; i++, mem[i/8 + 2] >>= 8) {
+		e = mem[i/8 + 2];
+
+		bit1 = (e & _esepbit1) >> 2;
+		bit2 = (e & _esepbit2) >> 3;
+		is1 = (1 - bit2) * bit1;
+
+		ret1 += bit2 * binomial[11-i][k];
+		k -= bit2;
+
+		jj = j < 8;
+		ret2 += jj * is1 * binomial[7-(j*jj)][l];
+		l -= is1;
+		j += (1-bit2);
+	}
+
+	return ret1 * 70 + ret2;
+}
+
+_static_inline void
+copy_corners(cube_t *dest, cube_t src)
+{
+	*dest = _mm256_blend_epi32(*dest, src, 0x0F);
+}
+
+_static_inline void
+copy_edges(cube_t *dest, cube_t src)
+{
+	*dest = _mm256_blend_epi32(*dest, src, 0xF0);
+}
+
+_static_inline void
+set_eo(cube_t *cube, int64_t eo)
+{
+	int64_t eo12, eotop, eobot;
+	__m256i veo;
+
+	eo12 = (eo << 1) + (_mm_popcnt_u64(eo) % 2);
+	eotop = (eo12 & (1 << 11)) << 17 |
+		(eo12 & (1 << 10)) << 10 |
+		(eo12 & (1 << 9)) << 3 |
+		(eo12 & (1 << 8)) >> 4;
+	eobot = (eo12 & (1 << 7)) << 53 |
+		(eo12 & (1 << 6)) << 46 |
+		(eo12 & (1 << 5)) << 39 |
+		(eo12 & (1 << 4)) << 32 |
+		(eo12 & (1 << 3)) << 25 |
+		(eo12 & (1 << 2)) << 18 |
+		(eo12 & (1 << 1)) << 11 |
+		(eo12 & 1) << 4;
+	veo = _mm256_set_epi64x(eotop, eobot, 0, 0);
+
+	*cube = _mm256_andnot_si256(_eo_avx2, *cube);
+	*cube = _mm256_or_si256(*cube, veo);
+}
+
+_static_inline cube_t
+invcoord_esep(int64_t esep)
+{
+	cube_t eee, ret;
+	int64_t bit1, bit2, i, j, jj, k, l, s, v, w, is1, set1, set2;
+	uint8_t mem[32];
+	uint8_t slice[3] = {0};
+
+	set1 = esep % 70;
+	set2 = esep / 70;
+
+	for (i = 0, j = 0, k = 4, l = 4; i < 12; i++) {
+		v = binomial[11-i][k];
+		jj = j < 8;
+		w = jj * binomial[7-(j*jj)][l];
+		bit2 = set2 >= v;
+		bit1 = set1 >= w;
+		is1 = (1 - bit2) * bit1;
+
+		set2 -= bit2 * v;
+		k -= bit2;
+		set1 -= is1 * w;
+		l -= is1;
+		j += (1-bit2);
+		s = 2*bit2 + (1-bit2)*bit1;
+
+		mem[i+16] = (slice[s]++) | (uint8_t)(s << 2);
+	}
+
+	ret = solved;
+	eee = _mm256_loadu_si256((__m256i_u *)&mem);
+	copy_edges(&ret, eee);
+
+	return ret;
+}
diff --git a/src/arch/common.h b/src/arch/common.h
@@ -0,0 +1,19 @@
+_static void pieces(cube_t *, uint8_t [static 8], uint8_t [static 12]);
+_static_inline bool equal(cube_t, cube_t);
+_static_inline cube_t invertco(cube_t);
+_static_inline cube_t compose_epcpeo(cube_t, cube_t);
+_static_inline cube_t compose_edges(cube_t, cube_t);
+_static_inline cube_t compose_corners(cube_t, cube_t);
+_static_inline cube_t compose(cube_t, cube_t);
+_static_inline cube_t inverse(cube_t);
+
+_static_inline int64_t coord_co(cube_t);
+_static_inline int64_t coord_csep(cube_t);
+_static_inline int64_t coord_cocsep(cube_t);
+_static_inline int64_t coord_eo(cube_t);
+_static_inline int64_t coord_esep(cube_t);
+
+_static_inline void copy_corners(cube_t *, cube_t);
+_static_inline void copy_edges(cube_t *, cube_t);
+_static_inline void set_eo(cube_t *, int64_t);
+_static_inline cube_t invcoord_esep(int64_t);
diff --git a/src/arch/neon.h b/src/arch/neon.h
@@ -0,0 +1,348 @@
+#define _co2_neon vdupq_n_u8(0x60)
+#define _cocw_neon vdupq_n_u8(0x20)
+#define _cp_neon vdupq_n_u8(0x07)
+#define _ep_neon vcombine_u8(vdupq_n_u8(0x0F), vdupq_n_u8(0x0F))
+#define _eo_neon vcombine_u8(vdupq_n_u8(0x10), vdupq_n_u8(0x10))
+
+// static cube
+#define static_cube(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
+					e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
+	((cube_t){ \
+		.corner = {c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, 0, 0, 0, 0, 0, 0, 0, 0}, \
+		.edge = {e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0}})
+
+// zero cube
+#define zero \
+	(cube_t) \
+	{ \
+		.corner = vdupq_n_u8(0), \
+		.edge = vdupq_n_u8(0) \
+	}
+
+// solved cube
+#define solved static_cube(	\
+	0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
+
+_static void
+pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12])
+{
+	// First 8 bytes of the corner vector are copied from the c array
+	vst1_u8(c, vget_low_u8(cube->corner));
+
+	// 12 bytes of the edge vector are copied from the e array
+	// First 8 bytes
+	vst1_u8(e, vget_low_u8(cube->edge));
+	// Next 4 bytes
+	vst1_lane_u32((uint32_t *)(e + 8), vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0);
+}
+
+_static_inline bool
+equal(cube_t c1, cube_t c2)
+{
+	uint8x16_t cmp_corner, cmp_edge;
+	uint64x2_t cmp_corner_u64, cmp_edge_u64;
+	uint64x2_t cmp_result;
+
+	// compare the corner vectors
+	cmp_corner = vceqq_u8(c1.corner, c2.corner);
+	// compare the edge vectors
+	cmp_edge = vceqq_u8(c1.edge, c2.edge);
+
+	// convert the comparison vectors to 64-bit vectors
+	cmp_corner_u64 = vreinterpretq_u64_u8(cmp_corner);
+	cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge);
+
+	// combine the comparison vectors
+	cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64);
+
+	// check if all the bits are set
+	return vgetq_lane_u64(cmp_result, 0) == ~0ULL && vgetq_lane_u64(cmp_result, 1) == ~0ULL;
+}
+
+_static_inline cube_t
+invertco(cube_t c)
+{
+	cube_t ret;
+	uint8x16_t co, shleft, shright, summed, newco, cleanco;
+
+	co = vandq_u8(c.corner, _co2_neon);
+	shleft = vshlq_n_u8(co, 1);
+	shright = vshrq_n_u8(co, 1);
+	summed = vorrq_u8(shleft, shright);
+	newco = vandq_u8(summed, _co2_neon);
+	cleanco = veorq_u8(c.corner, co);
+	ret.corner = vorrq_u8(cleanco, newco);
+	ret.edge = c.edge;
+	
+	return ret;
+}
+
+_static_inline cube_t
+compose_edges(cube_t c1, cube_t c2)
+{
+	cube_t ret = {0};
+	ret.edge = compose_edges_slim(c1.edge, c2.edge);
+	return ret;
+}
+
+_static_inline cube_t 
+compose_corners(cube_t c1, cube_t c2)
+{
+	cube_t ret = {0};
+	ret.corner = compose_corners_slim(c1.corner, c2.corner);
+	return ret;
+}
+
+_static_inline uint8x16_t 
+compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2)
+{
+	// Masks
+	uint8x16_t p_bits = vdupq_n_u8(_pbits);
+	uint8x16_t eo_bit = vdupq_n_u8(_eobit);
+
+	// Find the index and permutation
+	uint8x16_t p = vandq_u8(edge2, p_bits);
+	uint8x16_t piece1 = vqtbl1q_u8(edge1, p);
+
+	// Calculate the orientation through XOR
+	uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit);
+
+	// Combine the results
+	uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien);
+
+	// Mask to clear the last 32 bits of the result
+	uint8x16_t mask_last_32 = vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3);
+	ret = vreinterpretq_u8_u32(mask_last_32);
+
+	return ret;
+}
+
+_static_inline uint8x16_t 
+compose_corners_slim(uint8x16_t corner1, uint8x16_t corner2)
+{
+	// Masks
+	uint8x16_t p_bits = vdupq_n_u8(_pbits);
+	uint8x16_t cobits = vdupq_n_u8(_cobits);
+	uint8x16_t cobits2 = vdupq_n_u8(_cobits2);
+	uint8x16_t twist_cw = vdupq_n_u8(_ctwist_cw);
+
+	// Find the index and permutation
+	uint8x16_t p = vandq_u8(corner2, p_bits);
+	uint8x16_t piece1 = vqtbl1q_u8(corner1, p);
+
+	// Calculate the orientation
+	uint8x16_t aux = vaddq_u8(vandq_u8(corner2, cobits), vandq_u8(piece1, cobits));
+	uint8x16_t auy = vshrq_n_u8(vaddq_u8(aux, twist_cw), 2);
+	uint8x16_t orien = vandq_u8(vaddq_u8(aux, auy), cobits2);
+
+	// Combine the results
+	uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien);
+
+	// Mask to clear the last 64 bits of the result
+	uint8x16_t mask_last_64 = vsetq_lane_u64(0, vreinterpretq_u64_u8(ret), 1);
+	ret = vreinterpretq_u8_u64(mask_last_64);
+
+	return ret;
+}
+
+_static_inline cube_t 
+compose(cube_t c1, cube_t c2)
+{
+	cube_t ret = {0};
+
+	ret.edge = compose_edges_slim(c1.edge, c2.edge);
+	ret.corner = compose_corners_slim(c1.corner, c2.corner);
+
+	return ret;
+}
+
+_static_inline cube_t 
+inverse(cube_t cube)
+{
+	uint8_t i, piece, orien;
+	cube_t ret;
+
+	// Temp arrays to store the NEON vectors
+	uint8_t edges[16];
+	uint8_t corners[16];
+
+	// Copy the NEON vectors to the arrays
+	vst1q_u8(edges, cube.edge);
+	vst1q_u8(corners, cube.corner);
+
+	uint8_t edge_result[16] = {0};
+	uint8_t corner_result[16] = {0};
+
+	// Process the edges
+	for (i = 0; i < 12; i++)
+	{
+		piece = edges[i];
+		orien = piece & _eobit;
+		edge_result[piece & _pbits] = i | orien;
+	}
+
+	// Process the corners
+	for (i = 0; i < 8; i++)
+	{
+		piece = corners[i];
+		orien = ((piece << 1) | (piece >> 1)) & _cobits2;
+		corner_result[piece & _pbits] = i | orien;
+	}
+
+	// Copy the results back to the NEON vectors
+	ret.edge = vld1q_u8(edge_result);
+	ret.corner = vld1q_u8(corner_result);
+
+	return ret;
+}
+
+_static_inline int64_t 
+coord_co(cube_t c)
+{
+	// Temp array to store the NEON vector
+	uint8_t mem[16];
+	vst1q_u8(mem, c.corner);
+
+	int i, p;
+	int64_t ret;
+
+	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3)
+		ret += p * (mem[i] >> _coshift);
+
+	return ret;
+}
+
+_static_inline int64_t
+coord_csep(cube_t c)
+{
+	// Temp array to store the NEON vector
+	uint8_t mem[16];
+	vst1q_u8(mem, c.corner);
+
+	int64_t ret = 0;
+	int i, p;
+	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2)
+		ret += p * ((mem[i] & _csepbit) >> 2);
+
+	return ret;
+	return 0;
+}
+
+_static_inline int64_t
+coord_cocsep(cube_t c)
+{
+	return (coord_co(c) << 7) + coord_csep(c);
+}
+
+_static_inline int64_t 
+coord_eo(cube_t c)
+{
+	int64_t ret = 0;
+	int64_t p = 1;
+
+	// Temp array to store the NEON vector
+	uint8_t mem[16];
+	vst1q_u8(mem, c.edge);
+
+	for (int i = 1; i < 12; i++, p *= 2)
+	{
+		ret += p * (mem[i] >> _eoshift);
+	}
+
+	return ret;
+}
+
+_static_inline int64_t 
+coord_esep(cube_t c)
+{
+	int64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
+
+	// Temp array to store the NEON vector
+	uint8_t mem[16];
+	vst1q_u8(mem, c.edge);
+
+	for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++)
+	{
+		bit1 = (mem[i] & _esepbit1) >> 2;
+		bit2 = (mem[i] & _esepbit2) >> 3;
+		is1 = (1 - bit2) * bit1;
+
+		ret1 += bit2 * binomial[11 - i][k];
+		k -= bit2;
+
+		jj = j < 8;
+		ret2 += jj * is1 * binomial[7 - (j * jj)][l];
+		l -= is1;
+		j += (1 - bit2);
+	}
+
+	return ret1 * 70 + ret2;
+}
+
+_static_inline void
+copy_corners(cube_t *dst, cube_t src)
+{
+	dst->corner = src.corner;
+}
+
+_static_inline void
+copy_edges(cube_t *dst, cube_t src)
+{
+	dst->edge = src.edge;
+}
+
+_static_inline void
+set_eo(cube_t *cube, int64_t eo)
+{
+	// Temp array to store the NEON vector
+	uint8_t mem[16];
+	vst1q_u8(mem, cube->edge);
+	uint8_t i, sum, flip;
+
+	for (sum = 0, i = 1; i < 12; i++, eo >>= 1)
+	{
+		flip = eo % 2;
+		sum += flip;
+		mem[i] = (mem[i] & ~_eobit) | (_eobit * flip);
+	}
+	mem[0] = (mem[0] & ~_eobit) | (_eobit * (sum % 2));
+
+	// Copy the results back to the NEON vector
+	cube->edge = vld1q_u8(mem);
+	return;
+}
+
+_static_inline cube_t
+invcoord_esep(int64_t esep)
+{
+	cube_t ret;
+	int64_t bit1, bit2, i, j, jj, k, l, s, v, w, is1, set1, set2;
+	uint8_t slice[3] = {0};
+
+	ret = solved;
+	uint8_t mem[16];
+	set1 = esep % 70;
+	set2 = esep / 70;
+
+	for (i = 0, j = 0, k = 4, l = 4; i < 12; i++)
+	{
+		v = binomial[11 - i][k];
+		jj = j < 8;
+		w = jj * binomial[7 - (j * jj)][l];
+		bit2 = set2 >= v;
+		bit1 = set1 >= w;
+		is1 = (1 - bit2) * bit1;
+
+		set2 -= bit2 * v;
+		k -= bit2;
+		set1 -= is1 * w;
+		l -= is1;
+		j += (1 - bit2);
+		s = 2 * bit2 + (1 - bit2) * bit1;
+
+		mem[i] = (slice[s]++) | (uint8_t)(s << 2);
+	}
+
+	ret.edge = vld1q_u8(mem);
+	return ret;
+}
diff --git a/src/arch/portable.h b/src/arch/portable.h
@@ -0,0 +1,272 @@
+#define static_cube(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
+    e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
+    ((cube_t) { \
+        .corner = { c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl }, \
+        .edge = { e_uf, e_ub, e_db, e_df, e_ur, e_ul, \
+                  e_dl, e_dr, e_fr, e_fl, e_bl, e_br } })
+#define zero static_cube( \
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+#define solved static_cube( \
+    0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
+
+_static void
+pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12])
+{
+	memcpy(c, cube->corner, 8);
+	memcpy(e, cube->edge, 12);
+}
+
+_static_inline bool
+equal(cube_t c1, cube_t c2)
+{
+	uint8_t i;
+	bool ret;
+
+	ret = true;
+	for (i = 0; i < 8; i++)
+		ret = ret && c1.corner[i] == c2.corner[i];
+	for (i = 0; i < 12; i++)
+		ret = ret && c1.edge[i] == c2.edge[i];
+
+	return ret;
+}
+
+_static_inline cube_t
+invertco(cube_t c)
+{
+	uint8_t i, piece, orien;
+	cube_t ret;
+
+	ret = c;
+	for (i = 0; i < 8; i++) {
+		piece = c.corner[i];
+		orien = ((piece << 1) | (piece >> 1)) & _cobits2;
+		ret.corner[i] = (piece & _pbits) | orien;
+	}
+
+	return ret;
+}
+
+_static_inline void
+compose_edges_inplace(cube_t c1, cube_t c2, cube_t *ret)
+{
+	uint8_t i, piece1, piece2, p, orien;
+
+	for (i = 0; i < 12; i++) {
+		piece2 = c2.edge[i];
+		p = piece2 & _pbits;
+		piece1 = c1.edge[p];
+		orien = (piece2 ^ piece1) & _eobit;
+		ret->edge[i] = (piece1 & _pbits) | orien;
+	}
+}
+
+_static_inline void
+compose_corners_inplace(cube_t c1, cube_t c2, cube_t *ret)
+{
+	uint8_t i, piece1, piece2, p, orien, aux, auy;
+
+	for (i = 0; i < 8; i++) {
+		piece2 = c2.corner[i];
+		p = piece2 & _pbits;
+		piece1 = c1.corner[p];
+		aux = (piece2 & _cobits) + (piece1 & _cobits);
+		auy = (aux + _ctwist_cw) >> 2;
+		orien = (aux + auy) & _cobits2;
+		ret->corner[i] = (piece1 & _pbits) | orien;
+	}
+}
+
+_static_inline cube_t
+compose_edges(cube_t c1, cube_t c2)
+{
+	cube_t ret = zero;
+
+	compose_edges_inplace(c1, c2, &ret);
+
+	return ret;
+}
+
+_static_inline cube_t
+compose_corners(cube_t c1, cube_t c2)
+{
+	cube_t ret = zero;
+
+	compose_corners_inplace(c1, c2, &ret);
+
+	return ret;
+}
+
+_static_inline cube_t
+compose(cube_t c1, cube_t c2)
+{
+	cube_t ret = zero;
+
+	compose_edges_inplace(c1, c2, &ret);
+	compose_corners_inplace(c1, c2, &ret);
+
+	return ret;
+}
+
+cube_t
+inverse(cube_t cube)
+{
+	uint8_t i, piece, orien;
+	cube_t ret;
+
+	for (i = 0; i < 12; i++) {
+		piece = cube.edge[i];
+		orien = piece & _eobit;
+		ret.edge[piece & _pbits] = i | orien;
+	}
+
+	for (i = 0; i < 8; i++) {
+		piece = cube.corner[i];
+		orien = ((piece << 1) | (piece >> 1)) & _cobits2;
+		ret.corner[piece & _pbits] = i | orien;
+	}
+
+	return ret;
+}
+
+_static_inline int64_t
+coord_co(cube_t c)
+{
+	int i, p;
+	int64_t ret;
+
+	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3)
+		ret += p * (c.corner[i] >> _coshift);
+
+	return ret;
+}
+
+/*
+For corner separation, we consider the axis (a.k.a. tetrad) each
+corner belongs to as 0 or 1 and we translate this sequence into binary.
+Ignoring the last bit, we have a value up to 2^7, but not all values are
+possible. Encoding this as a number from 0 to C(8,4) would save about 40%
+of space, but we are not going to use this coordinate in large tables.
+*/
+_static_inline int64_t
+coord_csep(cube_t c)
+{
+	int i, p;
+	int64_t ret;
+
+	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2)
+		ret += p * ((c.corner[i] & _csepbit) >> 2);
+
+	return ret;
+}
+
+_static_inline int64_t
+coord_cocsep(cube_t c)
+{
+	return (coord_co(c) << 7) + coord_csep(c);
+}
+
+_static_inline int64_t
+coord_eo(cube_t c)
+{
+	int i, p;
+	int64_t ret;
+
+	for (ret = 0, i = 1, p = 1; i < 12; i++, p *= 2)
+		ret += p * (c.edge[i] >> _eoshift);
+
+	return ret;
+}
+
+/*
+We encode the edge separation as a number from 0 to C(12,4)*C(8,4).
+It can be seen as the composition of two "subset index" coordinates.
+*/
+_static_inline int64_t
+coord_esep(cube_t c)
+{
+	int64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
+
+	for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++) {
+		/* Simple version:
+		if (c.edge[i] & _esepbit2) {
+			ret1 += binomial[11-i][k--];
+		} else {
+			if (c.edge[i] & _esepbit1)
+				ret2 += binomial[7-j][l--];
+			j++;
+		}
+		*/
+
+		bit1 = (c.edge[i] & _esepbit1) >> 2;
+		bit2 = (c.edge[i] & _esepbit2) >> 3;
+		is1 = (1 - bit2) * bit1;
+
+		ret1 += bit2 * binomial[11-i][k];
+		k -= bit2;
+
+		jj = j < 8;
+		ret2 += jj * is1 * binomial[7-(j*jj)][l];
+		l -= is1;
+		j += (1-bit2);
+	}
+
+	return ret1 * 70 + ret2;
+}
+
+_static_inline void
+copy_corners(cube_t *dest, cube_t src)
+{
+	memcpy(&dest->corner, src.corner, sizeof(src.corner));
+}
+
+_static_inline void
+copy_edges(cube_t *dest, cube_t src)
+{
+	memcpy(&dest->edge, src.edge, sizeof(src.edge));
+}
+
+_static_inline void
+set_eo(cube_t *cube, int64_t eo)
+{
+	uint8_t i, sum, flip;
+
+	for (sum = 0, i = 1; i < 12; i++, eo >>= 1) {
+		flip = eo % 2;
+		sum += flip;
+		cube->edge[i] = (cube->edge[i] & ~_eobit) | (_eobit * flip);
+	}
+	cube->edge[0] = (cube->edge[0] & ~_eobit) | (_eobit * (sum % 2));
+}
+
+_static_inline cube_t
+invcoord_esep(int64_t esep)
+{
+	cube_t ret;
+	int64_t bit1, bit2, i, j, jj, k, l, s, v, w, is1, set1, set2;
+	uint8_t slice[3] = {0};
+
+	ret = solved;
+	set1 = esep % 70;
+	set2 = esep / 70;
+
+	for (i = 0, j = 0, k = 4, l = 4; i < 12; i++) {
+		v = binomial[11-i][k];
+		jj = j < 8;
+		w = jj * binomial[7-(j*jj)][l];
+		bit2 = set2 >= v;
+		bit1 = set1 >= w;
+		is1 = (1 - bit2) * bit1;
+
+		set2 -= bit2 * v;
+		k -= bit2;
+		set1 -= is1 * w;
+		l -= is1;
+		j += (1-bit2);
+		s = 2*bit2 + (1-bit2)*bit1;
+
+		ret.edge[i] = (slice[s]++) | (uint8_t)(s << 2);
+	}
+
+	return ret;
+}
diff --git a/src/constant_cubes.h b/src/core/constant_cubes.h
diff --git a/src/core/core.h b/src/core/core.h
@@ -0,0 +1,13 @@
+#include "constant_cubes.h"
+#include "io_moves.h"
+#include "io_trans.h"
+#include "cube.h"
+#include "io_cube.h"
+#include "moves.h"
+
+/* TODO: work in progress */
+#if 0
+#include "transform.h"
+#else
+#include "transform_with_switch.h"
+#endif
diff --git a/src/cube_generic.h b/src/core/cube.h
diff --git a/src/io_cube.h b/src/core/io_cube.h
diff --git a/src/core/io_moves.h b/src/core/io_moves.h
@@ -0,0 +1,62 @@
+_static uint8_t readmove(char);
+_static uint8_t readmodifier(char);
+_static int writemoves(uint8_t *, int, char *);
+
+_static uint8_t
+readmove(char c)
+{
+	switch (c) {
+	case 'U':
+		return _move_U;
+	case 'D':
+		return _move_D;
+	case 'R':
+		return _move_R;
+	case 'L':
+		return _move_L;
+	case 'F':
+		return _move_F;
+	case 'B':
+		return _move_B;
+	default:
+		return _error;
+	}
+}
+
+_static uint8_t
+readmodifier(char c)
+{
+	switch (c) {
+	case '1': /* Fallthrough */
+	case '2': /* Fallthrough */
+	case '3':
+		return c - '0' - 1;
+	case '\'':
+		return 2;
+	default:
+		return 0;
+	}
+}
+
+_static int
+writemoves(uint8_t *m, int n, char *buf)
+{
+	int i;
+	size_t len;
+	const char *s;
+	char *b;
+
+	for (i = 0, b = buf; i < n; i++, b++) {
+		s = movestr[m[i]];
+		len = strlen(s);
+		memcpy(b, s, len);
+		b += len;	
+		*b = ' ';
+	}
+
+	if (b != buf)
+		b--; /* Remove last space */
+	*b = '\0';
+
+	return b - buf;
+}
diff --git a/src/core/io_trans.h b/src/core/io_trans.h
@@ -0,0 +1,25 @@
+_static uint8_t readtrans(const char *);
+_static void writetrans(uint8_t, char *);
+
+_static uint8_t
+readtrans(const char *buf)
+{
+	uint8_t t;
+
+	for (t = 0; t < 48; t++)
+		if (!strncmp(buf, transstr[t], 11))
+			return t;
+
+	LOG("readtrans error\n");
+	return _error;
+}
+
+_static void
+writetrans(uint8_t t, char *buf)
+{
+	if (t >= 48)
+		memcpy(buf, "error trans", 11);
+	else
+		memcpy(buf, transstr[t], 11);
+	buf[11] = '\0';
+}
diff --git a/src/moves.h b/src/core/moves.h
diff --git a/src/cube_transform.h b/src/core/transform.h
diff --git a/src/cube_transform_with_switch.h b/src/core/transform_with_switch.h
diff --git a/src/cube.c b/src/cube.c
@@ -1,55 +0,0 @@
-#include <inttypes.h>
-#include <stdarg.h>
-#include <stdbool.h>
-#include <string.h>
-
-void (*nissy_log)(const char *, ...);
-
-#define LOG(...) if (nissy_log != NULL) nissy_log(__VA_ARGS__);
-
-#ifdef DEBUG
-#define _static
-#define _static_inline
-#define DBG_WARN(condition, ...) if (!(condition)) LOG(__VA_ARGS__);
-#define DBG_ASSERT(condition, retval, ...) \
-    if (!(condition)) { LOG(__VA_ARGS__); return retval; }
-#else
-#define _static static
-#define _static_inline static inline
-#define DBG_WARN(condition, ...)
-#define DBG_ASSERT(condition, retval, ...)
-#endif
-
-#include "constants.h"
-#include "utils.h"
-
-#if defined(CUBE_AVX2)
-#include <immintrin.h>
-#include "cube_avx2.h"
-#elif defined(CUBE_NEON)
-#include <stdlib.h>
-#include <arm_neon.h>
-#include "cube_neon.h"
-#else
-#include <stdlib.h> /* TODO: check if can be removed */
-#include "cube_portable.h"
-#endif
-
-#include "io_move_trans.h"
-#include "constant_cubes.h"
-#include "cube_generic.h"
-#include "io_cube.h"
-
-/* TODO: work in progress */
-#if 0
-#include "constant_cubes_transform.h"
-#include "cube_transform.h"
-#else
-#include "cube_transform_with_switch.h"
-#endif
-
-#include "moves.h"
-#include "solve_h48.h"
-#include "solve_generic.h"
-
-#include "cube_public.h"
diff --git a/src/cube_avx2.h b/src/cube_avx2.h
@@ -1,341 +0,0 @@
-typedef __m256i cube_t;
-
-#define _co2_avx2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x6060606060606060))
-#define _cocw_avx2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x2020202020202020))
-#define _cp_avx2 _mm256_set_epi64x(0, 0, 0, INT64_C(0x0707070707070707))
-#define _ep_avx2 \
-    _mm256_set_epi64x(INT64_C(0x0F0F0F0F), INT64_C(0x0F0F0F0F0F0F0F0F), 0, 0)
-#define _eo_avx2 \
-    _mm256_set_epi64x(INT64_C(0x10101010), INT64_C(0x1010101010101010), 0, 0)
-
-#define static_cube(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
-    e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
-    _mm256_set_epi8(0, 0, 0, 0, e_br, e_bl, e_fl, e_fr, \
-        e_dr, e_dl, e_ul, e_ur, e_df, e_db, e_ub, e_uf, \
-        0, 0, 0, 0, 0, 0, 0, 0, \
-        c_dbl, c_dfr, c_ubr, c_ufl, c_dbr, c_dfl, c_ubl, c_ufr)
-#define zero _mm256_set_epi64x(0, 0, 0, 0)
-#define solved static_cube( \
-    0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
-
-_static void pieces(cube_t *, uint8_t [static 8], uint8_t [static 12]);
-_static_inline bool equal(cube_t, cube_t);
-_static_inline cube_t invertco(cube_t);
-_static_inline cube_t compose_epcpeo(cube_t, cube_t);
-_static_inline cube_t compose_edges(cube_t, cube_t);
-_static_inline cube_t compose_corners(cube_t, cube_t);
-_static_inline cube_t compose(cube_t, cube_t);
-_static_inline cube_t inverse(cube_t);
-
-_static_inline int64_t coord_co(cube_t);
-_static_inline int64_t coord_csep(cube_t);
-_static_inline int64_t coord_cocsep(cube_t);
-_static_inline int64_t coord_eo(cube_t);
-_static_inline int64_t coord_esep(cube_t);
-
-_static_inline void copy_corners(cube_t *, cube_t);
-_static_inline void copy_edges(cube_t *, cube_t);
-_static_inline void set_eo(cube_t *, int64_t);
-_static_inline cube_t invcoord_esep(int64_t);
-
-_static void
-pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12])
-{
-	uint8_t aux[32];
-
-	_mm256_storeu_si256((__m256i_u *)aux, *cube);
-	memcpy(c, aux, 8);
-	memcpy(e, aux+16, 12);
-}
-
-_static_inline bool
-equal(cube_t c1, cube_t c2)
-{
-	int32_t mask;
-	__m256i cmp;
-
-	cmp = _mm256_cmpeq_epi8(c1, c2);
-	mask = _mm256_movemask_epi8(cmp);
-
-	return mask == ~0;
-}
-
-_static_inline cube_t
-invertco(cube_t c)
-{
-        cube_t co, shleft, shright, summed, newco, cleanco, ret;
-
-        co = _mm256_and_si256(c, _co2_avx2);
-        shleft = _mm256_slli_epi32(co, 1);
-        shright = _mm256_srli_epi32(co, 1);
-        summed = _mm256_or_si256(shleft, shright);
-        newco = _mm256_and_si256(summed, _co2_avx2);
-        cleanco = _mm256_xor_si256(c, co);
-        ret = _mm256_or_si256(cleanco, newco);
-
-        return ret;
-}
-
-_static_inline cube_t
-compose_epcpeo(cube_t c1, cube_t c2)
-{
-	cube_t b, s, eo2;
-
-	/* Permute and clean unused bits */
-	s = _mm256_shuffle_epi8(c1, c2);
-	b = _mm256_set_epi8(
-		~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0
-	);
-	s = _mm256_andnot_si256(b, s);
-
-	/* Change EO */
-	eo2 = _mm256_and_si256(c2, _eo_avx2);
-	s = _mm256_xor_si256(s, eo2);
-
-	return s;
-}
-
-_static_inline cube_t
-compose_edges(cube_t c1, cube_t c2)
-{
-	return compose_epcpeo(c1, c2);
-}
-
-_static_inline cube_t
-compose_corners(cube_t c1, cube_t c2)
-{
-	/*
-	 * We do a full compose. Minor optimizations are possible, like
-	 * saving one instruction by not doing EO, but it should not
-	 * be significant.
-	 */
-	return compose(c1, c2);
-}
-
-_static_inline cube_t
-compose(cube_t c1, cube_t c2)
-{
-	cube_t s, co1, co2, aux, auy1, auy2, auz1, auz2;
-
-	s = compose_epcpeo(c1, c2);
-
-	/* Change CO */
-	co1 = _mm256_and_si256(s, _co2_avx2);
-	co2 = _mm256_and_si256(c2, _co2_avx2);
-	aux = _mm256_add_epi8(co1, co2);
-	auy1 = _mm256_add_epi8(aux, _cocw_avx2);
-	auy2 = _mm256_srli_epi32(auy1, 2);
-	auz1 = _mm256_add_epi8(aux, auy2);
-	auz2 = _mm256_and_si256(auz1, _co2_avx2);
-
-	/* Put together */
-	s = _mm256_andnot_si256(_co2_avx2, s);
-	s = _mm256_or_si256(s, auz2);
-
-	return s;
-}
-
-_static_inline cube_t
-cleanaftershuffle(cube_t c)
-{
-	__m256i b;
-
-	b = _mm256_set_epi8(
-		~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0
-	);
-
-	return _mm256_andnot_si256(b, c);
-}
-
-_static_inline cube_t
-inverse(cube_t c)
-{
-	/* Method taken from Andrew Skalski's vcube[1]. The addition sequence
-	 * was generated using [2].
-	 * [1] https://github.com/Voltara/vcube
-	 * [2] http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
-	 */
-	cube_t v3, vi, vo, vp, ret;
-
-	v3 = _mm256_shuffle_epi8(c, c);
-	v3 = _mm256_shuffle_epi8(v3, c);
-	vi = _mm256_shuffle_epi8(v3, v3);
-	vi = _mm256_shuffle_epi8(vi, vi);
-	vi = _mm256_shuffle_epi8(vi, vi);
-	vi = _mm256_shuffle_epi8(vi, v3);
-	vi = _mm256_shuffle_epi8(vi, vi);
-	vi = _mm256_shuffle_epi8(vi, vi);
-	vi = _mm256_shuffle_epi8(vi, vi);
-	vi = _mm256_shuffle_epi8(vi, vi);
-	vi = _mm256_shuffle_epi8(vi, c);
-	vi = _mm256_shuffle_epi8(vi, vi);
-	vi = _mm256_shuffle_epi8(vi, vi);
-	vi = _mm256_shuffle_epi8(vi, vi);
-	vi = _mm256_shuffle_epi8(vi, vi);
-	vi = _mm256_shuffle_epi8(vi, vi);
-	vi = _mm256_shuffle_epi8(vi, v3);
-	vi = _mm256_shuffle_epi8(vi, vi);
-	vi = _mm256_shuffle_epi8(vi, c);
-
-	vo = _mm256_and_si256(c, _mm256_or_si256(_eo_avx2, _co2_avx2));
-	vo = _mm256_shuffle_epi8(vo, vi);
-	vp = _mm256_andnot_si256(_mm256_or_si256(_eo_avx2, _co2_avx2), vi);
-	ret = _mm256_or_si256(vp, vo);
-	ret = cleanaftershuffle(ret);
-	
-	return invertco(ret);
-}
-
-_static_inline int64_t
-coord_co(cube_t c)
-{
-	cube_t co;
-	int64_t mem[4], ret, i, p;
-
-	co = _mm256_and_si256(c, _co2_avx2);
-	_mm256_storeu_si256((__m256i *)mem, co);
-
-	mem[0] >>= 5;
-	for (i = 0, ret = 0, p = 1; i < 7; i++, mem[0] >>= 8, p *= 3)
-		ret += (mem[0] & 3) * p;
-
-	return ret;
-}
-
-_static_inline int64_t
-coord_csep(cube_t c)
-{
-	cube_t cp, shifted;
-	int64_t mask;
-
-	cp = _mm256_and_si256(c, _cp_avx2);
-	shifted = _mm256_slli_epi32(cp, 5);
-	mask = _mm256_movemask_epi8(shifted);
-
-	return mask & 0x7F;
-}
-
-_static_inline int64_t
-coord_cocsep(cube_t c)
-{
-	return (coord_co(c) << 7) + coord_csep(c);
-}
-
-_static_inline int64_t
-coord_eo(cube_t c)
-{
-	cube_t eo, shifted;
-	int64_t mask;
-
-	eo = _mm256_and_si256(c, _eo_avx2);
-	shifted = _mm256_slli_epi32(eo, 3);
-	mask = _mm256_movemask_epi8(shifted);
-
-	return mask >> 17;
-}
-
-_static_inline int64_t
-coord_esep(cube_t c)
-{
-	cube_t ep;
-	int64_t e, mem[4], i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
-
-	ep = _mm256_and_si256(c, _ep_avx2);
-	_mm256_storeu_si256((__m256i *)mem, ep);
-
-	mem[3] <<= 8;
-	ret1 = ret2 = 0;
-	k = l = 4;
-	for (i = 0, j = 0; i < 12; i++, mem[i/8 + 2] >>= 8) {
-		e = mem[i/8 + 2];
-
-		bit1 = (e & _esepbit1) >> 2;
-		bit2 = (e & _esepbit2) >> 3;
-		is1 = (1 - bit2) * bit1;
-
-		ret1 += bit2 * binomial[11-i][k];
-		k -= bit2;
-
-		jj = j < 8;
-		ret2 += jj * is1 * binomial[7-(j*jj)][l];
-		l -= is1;
-		j += (1-bit2);
-	}
-
-	return ret1 * 70 + ret2;
-}
-
-_static_inline void
-copy_corners(cube_t *dest, cube_t src)
-{
-	*dest = _mm256_blend_epi32(*dest, src, 0x0F);
-}
-
-_static_inline void
-copy_edges(cube_t *dest, cube_t src)
-{
-	*dest = _mm256_blend_epi32(*dest, src, 0xF0);
-}
-
-_static_inline void
-set_eo(cube_t *cube, int64_t eo)
-{
-	int64_t eo12, eotop, eobot;
-	__m256i veo;
-
-	eo12 = (eo << 1) + (_mm_popcnt_u64(eo) % 2);
-	eotop = (eo12 & (1 << 11)) << 17 |
-		(eo12 & (1 << 10)) << 10 |
-		(eo12 & (1 << 9)) << 3 |
-		(eo12 & (1 << 8)) >> 4;
-	eobot = (eo12 & (1 << 7)) << 53 |
-		(eo12 & (1 << 6)) << 46 |
-		(eo12 & (1 << 5)) << 39 |
-		(eo12 & (1 << 4)) << 32 |
-		(eo12 & (1 << 3)) << 25 |
-		(eo12 & (1 << 2)) << 18 |
-		(eo12 & (1 << 1)) << 11 |
-		(eo12 & 1) << 4;
-	veo = _mm256_set_epi64x(eotop, eobot, 0, 0);
-
-	*cube = _mm256_andnot_si256(_eo_avx2, *cube);
-	*cube = _mm256_or_si256(*cube, veo);
-}
-
-_static_inline cube_t
-invcoord_esep(int64_t esep)
-{
-	cube_t eee, ret;
-	int64_t bit1, bit2, i, j, jj, k, l, s, v, w, is1, set1, set2;
-	uint8_t mem[32];
-	uint8_t slice[3] = {0};
-
-	set1 = esep % 70;
-	set2 = esep / 70;
-
-	for (i = 0, j = 0, k = 4, l = 4; i < 12; i++) {
-		v = binomial[11-i][k];
-		jj = j < 8;
-		w = jj * binomial[7-(j*jj)][l];
-		bit2 = set2 >= v;
-		bit1 = set1 >= w;
-		is1 = (1 - bit2) * bit1;
-
-		set2 -= bit2 * v;
-		k -= bit2;
-		set1 -= is1 * w;
-		l -= is1;
-		j += (1-bit2);
-		s = 2*bit2 + (1-bit2)*bit1;
-
-		mem[i+16] = (slice[s]++) | (uint8_t)(s << 2);
-	}
-
-	ret = solved;
-	eee = _mm256_loadu_si256((__m256i_u *)&mem);
-	copy_edges(&ret, eee);
-
-	return ret;
-}
diff --git a/src/cube_neon.h b/src/cube_neon.h
@@ -1,377 +0,0 @@
-// cube_t
-typedef struct
-{
-	uint8x16_t corner;
-	uint8x16_t edge;
-} cube_t;
-
-#define _co2_neon vdupq_n_u8(0x60)
-#define _cocw_neon vdupq_n_u8(0x20)
-#define _cp_neon vdupq_n_u8(0x07)
-#define _ep_neon vcombine_u8(vdupq_n_u8(0x0F), vdupq_n_u8(0x0F))
-#define _eo_neon vcombine_u8(vdupq_n_u8(0x10), vdupq_n_u8(0x10))
-
-// static cube
-#define static_cube(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
-					e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
-	((cube_t){ \
-		.corner = {c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, 0, 0, 0, 0, 0, 0, 0, 0}, \
-		.edge = {e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br, 0, 0, 0, 0}})
-
-// zero cube
-#define zero \
-	(cube_t) \
-	{ \
-		.corner = vdupq_n_u8(0), \
-		.edge = vdupq_n_u8(0) \
-	}
-
-// solved cube
-#define solved static_cube(	\
-	0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
-
-// Functions
-_static void pieces(cube_t *, uint8_t[static 8], uint8_t[static 12]);
-_static_inline bool equal(cube_t, cube_t);
-_static_inline cube_t invertco(cube_t);
-_static_inline cube_t compose_edges(cube_t, cube_t);
-_static_inline cube_t compose_corners(cube_t, cube_t);
-_static_inline uint8x16_t compose_edges_slim(uint8x16_t, uint8x16_t);
-_static_inline uint8x16_t compose_corners_slim(uint8x16_t, uint8x16_t);
-_static_inline cube_t compose(cube_t, cube_t);
-_static_inline cube_t inverse(cube_t);
-
-_static_inline int64_t coord_co(cube_t);
-_static_inline int64_t coord_csep(cube_t);
-_static_inline int64_t coord_cocsep(cube_t);
-_static_inline int64_t coord_eo(cube_t);
-_static_inline int64_t coord_esep(cube_t);
-
-_static_inline void copy_corners(cube_t *, cube_t);
-_static_inline void copy_edges(cube_t *, cube_t);
-_static_inline void set_eo(cube_t *, int64_t);
-_static_inline cube_t invcoord_esep(int64_t);
-
-_static void
-pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12])
-{
-	// First 8 bytes of the corner vector are copied from the c array
-	vst1_u8(c, vget_low_u8(cube->corner));
-
-	// 12 bytes of the edge vector are copied from the e array
-	// First 8 bytes
-	vst1_u8(e, vget_low_u8(cube->edge));
-	// Next 4 bytes
-	vst1_lane_u32((uint32_t *)(e + 8), vreinterpret_u32_u8(vget_high_u8(cube->edge)), 0);
-}
-
-_static_inline bool
-equal(cube_t c1, cube_t c2)
-{
-	uint8x16_t cmp_corner, cmp_edge;
-	uint64x2_t cmp_corner_u64, cmp_edge_u64;
-	uint64x2_t cmp_result;
-
-	// compare the corner vectors
-	cmp_corner = vceqq_u8(c1.corner, c2.corner);
-	// compare the edge vectors
-	cmp_edge = vceqq_u8(c1.edge, c2.edge);
-
-	// convert the comparison vectors to 64-bit vectors
-	cmp_corner_u64 = vreinterpretq_u64_u8(cmp_corner);
-	cmp_edge_u64 = vreinterpretq_u64_u8(cmp_edge);
-
-	// combine the comparison vectors
-	cmp_result = vandq_u64(cmp_corner_u64, cmp_edge_u64);
-
-	// check if all the bits are set
-	return vgetq_lane_u64(cmp_result, 0) == ~0ULL && vgetq_lane_u64(cmp_result, 1) == ~0ULL;
-}
-
-_static_inline cube_t
-invertco(cube_t c)
-{
-	cube_t ret;
-	uint8x16_t co, shleft, shright, summed, newco, cleanco;
-
-	co = vandq_u8(c.corner, _co2_neon);
-	shleft = vshlq_n_u8(co, 1);
-	shright = vshrq_n_u8(co, 1);
-	summed = vorrq_u8(shleft, shright);
-	newco = vandq_u8(summed, _co2_neon);
-	cleanco = veorq_u8(c.corner, co);
-	ret.corner = vorrq_u8(cleanco, newco);
-	ret.edge = c.edge;
-	
-	return ret;
-}
-
-_static_inline cube_t
-compose_edges(cube_t c1, cube_t c2)
-{
-	cube_t ret = {0};
-	ret.edge = compose_edges_slim(c1.edge, c2.edge);
-	return ret;
-}
-
-_static_inline cube_t 
-compose_corners(cube_t c1, cube_t c2)
-{
-	cube_t ret = {0};
-	ret.corner = compose_corners_slim(c1.corner, c2.corner);
-	return ret;
-}
-
-_static_inline uint8x16_t 
-compose_edges_slim(uint8x16_t edge1, uint8x16_t edge2)
-{
-	// Masks
-	uint8x16_t p_bits = vdupq_n_u8(_pbits);
-	uint8x16_t eo_bit = vdupq_n_u8(_eobit);
-
-	// Find the index and permutation
-	uint8x16_t p = vandq_u8(edge2, p_bits);
-	uint8x16_t piece1 = vqtbl1q_u8(edge1, p);
-
-	// Calculate the orientation through XOR
-	uint8x16_t orien = vandq_u8(veorq_u8(edge2, piece1), eo_bit);
-
-	// Combine the results
-	uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien);
-
-	// Mask to clear the last 32 bits of the result
-	uint8x16_t mask_last_32 = vsetq_lane_u32(0, vreinterpretq_u32_u8(ret), 3);
-	ret = vreinterpretq_u8_u32(mask_last_32);
-
-	return ret;
-}
-
-_static_inline uint8x16_t 
-compose_corners_slim(uint8x16_t corner1, uint8x16_t corner2)
-{
-	// Masks
-	uint8x16_t p_bits = vdupq_n_u8(_pbits);
-	uint8x16_t cobits = vdupq_n_u8(_cobits);
-	uint8x16_t cobits2 = vdupq_n_u8(_cobits2);
-	uint8x16_t twist_cw = vdupq_n_u8(_ctwist_cw);
-
-	// Find the index and permutation
-	uint8x16_t p = vandq_u8(corner2, p_bits);
-	uint8x16_t piece1 = vqtbl1q_u8(corner1, p);
-
-	// Calculate the orientation
-	uint8x16_t aux = vaddq_u8(vandq_u8(corner2, cobits), vandq_u8(piece1, cobits));
-	uint8x16_t auy = vshrq_n_u8(vaddq_u8(aux, twist_cw), 2);
-	uint8x16_t orien = vandq_u8(vaddq_u8(aux, auy), cobits2);
-
-	// Combine the results
-	uint8x16_t ret = vorrq_u8(vandq_u8(piece1, p_bits), orien);
-
-	// Mask to clear the last 64 bits of the result
-	uint8x16_t mask_last_64 = vsetq_lane_u64(0, vreinterpretq_u64_u8(ret), 1);
-	ret = vreinterpretq_u8_u64(mask_last_64);
-
-	return ret;
-}
-
-_static_inline cube_t 
-compose(cube_t c1, cube_t c2)
-{
-	cube_t ret = {0};
-
-	ret.edge = compose_edges_slim(c1.edge, c2.edge);
-	ret.corner = compose_corners_slim(c1.corner, c2.corner);
-
-	return ret;
-}
-
-_static_inline cube_t 
-inverse(cube_t cube)
-{
-	uint8_t i, piece, orien;
-	cube_t ret;
-
-	// Temp arrays to store the NEON vectors
-	uint8_t edges[16];
-	uint8_t corners[16];
-
-	// Copy the NEON vectors to the arrays
-	vst1q_u8(edges, cube.edge);
-	vst1q_u8(corners, cube.corner);
-
-	uint8_t edge_result[16] = {0};
-	uint8_t corner_result[16] = {0};
-
-	// Process the edges
-	for (i = 0; i < 12; i++)
-	{
-		piece = edges[i];
-		orien = piece & _eobit;
-		edge_result[piece & _pbits] = i | orien;
-	}
-
-	// Process the corners
-	for (i = 0; i < 8; i++)
-	{
-		piece = corners[i];
-		orien = ((piece << 1) | (piece >> 1)) & _cobits2;
-		corner_result[piece & _pbits] = i | orien;
-	}
-
-	// Copy the results back to the NEON vectors
-	ret.edge = vld1q_u8(edge_result);
-	ret.corner = vld1q_u8(corner_result);
-
-	return ret;
-}
-
-_static_inline int64_t 
-coord_co(cube_t c)
-{
-	// Temp array to store the NEON vector
-	uint8_t mem[16];
-	vst1q_u8(mem, c.corner);
-
-	int i, p;
-	int64_t ret;
-
-	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3)
-		ret += p * (mem[i] >> _coshift);
-
-	return ret;
-}
-
-_static_inline int64_t
-coord_csep(cube_t c)
-{
-	// Temp array to store the NEON vector
-	uint8_t mem[16];
-	vst1q_u8(mem, c.corner);
-
-	int64_t ret = 0;
-	int i, p;
-	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2)
-		ret += p * ((mem[i] & _csepbit) >> 2);
-
-	return ret;
-	return 0;
-}
-
-_static_inline int64_t
-coord_cocsep(cube_t c)
-{
-	return (coord_co(c) << 7) + coord_csep(c);
-}
-
-_static_inline int64_t 
-coord_eo(cube_t c)
-{
-	int64_t ret = 0;
-	int64_t p = 1;
-
-	// Temp array to store the NEON vector
-	uint8_t mem[16];
-	vst1q_u8(mem, c.edge);
-
-	for (int i = 1; i < 12; i++, p *= 2)
-	{
-		ret += p * (mem[i] >> _eoshift);
-	}
-
-	return ret;
-}
-
-_static_inline int64_t 
-coord_esep(cube_t c)
-{
-	int64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
-
-	// Temp array to store the NEON vector
-	uint8_t mem[16];
-	vst1q_u8(mem, c.edge);
-
-	for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++)
-	{
-		bit1 = (mem[i] & _esepbit1) >> 2;
-		bit2 = (mem[i] & _esepbit2) >> 3;
-		is1 = (1 - bit2) * bit1;
-
-		ret1 += bit2 * binomial[11 - i][k];
-		k -= bit2;
-
-		jj = j < 8;
-		ret2 += jj * is1 * binomial[7 - (j * jj)][l];
-		l -= is1;
-		j += (1 - bit2);
-	}
-
-	return ret1 * 70 + ret2;
-}
-
-_static_inline void
-copy_corners(cube_t *dst, cube_t src)
-{
-	dst->corner = src.corner;
-}
-
-_static_inline void
-copy_edges(cube_t *dst, cube_t src)
-{
-	dst->edge = src.edge;
-}
-
-_static_inline void
-set_eo(cube_t *cube, int64_t eo)
-{
-	// Temp array to store the NEON vector
-	uint8_t mem[16];
-	vst1q_u8(mem, cube->edge);
-	uint8_t i, sum, flip;
-
-	for (sum = 0, i = 1; i < 12; i++, eo >>= 1)
-	{
-		flip = eo % 2;
-		sum += flip;
-		mem[i] = (mem[i] & ~_eobit) | (_eobit * flip);
-	}
-	mem[0] = (mem[0] & ~_eobit) | (_eobit * (sum % 2));
-
-	// Copy the results back to the NEON vector
-	cube->edge = vld1q_u8(mem);
-	return;
-}
-
-_static_inline cube_t
-invcoord_esep(int64_t esep)
-{
-	cube_t ret;
-	int64_t bit1, bit2, i, j, jj, k, l, s, v, w, is1, set1, set2;
-	uint8_t slice[3] = {0};
-
-	ret = solved;
-	uint8_t mem[16];
-	set1 = esep % 70;
-	set2 = esep / 70;
-
-	for (i = 0, j = 0, k = 4, l = 4; i < 12; i++)
-	{
-		v = binomial[11 - i][k];
-		jj = j < 8;
-		w = jj * binomial[7 - (j * jj)][l];
-		bit2 = set2 >= v;
-		bit1 = set1 >= w;
-		is1 = (1 - bit2) * bit1;
-
-		set2 -= bit2 * v;
-		k -= bit2;
-		set1 -= is1 * w;
-		l -= is1;
-		j += (1 - bit2);
-		s = 2 * bit2 + (1 - bit2) * bit1;
-
-		mem[i] = (slice[s]++) | (uint8_t)(s << 2);
-	}
-
-	ret.edge = vld1q_u8(mem);
-	return ret;
-}
diff --git a/src/cube_portable.h b/src/cube_portable.h
@@ -1,298 +0,0 @@
-typedef struct {
-	uint8_t corner[8];
-	uint8_t edge[12];
-} cube_t;
-
-#define static_cube(c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl, \
-    e_uf, e_ub, e_db, e_df, e_ur, e_ul, e_dl, e_dr, e_fr, e_fl, e_bl, e_br) \
-    ((cube_t) { \
-        .corner = { c_ufr, c_ubl, c_dfl, c_dbr, c_ufl, c_ubr, c_dfr, c_dbl }, \
-        .edge = { e_uf, e_ub, e_db, e_df, e_ur, e_ul, \
-                  e_dl, e_dr, e_fr, e_fl, e_bl, e_br } })
-#define zero static_cube( \
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
-#define solved static_cube( \
-    0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
-
-_static void pieces(cube_t *, uint8_t [static 8], uint8_t [static 12]);
-_static_inline bool equal(cube_t, cube_t);
-_static_inline cube_t invertco(cube_t);
-_static_inline void compose_edges_inplace(cube_t, cube_t, cube_t *);
-_static_inline void compose_corners_inplace(cube_t, cube_t, cube_t *);
-_static_inline cube_t compose_edges(cube_t, cube_t);
-_static_inline cube_t compose_corners(cube_t, cube_t);
-_static_inline cube_t compose(cube_t, cube_t);
-_static_inline cube_t inverse(cube_t);
-
-_static_inline int64_t coord_co(cube_t);
-_static_inline int64_t coord_csep(cube_t);
-_static_inline int64_t coord_cocsep(cube_t);
-_static_inline int64_t coord_eo(cube_t);
-_static_inline int64_t coord_esep(cube_t);
-
-_static_inline void copy_corners(cube_t *, cube_t);
-_static_inline void copy_edges(cube_t *, cube_t);
-_static_inline void set_eo(cube_t *, int64_t);
-_static_inline cube_t invcoord_esep(int64_t);
-
-_static void
-pieces(cube_t *cube, uint8_t c[static 8], uint8_t e[static 12])
-{
-	memcpy(c, cube->corner, 8);
-	memcpy(e, cube->edge, 12);
-}
-
-_static_inline bool
-equal(cube_t c1, cube_t c2)
-{
-	uint8_t i;
-	bool ret;
-
-	ret = true;
-	for (i = 0; i < 8; i++)
-		ret = ret && c1.corner[i] == c2.corner[i];
-	for (i = 0; i < 12; i++)
-		ret = ret && c1.edge[i] == c2.edge[i];
-
-	return ret;
-}
-
-_static_inline cube_t
-invertco(cube_t c)
-{
-	uint8_t i, piece, orien;
-	cube_t ret;
-
-	ret = c;
-	for (i = 0; i < 8; i++) {
-		piece = c.corner[i];
-		orien = ((piece << 1) | (piece >> 1)) & _cobits2;
-		ret.corner[i] = (piece & _pbits) | orien;
-	}
-
-	return ret;
-}
-
-_static_inline void
-compose_edges_inplace(cube_t c1, cube_t c2, cube_t *ret)
-{
-	uint8_t i, piece1, piece2, p, orien;
-
-	for (i = 0; i < 12; i++) {
-		piece2 = c2.edge[i];
-		p = piece2 & _pbits;
-		piece1 = c1.edge[p];
-		orien = (piece2 ^ piece1) & _eobit;
-		ret->edge[i] = (piece1 & _pbits) | orien;
-	}
-}
-
-_static_inline void
-compose_corners_inplace(cube_t c1, cube_t c2, cube_t *ret)
-{
-	uint8_t i, piece1, piece2, p, orien, aux, auy;
-
-	for (i = 0; i < 8; i++) {
-		piece2 = c2.corner[i];
-		p = piece2 & _pbits;
-		piece1 = c1.corner[p];
-		aux = (piece2 & _cobits) + (piece1 & _cobits);
-		auy = (aux + _ctwist_cw) >> 2;
-		orien = (aux + auy) & _cobits2;
-		ret->corner[i] = (piece1 & _pbits) | orien;
-	}
-}
-
-_static_inline cube_t
-compose_edges(cube_t c1, cube_t c2)
-{
-	cube_t ret = zero;
-
-	compose_edges_inplace(c1, c2, &ret);
-
-	return ret;
-}
-
-_static_inline cube_t
-compose_corners(cube_t c1, cube_t c2)
-{
-	cube_t ret = zero;
-
-	compose_corners_inplace(c1, c2, &ret);
-
-	return ret;
-}
-
-_static_inline cube_t
-compose(cube_t c1, cube_t c2)
-{
-	cube_t ret = zero;
-
-	compose_edges_inplace(c1, c2, &ret);
-	compose_corners_inplace(c1, c2, &ret);
-
-	return ret;
-}
-
-cube_t
-inverse(cube_t cube)
-{
-	uint8_t i, piece, orien;
-	cube_t ret;
-
-	for (i = 0; i < 12; i++) {
-		piece = cube.edge[i];
-		orien = piece & _eobit;
-		ret.edge[piece & _pbits] = i | orien;
-	}
-
-	for (i = 0; i < 8; i++) {
-		piece = cube.corner[i];
-		orien = ((piece << 1) | (piece >> 1)) & _cobits2;
-		ret.corner[piece & _pbits] = i | orien;
-	}
-
-	return ret;
-}
-
-_static_inline int64_t
-coord_co(cube_t c)
-{
-	int i, p;
-	int64_t ret;
-
-	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 3)
-		ret += p * (c.corner[i] >> _coshift);
-
-	return ret;
-}
-
-/*
-For corner separation, we consider the axis (a.k.a. tetrad) each
-corner belongs to as 0 or 1 and we translate this sequence into binary.
-Ignoring the last bit, we have a value up to 2^7, but not all values are
-possible. Encoding this as a number from 0 to C(8,4) would save about 40%
-of space, but we are not going to use this coordinate in large tables.
-*/
-_static_inline int64_t
-coord_csep(cube_t c)
-{
-	int i, p;
-	int64_t ret;
-
-	for (ret = 0, i = 0, p = 1; i < 7; i++, p *= 2)
-		ret += p * ((c.corner[i] & _csepbit) >> 2);
-
-	return ret;
-}
-
-_static_inline int64_t
-coord_cocsep(cube_t c)
-{
-	return (coord_co(c) << 7) + coord_csep(c);
-}
-
-_static_inline int64_t
-coord_eo(cube_t c)
-{
-	int i, p;
-	int64_t ret;
-
-	for (ret = 0, i = 1, p = 1; i < 12; i++, p *= 2)
-		ret += p * (c.edge[i] >> _eoshift);
-
-	return ret;
-}
-
-/*
-We encode the edge separation as a number from 0 to C(12,4)*C(8,4).
-It can be seen as the composition of two "subset index" coordinates.
-*/
-_static_inline int64_t
-coord_esep(cube_t c)
-{
-	int64_t i, j, jj, k, l, ret1, ret2, bit1, bit2, is1;
-
-	for (i = 0, j = 0, k = 4, l = 4, ret1 = 0, ret2 = 0; i < 12; i++) {
-		/* Simple version:
-		if (c.edge[i] & _esepbit2) {
-			ret1 += binomial[11-i][k--];
-		} else {
-			if (c.edge[i] & _esepbit1)
-				ret2 += binomial[7-j][l--];
-			j++;
-		}
-		*/
-
-		bit1 = (c.edge[i] & _esepbit1) >> 2;
-		bit2 = (c.edge[i] & _esepbit2) >> 3;
-		is1 = (1 - bit2) * bit1;
-
-		ret1 += bit2 * binomial[11-i][k];
-		k -= bit2;
-
-		jj = j < 8;
-		ret2 += jj * is1 * binomial[7-(j*jj)][l];
-		l -= is1;
-		j += (1-bit2);
-	}
-
-	return ret1 * 70 + ret2;
-}
-
-_static_inline void
-copy_corners(cube_t *dest, cube_t src)
-{
-	memcpy(&dest->corner, src.corner, sizeof(src.corner));
-}
-
-_static_inline void
-copy_edges(cube_t *dest, cube_t src)
-{
-	memcpy(&dest->edge, src.edge, sizeof(src.edge));
-}
-
-_static_inline void
-set_eo(cube_t *cube, int64_t eo)
-{
-	uint8_t i, sum, flip;
-
-	for (sum = 0, i = 1; i < 12; i++, eo >>= 1) {
-		flip = eo % 2;
-		sum += flip;
-		cube->edge[i] = (cube->edge[i] & ~_eobit) | (_eobit * flip);
-	}
-	cube->edge[0] = (cube->edge[0] & ~_eobit) | (_eobit * (sum % 2));
-}
-
-_static_inline cube_t
-invcoord_esep(int64_t esep)
-{
-	cube_t ret;
-	int64_t bit1, bit2, i, j, jj, k, l, s, v, w, is1, set1, set2;
-	uint8_t slice[3] = {0};
-
-	ret = solved;
-	set1 = esep % 70;
-	set2 = esep / 70;
-
-	for (i = 0, j = 0, k = 4, l = 4; i < 12; i++) {
-		v = binomial[11-i][k];
-		jj = j < 8;
-		w = jj * binomial[7-(j*jj)][l];
-		bit2 = set2 >= v;
-		bit1 = set1 >= w;
-		is1 = (1 - bit2) * bit1;
-
-		set2 -= bit2 * v;
-		k -= bit2;
-		set1 -= is1 * w;
-		l -= is1;
-		j += (1-bit2);
-		s = 2*bit2 + (1-bit2)*bit1;
-
-		ret.edge[i] = (slice[s]++) | (uint8_t)(s << 2);
-	}
-
-	return ret;
-}
diff --git a/src/cube_public.h b/src/cube_public.h
@@ -1,255 +0,0 @@
-#include "cube.h"
-
-_static int64_t write_result(cube_t, char [static 22]);
-
-/* TODO: add option to get DR, maybe C-only, E-only, eo... */
-#define GETCUBE_OPTIONS(S, F) { .option = S, .fix = F }
-struct {
-	char *option;
-	void (*fix)(int64_t *, int64_t *, int64_t *, int64_t *);
-} getcube_options[] = {
-	GETCUBE_OPTIONS("fix", getcube_fix),
-	GETCUBE_OPTIONS(NULL, NULL)
-};
-
-_static int64_t
-write_result(cube_t cube, char result[static 22])
-{
-	if (!isconsistent(cube)) {
-		writecube("B32", zero, result);
-		return 2;
-	}
-
-	writecube("B32", cube, result);
-
-	return issolvable(cube) ? 0 : 1;
-}
-
-int64_t
-nissy_compose(
-	const char cube[static 22],
-	const char permutation[static 22],
-	char result[static 22]
-)
-{
-	cube_t c, p, res;
-
-	c = readcube("B32", cube);
-	p = readcube("B32", permutation);
-	res = compose(c, p);
-
-	return write_result(res, result);
-}
-
-int64_t
-nissy_inverse(
-	const char cube[static 22],
-	char result[static 22]
-)
-{
-	cube_t c, res;
-
-	c = readcube("B32", cube);
-	res = inverse(c);
-
-	return write_result(res, result);
-}
-
-int64_t
-nissy_applymoves(
-	const char cube[static 22],
-	const char *moves,
-	char result[static 22]
-)
-{
-	cube_t c, res;
-
-	c = readcube("B32", cube);
-	res = applymoves(c, moves);
-
-	return write_result(res, result);
-}
-
-int64_t
-nissy_applytrans(
-	const char cube[static 22],
-	const char *transformation,
-	char result[static 22]
-)
-{
-	cube_t c, res;
-
-	c = readcube("B32", cube);
-	res = applytrans(c, transformation);
-
-	return write_result(res, result);
-}
-
-int64_t
-nissy_frommoves(
-	const char *moves,
-	char result[static 22]
-)
-{
-	cube_t res;
-
-	res = applymoves(solved, moves);
-
-	return write_result(res, result);
-}
-
-int64_t
-nissy_convert(
-	const char *format_in,
-	const char *format_out,
-	const char *cube_string,
-	char *result
-)
-{
-	cube_t c;
-
-	c = readcube(format_in, cube_string);
-	writecube(format_out, c, result);
-
-	return isconsistent(c) ? 0 : 2;
-}
-
-int64_t
-nissy_getcube(
-	int64_t ep,
-	int64_t eo,
-	int64_t cp,
-	int64_t co,
-	const char *options,
-	char result[static 22]
-)
-{
-	int i;
-	cube_t c;
-
-	for (i = 0; getcube_options[i].option != NULL; i++)
-		if (!strcmp(options, getcube_options[i].option))
-			getcube_options[i].fix(&ep, &eo, &cp, &co);
-
-	c = getcube(ep, eo, cp, co);
-
-	return write_result(c, result);
-}
-
-int64_t
-nissy_datasize(
-	const char *solver,
-	const char *options
-)
-{
-	/* gendata() handles a NULL *data as a "dryrun" request */
-	return nissy_gendata(solver, options, NULL);
-}
-
-int64_t
-nissy_gendata(
-	const char *solver,
-	const char *options,
-	void *data
-)
-{
-	int64_t ret;
-	uint8_t maxdepth, h, i, j;
-
-	if (!strcmp(solver, "h48")) {
-		/*  options are in the form "h;maxdepth" */
-		for (i = 0; options[i] != ';'; i++) ;
-		for (j = i; options[j]; j++) ;
-		h = atoi(options);
-		if (h != 0) {
-			LOG("Temporarily only h=0 is supported\n");
-			ret = -1;
-		} else {
-			maxdepth = atoi(&options[i+1]);
-			ret = gendata_h48h0k4(data, maxdepth);
-		}
-	} else if (!strcmp(solver, "h48stats")) {
-		ret = gendata_h48h0k4(data, 20);
-	} else {
-		LOG("gendata: implemented only for h48 solver\n");
-		ret = -1;
-	}
-
-	return ret;
-}
-
-int64_t
-nissy_solve(
-	const char cube[static 22],
-	const char *solver, 
-	const char *options,
-	const char *nisstype,
-	int8_t minmoves,
-	int8_t maxmoves,
-	int64_t maxsolutions,
-	int8_t optimal,
-	const void *data,
-	char *solutions
-)
-{
-	cube_t c;
-	int64_t ret;
-	int h;
-
-	c = readcube_B32(cube);
-
-	if (!issolvable(c)) {
-		LOG("solve: cube is not solvable\n");
-		return -1;
-	}
-
-	if (minmoves < 0) {
-		LOG("solve: 'minmoves' is negative, setting it to 0\n");
-		minmoves = 0;
-	}
-
-	if (maxmoves < 0) {
-		LOG("solve: 'maxmoves' is negative, setting it to 20\n");
-		maxmoves = 20;
-	}
-
-	if (maxsolutions < 0) {
-		LOG("solve: 'maxsols' is negative, stopping\n");
-		return -1;
-	}
-
-	if (maxsolutions == 0) {
-		LOG("solve: 'maxsols' is 0, returning no solution\n");
-		return 0;
-	}
-
-	if (solutions == NULL) {
-		LOG("solve: return parameter 'solutions' is NULL, stopping\n");
-		return -1;
-	}
-
-	/* TODO define and use solve_options_t */
-	if (!strcmp(solver, "h48")) {
-		h = atoi(options); /* TODO: better parsing */
-		ret = solve_h48(
-		    c, minmoves, maxmoves, maxsolutions,
-		    (uint8_t)h, data, solutions);
-		ret = -1;
-	} else if (!strcmp(solver, "h48stats")) {
-		ret = solve_h48stats(c, maxmoves, data, solutions);
-	} else if (!strcmp(solver, "simple")) {
-		ret = solve_simple(
-		    c, minmoves, maxmoves, maxsolutions, optimal, solutions);
-	} else {
-		LOG("solve: unknown solver '%s'\n", solver);
-		ret = -1;
-	}
-
-	return ret;
-}
-
-void
-nissy_setlogger(void (*log)(const char *, ...))
-{
-	nissy_log = log;
-}
diff --git a/src/io_move_trans.h b/src/io_move_trans.h
@@ -1,87 +0,0 @@
-_static uint8_t readmove(char);
-_static uint8_t readmodifier(char);
-_static uint8_t readtrans(const char *);
-_static int writemoves(uint8_t *, int, char *);
-_static void writetrans(uint8_t, char *);
-
-_static uint8_t
-readmove(char c)
-{
-	switch (c) {
-	case 'U':
-		return _move_U;
-	case 'D':
-		return _move_D;
-	case 'R':
-		return _move_R;
-	case 'L':
-		return _move_L;
-	case 'F':
-		return _move_F;
-	case 'B':
-		return _move_B;
-	default:
-		return _error;
-	}
-}
-
-_static uint8_t
-readmodifier(char c)
-{
-	switch (c) {
-	case '1': /* Fallthrough */
-	case '2': /* Fallthrough */
-	case '3':
-		return c - '0' - 1;
-	case '\'':
-		return 2;
-	default:
-		return 0;
-	}
-}
-
-_static uint8_t
-readtrans(const char *buf)
-{
-	uint8_t t;
-
-	for (t = 0; t < 48; t++)
-		if (!strncmp(buf, transstr[t], 11))
-			return t;
-
-	LOG("readtrans error\n");
-	return _error;
-}
-
-_static int
-writemoves(uint8_t *m, int n, char *buf)
-{
-	int i;
-	size_t len;
-	const char *s;
-	char *b;
-
-	for (i = 0, b = buf; i < n; i++, b++) {
-		s = movestr[m[i]];
-		len = strlen(s);
-		memcpy(b, s, len);
-		b += len;	
-		*b = ' ';
-	}
-
-	if (b != buf)
-		b--; /* Remove last space */
-	*b = '\0';
-
-	return b - buf;
-}
-
-_static void
-writetrans(uint8_t t, char *buf)
-{
-	if (t >= 48)
-		memcpy(buf, "error trans", 11);
-	else
-		memcpy(buf, transstr[t], 11);
-	buf[11] = '\0';
-}
diff --git a/src/nissy.c b/src/nissy.c
@@ -0,0 +1,265 @@
+#include <inttypes.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "utils/utils.h"
+#include "arch/arch.h"
+#include "core/core.h"
+#include "solvers/solvers.h"
+
+#include "nissy.h"
+
+_static int64_t write_result(cube_t, char [static 22]);
+
+/* TODO: add option to get DR, maybe C-only, E-only, eo... */
+#define GETCUBE_OPTIONS(S, F) { .option = S, .fix = F }
+struct {
+	char *option;
+	void (*fix)(int64_t *, int64_t *, int64_t *, int64_t *);
+} getcube_options[] = {
+	GETCUBE_OPTIONS("fix", getcube_fix),
+	GETCUBE_OPTIONS(NULL, NULL)
+};
+
+_static int64_t
+write_result(cube_t cube, char result[static 22])
+{
+	if (!isconsistent(cube)) {
+		writecube("B32", zero, result);
+		return 2;
+	}
+
+	writecube("B32", cube, result);
+
+	return issolvable(cube) ? 0 : 1;
+}
+
+int64_t
+nissy_compose(
+	const char cube[static 22],
+	const char permutation[static 22],
+	char result[static 22]
+)
+{
+	cube_t c, p, res;
+
+	c = readcube("B32", cube);
+	p = readcube("B32", permutation);
+	res = compose(c, p);
+
+	return write_result(res, result);
+}
+
+int64_t
+nissy_inverse(
+	const char cube[static 22],
+	char result[static 22]
+)
+{
+	cube_t c, res;
+
+	c = readcube("B32", cube);
+	res = inverse(c);
+
+	return write_result(res, result);
+}
+
+int64_t
+nissy_applymoves(
+	const char cube[static 22],
+	const char *moves,
+	char result[static 22]
+)
+{
+	cube_t c, res;
+
+	c = readcube("B32", cube);
+	res = applymoves(c, moves);
+
+	return write_result(res, result);
+}
+
+int64_t
+nissy_applytrans(
+	const char cube[static 22],
+	const char *transformation,
+	char result[static 22]
+)
+{
+	cube_t c, res;
+
+	c = readcube("B32", cube);
+	res = applytrans(c, transformation);
+
+	return write_result(res, result);
+}
+
+int64_t
+nissy_frommoves(
+	const char *moves,
+	char result[static 22]
+)
+{
+	cube_t res;
+
+	res = applymoves(solved, moves);
+
+	return write_result(res, result);
+}
+
+int64_t
+nissy_convert(
+	const char *format_in,
+	const char *format_out,
+	const char *cube_string,
+	char *result
+)
+{
+	cube_t c;
+
+	c = readcube(format_in, cube_string);
+	writecube(format_out, c, result);
+
+	return isconsistent(c) ? 0 : 2;
+}
+
+int64_t
+nissy_getcube(
+	int64_t ep,
+	int64_t eo,
+	int64_t cp,
+	int64_t co,
+	const char *options,
+	char result[static 22]
+)
+{
+	int i;
+	cube_t c;
+
+	for (i = 0; getcube_options[i].option != NULL; i++)
+		if (!strcmp(options, getcube_options[i].option))
+			getcube_options[i].fix(&ep, &eo, &cp, &co);
+
+	c = getcube(ep, eo, cp, co);
+
+	return write_result(c, result);
+}
+
+int64_t
+nissy_datasize(
+	const char *solver,
+	const char *options
+)
+{
+	/* gendata() handles a NULL *data as a "dryrun" request */
+	return nissy_gendata(solver, options, NULL);
+}
+
+int64_t
+nissy_gendata(
+	const char *solver,
+	const char *options,
+	void *data
+)
+{
+	int64_t ret;
+	uint8_t maxdepth, h, i, j;
+
+	if (!strcmp(solver, "h48")) {
+		/*  options are in the form "h;maxdepth" */
+		for (i = 0; options[i] != ';'; i++) ;
+		for (j = i; options[j]; j++) ;
+		h = atoi(options);
+		if (h != 0) {
+			LOG("Temporarily only h=0 is supported\n");
+			ret = -1;
+		} else {
+			maxdepth = atoi(&options[i+1]);
+			ret = gendata_h48h0k4(data, maxdepth);
+		}
+	} else if (!strcmp(solver, "h48stats")) {
+		ret = gendata_h48h0k4(data, 20);
+	} else {
+		LOG("gendata: implemented only for h48 solver\n");
+		ret = -1;
+	}
+
+	return ret;
+}
+
+int64_t
+nissy_solve(
+	const char cube[static 22],
+	const char *solver, 
+	const char *options,
+	const char *nisstype,
+	int8_t minmoves,
+	int8_t maxmoves,
+	int64_t maxsolutions,
+	int8_t optimal,
+	const void *data,
+	char *solutions
+)
+{
+	cube_t c;
+	int64_t ret;
+	int h;
+
+	c = readcube_B32(cube);
+
+	if (!issolvable(c)) {
+		LOG("solve: cube is not solvable\n");
+		return -1;
+	}
+
+	if (minmoves < 0) {
+		LOG("solve: 'minmoves' is negative, setting it to 0\n");
+		minmoves = 0;
+	}
+
+	if (maxmoves < 0) {
+		LOG("solve: 'maxmoves' is negative, setting it to 20\n");
+		maxmoves = 20;
+	}
+
+	if (maxsolutions < 0) {
+		LOG("solve: 'maxsols' is negative, stopping\n");
+		return -1;
+	}
+
+	if (maxsolutions == 0) {
+		LOG("solve: 'maxsols' is 0, returning no solution\n");
+		return 0;
+	}
+
+	if (solutions == NULL) {
+		LOG("solve: return parameter 'solutions' is NULL, stopping\n");
+		return -1;
+	}
+
+	/* TODO define and use solve_options_t */
+	if (!strcmp(solver, "h48")) {
+		h = atoi(options); /* TODO: better parsing */
+		ret = solve_h48(
+		    c, minmoves, maxmoves, maxsolutions,
+		    (uint8_t)h, data, solutions);
+		ret = -1;
+	} else if (!strcmp(solver, "h48stats")) {
+		ret = solve_h48stats(c, maxmoves, data, solutions);
+	} else if (!strcmp(solver, "simple")) {
+		ret = solve_simple(
+		    c, minmoves, maxmoves, maxsolutions, optimal, solutions);
+	} else {
+		LOG("solve: unknown solver '%s'\n", solver);
+		ret = -1;
+	}
+
+	return ret;
+}
+
+void
+nissy_setlogger(void (*log)(const char *, ...))
+{
+	nissy_log = log;
+}
diff --git a/src/cube.h b/src/nissy.h
diff --git a/src/solve_h48.h b/src/solve_h48.h
@@ -1,839 +0,0 @@
-#define MAP_UNSET             UINT64_C(0xFFFFFFFFFFFFFFFF)
-#define MAP_KEYMASK           UINT64_C(0xFFFFFFFFFF)
-#define MAP_KEYSHIFT          UINT64_C(40)
-
-#define COCSEP_CLASSES        ((size_t)3393)
-#define COCSEP_TABLESIZE      ((size_t)_3p7 << (size_t)7)
-#define COCSEP_VISITEDSIZE    ((COCSEP_TABLESIZE + (size_t)7) / (size_t)8)
-#define COCSEP_FULLSIZE       ((size_t)4 * (COCSEP_TABLESIZE + (size_t)12))
-
-#define ESEP_NOEO             (COCSEP_CLASSES * (size_t)_12c4 * (size_t)_8c4)
-#define ESEP_MAX(h)           (ESEP_NOEO << (size_t)(h))
-#define ESEP_TABLESIZE(h, k)  (ESEP_MAX((h)) / ((size_t)8 / (size_t)(k)))
-
-#define COCLASS_MASK          (UINT32_C(0xFFFF) << UINT32_C(16))
-#define COCLASS(x)            (((x) & COCLASS_MASK) >> UINT32_C(16))
-#define TTREP_MASK            (UINT32_C(0xFF) << UINT32_C(8))
-#define TTREP(x)              (((x) & TTREP_MASK) >> UINT32_C(8))
-#define CBOUND_MASK           UINT32_C(0xFF)
-#define CBOUND(x)             ((x) & CBOUND_MASK)
-#define H48_ESIZE(h)          ((_12c4 * _8c4) << (int64_t)(h))
-
-#define ESEP_IND(i)           ((uint32_t)(i) / UINT32_C(8))
-#define ESEP_SHIFT(i)         (UINT32_C(4) * ((uint32_t)(i) % UINT32_C(8)))
-#define ESEP_MASK(i)          ((_bit_u32(4) - (uint32_t)(1)) << ESEP_SHIFT(i))
-#define VISITED_IND(i)        ((uint32_t)(i) / UINT32_C(8))
-#define VISITED_MASK(i)       (UINT32_C(1) << ((uint32_t)(i) % UINT32_C(8)))
-
-#define MAX_SOLUTION_LENGTH 20
-
-/*
-TODO: This loop other similar h48 coordinates can be improved by only
-transforming edges, but we need to compose transformations (i.e. conjugate
-_t by _ttrep).
-*/
-#define _foreach_h48sim(_cube, _cocsepdata, _selfsim, _h, _action) \
-	int64_t _cocsep = coord_cocsep(_cube); \
-	uint8_t _ttrep = TTREP(_cocsepdata[_cocsep]); \
-	uint8_t _inverse_ttrep = inverse_trans(_ttrep); \
-	int64_t _coclass = COCLASS(_cocsepdata[_cocsep]); \
-	cube_t _rep = transform(_cube, _ttrep); \
-	uint64_t _sim = _selfsim[_coclass]; \
-	for (uint8_t _t = 0; _t < 48 && _sim; _t++, _sim >>= 1) { \
-		if (!(_sim & 1)) continue; \
-		_cube = transform(_rep, _t); \
-		_cube = transform(_cube, _inverse_ttrep); \
-		_action \
-	}
-
-typedef struct {
-	uint64_t n;
-	uint64_t capacity;
-	uint64_t randomizer;
-	uint64_t *table;
-} h48map_t;
-
-typedef struct {
-	uint64_t key;
-	uint64_t val;
-} kvpair_t;
-
-typedef struct {
-	cube_t cube;
-	uint8_t depth;
-	uint8_t maxdepth;
-	uint16_t *n;
-	uint32_t *buf32;
-	uint8_t *visited;
-	uint64_t *selfsim;
-	cube_t *rep;
-} dfsarg_cocsep_t;
-
-/* TODO keep or not? */
-typedef struct {
-	cube_t cube;
-	int8_t nmoves;
-	int8_t depth;
-	uint8_t moves[MAX_SOLUTION_LENGTH];
-	uint32_t *cocsepdata;
-	h48map_t *visited;
-} dfsarg_genh48set_t;
-
-typedef struct {
-	uint8_t depth;
-	uint32_t *cocsepdata;
-	uint32_t *buf32;
-	uint64_t *selfsim;
-	int64_t done;
-	cube_t *crep;
-} bfsarg_esep_t;
-
-typedef struct {
-	cube_t cube;
-	cube_t inverse;
-	int8_t nmoves;
-	int8_t depth;
-	uint8_t moves[MAX_SOLUTION_LENGTH];
-	int64_t *nsols;
-	int64_t maxsolutions;
-	uint8_t h;
-	uint32_t *cocsepdata;
-	uint32_t *h48data;
-	char **nextsol;
-} dfsarg_solveh48_t;
-
-typedef struct {
-	cube_t cube;
-	int8_t nmoves;
-	int8_t depth;
-	uint8_t moves[MAX_SOLUTION_LENGTH];
-	uint32_t *cocsepdata;
-	uint32_t *h48data;
-	char *s;
-} dfsarg_solveh48stats_t;
-
-_static void h48map_create(h48map_t *, uint64_t, uint64_t);
-_static void h48map_clear(h48map_t *);
-_static void h48map_destroy(h48map_t *);
-_static uint64_t h48map_lookup(h48map_t *, uint64_t);
-_static void h48map_insertmin(h48map_t *, uint64_t, uint64_t);
-_static uint64_t h48map_value(h48map_t *, uint64_t);
-_static kvpair_t h48map_nextkvpair(h48map_t *, uint64_t *);
-
-_static_inline int64_t coord_h48(cube_t, const uint32_t *, uint8_t);
-_static_inline int64_t coord_h48_edges(cube_t, int64_t, uint8_t, uint8_t);
-_static_inline cube_t invcoord_h48(int64_t, const cube_t *, uint8_t);
-
-_static_inline bool get_visited(const uint8_t *, int64_t);
-_static_inline void set_visited(uint8_t *, int64_t);
-_static_inline uint8_t get_esep_pval(const uint32_t *, int64_t);
-_static_inline void set_esep_pval(uint32_t *, int64_t, uint8_t);
-
-_static size_t gendata_cocsep(void *, uint64_t *, cube_t *);
-_static uint32_t gendata_cocsep_dfs(dfsarg_cocsep_t *);
-_static uint64_t gen_h48short(
-    uint8_t, const uint32_t *, const cube_t *, const uint64_t *, h48map_t *);
-_static size_t gendata_h48h0k4(void *, uint8_t);
-_static int64_t gendata_h48h0k4_bfs(bfsarg_esep_t *);
-_static int64_t gendata_h48h0k4_bfs_fromdone(bfsarg_esep_t *);
-_static int64_t gendata_h48h0k4_bfs_fromnew(bfsarg_esep_t *);
-
-_static void solve_h48_appendsolution(dfsarg_solveh48_t *);
-_static_inline int8_t get_h48_cdata(cube_t, uint32_t *, uint32_t *);
-_static_inline int8_t get_h48_bound(cube_t, uint32_t, uint8_t, uint32_t *);
-_static_inline bool solve_h48_stop(dfsarg_solveh48_t *);
-_static int64_t solve_h48_dfs(dfsarg_solveh48_t *);
-_static int64_t solve_h48(
-    cube_t, int8_t, int8_t, int8_t, uint8_t, const void *, char *);
-
-_static int64_t solve_h48stats_dfs(dfsarg_solveh48stats_t *);
-_static int64_t solve_h48stats(cube_t, int8_t, const void *, char [static 12]);
-
-_static void
-h48map_create(h48map_t *map, uint64_t capacity, uint64_t randomizer)
-{
-	map->capacity = capacity;
-	map->randomizer = randomizer;
-
-	map->table = malloc(map->capacity * sizeof(int64_t));
-	h48map_clear(map);
-}
-
-_static void
-h48map_clear(h48map_t *map)
-{
-	memset(map->table, 0xFF, map->capacity * sizeof(uint64_t));
-	map->n = 0;
-}
-
-_static void
-h48map_destroy(h48map_t *map)
-{
-	free(map->table);
-}
-
-_static_inline uint64_t
-h48map_lookup(h48map_t *map, uint64_t x)
-{
-	uint64_t hash, i;
-
-	hash = ((x % map->capacity) * map->randomizer) % map->capacity;
-	for (i = hash;
-	     map->table[i] != MAP_UNSET && (map->table[i] & MAP_KEYMASK) != x;
-	     i = (i+1) % map->capacity
-	) ;
-
-	return i;
-}
-
-_static_inline void
-h48map_insertmin(h48map_t *map, uint64_t key, uint64_t val)
-{
-	uint64_t i, oldval, min;
-
-	i = h48map_lookup(map, key);
-	oldval = map->table[i] >> MAP_KEYSHIFT;
-	min = _min(val, oldval);
-
-	map->n += map->table[i] == MAP_UNSET;
-	map->table[i] = (key & MAP_KEYMASK) | (min << MAP_KEYSHIFT);
-}
-
-_static_inline uint64_t
-h48map_value(h48map_t *map, uint64_t key)
-{
-	return map->table[h48map_lookup(map, key)] >> MAP_KEYSHIFT;
-}
-
-_static kvpair_t
-h48map_nextkvpair(h48map_t *map, uint64_t *p)
-{
-	kvpair_t kv;
-	uint64_t pair;
-
-	kv.key = MAP_UNSET;
-	kv.val = MAP_UNSET;
-
-	DBG_ASSERT(*p < map->capacity, kv,
-	    "Error looping over map: given index %" PRIu64 " is out of "
-	    "range [0,%" PRIu64 "]", *p, map->capacity);
-
-	for ( ; *p < map->capacity; (*p)++) {
-		if (map->table[*p] != MAP_UNSET) {
-			pair = map->table[(*p)++];
-			kv.key = pair & MAP_KEYMASK;
-			kv.val = pair >> MAP_KEYSHIFT;
-			return kv;
-		}
-	}
-
-	return kv;
-}
-
-_static_inline int64_t
-coord_h48(cube_t c, const uint32_t *cocsepdata, uint8_t h)
-{
-	int64_t cocsep, coclass;
-	uint32_t data;
-	uint8_t ttrep;
-
-	DBG_ASSERT(h <= 11, -1, "coord_h48: h must be between 0 and 11\n");
-
-	cocsep = coord_cocsep(c);
-	data = cocsepdata[cocsep];
-	coclass = (int64_t)COCLASS(data);
-	ttrep = (int64_t)TTREP(data);
-
-	return coord_h48_edges(c, coclass, ttrep, h);
-}
-
-_static_inline int64_t
-coord_h48_edges(cube_t c, int64_t coclass, uint8_t ttrep, uint8_t h)
-{
-	cube_t d;
-	int64_t esep, eo, edges;
-
-	d = transform_edges(c, ttrep);
-	esep = coord_esep(d);
-	eo = coord_eo(d);
-	edges = (esep << 11) + eo;
-
-	return (coclass * H48_ESIZE(11) + edges) >> (11 - (int64_t)h);
-}
-
-/*
-This function does not necessarily return a cube whose coordinate is
-the given value, because it works up to symmetry. This means that the
-returned cube is a transformed cube of one that gives the correct value.
-*/
-_static_inline cube_t
-invcoord_h48(int64_t i, const cube_t *crep, uint8_t h)
-{
-	cube_t ret;
-	int64_t hh, coclass, ee, esep, eo;
-
-	DBG_ASSERT(h <= 11, zero,
-	    "invcoord_h48: h must be between 0 and 11\n");
-
-	hh = (int64_t)h;
-	coclass = i / H48_ESIZE(h);
-	ee = i % H48_ESIZE(h);
-	esep = ee >> hh;
-	eo = (ee & ((1 << hh) - 1)) << (11 - hh);
-
-	ret = invcoord_esep(esep);
-	copy_corners(&ret, crep[coclass]);
-	set_eo(&ret, eo);
-
-	return ret;
-}
-
-/*
-Each element of the cocsep table is a uint32_t used as follows:
-  - Lowest 8-bit block: pruning value
-  - Second-lowest 8-bit block: "ttrep" (transformation to representative)
-  - Top 16-bit block: symcoord value
-After the data as described above, more auxiliary information is appended:
-  - A uint32_t representing the number of symmetry classes
-  - A uint32_t representing the highest value of the pruning table
-  - One uint32_t for each "line" of the pruning table, representing the number
-    of positions having that pruning value.
-*/
-_static size_t
-gendata_cocsep(void *buf, uint64_t *selfsim, cube_t *rep)
-{
-	uint32_t *buf32, *info, cc;
-	uint16_t n;
-	uint8_t i, j, visited[COCSEP_VISITEDSIZE];
-	dfsarg_cocsep_t arg;
-
-	if (buf == NULL)
-		goto gendata_cocsep_return_size;
-
-	buf32 = (uint32_t *)buf;
-	info = buf32 + COCSEP_TABLESIZE;
-	memset(buf32, 0xFF, sizeof(uint32_t) * COCSEP_TABLESIZE);
-	if (selfsim != NULL)
-		memset(selfsim, 0, sizeof(uint64_t) * COCSEP_CLASSES);
-
-	arg = (dfsarg_cocsep_t) {
-		.cube = solved,
-		.n = &n,
-		.buf32 = buf32,
-		.visited = visited,
-		.selfsim = selfsim,
-		.rep = rep
-	};
-	for (i = 0, n = 0, cc = 0; i < 10; i++) {
-		LOG("cocsep: generating depth %" PRIu8 "\n", i);
-		memset(visited, 0, COCSEP_VISITEDSIZE);
-		arg.depth = 0;
-		arg.maxdepth = i;
-		cc = gendata_cocsep_dfs(&arg);
-		info[i+2] = cc;
-		LOG("found %" PRIu32 "\n", cc);
-	}
-
-	info[0] = (uint32_t)n;
-	info[1] = 9; /* Known max pruning value */
-	DBG_ASSERT(n == COCSEP_CLASSES, 0,
-	    "cocsep: computed %" PRIu16 " symmetry classes, "
-	    "expected %zu\n", n, COCSEP_CLASSES);
-
-	LOG("cocsep data computed\n");
-	LOG("Symmetry classes: %" PRIu32 "\n", info[0]);
-	LOG("Maximum pruning value: %" PRIu32 "\n", info[1]);
-	LOG("Pruning value distribution:\n");
-	for (j = 0; j < 10; j++)
-		LOG("%" PRIu8 ":\t%" PRIu32 "\n", j, info[j+2]);
-
-gendata_cocsep_return_size:
-	return COCSEP_FULLSIZE;
-}
-
-_static uint32_t
-gendata_cocsep_dfs(dfsarg_cocsep_t *arg)
-{
-	uint8_t m;
-	uint32_t cc, class, ttrep, depth, olddepth, tinv;
-	uint64_t t;
-	int64_t i, j;
-	cube_t d;
-	dfsarg_cocsep_t nextarg;
-
-	i = coord_cocsep(arg->cube);
-	olddepth = (uint8_t)(arg->buf32[i] & 0xFF);
-	if (olddepth < arg->depth || get_visited(arg->visited, i))
-		return 0;
-	set_visited(arg->visited, i);
-
-	if (arg->depth == arg->maxdepth) {
-		if ((arg->buf32[i] & 0xFF) != 0xFF)
-			return 0;
-
-		if (arg->rep != NULL)
-			arg->rep[*arg->n] = arg->cube;
-		for (t = 0, cc = 0; t < 48; t++) {
-			d = transform_corners(arg->cube, t);
-			j = coord_cocsep(d);
-			if (i == j && arg->selfsim != NULL)
-				arg->selfsim[*arg->n] |= UINT64_C(1) << t;
-			if (COCLASS(arg->buf32[j]) != UINT32_C(0xFFFF))
-				continue;
-			set_visited(arg->visited, j);
-			tinv = inverse_trans(t);
-			olddepth = arg->buf32[j] & 0xFF;
-			cc += olddepth == 0xFF;
-
-			class = (uint32_t)(*arg->n) << UINT32_C(16);
-			ttrep = (uint32_t)tinv << UINT32_C(8);
-			depth = (uint32_t)arg->depth;
-			arg->buf32[j] = class | ttrep | depth;
-		}
-		(*arg->n)++;
-
-		return cc;
-	}
-
-	memcpy(&nextarg, arg, sizeof(dfsarg_cocsep_t));
-	nextarg.depth++;
-	for (m = 0, cc = 0; m < 18; m++) {
-		nextarg.cube = move(arg->cube, m);
-		cc += gendata_cocsep_dfs(&nextarg);
-	}
-
-	return cc;
-}
-
-_static uint64_t
-gen_h48short(
-	uint8_t n,
-	const uint32_t *cocsepdata,
-	const cube_t *crep,
-	const uint64_t *selfsim,
-	h48map_t *map
-) {
-	uint8_t i, m;
-	int64_t coord;
-	uint64_t j, oldn;
-	kvpair_t kv;
-	cube_t cube, d;
-
-	cube = solvedcube();
-	coord = coord_h48(cube, cocsepdata, 11);
-	h48map_insertmin(map, coord, 0);
-	oldn = 0;
-	LOG("Short h48: generating depth 0\nfound %" PRIu8 "\n", map->n-oldn);
-	for (i = 0; i < n; i++) {
-		LOG("Short h48: generating depth %" PRIu8 "\n", i+1);
-		j = 0;
-		oldn = map->n;
-		for (kv = h48map_nextkvpair(map, &j);
-		     j != map->capacity;
-		     kv = h48map_nextkvpair(map, &j)
-		) {
-			if (kv.val != i)
-				continue;
-			cube = invcoord_h48(kv.key, crep, 11);
-			for (m = 0; m < 18; m++) {
-				d = move(cube, m);
-				_foreach_h48sim(d, cocsepdata, selfsim, 11,
-					coord = coord_h48(d, cocsepdata, 11);
-					h48map_insertmin(map, coord, i+1);
-				)
-			}
-		}
-		LOG("found %" PRIu8 "\n", map->n-oldn);
-	}
-
-	return map->n;
-}
-
-/*
-TODO description
-generating fixed table with h=0, k=4
-*/
-_static size_t
-gendata_h48h0k4(void *buf, uint8_t maxdepth)
-{
-	uint32_t j, *buf32, *info, *cocsepdata;
-	bfsarg_esep_t arg;
-	int64_t sc, cc, esep_max;
-	uint64_t selfsim[COCSEP_CLASSES];
-	cube_t crep[COCSEP_CLASSES];
-	size_t cocsepsize, infosize;
-
-	/* TODO: move info at start of tables (all tables!) */
-	infosize = 4 * maxdepth;
-	cocsepsize = gendata_cocsep(buf, selfsim, crep);
-	infosize = 88;
-
-	if (buf == NULL)
-		goto gendata_h48h0k4_return_size;
-
-	esep_max = (int64_t)ESEP_MAX(0);
-	cocsepdata = (uint32_t *)buf;
-	buf32 = cocsepdata + cocsepsize / 4;
-	info = buf32 + (ESEP_TABLESIZE(0, 4) / sizeof(uint32_t));
-	memset(buf32, 0xFF, ESEP_TABLESIZE(0, 4));
-
-	sc = coord_h48(solved, cocsepdata, 0);
-	set_esep_pval(buf32, sc, 0);
-	info[1] = 1;
-	arg = (bfsarg_esep_t) {
-		.cocsepdata = cocsepdata,
-		.buf32 = buf32,
-		.selfsim = selfsim,
-		.crep = crep
-	};
-	for (
-		arg.done = 1, arg.depth = 1, cc = 0;
-		arg.done < esep_max && arg.depth <= maxdepth;
-		arg.depth++
-	) {
-		LOG("esep: generating depth %" PRIu8 "\n", arg.depth);
-		cc = gendata_h48h0k4_bfs(&arg);
-		arg.done += cc;
-		info[arg.depth+1] = cc;
-		LOG("found %" PRId64 "\n", cc);
-	}
-
-	info[0] = arg.depth-1;
-
-	LOG("h48 pruning table computed\n");
-	LOG("Maximum pruning value: %" PRIu32 "\n", info[0]);
-	LOG("Pruning value distribution:\n");
-	for (j = 0; j <= info[0]; j++)
-		LOG("%" PRIu8 ":\t%" PRIu32 "\n", j, info[j+1]);
-
-gendata_h48h0k4_return_size:
-	return cocsepsize + ESEP_TABLESIZE(0, 4) + infosize;
-}
-
-_static int64_t
-gendata_h48h0k4_bfs(bfsarg_esep_t *arg)
-{
-	const uint8_t breakpoint = 10; /* Hand-picked optimal */
-
-	if (arg->depth < breakpoint)
-		return gendata_h48h0k4_bfs_fromdone(arg);
-	else
-		return gendata_h48h0k4_bfs_fromnew(arg);
-}
-
-_static int64_t
-gendata_h48h0k4_bfs_fromdone(bfsarg_esep_t *arg)
-{
-	uint8_t c, m, x;
-	uint32_t cc;
-	int64_t i, j, k;
-	cube_t cube, moved;
-
-	for (i = 0, cc = 0; i < (int64_t)ESEP_MAX(0); i++) {
-		c = get_esep_pval(arg->buf32, i);
-		if (c != arg->depth - 1)
-			continue;
-		cube = invcoord_h48(i, arg->crep, 0);
-		for (m = 0; m < 18; m++) {
-			moved = move(cube, m);
-			j = coord_h48(moved, arg->cocsepdata, 0);
-			if (get_esep_pval(arg->buf32, j) <= arg->depth)
-				continue;
-			_foreach_h48sim(moved, arg->cocsepdata, arg->selfsim, 0,
-				k = coord_h48(moved, arg->cocsepdata, 0);
-				x = get_esep_pval(arg->buf32, k);
-				set_esep_pval(arg->buf32, k, arg->depth);
-				cc += x != arg->depth;
-			)
-		}
-	}
-
-	return cc;
-}
-
-_static int64_t
-gendata_h48h0k4_bfs_fromnew(bfsarg_esep_t *arg)
-{
-	uint8_t c, m, x;
-	uint32_t cc;
-	int64_t i, j;
-	cube_t cube, moved;
-
-	for (i = 0, cc = 0; i < (int64_t)ESEP_MAX(0); i++) {
-		c = get_esep_pval(arg->buf32, i);
-		if (c != 0xF)
-			continue;
-		cube = invcoord_h48(i, arg->crep, 0);
-		for (m = 0; m < 18; m++) {
-			moved = move(cube, m);
-			j = coord_h48(moved, arg->cocsepdata, 0);
-			x = get_esep_pval(arg->buf32, j);
-			if (x >= arg->depth)
-				continue;
-			_foreach_h48sim(cube, arg->cocsepdata, arg->selfsim, 0,
-				j = coord_h48(cube, arg->cocsepdata, 0);
-				x = get_esep_pval(arg->buf32, j);
-				set_esep_pval(arg->buf32, j, arg->depth);
-				cc += x == 0xF;
-			)
-			break; /* Enough to find one, skip the rest */
-		}
-	}
-
-	return cc;
-}
-
-_static_inline bool
-get_visited(const uint8_t *a, int64_t i)
-{
-	return a[VISITED_IND(i)] & VISITED_MASK(i);
-}
-
-_static_inline void
-set_visited(uint8_t *a, int64_t i)
-{
-	a[VISITED_IND(i)] |= VISITED_MASK(i);
-}
-
-_static_inline uint8_t
-get_esep_pval(const uint32_t *buf32, int64_t i)
-{
-	return (buf32[ESEP_IND(i)] & ESEP_MASK(i)) >> ESEP_SHIFT(i);
-}
-
-_static_inline void
-set_esep_pval(uint32_t *buf32, int64_t i, uint8_t val)
-{
-	buf32[ESEP_IND(i)] =
-	    (buf32[ESEP_IND(i)] & (~ESEP_MASK(i))) | (val << ESEP_SHIFT(i));
-}
-
-_static void
-solve_h48_appendsolution(dfsarg_solveh48_t *arg)
-{
-	int strl;
-
-	strl = writemoves(arg->moves, arg->nmoves, *arg->nextsol);
-	LOG("Solution found: %s\n", *arg->nextsol);
-	*arg->nextsol += strl;
-	**arg->nextsol = '\n';
-	(*arg->nextsol)++;
-	(*arg->nsols)++;
-}
-
-_static_inline int8_t
-get_h48_cdata(cube_t cube, uint32_t *cocsepdata, uint32_t *cdata)
-{
-	int64_t coord;
-
-	coord = coord_cocsep(cube);
-	*cdata = cocsepdata[coord];
-
-	return CBOUND(*cdata);
-}
-
-_static_inline int8_t
-get_h48_bound(cube_t cube, uint32_t cdata, uint8_t h, uint32_t *h48data)
-{
-	int64_t coord;
-
-	coord = coord_h48_edges(cube, COCLASS(cdata), TTREP(cdata), h);
-	return get_esep_pval(h48data, coord);
-}
-
-_static_inline bool
-solve_h48_stop(dfsarg_solveh48_t *arg)
-{
-	uint32_t data, data_inv;
-	int8_t bound;
-
-	bound = get_h48_cdata(arg->cube, arg->cocsepdata, &data);
-	if (bound + arg->nmoves > arg->depth)
-		return true;
-
-	bound = get_h48_cdata(arg->inverse, arg->cocsepdata, &data_inv);
-	if (bound + arg->nmoves > arg->depth)
-		return true;
-
-/*
-	bound = get_h48_bound(arg->cube, data, arg->h, arg->h48data);
-LOG("Using pval %" PRId8 "\n", bound);
-	if (bound + arg->nmoves > arg->depth)
-		return true;
-
-	bound = get_h48_bound(arg->inverse, data_inv, arg->h, arg->h48data);
-	if (bound + arg->nmoves > arg->depth)
-		return true;
-*/
-
-	return false;
-}
-
-_static int64_t
-solve_h48_dfs(dfsarg_solveh48_t *arg)
-{
-	dfsarg_solveh48_t nextarg;
-	int64_t ret;
-	uint8_t m;
-
-	if (*arg->nsols == arg->maxsolutions)
-		return 0;
-
-	if (solve_h48_stop(arg))
-		return 0;
-
-	if (issolved(arg->cube)) {
-		if (arg->nmoves != arg->depth)
-			return 0;
-		solve_h48_appendsolution(arg);
-		return 1;
-	}
-
-	/* TODO: avoid copy, change arg and undo changes after recursion */
-	nextarg = *arg;
-	nextarg.nmoves = arg->nmoves + 1;
-	ret = 0;
-	for (m = 0; m < 18; m++) {
-		nextarg.moves[arg->nmoves] = m;
-		if (!allowednextmove(nextarg.moves, nextarg.nmoves)) {
-			/* If a move is not allowed, neither are its 180
-			 * and 270 degree variations */
-			m += 2;
-			continue;
-		}
-		nextarg.cube = move(arg->cube, m);
-		nextarg.inverse = inverse(nextarg.cube); /* TODO: use premove */
-		ret += solve_h48_dfs(&nextarg);
-	}
-
-	return ret;
-}
-
-_static int64_t
-solve_h48(
-	cube_t cube,
-	int8_t minmoves,
-	int8_t maxmoves,
-	int8_t maxsolutions,
-	uint8_t h,
-	const void *data,
-	char *solutions
-)
-{
-	int64_t nsols;
-	dfsarg_solveh48_t arg;
-
-	arg = (dfsarg_solveh48_t) {
-		.cube = cube,
-		.inverse = inverse(cube),
-		.nsols = &nsols,
-		.maxsolutions = maxsolutions,
-		.h = h,
-		.cocsepdata = (uint32_t *)data,
-		.h48data = ((uint32_t *)data) + COCSEP_FULLSIZE / 4,
-		.nextsol = &solutions
-	};
-
-	nsols = 0;
-	for (arg.depth = minmoves;
-	     arg.depth <= maxmoves && nsols < maxsolutions;
-	     arg.depth++)
-	{
-		LOG("Found %" PRId64 " solutions, searching at depth %"
-		    PRId8 "\n", nsols, arg.depth);
-		arg.nmoves = 0;
-		solve_h48_dfs(&arg);
-	}
-
-	return nsols;
-}
-
-/*
-The h48stats solver computes how many moves it takes to solve to
-each of the 12 h48 coordinates, one for each value of h from 0 to 11.
-The solutions array is filled with the length of the solutions. The
-solution array is therefore not a printable string.
-*/
-_static int64_t
-solve_h48stats_dfs(dfsarg_solveh48stats_t *arg)
-{
-	const int64_t limit = 11;
-
-	int8_t bound, u;
-	uint8_t m;
-	uint32_t d;
-	int64_t coord, h;
-	dfsarg_solveh48stats_t nextarg;
-
-	/* Check cocsep lower bound (corners only) */
-	bound = get_h48_cdata(arg->cube, arg->cocsepdata, &d);
-	if (bound + arg->nmoves > arg->depth)
-		return 0;
-
-	/* Check h48 lower bound for h=0 (esep, but no eo) */
-	coord = coord_h48_edges(arg->cube, COCLASS(d), TTREP(d), 0);
-	bound = get_esep_pval(arg->h48data, coord);
-	if (bound + arg->nmoves > arg->depth)
-		return 0;
-	
-	/* Update all other values, if solved */
-	coord = coord_h48_edges(arg->cube, COCLASS(d), TTREP(d), 11);
-	for (h = 0; h <= limit; h++) {
-		u = coord >> (11-h) == 0 && arg->s[h] == 99;
-		arg->s[h] = u * arg->nmoves + (1-u) * arg->s[h];
-	}
-
-	if (arg->s[limit] != 99)
-		return 0;
-
-	nextarg = *arg;
-	nextarg.nmoves = arg->nmoves + 1;
-	for (m = 0; m < 18; m++) {
-		nextarg.moves[arg->nmoves] = m;
-		if (!allowednextmove(nextarg.moves, nextarg.nmoves)) {
-			/* If a move is not allowed, neither are its 180
-			 * and 270 degree variations */
-			m += 2;
-			continue;
-		}
-		nextarg.cube = move(arg->cube, m);
-		solve_h48stats_dfs(&nextarg);
-	}
-
-	return 0;
-}
-
-_static int64_t
-solve_h48stats(
-	cube_t cube,
-	int8_t maxmoves,
-	const void *data,
-	char solutions[static 12]
-)
-{
-	int i;
-	size_t cocsepsize;
-	dfsarg_solveh48stats_t arg;
-
-	cocsepsize = gendata_cocsep(NULL, NULL, NULL);
-
-	arg = (dfsarg_solveh48stats_t) {
-		.cube = cube,
-		.cocsepdata = (uint32_t *)data,
-		.h48data = ((uint32_t *)data) + (cocsepsize/4),
-		.s = solutions
-	};
-
-	for (i = 0; i < 12; i++)
-		solutions[i] = (char)99;
-
-	for (arg.depth = 0;
-	     arg.depth <= maxmoves && solutions[11] == 99;
-	     arg.depth++)
-	{
-		arg.nmoves = 0;
-		solve_h48stats_dfs(&arg);
-	}
-
-	return 0;
-}
diff --git a/src/solve_generic.h b/src/solvers/generic/generic.h
diff --git a/src/solvers/h48/coordinate.h b/src/solvers/h48/coordinate.h
@@ -0,0 +1,68 @@
+#define H48_ESIZE(h) ((_12c4 * _8c4) << (int64_t)(h))
+
+#define COCLASS_MASK (UINT32_C(0xFFFF) << UINT32_C(16))
+#define COCLASS(x)   (((x) & COCLASS_MASK) >> UINT32_C(16))
+#define TTREP_MASK   (UINT32_C(0xFF) << UINT32_C(8))
+#define TTREP(x)     (((x) & TTREP_MASK) >> UINT32_C(8))
+
+_static_inline int64_t coord_h48(cube_t, const uint32_t *, uint8_t);
+_static_inline int64_t coord_h48_edges(cube_t, int64_t, uint8_t, uint8_t);
+_static_inline cube_t invcoord_h48(int64_t, const cube_t *, uint8_t);
+
+_static_inline int64_t
+coord_h48(cube_t c, const uint32_t *cocsepdata, uint8_t h)
+{
+	int64_t cocsep, coclass;
+	uint32_t data;
+	uint8_t ttrep;
+
+	DBG_ASSERT(h <= 11, -1, "coord_h48: h must be between 0 and 11\n");
+
+	cocsep = coord_cocsep(c);
+	data = cocsepdata[cocsep];
+	coclass = (int64_t)COCLASS(data);
+	ttrep = (int64_t)TTREP(data);
+
+	return coord_h48_edges(c, coclass, ttrep, h);
+}
+
+_static_inline int64_t
+coord_h48_edges(cube_t c, int64_t coclass, uint8_t ttrep, uint8_t h)
+{
+	cube_t d;
+	int64_t esep, eo, edges;
+
+	d = transform_edges(c, ttrep);
+	esep = coord_esep(d);
+	eo = coord_eo(d);
+	edges = (esep << 11) + eo;
+
+	return (coclass * H48_ESIZE(11) + edges) >> (11 - (int64_t)h);
+}
+
+/*
+This function does not necessarily return a cube whose coordinate is
+the given value, because it works up to symmetry. This means that the
+returned cube is a transformed cube of one that gives the correct value.
+*/
+_static_inline cube_t
+invcoord_h48(int64_t i, const cube_t *crep, uint8_t h)
+{
+	cube_t ret;
+	int64_t hh, coclass, ee, esep, eo;
+
+	DBG_ASSERT(h <= 11, zero,
+	    "invcoord_h48: h must be between 0 and 11\n");
+
+	hh = (int64_t)h;
+	coclass = i / H48_ESIZE(h);
+	ee = i % H48_ESIZE(h);
+	esep = ee >> hh;
+	eo = (ee & ((1 << hh) - 1)) << (11 - hh);
+
+	ret = invcoord_esep(esep);
+	copy_corners(&ret, crep[coclass]);
+	set_eo(&ret, eo);
+
+	return ret;
+}
diff --git a/src/solvers/h48/gendata.h b/src/solvers/h48/gendata.h
@@ -0,0 +1,425 @@
+#define COCSEP_CLASSES        ((size_t)3393)
+#define COCSEP_TABLESIZE      ((size_t)_3p7 << (size_t)7)
+#define COCSEP_VISITEDSIZE    ((COCSEP_TABLESIZE + (size_t)7) / (size_t)8)
+#define COCSEP_FULLSIZE       ((size_t)4 * (COCSEP_TABLESIZE + (size_t)12))
+
+#define ESEP_NOEO             (COCSEP_CLASSES * (size_t)_12c4 * (size_t)_8c4)
+#define ESEP_MAX(h)           (ESEP_NOEO << (size_t)(h))
+#define ESEP_TABLESIZE(h, k)  (ESEP_MAX((h)) / ((size_t)8 / (size_t)(k)))
+
+#define ESEP_IND(i)           ((uint32_t)(i) / UINT32_C(8))
+#define ESEP_SHIFT(i)         (UINT32_C(4) * ((uint32_t)(i) % UINT32_C(8)))
+#define ESEP_MASK(i)          ((_bit_u32(4) - (uint32_t)(1)) << ESEP_SHIFT(i))
+#define VISITED_IND(i)        ((uint32_t)(i) / UINT32_C(8))
+#define VISITED_MASK(i)       (UINT32_C(1) << ((uint32_t)(i) % UINT32_C(8)))
+
+#define CBOUND_MASK           UINT32_C(0xFF)
+#define CBOUND(x)             ((x) & CBOUND_MASK)
+
+#define MAXLEN 20
+
+/*
+TODO: This loop over similar h48 coordinates can be improved by only
+transforming edges, but we need to compose transformations (i.e. conjugate
+_t by _ttrep).
+*/
+#define _foreach_h48sim(_cube, _cocsepdata, _selfsim, _h, _action) \
+	int64_t _cocsep = coord_cocsep(_cube); \
+	uint8_t _ttrep = TTREP(_cocsepdata[_cocsep]); \
+	uint8_t _inverse_ttrep = inverse_trans(_ttrep); \
+	int64_t _coclass = COCLASS(_cocsepdata[_cocsep]); \
+	cube_t _rep = transform(_cube, _ttrep); \
+	uint64_t _sim = _selfsim[_coclass]; \
+	for (uint8_t _t = 0; _t < 48 && _sim; _t++, _sim >>= 1) { \
+		if (!(_sim & 1)) continue; \
+		_cube = transform(_rep, _t); \
+		_cube = transform(_cube, _inverse_ttrep); \
+		_action \
+	}
+
+typedef struct {
+	cube_t cube;
+	uint8_t depth;
+	uint8_t maxdepth;
+	uint16_t *n;
+	uint32_t *buf32;
+	uint8_t *visited;
+	uint64_t *selfsim;
+	cube_t *rep;
+} dfsarg_cocsep_t;
+
+/* TODO keep or not? */
+typedef struct {
+	cube_t cube;
+	int8_t nmoves;
+	int8_t depth;
+	uint8_t moves[MAXLEN];
+	uint32_t *cocsepdata;
+	h48map_t *visited;
+} dfsarg_genh48set_t;
+
+typedef struct {
+	uint8_t depth;
+	uint32_t *cocsepdata;
+	uint32_t *buf32;
+	uint64_t *selfsim;
+	int64_t done;
+	cube_t *crep;
+} bfsarg_esep_t;
+
+_static_inline bool get_visited(const uint8_t *, int64_t);
+_static_inline void set_visited(uint8_t *, int64_t);
+_static_inline uint8_t get_esep_pval(const uint32_t *, int64_t);
+_static_inline void set_esep_pval(uint32_t *, int64_t, uint8_t);
+
+_static size_t gendata_cocsep(void *, uint64_t *, cube_t *);
+_static uint32_t gendata_cocsep_dfs(dfsarg_cocsep_t *);
+_static uint64_t gen_h48short(
+    uint8_t, const uint32_t *, const cube_t *, const uint64_t *, h48map_t *);
+_static size_t gendata_h48h0k4(void *, uint8_t);
+_static int64_t gendata_h48h0k4_bfs(bfsarg_esep_t *);
+_static int64_t gendata_h48h0k4_bfs_fromdone(bfsarg_esep_t *);
+_static int64_t gendata_h48h0k4_bfs_fromnew(bfsarg_esep_t *);
+
+_static_inline int8_t get_h48_cdata(cube_t, uint32_t *, uint32_t *);
+_static_inline int8_t get_h48_bound(cube_t, uint32_t, uint8_t, uint32_t *);
+
+/*
+Each element of the cocsep table is a uint32_t used as follows:
+  - Lowest 8-bit block: pruning value
+  - Second-lowest 8-bit block: "ttrep" (transformation to representative)
+  - Top 16-bit block: symcoord value
+After the data as described above, more auxiliary information is appended:
+  - A uint32_t representing the number of symmetry classes
+  - A uint32_t representing the highest value of the pruning table
+  - One uint32_t for each "line" of the pruning table, representing the number
+    of positions having that pruning value.
+*/
+_static size_t
+gendata_cocsep(void *buf, uint64_t *selfsim, cube_t *rep)
+{
+	uint32_t *buf32, *info, cc;
+	uint16_t n;
+	uint8_t i, j, visited[COCSEP_VISITEDSIZE];
+	dfsarg_cocsep_t arg;
+
+	if (buf == NULL)
+		goto gendata_cocsep_return_size;
+
+	buf32 = (uint32_t *)buf;
+	info = buf32 + COCSEP_TABLESIZE;
+	memset(buf32, 0xFF, sizeof(uint32_t) * COCSEP_TABLESIZE);
+	if (selfsim != NULL)
+		memset(selfsim, 0, sizeof(uint64_t) * COCSEP_CLASSES);
+
+	arg = (dfsarg_cocsep_t) {
+		.cube = solved,
+		.n = &n,
+		.buf32 = buf32,
+		.visited = visited,
+		.selfsim = selfsim,
+		.rep = rep
+	};
+	for (i = 0, n = 0, cc = 0; i < 10; i++) {
+		LOG("cocsep: generating depth %" PRIu8 "\n", i);
+		memset(visited, 0, COCSEP_VISITEDSIZE);
+		arg.depth = 0;
+		arg.maxdepth = i;
+		cc = gendata_cocsep_dfs(&arg);
+		info[i+2] = cc;
+		LOG("found %" PRIu32 "\n", cc);
+	}
+
+	info[0] = (uint32_t)n;
+	info[1] = 9; /* Known max pruning value */
+	DBG_ASSERT(n == COCSEP_CLASSES, 0,
+	    "cocsep: computed %" PRIu16 " symmetry classes, "
+	    "expected %zu\n", n, COCSEP_CLASSES);
+
+	LOG("cocsep data computed\n");
+	LOG("Symmetry classes: %" PRIu32 "\n", info[0]);
+	LOG("Maximum pruning value: %" PRIu32 "\n", info[1]);
+	LOG("Pruning value distribution:\n");
+	for (j = 0; j < 10; j++)
+		LOG("%" PRIu8 ":\t%" PRIu32 "\n", j, info[j+2]);
+
+gendata_cocsep_return_size:
+	return COCSEP_FULLSIZE;
+}
+
+_static uint32_t
+gendata_cocsep_dfs(dfsarg_cocsep_t *arg)
+{
+	uint8_t m;
+	uint32_t cc, class, ttrep, depth, olddepth, tinv;
+	uint64_t t;
+	int64_t i, j;
+	cube_t d;
+	dfsarg_cocsep_t nextarg;
+
+	i = coord_cocsep(arg->cube);
+	olddepth = (uint8_t)(arg->buf32[i] & 0xFF);
+	if (olddepth < arg->depth || get_visited(arg->visited, i))
+		return 0;
+	set_visited(arg->visited, i);
+
+	if (arg->depth == arg->maxdepth) {
+		if ((arg->buf32[i] & 0xFF) != 0xFF)
+			return 0;
+
+		if (arg->rep != NULL)
+			arg->rep[*arg->n] = arg->cube;
+		for (t = 0, cc = 0; t < 48; t++) {
+			d = transform_corners(arg->cube, t);
+			j = coord_cocsep(d);
+			if (i == j && arg->selfsim != NULL)
+				arg->selfsim[*arg->n] |= UINT64_C(1) << t;
+			if (COCLASS(arg->buf32[j]) != UINT32_C(0xFFFF))
+				continue;
+			set_visited(arg->visited, j);
+			tinv = inverse_trans(t);
+			olddepth = arg->buf32[j] & 0xFF;
+			cc += olddepth == 0xFF;
+
+			class = (uint32_t)(*arg->n) << UINT32_C(16);
+			ttrep = (uint32_t)tinv << UINT32_C(8);
+			depth = (uint32_t)arg->depth;
+			arg->buf32[j] = class | ttrep | depth;
+		}
+		(*arg->n)++;
+
+		return cc;
+	}
+
+	memcpy(&nextarg, arg, sizeof(dfsarg_cocsep_t));
+	nextarg.depth++;
+	for (m = 0, cc = 0; m < 18; m++) {
+		nextarg.cube = move(arg->cube, m);
+		cc += gendata_cocsep_dfs(&nextarg);
+	}
+
+	return cc;
+}
+
+_static uint64_t
+gen_h48short(
+	uint8_t n,
+	const uint32_t *cocsepdata,
+	const cube_t *crep,
+	const uint64_t *selfsim,
+	h48map_t *map
+) {
+	uint8_t i, m;
+	int64_t coord;
+	uint64_t j, oldn;
+	kvpair_t kv;
+	cube_t cube, d;
+
+	cube = solvedcube();
+	coord = coord_h48(cube, cocsepdata, 11);
+	h48map_insertmin(map, coord, 0);
+	oldn = 0;
+	LOG("Short h48: generating depth 0\nfound %" PRIu8 "\n", map->n-oldn);
+	for (i = 0; i < n; i++) {
+		LOG("Short h48: generating depth %" PRIu8 "\n", i+1);
+		j = 0;
+		oldn = map->n;
+		for (kv = h48map_nextkvpair(map, &j);
+		     j != map->capacity;
+		     kv = h48map_nextkvpair(map, &j)
+		) {
+			if (kv.val != i)
+				continue;
+			cube = invcoord_h48(kv.key, crep, 11);
+			for (m = 0; m < 18; m++) {
+				d = move(cube, m);
+				_foreach_h48sim(d, cocsepdata, selfsim, 11,
+					coord = coord_h48(d, cocsepdata, 11);
+					h48map_insertmin(map, coord, i+1);
+				)
+			}
+		}
+		LOG("found %" PRIu8 "\n", map->n-oldn);
+	}
+
+	return map->n;
+}
+
+/*
+TODO description
+generating fixed table with h=0, k=4
+*/
+_static size_t
+gendata_h48h0k4(void *buf, uint8_t maxdepth)
+{
+	uint32_t j, *buf32, *info, *cocsepdata;
+	bfsarg_esep_t arg;
+	int64_t sc, cc, esep_max;
+	uint64_t selfsim[COCSEP_CLASSES];
+	cube_t crep[COCSEP_CLASSES];
+	size_t cocsepsize, infosize;
+
+	/* TODO: move info at start of tables (all tables!) */
+	infosize = 4 * maxdepth;
+	cocsepsize = gendata_cocsep(buf, selfsim, crep);
+	infosize = 88;
+
+	if (buf == NULL)
+		goto gendata_h48h0k4_return_size;
+
+	esep_max = (int64_t)ESEP_MAX(0);
+	cocsepdata = (uint32_t *)buf;
+	buf32 = cocsepdata + cocsepsize / 4;
+	info = buf32 + (ESEP_TABLESIZE(0, 4) / sizeof(uint32_t));
+	memset(buf32, 0xFF, ESEP_TABLESIZE(0, 4));
+
+	sc = coord_h48(solved, cocsepdata, 0);
+	set_esep_pval(buf32, sc, 0);
+	info[1] = 1;
+	arg = (bfsarg_esep_t) {
+		.cocsepdata = cocsepdata,
+		.buf32 = buf32,
+		.selfsim = selfsim,
+		.crep = crep
+	};
+	for (
+		arg.done = 1, arg.depth = 1, cc = 0;
+		arg.done < esep_max && arg.depth <= maxdepth;
+		arg.depth++
+	) {
+		LOG("esep: generating depth %" PRIu8 "\n", arg.depth);
+		cc = gendata_h48h0k4_bfs(&arg);
+		arg.done += cc;
+		info[arg.depth+1] = cc;
+		LOG("found %" PRId64 "\n", cc);
+	}
+
+	info[0] = arg.depth-1;
+
+	LOG("h48 pruning table computed\n");
+	LOG("Maximum pruning value: %" PRIu32 "\n", info[0]);
+	LOG("Pruning value distribution:\n");
+	for (j = 0; j <= info[0]; j++)
+		LOG("%" PRIu8 ":\t%" PRIu32 "\n", j, info[j+1]);
+
+gendata_h48h0k4_return_size:
+	return cocsepsize + ESEP_TABLESIZE(0, 4) + infosize;
+}
+
+_static int64_t
+gendata_h48h0k4_bfs(bfsarg_esep_t *arg)
+{
+	const uint8_t breakpoint = 10; /* Hand-picked optimal */
+
+	if (arg->depth < breakpoint)
+		return gendata_h48h0k4_bfs_fromdone(arg);
+	else
+		return gendata_h48h0k4_bfs_fromnew(arg);
+}
+
+_static int64_t
+gendata_h48h0k4_bfs_fromdone(bfsarg_esep_t *arg)
+{
+	uint8_t c, m, x;
+	uint32_t cc;
+	int64_t i, j, k;
+	cube_t cube, moved;
+
+	for (i = 0, cc = 0; i < (int64_t)ESEP_MAX(0); i++) {
+		c = get_esep_pval(arg->buf32, i);
+		if (c != arg->depth - 1)
+			continue;
+		cube = invcoord_h48(i, arg->crep, 0);
+		for (m = 0; m < 18; m++) {
+			moved = move(cube, m);
+			j = coord_h48(moved, arg->cocsepdata, 0);
+			if (get_esep_pval(arg->buf32, j) <= arg->depth)
+				continue;
+			_foreach_h48sim(moved, arg->cocsepdata, arg->selfsim, 0,
+				k = coord_h48(moved, arg->cocsepdata, 0);
+				x = get_esep_pval(arg->buf32, k);
+				set_esep_pval(arg->buf32, k, arg->depth);
+				cc += x != arg->depth;
+			)
+		}
+	}
+
+	return cc;
+}
+
+_static int64_t
+gendata_h48h0k4_bfs_fromnew(bfsarg_esep_t *arg)
+{
+	uint8_t c, m, x;
+	uint32_t cc;
+	int64_t i, j;
+	cube_t cube, moved;
+
+	for (i = 0, cc = 0; i < (int64_t)ESEP_MAX(0); i++) {
+		c = get_esep_pval(arg->buf32, i);
+		if (c != 0xF)
+			continue;
+		cube = invcoord_h48(i, arg->crep, 0);
+		for (m = 0; m < 18; m++) {
+			moved = move(cube, m);
+			j = coord_h48(moved, arg->cocsepdata, 0);
+			x = get_esep_pval(arg->buf32, j);
+			if (x >= arg->depth)
+				continue;
+			_foreach_h48sim(cube, arg->cocsepdata, arg->selfsim, 0,
+				j = coord_h48(cube, arg->cocsepdata, 0);
+				x = get_esep_pval(arg->buf32, j);
+				set_esep_pval(arg->buf32, j, arg->depth);
+				cc += x == 0xF;
+			)
+			break; /* Enough to find one, skip the rest */
+		}
+	}
+
+	return cc;
+}
+
+_static_inline bool
+get_visited(const uint8_t *a, int64_t i)
+{
+	return a[VISITED_IND(i)] & VISITED_MASK(i);
+}
+
+_static_inline void
+set_visited(uint8_t *a, int64_t i)
+{
+	a[VISITED_IND(i)] |= VISITED_MASK(i);
+}
+
+_static_inline uint8_t
+get_esep_pval(const uint32_t *buf32, int64_t i)
+{
+	return (buf32[ESEP_IND(i)] & ESEP_MASK(i)) >> ESEP_SHIFT(i);
+}
+
+_static_inline void
+set_esep_pval(uint32_t *buf32, int64_t i, uint8_t val)
+{
+	buf32[ESEP_IND(i)] =
+	    (buf32[ESEP_IND(i)] & (~ESEP_MASK(i))) | (val << ESEP_SHIFT(i));
+}
+
+_static_inline int8_t
+get_h48_cdata(cube_t cube, uint32_t *cocsepdata, uint32_t *cdata)
+{
+	int64_t coord;
+
+	coord = coord_cocsep(cube);
+	*cdata = cocsepdata[coord];
+
+	return CBOUND(*cdata);
+}
+
+_static_inline int8_t
+get_h48_bound(cube_t cube, uint32_t cdata, uint8_t h, uint32_t *h48data)
+{
+	int64_t coord;
+
+	coord = coord_h48_edges(cube, COCLASS(cdata), TTREP(cdata), h);
+	return get_esep_pval(h48data, coord);
+}
diff --git a/src/solvers/h48/h48.h b/src/solvers/h48/h48.h
@@ -0,0 +1,4 @@
+#include "coordinate.h"
+#include "map.h"
+#include "gendata.h"
+#include "solve.h"
diff --git a/src/solvers/h48/map.h b/src/solvers/h48/map.h
@@ -0,0 +1,104 @@
+#define MAP_UNSET             UINT64_C(0xFFFFFFFFFFFFFFFF)
+#define MAP_KEYMASK           UINT64_C(0xFFFFFFFFFF)
+#define MAP_KEYSHIFT          UINT64_C(40)
+
+typedef struct {
+	uint64_t n;
+	uint64_t capacity;
+	uint64_t randomizer;
+	uint64_t *table;
+} h48map_t;
+
+typedef struct {
+	uint64_t key;
+	uint64_t val;
+} kvpair_t;
+
+_static void h48map_create(h48map_t *, uint64_t, uint64_t);
+_static void h48map_clear(h48map_t *);
+_static void h48map_destroy(h48map_t *);
+_static uint64_t h48map_lookup(h48map_t *, uint64_t);
+_static void h48map_insertmin(h48map_t *, uint64_t, uint64_t);
+_static uint64_t h48map_value(h48map_t *, uint64_t);
+_static kvpair_t h48map_nextkvpair(h48map_t *, uint64_t *);
+
+_static void
+h48map_create(h48map_t *map, uint64_t capacity, uint64_t randomizer)
+{
+	map->capacity = capacity;
+	map->randomizer = randomizer;
+
+	map->table = malloc(map->capacity * sizeof(int64_t));
+	h48map_clear(map);
+}
+
+_static void
+h48map_clear(h48map_t *map)
+{
+	memset(map->table, 0xFF, map->capacity * sizeof(uint64_t));
+	map->n = 0;
+}
+
+_static void
+h48map_destroy(h48map_t *map)
+{
+	free(map->table);
+}
+
+_static_inline uint64_t
+h48map_lookup(h48map_t *map, uint64_t x)
+{
+	uint64_t hash, i;
+
+	hash = ((x % map->capacity) * map->randomizer) % map->capacity;
+	for (i = hash;
+	     map->table[i] != MAP_UNSET && (map->table[i] & MAP_KEYMASK) != x;
+	     i = (i+1) % map->capacity
+	) ;
+
+	return i;
+}
+
+_static_inline void
+h48map_insertmin(h48map_t *map, uint64_t key, uint64_t val)
+{
+	uint64_t i, oldval, min;
+
+	i = h48map_lookup(map, key);
+	oldval = map->table[i] >> MAP_KEYSHIFT;
+	min = _min(val, oldval);
+
+	map->n += map->table[i] == MAP_UNSET;
+	map->table[i] = (key & MAP_KEYMASK) | (min << MAP_KEYSHIFT);
+}
+
+_static_inline uint64_t
+h48map_value(h48map_t *map, uint64_t key)
+{
+	return map->table[h48map_lookup(map, key)] >> MAP_KEYSHIFT;
+}
+
+_static kvpair_t
+h48map_nextkvpair(h48map_t *map, uint64_t *p)
+{
+	kvpair_t kv;
+	uint64_t pair;
+
+	kv.key = MAP_UNSET;
+	kv.val = MAP_UNSET;
+
+	DBG_ASSERT(*p < map->capacity, kv,
+	    "Error looping over map: given index %" PRIu64 " is out of "
+	    "range [0,%" PRIu64 "]", *p, map->capacity);
+
+	for ( ; *p < map->capacity; (*p)++) {
+		if (map->table[*p] != MAP_UNSET) {
+			pair = map->table[(*p)++];
+			kv.key = pair & MAP_KEYMASK;
+			kv.val = pair >> MAP_KEYSHIFT;
+			return kv;
+		}
+	}
+
+	return kv;
+}
diff --git a/src/solvers/h48/solve.h b/src/solvers/h48/solve.h
@@ -0,0 +1,242 @@
+typedef struct {
+	cube_t cube;
+	cube_t inverse;
+	int8_t nmoves;
+	int8_t depth;
+	uint8_t moves[MAXLEN];
+	int64_t *nsols;
+	int64_t maxsolutions;
+	uint8_t h;
+	uint32_t *cocsepdata;
+	uint32_t *h48data;
+	char **nextsol;
+} dfsarg_solveh48_t;
+
+typedef struct {
+	cube_t cube;
+	int8_t nmoves;
+	int8_t depth;
+	uint8_t moves[MAXLEN];
+	uint32_t *cocsepdata;
+	uint32_t *h48data;
+	char *s;
+} dfsarg_solveh48stats_t;
+
+_static void solve_h48_appendsolution(dfsarg_solveh48_t *);
+_static_inline bool solve_h48_stop(dfsarg_solveh48_t *);
+_static int64_t solve_h48_dfs(dfsarg_solveh48_t *);
+_static int64_t solve_h48(
+    cube_t, int8_t, int8_t, int8_t, uint8_t, const void *, char *);
+
+_static int64_t solve_h48stats_dfs(dfsarg_solveh48stats_t *);
+_static int64_t solve_h48stats(cube_t, int8_t, const void *, char [static 12]);
+
+_static void
+solve_h48_appendsolution(dfsarg_solveh48_t *arg)
+{
+	int strl;
+
+	strl = writemoves(arg->moves, arg->nmoves, *arg->nextsol);
+	LOG("Solution found: %s\n", *arg->nextsol);
+	*arg->nextsol += strl;
+	**arg->nextsol = '\n';
+	(*arg->nextsol)++;
+	(*arg->nsols)++;
+}
+
+_static_inline bool
+solve_h48_stop(dfsarg_solveh48_t *arg)
+{
+	uint32_t data, data_inv;
+	int8_t bound;
+
+	bound = get_h48_cdata(arg->cube, arg->cocsepdata, &data);
+	if (bound + arg->nmoves > arg->depth)
+		return true;
+
+	bound = get_h48_cdata(arg->inverse, arg->cocsepdata, &data_inv);
+	if (bound + arg->nmoves > arg->depth)
+		return true;
+
+/*
+	bound = get_h48_bound(arg->cube, data, arg->h, arg->h48data);
+LOG("Using pval %" PRId8 "\n", bound);
+	if (bound + arg->nmoves > arg->depth)
+		return true;
+
+	bound = get_h48_bound(arg->inverse, data_inv, arg->h, arg->h48data);
+	if (bound + arg->nmoves > arg->depth)
+		return true;
+*/
+
+	return false;
+}
+
+_static int64_t
+solve_h48_dfs(dfsarg_solveh48_t *arg)
+{
+	dfsarg_solveh48_t nextarg;
+	int64_t ret;
+	uint8_t m;
+
+	if (*arg->nsols == arg->maxsolutions)
+		return 0;
+
+	if (solve_h48_stop(arg))
+		return 0;
+
+	if (issolved(arg->cube)) {
+		if (arg->nmoves != arg->depth)
+			return 0;
+		solve_h48_appendsolution(arg);
+		return 1;
+	}
+
+	/* TODO: avoid copy, change arg and undo changes after recursion */
+	nextarg = *arg;
+	nextarg.nmoves = arg->nmoves + 1;
+	ret = 0;
+	for (m = 0; m < 18; m++) {
+		nextarg.moves[arg->nmoves] = m;
+		if (!allowednextmove(nextarg.moves, nextarg.nmoves)) {
+			/* If a move is not allowed, neither are its 180
+			 * and 270 degree variations */
+			m += 2;
+			continue;
+		}
+		nextarg.cube = move(arg->cube, m);
+		nextarg.inverse = inverse(nextarg.cube); /* TODO: use premove */
+		ret += solve_h48_dfs(&nextarg);
+	}
+
+	return ret;
+}
+
+_static int64_t
+solve_h48(
+	cube_t cube,
+	int8_t minmoves,
+	int8_t maxmoves,
+	int8_t maxsolutions,
+	uint8_t h,
+	const void *data,
+	char *solutions
+)
+{
+	int64_t nsols;
+	dfsarg_solveh48_t arg;
+
+	arg = (dfsarg_solveh48_t) {
+		.cube = cube,
+		.inverse = inverse(cube),
+		.nsols = &nsols,
+		.maxsolutions = maxsolutions,
+		.h = h,
+		.cocsepdata = (uint32_t *)data,
+		.h48data = ((uint32_t *)data) + COCSEP_FULLSIZE / 4,
+		.nextsol = &solutions
+	};
+
+	nsols = 0;
+	for (arg.depth = minmoves;
+	     arg.depth <= maxmoves && nsols < maxsolutions;
+	     arg.depth++)
+	{
+		LOG("Found %" PRId64 " solutions, searching at depth %"
+		    PRId8 "\n", nsols, arg.depth);
+		arg.nmoves = 0;
+		solve_h48_dfs(&arg);
+	}
+
+	return nsols;
+}
+
+/*
+The h48stats solver computes how many moves it takes to solve to
+each of the 12 h48 coordinates, one for each value of h from 0 to 11.
+The solutions array is filled with the length of the solutions. The
+solution array is therefore not a printable string.
+*/
+_static int64_t
+solve_h48stats_dfs(dfsarg_solveh48stats_t *arg)
+{
+	const int64_t limit = 11;
+
+	int8_t bound, u;
+	uint8_t m;
+	uint32_t d;
+	int64_t coord, h;
+	dfsarg_solveh48stats_t nextarg;
+
+	/* Check cocsep lower bound (corners only) */
+	bound = get_h48_cdata(arg->cube, arg->cocsepdata, &d);
+	if (bound + arg->nmoves > arg->depth)
+		return 0;
+
+	/* Check h48 lower bound for h=0 (esep, but no eo) */
+	coord = coord_h48_edges(arg->cube, COCLASS(d), TTREP(d), 0);
+	bound = get_esep_pval(arg->h48data, coord);
+	if (bound + arg->nmoves > arg->depth)
+		return 0;
+	
+	/* Update all other values, if solved */
+	coord = coord_h48_edges(arg->cube, COCLASS(d), TTREP(d), 11);
+	for (h = 0; h <= limit; h++) {
+		u = coord >> (11-h) == 0 && arg->s[h] == 99;
+		arg->s[h] = u * arg->nmoves + (1-u) * arg->s[h];
+	}
+
+	if (arg->s[limit] != 99)
+		return 0;
+
+	nextarg = *arg;
+	nextarg.nmoves = arg->nmoves + 1;
+	for (m = 0; m < 18; m++) {
+		nextarg.moves[arg->nmoves] = m;
+		if (!allowednextmove(nextarg.moves, nextarg.nmoves)) {
+			/* If a move is not allowed, neither are its 180
+			 * and 270 degree variations */
+			m += 2;
+			continue;
+		}
+		nextarg.cube = move(arg->cube, m);
+		solve_h48stats_dfs(&nextarg);
+	}
+
+	return 0;
+}
+
+_static int64_t
+solve_h48stats(
+	cube_t cube,
+	int8_t maxmoves,
+	const void *data,
+	char solutions[static 12]
+)
+{
+	int i;
+	size_t cocsepsize;
+	dfsarg_solveh48stats_t arg;
+
+	cocsepsize = gendata_cocsep(NULL, NULL, NULL);
+
+	arg = (dfsarg_solveh48stats_t) {
+		.cube = cube,
+		.cocsepdata = (uint32_t *)data,
+		.h48data = ((uint32_t *)data) + (cocsepsize/4),
+		.s = solutions
+	};
+
+	for (i = 0; i < 12; i++)
+		solutions[i] = (char)99;
+
+	for (arg.depth = 0;
+	     arg.depth <= maxmoves && solutions[11] == 99;
+	     arg.depth++)
+	{
+		arg.nmoves = 0;
+		solve_h48stats_dfs(&arg);
+	}
+
+	return 0;
+}
diff --git a/src/solvers/solvers.h b/src/solvers/solvers.h
@@ -0,0 +1,2 @@
+#include "generic/generic.h"
+#include "h48/h48.h"
diff --git a/src/constants.h b/src/utils/constants.h
diff --git a/src/utils/dbg_log.h b/src/utils/dbg_log.h
@@ -0,0 +1,16 @@
+void (*nissy_log)(const char *, ...);
+
+#define LOG(...) if (nissy_log != NULL) nissy_log(__VA_ARGS__);
+
+#ifdef DEBUG
+#define _static
+#define _static_inline
+#define DBG_WARN(condition, ...) if (!(condition)) LOG(__VA_ARGS__);
+#define DBG_ASSERT(condition, retval, ...) \
+    if (!(condition)) { LOG(__VA_ARGS__); return retval; }
+#else
+#define _static static
+#define _static_inline static inline
+#define DBG_WARN(condition, ...)
+#define DBG_ASSERT(condition, retval, ...)
+#endif
diff --git a/src/utils.h b/src/utils/math.h
diff --git a/src/utils/utils.h b/src/utils/utils.h
@@ -0,0 +1,3 @@
+#include "dbg_log.h"
+#include "constants.h"
+#include "math.h"
diff --git a/test/test.h b/test/test.h
@@ -1,3 +1,5 @@
+#define TEST_H
+
 #include <inttypes.h>
 #include <stdarg.h>
 #include <stdbool.h>
@@ -5,24 +7,9 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define STRLENMAX 10000
+#include "../src/arch/arch.h"
 
-#if defined(CUBE_AVX2)
-#include <immintrin.h>
-typedef __m256i cube_t;
-#elif defined(CUBE_NEON)
-#include <stdlib.h>
-#include <arm_neon.h>
-typedef struct {
-	uint8x16_t corner;
-	uint8x16_t edge;
-} cube_t;
-#else
-typedef struct {
-	uint8_t corner[8];
-	uint8_t edge[12];
-} cube_t;
-#endif
+#define STRLENMAX 10000
 
 /* Basic functions used in most tests */
 cube_t solvedcube(void);
diff --git a/tools/001_gendata_h48/gendata_h48.c b/tools/001_gendata_h48/gendata_h48.c
@@ -1,5 +1,5 @@
 #include "../timerun.h"
-#include "../../src/cube.h"
+#include "../../src/nissy.h"
 
 #define MAXDEPTH 20
 #define HVALUE 0
diff --git a/tools/002_stats_tables_h48/stats_tables_h48.c b/tools/002_stats_tables_h48/stats_tables_h48.c
@@ -1,7 +1,7 @@
 #include <pthread.h>
 #include <time.h>
 #include "../timerun.h"
-#include "../../src/cube.h"
+#include "../../src/nissy.h"
 
 #define MAXMOVES 20
 #define NTHREADS 32
diff --git a/utils/genmovecode.sh b/utils/genmovecode.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 
-cc -DDEBUG h48_to_lst.c ../src/cube.c -o h48_to_lst
+cc -DDEBUG h48_to_lst.c ../src/nissy.c -o h48_to_lst
 
 gen() {
 	for f in cubes/move_??_*.txt; do
diff --git a/utils/gentranscode.sh b/utils/gentranscode.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 
-cc -DDEBUG h48_to_lst.c ../src/cube.c -o h48_to_lst
-cc -DDEBUG invert.c ../src/cube.c -o invert
+cc -DDEBUG h48_to_lst.c ../src/nissy.c -o h48_to_lst
+cc -DDEBUG invert.c ../src/nissy.c -o invert
 
 lineavx() { printf '#define _trans_cube_%s ' "$1"; }
 linesrc() { printf '_static cube_fast_t _trans_cube_%s = ' "$1"; }
diff --git a/utils/h48_to_lst.c b/utils/h48_to_lst.c
@@ -2,7 +2,7 @@
 #include <inttypes.h>
 #include <stdbool.h>
 
-#include "../src/cube.h"
+#include "../src/nissy.h"
 
 #define STRLENMAX 1000
 
diff --git a/utils/invert.c b/utils/invert.c
@@ -2,7 +2,7 @@
 #include <inttypes.h>
 #include <stdbool.h>
 
-#include "../src/cube.h"
+#include "../src/nissy.h"
 
 #define STRLENMAX 1000

	h48 A prototype for an optimal Rubik's cube solver, work in progress.
	git clone https://git.tronto.net/h48
	Download \| Log \| Files \| Refs \| README \| LICENSE

M	LICENSE	\|	4	+++-
M	Makefile	\|	6	+++---
M	shell.c	\|	2	+-
A	src/arch/arch.h	\|	41	+++++++++++++++++++++++++++++++++++++++++
A	src/arch/avx2.h	\|	319	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/arch/common.h	\|	19	+++++++++++++++++++
A	src/arch/neon.h	\|	348	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/arch/portable.h	\|	272	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R	src/constant_cubes.h -> src/core/constant_cubes.h	\|	0
A	src/core/core.h	\|	13	+++++++++++++
R	src/cube_generic.h -> src/core/cube.h	\|	0
R	src/io_cube.h -> src/core/io_cube.h	\|	0
A	src/core/io_moves.h	\|	62	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/core/io_trans.h	\|	25	+++++++++++++++++++++++++
R	src/moves.h -> src/core/moves.h	\|	0
R	src/cube_transform.h -> src/core/transform.h	\|	0
R	src/cube_transform_with_switch.h -> src/core/transform_with_switch.h	\|	0
D	src/cube.c	\|	55	-------------------------------------------------------
D	src/cube_avx2.h	\|	341	-------------------------------------------------------------------------------
D	src/cube_neon.h	\|	377	-------------------------------------------------------------------------------
D	src/cube_portable.h	\|	298	-------------------------------------------------------------------------------
D	src/cube_public.h	\|	255	-------------------------------------------------------------------------------
D	src/io_move_trans.h	\|	87	-------------------------------------------------------------------------------
A	src/nissy.c	\|	265	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R	src/cube.h -> src/nissy.h	\|	0
D	src/solve_h48.h	\|	839	-------------------------------------------------------------------------------
R	src/solve_generic.h -> src/solvers/generic/generic.h	\|	0
A	src/solvers/h48/coordinate.h	\|	68	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/solvers/h48/gendata.h	\|	425	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/solvers/h48/h48.h	\|	4	++++
A	src/solvers/h48/map.h	\|	104	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/solvers/h48/solve.h	\|	242	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/solvers/solvers.h	\|	2	++
R	src/constants.h -> src/utils/constants.h	\|	0
A	src/utils/dbg_log.h	\|	16	++++++++++++++++
R	src/utils.h -> src/utils/math.h	\|	0
A	src/utils/utils.h	\|	3	+++
M	test/test.h	\|	21	++++-----------------
M	tools/001_gendata_h48/gendata_h48.c	\|	2	+-
M	tools/002_stats_tables_h48/stats_tables_h48.c	\|	2	+-
M	utils/genmovecode.sh	\|	2	+-
M	utils/gentranscode.sh	\|	4	++--
M	utils/h48_to_lst.c	\|	2	+-
M	utils/invert.c	\|	2	+-