h48

A prototype for an optimal Rubik's cube solver, work in progress.
git clone https://git.tronto.net/h48
Download | Log | Files | Refs | README | LICENSE

commit 0eba10bf55283acdb687a172404a89e5a72a00c2
parent 1fcba7fcea462d67379c8d289bd7e0ad52170e02
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Sat,  7 Sep 2024 10:55:30 +0200

Improved gendata_h48k2 performance

Performance for the "real coordinate" case (h0) is ok, but still around 3x
slower than the gendata_h48k4 method. Performance for the intermediate
tables is not good, but for now I'll keep it like this.

This also shows that for h11 we should use the gendata_h48k4 method for h11.
For intermediate table, h8, h9 and h10 might be too slow to compute in an
acceptable time. We could restrict to h7 (largest with base = 9).

Diffstat:
Msrc/arch/arch.h | 4++--
Msrc/solvers/h48/gendata_h48.h | 122++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
Mtools/run_tool.sh | 3++-
3 files changed, 91 insertions(+), 38 deletions(-)

diff --git a/src/arch/arch.h b/src/arch/arch.h @@ -1,4 +1,4 @@ -#if defined(CUBE_AVX2) +#if defined(AVX2) #include <immintrin.h> @@ -9,7 +9,7 @@ typedef __m256i cube_t; #include "avx2.h" #endif -#elif defined(CUBE_NEON) +#elif defined(NEON) #include <stdlib.h> #include <arm_neon.h> diff --git a/src/solvers/h48/gendata_h48.h b/src/solvers/h48/gendata_h48.h @@ -60,13 +60,10 @@ typedef struct { typedef struct { cube_t cube; - uint8_t moves[MAXLEN]; uint8_t h; uint8_t k; uint8_t base; - uint8_t depth; uint8_t shortdepth; - uint8_t maxdepth; uint32_t *cocsepdata; uint32_t *h48data; uint64_t *selfsim; @@ -84,6 +81,8 @@ STATIC int64_t gendata_h48h0k4_bfs(h48h0k4_bfs_arg_t *); STATIC int64_t gendata_h48h0k4_bfs_fromdone(h48h0k4_bfs_arg_t *); STATIC int64_t gendata_h48h0k4_bfs_fromnew(h48h0k4_bfs_arg_t *); STATIC size_t gendata_h48k2(gendata_h48_arg_t *); +STATIC_INLINE void gendata_h48k2_mark(cube_t, int8_t, h48k2_dfs_arg_t *); +STATIC_INLINE bool gendata_h48k2_dfs_stop(cube_t, uint8_t, h48k2_dfs_arg_t *); STATIC void gendata_h48k2_dfs(h48k2_dfs_arg_t *arg); STATIC_INLINE int8_t get_h48_bound(cube_t, uint32_t, uint8_t, uint8_t, uint32_t *); @@ -328,9 +327,7 @@ gendata_h48k2(gendata_h48_arg_t *arg) .h = arg->h, .k = arg->k, .base = base[arg->h], - .depth = shortdepth, .shortdepth = shortdepth, - .maxdepth = MIN(arg->maxdepth, base[arg->h]+2), .cocsepdata = arg->cocsepdata, .h48data = arg->h48data, .selfsim = arg->selfsim, @@ -345,7 +342,7 @@ gendata_h48k2(gendata_h48_arg_t *arg) ) { dfsarg.cube = invcoord_h48(kv.key, arg->crep, 11); gendata_h48k2_dfs(&dfsarg); - if (++i % UINT64_C(1000000) == 0) + if (++ii % UINT64_C(1000000) == 0) LOG("Processed %" PRIu64 " short cubes\n", ii); } @@ -365,43 +362,98 @@ gendata_h48k2_return_size: STATIC void gendata_h48k2_dfs(h48k2_dfs_arg_t *arg) { - cube_t ccc; - bool toodeep, backtracked; - uint8_t nmoves, oldval, newval; - uint64_t mval; + int8_t d; + uint8_t m[4]; + cube_t cube[4]; + + d = (int8_t)arg->shortdepth - (int8_t)arg->base; + + /* Depth d+0 (shortcubes) */ + gendata_h48k2_mark(arg->cube, d, arg); + + /* Depth d+1 */ + for (m[0] = 0; m[0] < 18; m[0]++) { + cube[0] = move(arg->cube, m[0]); + if (gendata_h48k2_dfs_stop(cube[0], d+1, arg)) + continue; + gendata_h48k2_mark(cube[0], d+1, arg); + + /* Depth d+2 */ + for (m[1] = 0; m[1] < 18; m[1]++) { + if (m[0] / 3 == m[1] / 3) { + m[1] += 2; + continue; + } + cube[1] = move(cube[0], m[1]); + if (gendata_h48k2_dfs_stop(cube[1], d+2, arg)) + continue; + gendata_h48k2_mark(cube[1], d+2, arg); + if (d >= 0) + continue; + + /* Depth d+3 */ + for (m[2] = 0; m[2] < 18; m[2]++) { + if (!allowednextmove(m, 3)) { + m[2] += 2; + continue; + } + cube[2] = move(cube[1], m[2]); + if (gendata_h48k2_dfs_stop(cube[2], d+3, arg)) + continue; + gendata_h48k2_mark(cube[2], d+3, arg); + if (d >= -1) + continue; + + /* Depth d+4 */ + for (m[3] = 0; m[3] < 18; m[3]++) { + if (!allowednextmove(m, 4)) { + m[3] += 2; + continue; + } + cube[3] = move(cube[2], m[3]); + gendata_h48k2_mark(cube[3], d+4, arg); + } + } + } + } +} + +STATIC_INLINE void +gendata_h48k2_mark(cube_t cube, int8_t depth, h48k2_dfs_arg_t *arg) +{ + uint8_t oldval, newval; int64_t coord, fullcoord; - h48k2_dfs_arg_t nextarg; - uint8_t m; - ccc = arg->cube; - FOREACH_H48SIM(ccc, arg->cocsepdata, arg->selfsim, - fullcoord = coord_h48(ccc, arg->cocsepdata, 11); + FOREACH_H48SIM(cube, arg->cocsepdata, arg->selfsim, + fullcoord = coord_h48(cube, arg->cocsepdata, 11); coord = fullcoord >> (int64_t)(11 - arg->h); oldval = get_esep_pval(arg->h48data, coord, arg->k); - newval = arg->depth >= arg->base ? arg->depth - arg->base : 0; + newval = (uint8_t)MAX(depth, 0); set_esep_pval( arg->h48data, coord, arg->k, MIN(oldval, newval)); ) +} - fullcoord = coord_h48(arg->cube, arg->cocsepdata, 11); /* Necessary? */ - mval = h48map_value(arg->shortcubes, fullcoord); - backtracked = mval <= arg->shortdepth && arg->depth != arg->shortdepth; - toodeep = arg->depth >= arg->maxdepth; - if (backtracked || toodeep) - return; - - /* TODO: avoid copy, change arg and undo changes after recursion */ - nextarg = *arg; - nextarg.depth++; - nmoves = nextarg.depth - arg->shortdepth; - for (m = 0; m < 18; m++) { - nextarg.moves[nmoves - 1] = m; - if (!allowednextmove(nextarg.moves, nmoves)) { - m += 2; - continue; - } - nextarg.cube = move(arg->cube, m); - gendata_h48k2_dfs(&nextarg); +STATIC_INLINE bool +gendata_h48k2_dfs_stop(cube_t cube, uint8_t depth, h48k2_dfs_arg_t *arg) +{ + uint64_t val; + int64_t coord; + uint8_t oldval; + + if (arg->h == 0 || arg->h == 11) { + /* We are in the "real coordinate" case, we can stop + if this coordinate has already been visited */ + coord = coord_h48(cube, arg->cocsepdata, arg->h); + oldval = get_esep_pval(arg->h48data, coord, arg->k); + return oldval <= depth; + } else { + /* With 0 < k < 11 we do not have a "real coordinate". + The best we can do is checking if we backtracked to + one of the "short cubes". */ + coord = coord_h48(cube, arg->cocsepdata, 11); + val = h48map_value(arg->shortcubes, coord); + return val <= arg->shortdepth; } } diff --git a/tools/run_tool.sh b/tools/run_tool.sh @@ -20,4 +20,5 @@ for t in tools/*; do break done -rm -rf $BIN $CUBEOBJ +# $BIN is kept so it can be run manually for profiling +rm -rf $CUBEOBJ