h48

A prototype for an optimal Rubik's cube solver, work in progress.
git clone https://git.tronto.net/h48
Download | Log | Files | Refs | README | LICENSE

commit 1a092414735f244bffa3c6878e57463b7dd1e9ba
parent be2d3fe10fd8adbf3fa4ce7cd436e2ad33363fcc
Author: Enrico Tenuti <123324975+enricotenuti@users.noreply.github.com>
Date:   Tue, 24 Sep 2024 07:13:13 +0200

Merge branch 'sebastianotronto:master' into master

Diffstat:
Msrc/solvers/h48/gendata_h48.h | 370+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Mtools/tool.h | 1-
2 files changed, 232 insertions(+), 139 deletions(-)

diff --git a/src/solvers/h48/gendata_h48.h b/src/solvers/h48/gendata_h48.h @@ -33,6 +33,7 @@ _t by _ttrep). typedef struct { uint8_t h; uint8_t k; + uint8_t base; uint8_t maxdepth; tableinfo_t info; void *buf; @@ -56,6 +57,12 @@ typedef struct { uint8_t *table; uint64_t *selfsim; cube_t *crep; + uint64_t start; + uint64_t end; + uint64_t count; + uint64_t *distribution; + pthread_mutex_t *distribution_mutex; + pthread_mutex_t *table_mutex[CHUNKS]; } h48h0k4_bfs_arg_t; typedef struct { @@ -75,17 +82,28 @@ typedef struct { uint64_t *count; } h48k2_dfs_arg_t; +typedef struct { + cube_t cube; + int8_t depth; + uint8_t h; + uint8_t k; + uint32_t *cocsepdata; + uint64_t *selfsim; + uint8_t *table; + pthread_mutex_t **table_mutex; +} gendata_h48_mark_t; + STATIC uint64_t gendata_h48short(gendata_h48short_arg_t *); STATIC size_t gendata_h48(gendata_h48_arg_t *); STATIC size_t gendata_h48h0k4(gendata_h48_arg_t *); -STATIC int64_t gendata_h48h0k4_bfs(h48h0k4_bfs_arg_t *); -STATIC int64_t gendata_h48h0k4_bfs_fromdone(h48h0k4_bfs_arg_t *); -STATIC int64_t gendata_h48h0k4_bfs_fromnew(h48h0k4_bfs_arg_t *); STATIC size_t gendata_h48k2(gendata_h48_arg_t *); -STATIC void * gendata_h48k2_runthread(void *); -STATIC_INLINE void gendata_h48k2_mark(cube_t, int8_t, h48k2_dfs_arg_t *); -STATIC_INLINE bool gendata_h48k2_dfs_stop(cube_t, uint8_t, h48k2_dfs_arg_t *); +STATIC void * gendata_h48h0k4_runthread(void *); +STATIC_INLINE uint64_t gendata_h48_mark(gendata_h48_mark_t *); +STATIC_INLINE bool gendata_h48k2_dfs_stop(cube_t, int8_t, h48k2_dfs_arg_t *); +STATIC size_t gendata_h48k2_realcoord(gendata_h48_arg_t *); STATIC void gendata_h48k2_dfs(h48k2_dfs_arg_t *arg); +STATIC void * gendata_h48k2_runthread(void *); +STATIC tableinfo_t makeinfo_h48k2(gendata_h48_arg_t *, uint8_t); STATIC uint32_t *get_cocsepdata_ptr(const void *); STATIC uint8_t *get_h48data_ptr(const void *); @@ -147,8 +165,12 @@ gendata_h48(gendata_h48_arg_t *arg) arg->cocsepdata = (uint32_t *)cocsepdata_offset; arg->h48buf = (char *)arg->buf + cocsepsize; + arg->base = 99; // TODO: set this somewhere else + if (arg->h == 0 && arg->k == 4) { h48size = gendata_h48h0k4(arg); + } else if ((arg->h == 0 || arg->h == 11) && arg->k == 2) { + h48size = gendata_h48k2_realcoord(arg); } else if (arg->k == 2) { h48size = gendata_h48k2(arg); } else { @@ -179,17 +201,15 @@ gendata_h48_error: return 0; } -/* -TODO description -generating fixed table with h=0, k=4 -*/ STATIC size_t gendata_h48h0k4(gendata_h48_arg_t *arg) { - uint32_t j; uint8_t *table; - h48h0k4_bfs_arg_t bfsarg; - int64_t sc, cc, done, h48max; + int64_t sc, done, d, h48max; + uint64_t t, tt, isize, cc; + h48h0k4_bfs_arg_t bfsarg[THREADS]; + pthread_t thread[THREADS]; + pthread_mutex_t distribution_mutex, table_mutex[CHUNKS]; if (arg->buf == NULL) goto gendata_h48h0k4_return_size; @@ -216,109 +236,134 @@ gendata_h48h0k4(gendata_h48_arg_t *arg) sc = coord_h48(SOLVED_CUBE, arg->cocsepdata, 0); set_h48_pval(table, sc, 4, 0); arg->info.distribution[0] = 1; - bfsarg = (h48h0k4_bfs_arg_t) { - .cocsepdata = arg->cocsepdata, - .table = table, - .selfsim = arg->selfsim, - .crep = arg->crep - }; - for ( - done = 1, bfsarg.depth = 1, cc = 0; - done < h48max && bfsarg.depth <= arg->maxdepth; - bfsarg.depth++ - ) { - LOG("h48: generating depth %" PRIu8 "\n", bfsarg.depth); - cc = gendata_h48h0k4_bfs(&bfsarg); - done += cc; - arg->info.distribution[bfsarg.depth] = cc; - LOG("found %" PRId64 "\n", cc); + + isize = h48max / THREADS; + isize = (isize / H48_COEFF(arg->k)) * H48_COEFF(arg->k); + pthread_mutex_init(&distribution_mutex, NULL); + for (t = 0; t < CHUNKS; t++) + pthread_mutex_init(&table_mutex[t], NULL); + for (t = 0; t < THREADS; t++) { + bfsarg[t] = (h48h0k4_bfs_arg_t) { + .cocsepdata = arg->cocsepdata, + .table = table, + .selfsim = arg->selfsim, + .crep = arg->crep, + .start = isize * t, + .end = t == THREADS-1 ? (uint64_t)h48max : isize * (t+1), + .distribution = arg->info.distribution, + .distribution_mutex = &distribution_mutex, + }; + for (tt = 0; tt < CHUNKS; tt++) + bfsarg[t].table_mutex[tt] = &table_mutex[tt]; } + for (done = 1, d = 1; done < h48max && d <= arg->maxdepth; d++) { + LOG("h48: generating depth %" PRIu8 "\n", d); + + for (t = 0; t < THREADS; t++) { + bfsarg[t].depth = d; + bfsarg[t].count = 0; + pthread_create(&thread[t], NULL, + gendata_h48h0k4_runthread, &bfsarg[t]); + } + + for (t = 0, cc = 0; t < THREADS; t++) { + pthread_join(thread[t], NULL); + cc += bfsarg[t].count; + } - arg->info.maxvalue = bfsarg.depth-1; + done += cc; + arg->info.distribution[d] = cc; - LOG("h48 pruning table computed\n"); - LOG("Maximum pruning value: %" PRIu32 "\n", arg->info.maxvalue); - LOG("Pruning value distribution:\n"); - for (j = 0; j <= arg->info.maxvalue; j++) - LOG("%" PRIu8 ":\t%" PRIu32 "\n", j, arg->info.distribution[j]); + LOG("found %" PRId64 "\n", cc); + } + arg->info.maxvalue = d - 1; writetableinfo(&arg->info, arg->h48buf); gendata_h48h0k4_return_size: return H48_TABLESIZE(0, 4) + INFOSIZE; } -STATIC int64_t -gendata_h48h0k4_bfs(h48h0k4_bfs_arg_t *arg) -{ - const uint8_t breakpoint = 10; /* Hand-picked optimal */ +/* +TODO: the following function suffers from a big performance loss (about 40%) +because of all the mutex locking. It is possible to reduce the first lock +(at the beginning of the outermost for loop) by caching the next few items at +once and then looping over the cached elements. But for the second one (inner +loop) the access is non-sequential, so this cannot be done. - if (arg->depth < breakpoint) - return gendata_h48h0k4_bfs_fromdone(arg); - else - return gendata_h48h0k4_bfs_fromnew(arg); -} +However, in both cases in theory we do not care if we are reading the old +value or the updated one. Depending on the compiler guarantees on concurrent +memory access, we may be fine just removing the locks altogether. -STATIC int64_t -gendata_h48h0k4_bfs_fromdone(h48h0k4_bfs_arg_t *arg) +It is probably possible to solve part of this by using atomics. +*/ +STATIC void * +gendata_h48h0k4_runthread(void *arg) { - uint8_t c, m, x; - uint32_t cc; - int64_t i, j, k; - cube_t cube, moved; - - for (i = 0, cc = 0; i < (int64_t)H48_COORDMAX(0); i++) { - c = get_h48_pval(arg->table, i, 4); - if (c != arg->depth - 1) - continue; - cube = invcoord_h48(i, arg->crep, 0); - for (m = 0; m < 18; m++) { - moved = move(cube, m); - j = coord_h48(moved, arg->cocsepdata, 0); - if (get_h48_pval(arg->table, j, 4) <= arg->depth) - continue; - FOREACH_H48SIM(moved, arg->cocsepdata, arg->selfsim, - k = coord_h48(moved, arg->cocsepdata, 0); - x = get_h48_pval(arg->table, k, 4); - set_h48_pval(arg->table, k, 4, arg->depth); - cc += x != arg->depth; - ) - } - } - - return cc; -} + static const uint8_t breakpoint = 10; /* Hand-picked optimal */ -STATIC int64_t -gendata_h48h0k4_bfs_fromnew(h48h0k4_bfs_arg_t *arg) -{ - uint8_t c, m, x; - uint32_t cc; - int64_t i, j; + uint8_t c, m; + uint64_t i, d, mutex; + int64_t j; cube_t cube, moved; + gendata_h48_mark_t markarg; + h48h0k4_bfs_arg_t *bfsarg; + + bfsarg = (h48h0k4_bfs_arg_t *)arg; + + markarg = (gendata_h48_mark_t) { + .depth = bfsarg->depth, + .h = 0, + .k = 4, + .cocsepdata = bfsarg->cocsepdata, + .selfsim = bfsarg->selfsim, + .table = bfsarg->table, + .table_mutex = bfsarg->table_mutex, + }; - for (i = 0, cc = 0; i < (int64_t)H48_COORDMAX(0); i++) { - c = get_h48_pval(arg->table, i, 4); - if (c != 0xF) + /* + * If depth < breakpoint, scan all neighbors of coordinates at depth-1. + * Otherwise, scan all neighbors of unvisited coordinates. + */ + for (i = bfsarg->start, d = 0; i < bfsarg->end; i++) { + mutex = H48_INDEX(i, 4) % CHUNKS; + pthread_mutex_lock(bfsarg->table_mutex[mutex]); + c = get_h48_pval(bfsarg->table, i, 4); + pthread_mutex_unlock(bfsarg->table_mutex[mutex]); + + if ((bfsarg->depth < breakpoint && c != bfsarg->depth - 1) || + (bfsarg->depth >= breakpoint && c != 0xF)) continue; - cube = invcoord_h48(i, arg->crep, 0); + + cube = invcoord_h48(i, bfsarg->crep, 0); for (m = 0; m < 18; m++) { moved = move(cube, m); - j = coord_h48(moved, arg->cocsepdata, 0); - x = get_h48_pval(arg->table, j, 4); - if (x >= arg->depth) - continue; - FOREACH_H48SIM(cube, arg->cocsepdata, arg->selfsim, - j = coord_h48(cube, arg->cocsepdata, 0); - x = get_h48_pval(arg->table, j, 4); - set_h48_pval(arg->table, j, 4, arg->depth); - cc += x == 0xF; - ) - break; /* Enough to find one, skip the rest */ + j = coord_h48(moved, bfsarg->cocsepdata, 0); + mutex = H48_INDEX(j, 4) % CHUNKS; + pthread_mutex_lock(bfsarg->table_mutex[mutex]); + c = get_h48_pval(bfsarg->table, j, 4); + pthread_mutex_unlock(bfsarg->table_mutex[mutex]); + if (bfsarg->depth < breakpoint) { + if (c <= bfsarg->depth) + continue; + markarg.cube = moved; + d += gendata_h48_mark(&markarg); + } else { + if (c >= bfsarg->depth) + continue; + markarg.cube = cube; + d += gendata_h48_mark(&markarg); + break; /* Enough to find one, skip the rest */ + } } } - return cc; + pthread_mutex_lock(bfsarg->distribution_mutex); + bfsarg->count += d; + bfsarg->distribution[bfsarg->depth] += d; + pthread_mutex_unlock(bfsarg->distribution_mutex); + + return NULL; } STATIC size_t @@ -369,24 +414,8 @@ gendata_h48k2(gendata_h48_arg_t *arg) }; gendata_h48short(&shortarg); - selectedbase = base[arg->h]; - arg->info = (tableinfo_t) { - .solver = "h48 solver h = , k = 2", - .type = TABLETYPE_PRUNING, - .infosize = INFOSIZE, - .fullsize = H48_TABLESIZE(arg->h, 2) + INFOSIZE, - .hash = 0, /* TODO */ - .entries = H48_COORDMAX(arg->h), - .classes = 0, - .h48h = arg->h, - .bits = 2, - .base = selectedbase, - .maxvalue = 3, - .next = 0, - }; - arg->info.solver[15] = (arg->h % 10) + '0'; - if (arg->h >= 10) - arg->info.solver[14] = (arg->h / 10) + '0'; + selectedbase = arg->base < 20 ? arg->base : base[arg->h]; + arg->info = makeinfo_h48k2(arg, selectedbase); inext = count = 0; pthread_mutex_init(&shortcubes_mutex, NULL); @@ -419,6 +448,7 @@ gendata_h48k2(gendata_h48_arg_t *arg) h48map_destroy(&shortcubes); + /* TODO: inline into mark */ for (j = 0; j < H48_COORDMAX(arg->h); j++) { t = get_h48_pval(table, j, 2); arg->info.distribution[t]++; @@ -474,21 +504,36 @@ gendata_h48k2_dfs(h48k2_dfs_arg_t *arg) int8_t d; uint8_t m[4]; cube_t cube[4]; + gendata_h48_mark_t markarg; + + markarg = (gendata_h48_mark_t) { + .h = arg->h, + .k = arg->k, + .cocsepdata = arg->cocsepdata, + .selfsim = arg->selfsim, + .table = arg->table, + .table_mutex = arg->table_mutex, + }; d = (int8_t)arg->shortdepth - (int8_t)arg->base; /* Depth d+0 (shortcubes) */ - gendata_h48k2_mark(arg->cube, d, arg); + markarg.depth = d; + markarg.cube = arg->cube; + gendata_h48_mark(&markarg); /* Depth d+1 */ for (m[0] = 0; m[0] < 18; m[0]++) { + markarg.depth = d+1; cube[0] = move(arg->cube, m[0]); if (gendata_h48k2_dfs_stop(cube[0], d+1, arg)) continue; - gendata_h48k2_mark(cube[0], d+1, arg); + markarg.cube = cube[0]; + gendata_h48_mark(&markarg); /* Depth d+2 */ for (m[1] = 0; m[1] < 18; m[1]++) { + markarg.depth = d+2; if (m[0] / 3 == m[1] / 3) { m[1] += 2; continue; @@ -496,12 +541,14 @@ gendata_h48k2_dfs(h48k2_dfs_arg_t *arg) cube[1] = move(cube[0], m[1]); if (gendata_h48k2_dfs_stop(cube[1], d+2, arg)) continue; - gendata_h48k2_mark(cube[1], d+2, arg); + markarg.cube = cube[1]; + gendata_h48_mark(&markarg); if (d >= 0) continue; /* Depth d+3 */ for (m[2] = 0; m[2] < 18; m[2]++) { + markarg.depth = d+3; if (!allowednextmove(m, 3)) { m[2] += 2; continue; @@ -509,48 +556,55 @@ gendata_h48k2_dfs(h48k2_dfs_arg_t *arg) cube[2] = move(cube[1], m[2]); if (gendata_h48k2_dfs_stop(cube[2], d+3, arg)) continue; - gendata_h48k2_mark(cube[2], d+3, arg); + markarg.cube = cube[2]; + gendata_h48_mark(&markarg); if (d >= -1) continue; /* Depth d+4 */ for (m[3] = 0; m[3] < 18; m[3]++) { + markarg.depth = d+4; if (!allowednextmove(m, 4)) { m[3] += 2; continue; } cube[3] = move(cube[2], m[3]); - gendata_h48k2_mark(cube[3], d+4, arg); + markarg.cube = cube[3]; + gendata_h48_mark(&markarg); } } } } } -STATIC_INLINE void -gendata_h48k2_mark(cube_t cube, int8_t depth, h48k2_dfs_arg_t *arg) +STATIC_INLINE uint64_t +gendata_h48_mark(gendata_h48_mark_t *arg) { uint8_t oldval, newval; - int64_t coord, fullcoord, mutex; + uint64_t d; + int64_t coord, mutex; - FOREACH_H48SIM(cube, arg->cocsepdata, arg->selfsim, - fullcoord = coord_h48(cube, arg->cocsepdata, 11); - coord = fullcoord >> (int64_t)(11 - arg->h); + d = 0; + FOREACH_H48SIM(arg->cube, arg->cocsepdata, arg->selfsim, + coord = coord_h48(arg->cube, arg->cocsepdata, arg->h); mutex = H48_INDEX(coord, arg->k) % CHUNKS; pthread_mutex_lock(arg->table_mutex[mutex]); oldval = get_h48_pval(arg->table, coord, arg->k); - newval = (uint8_t)MAX(depth, 0); + newval = (uint8_t)MAX(arg->depth, 0); + d += newval < oldval; set_h48_pval(arg->table, coord, arg->k, MIN(oldval, newval)); pthread_mutex_unlock(arg->table_mutex[mutex]); ) + + return d; } STATIC_INLINE bool -gendata_h48k2_dfs_stop(cube_t cube, uint8_t depth, h48k2_dfs_arg_t *arg) +gendata_h48k2_dfs_stop(cube_t cube, int8_t depth, h48k2_dfs_arg_t *arg) { uint64_t val; int64_t coord; - uint8_t oldval; + int8_t oldval; if (arg->h == 0 || arg->h == 11) { /* We are in the "real coordinate" case, we can stop @@ -568,26 +622,44 @@ gendata_h48k2_dfs_stop(cube_t cube, uint8_t depth, h48k2_dfs_arg_t *arg) } } -STATIC_INLINE uint8_t -get_h48_pval(const uint8_t *table, int64_t i, uint8_t k) +STATIC size_t +gendata_h48k2_realcoord(gendata_h48_arg_t *arg) { - return (table[H48_INDEX(i, k)] & H48_MASK(i, k)) >> H48_SHIFT(i, k); + /* TODO */ + return gendata_h48k2(arg); } -STATIC_INLINE void -set_h48_pval(uint8_t *table, int64_t i, uint8_t k, uint8_t val) +STATIC void * +gendata_h48k2_realcoord_runthread(void *arg) { - table[H48_INDEX(i, k)] = (table[H48_INDEX(i, k)] & (~H48_MASK(i, k))) - | (val << H48_SHIFT(i, k)); + /* TODO */ + return NULL; } -STATIC_INLINE uint8_t -get_h48_bound(cube_t cube, uint32_t cdata, uint8_t h, uint8_t k, uint8_t *table) +STATIC tableinfo_t +makeinfo_h48k2(gendata_h48_arg_t *arg, uint8_t base) { - int64_t coord; + tableinfo_t info; - coord = coord_h48_edges(cube, COCLASS(cdata), TTREP(cdata), h); - return get_h48_pval(table, coord, k); + info = (tableinfo_t) { + .solver = "h48 solver h = , k = 2", + .type = TABLETYPE_PRUNING, + .infosize = INFOSIZE, + .fullsize = H48_TABLESIZE(arg->h, 2) + INFOSIZE, + .hash = 0, /* TODO */ + .entries = H48_COORDMAX(arg->h), + .classes = 0, + .h48h = arg->h, + .bits = 2, + .base = base, + .maxvalue = 3, + .next = 0, + }; + info.solver[15] = (arg->h % 10) + '0'; + if (arg->h >= 10) + info.solver[14] = (arg->h / 10) + '0'; + + return info; } STATIC uint32_t * @@ -601,3 +673,25 @@ get_h48data_ptr(const void *data) { return (uint8_t *)data + COCSEP_FULLSIZE + INFOSIZE; } + +STATIC_INLINE uint8_t +get_h48_pval(const uint8_t *table, int64_t i, uint8_t k) +{ + return (table[H48_INDEX(i, k)] & H48_MASK(i, k)) >> H48_SHIFT(i, k); +} + +STATIC_INLINE void +set_h48_pval(uint8_t *table, int64_t i, uint8_t k, uint8_t val) +{ + table[H48_INDEX(i, k)] = (table[H48_INDEX(i, k)] & (~H48_MASK(i, k))) + | (val << H48_SHIFT(i, k)); +} + +STATIC_INLINE uint8_t +get_h48_bound(cube_t cube, uint32_t cdata, uint8_t h, uint8_t k, uint8_t *table) +{ + int64_t coord; + + coord = coord_h48_edges(cube, COCLASS(cdata), TTREP(cdata), h); + return get_h48_pval(table, coord, k); +} diff --git a/tools/tool.h b/tools/tool.h @@ -4,7 +4,6 @@ #include <inttypes.h> #include <stdio.h> #include <stdlib.h> -#include <time.h> #include "../src/nissy.h"