Merge branch 'sebastianotronto:master' into master - h48 - A prototype for an optimal Rubik's cube solver, work in progress.

commit 1a092414735f244bffa3c6878e57463b7dd1e9ba
parent be2d3fe10fd8adbf3fa4ce7cd436e2ad33363fcc
Author: Enrico Tenuti <123324975+enricotenuti@users.noreply.github.com>
Date:   Tue, 24 Sep 2024 07:13:13 +0200

Merge branch 'sebastianotronto:master' into master

Diffstat:
M src/solvers/h48/gendata_h48.h  | 370 +++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
M tools/tool.h  | 1 -

2 files changed, 232 insertions(+), 139 deletions(-)
diff --git a/src/solvers/h48/gendata_h48.h b/src/solvers/h48/gendata_h48.h
@@ -33,6 +33,7 @@ _t by _ttrep).
 typedef struct {
 	uint8_t h;
 	uint8_t k;
+	uint8_t base;
 	uint8_t maxdepth;
 	tableinfo_t info;
 	void *buf;
@@ -56,6 +57,12 @@ typedef struct {
 	uint8_t *table;
 	uint64_t *selfsim;
 	cube_t *crep;
+	uint64_t start;
+	uint64_t end;
+	uint64_t count;
+	uint64_t *distribution;
+	pthread_mutex_t *distribution_mutex;
+	pthread_mutex_t *table_mutex[CHUNKS];
 } h48h0k4_bfs_arg_t;
 
 typedef struct {
@@ -75,17 +82,28 @@ typedef struct {
 	uint64_t *count;
 } h48k2_dfs_arg_t;
 
+typedef struct {
+	cube_t cube;
+	int8_t depth;
+	uint8_t h;
+	uint8_t k;
+	uint32_t *cocsepdata;
+	uint64_t *selfsim;
+	uint8_t *table;
+	pthread_mutex_t **table_mutex;
+} gendata_h48_mark_t;
+
 STATIC uint64_t gendata_h48short(gendata_h48short_arg_t *);
 STATIC size_t gendata_h48(gendata_h48_arg_t *);
 STATIC size_t gendata_h48h0k4(gendata_h48_arg_t *);
-STATIC int64_t gendata_h48h0k4_bfs(h48h0k4_bfs_arg_t *);
-STATIC int64_t gendata_h48h0k4_bfs_fromdone(h48h0k4_bfs_arg_t *);
-STATIC int64_t gendata_h48h0k4_bfs_fromnew(h48h0k4_bfs_arg_t *);
 STATIC size_t gendata_h48k2(gendata_h48_arg_t *);
-STATIC void * gendata_h48k2_runthread(void *);
-STATIC_INLINE void gendata_h48k2_mark(cube_t, int8_t, h48k2_dfs_arg_t *);
-STATIC_INLINE bool gendata_h48k2_dfs_stop(cube_t, uint8_t, h48k2_dfs_arg_t *);
+STATIC void * gendata_h48h0k4_runthread(void *);
+STATIC_INLINE uint64_t gendata_h48_mark(gendata_h48_mark_t *);
+STATIC_INLINE bool gendata_h48k2_dfs_stop(cube_t, int8_t, h48k2_dfs_arg_t *);
+STATIC size_t gendata_h48k2_realcoord(gendata_h48_arg_t *);
 STATIC void gendata_h48k2_dfs(h48k2_dfs_arg_t *arg);
+STATIC void * gendata_h48k2_runthread(void *);
+STATIC tableinfo_t makeinfo_h48k2(gendata_h48_arg_t *, uint8_t);
 
 STATIC uint32_t *get_cocsepdata_ptr(const void *);
 STATIC uint8_t *get_h48data_ptr(const void *);
@@ -147,8 +165,12 @@ gendata_h48(gendata_h48_arg_t *arg)
 	arg->cocsepdata = (uint32_t *)cocsepdata_offset;
 	arg->h48buf = (char *)arg->buf + cocsepsize;
 
+	arg->base = 99; // TODO: set this somewhere else
+
 	if (arg->h == 0 && arg->k == 4) {
 		h48size = gendata_h48h0k4(arg);
+	} else if ((arg->h == 0 || arg->h == 11) && arg->k == 2) {
+		h48size = gendata_h48k2_realcoord(arg);
 	} else if (arg->k == 2) {
 		h48size = gendata_h48k2(arg);
 	} else {
@@ -179,17 +201,15 @@ gendata_h48_error:
 	return 0;
 }
 
-/*
-TODO description
-generating fixed table with h=0, k=4
-*/
 STATIC size_t
 gendata_h48h0k4(gendata_h48_arg_t *arg)
 {
-	uint32_t j;
 	uint8_t *table;
-	h48h0k4_bfs_arg_t bfsarg;
-	int64_t sc, cc, done, h48max;
+	int64_t sc, done, d, h48max;
+	uint64_t t, tt, isize, cc;
+	h48h0k4_bfs_arg_t bfsarg[THREADS];
+	pthread_t thread[THREADS];
+	pthread_mutex_t distribution_mutex, table_mutex[CHUNKS];
 
 	if (arg->buf == NULL)
 		goto gendata_h48h0k4_return_size;
@@ -216,109 +236,134 @@ gendata_h48h0k4(gendata_h48_arg_t *arg)
 	sc = coord_h48(SOLVED_CUBE, arg->cocsepdata, 0);
 	set_h48_pval(table, sc, 4, 0);
 	arg->info.distribution[0] = 1;
-	bfsarg = (h48h0k4_bfs_arg_t) {
-		.cocsepdata = arg->cocsepdata,
-		.table = table,
-		.selfsim = arg->selfsim,
-		.crep = arg->crep
-	};
-	for (
-		done = 1, bfsarg.depth = 1, cc = 0;
-		done < h48max && bfsarg.depth <= arg->maxdepth;
-		bfsarg.depth++
-	) {
-		LOG("h48: generating depth %" PRIu8 "\n", bfsarg.depth);
-		cc = gendata_h48h0k4_bfs(&bfsarg);
-		done += cc;
-		arg->info.distribution[bfsarg.depth] = cc;
-		LOG("found %" PRId64 "\n", cc);
+
+	isize = h48max / THREADS;
+	isize = (isize / H48_COEFF(arg->k)) * H48_COEFF(arg->k);
+	pthread_mutex_init(&distribution_mutex, NULL);
+	for (t = 0; t < CHUNKS; t++)
+		pthread_mutex_init(&table_mutex[t], NULL);
+	for (t = 0; t < THREADS; t++) {
+		bfsarg[t] = (h48h0k4_bfs_arg_t) {
+			.cocsepdata = arg->cocsepdata,
+			.table = table,
+			.selfsim = arg->selfsim,
+			.crep = arg->crep,
+			.start = isize * t,
+			.end = t == THREADS-1 ? (uint64_t)h48max : isize * (t+1),
+			.distribution = arg->info.distribution,
+			.distribution_mutex = &distribution_mutex,
+		};
+		for (tt = 0; tt < CHUNKS; tt++)
+			bfsarg[t].table_mutex[tt] = &table_mutex[tt];
 	}
+	for (done = 1, d = 1; done < h48max && d <= arg->maxdepth; d++) {
+		LOG("h48: generating depth %" PRIu8 "\n", d);
+
+		for (t = 0; t < THREADS; t++) {
+			bfsarg[t].depth = d;
+			bfsarg[t].count = 0;
+			pthread_create(&thread[t], NULL,
+			    gendata_h48h0k4_runthread, &bfsarg[t]);
+		}
+
+		for (t = 0, cc = 0; t < THREADS; t++) {
+			pthread_join(thread[t], NULL);
+			cc += bfsarg[t].count;
+		}
 
-	arg->info.maxvalue = bfsarg.depth-1;
+		done += cc;
+		arg->info.distribution[d] = cc;
 
-	LOG("h48 pruning table computed\n");
-	LOG("Maximum pruning value: %" PRIu32 "\n", arg->info.maxvalue);
-	LOG("Pruning value distribution:\n");
-	for (j = 0; j <= arg->info.maxvalue; j++)
-		LOG("%" PRIu8 ":\t%" PRIu32 "\n", j, arg->info.distribution[j]);
+		LOG("found %" PRId64 "\n", cc);
+	}
 
+	arg->info.maxvalue = d - 1;
 	writetableinfo(&arg->info, arg->h48buf);
 
 gendata_h48h0k4_return_size:
 	return H48_TABLESIZE(0, 4) + INFOSIZE;
 }
 
-STATIC int64_t
-gendata_h48h0k4_bfs(h48h0k4_bfs_arg_t *arg)
-{
-	const uint8_t breakpoint = 10; /* Hand-picked optimal */
+/*
+TODO: the following function suffers from a big performance loss (about 40%)
+because of all the mutex locking. It is possible to reduce the first lock
+(at the beginning of the outermost for loop) by caching the next few items at
+once and then looping over the cached elements. But for the second one (inner
+loop) the access is non-sequential, so this cannot be done.
 
-	if (arg->depth < breakpoint)
-		return gendata_h48h0k4_bfs_fromdone(arg);
-	else
-		return gendata_h48h0k4_bfs_fromnew(arg);
-}
+However, in both cases in theory we do not care if we are reading the old
+value or the updated one. Depending on the compiler guarantees on concurrent
+memory access, we may be fine just removing the locks altogether.
 
-STATIC int64_t
-gendata_h48h0k4_bfs_fromdone(h48h0k4_bfs_arg_t *arg)
+It is probably possible to solve part of this by using atomics.
+*/
+STATIC void *
+gendata_h48h0k4_runthread(void *arg)
 {
-	uint8_t c, m, x;
-	uint32_t cc;
-	int64_t i, j, k;
-	cube_t cube, moved;
-
-	for (i = 0, cc = 0; i < (int64_t)H48_COORDMAX(0); i++) {
-		c = get_h48_pval(arg->table, i, 4);
-		if (c != arg->depth - 1)
-			continue;
-		cube = invcoord_h48(i, arg->crep, 0);
-		for (m = 0; m < 18; m++) {
-			moved = move(cube, m);
-			j = coord_h48(moved, arg->cocsepdata, 0);
-			if (get_h48_pval(arg->table, j, 4) <= arg->depth)
-				continue;
-			FOREACH_H48SIM(moved, arg->cocsepdata, arg->selfsim,
-				k = coord_h48(moved, arg->cocsepdata, 0);
-				x = get_h48_pval(arg->table, k, 4);
-				set_h48_pval(arg->table, k, 4, arg->depth);
-				cc += x != arg->depth;
-			)
-		}
-	}
-
-	return cc;
-}
+	static const uint8_t breakpoint = 10; /* Hand-picked optimal */
 
-STATIC int64_t
-gendata_h48h0k4_bfs_fromnew(h48h0k4_bfs_arg_t *arg)
-{
-	uint8_t c, m, x;
-	uint32_t cc;
-	int64_t i, j;
+	uint8_t c, m;
+	uint64_t i, d, mutex;
+	int64_t j;
 	cube_t cube, moved;
+	gendata_h48_mark_t markarg;
+	h48h0k4_bfs_arg_t *bfsarg;
+
+	bfsarg = (h48h0k4_bfs_arg_t *)arg;
+
+	markarg = (gendata_h48_mark_t) {
+		.depth = bfsarg->depth,
+		.h = 0,
+		.k = 4,
+		.cocsepdata = bfsarg->cocsepdata,
+		.selfsim = bfsarg->selfsim,
+		.table = bfsarg->table,
+		.table_mutex = bfsarg->table_mutex,
+	};
 
-	for (i = 0, cc = 0; i < (int64_t)H48_COORDMAX(0); i++) {
-		c = get_h48_pval(arg->table, i, 4);
-		if (c != 0xF)
+	/*
+         * If depth < breakpoint, scan all neighbors of coordinates at depth-1.
+         * Otherwise, scan all neighbors of unvisited coordinates.
+	 */
+	for (i = bfsarg->start, d = 0; i < bfsarg->end; i++) {
+		mutex = H48_INDEX(i, 4) % CHUNKS;
+		pthread_mutex_lock(bfsarg->table_mutex[mutex]);
+		c = get_h48_pval(bfsarg->table, i, 4);
+		pthread_mutex_unlock(bfsarg->table_mutex[mutex]);
+
+		if ((bfsarg->depth < breakpoint && c != bfsarg->depth - 1) ||
+		    (bfsarg->depth >= breakpoint && c != 0xF))
 			continue;
-		cube = invcoord_h48(i, arg->crep, 0);
+
+		cube = invcoord_h48(i, bfsarg->crep, 0);
 		for (m = 0; m < 18; m++) {
 			moved = move(cube, m);
-			j = coord_h48(moved, arg->cocsepdata, 0);
-			x = get_h48_pval(arg->table, j, 4);
-			if (x >= arg->depth)
-				continue;
-			FOREACH_H48SIM(cube, arg->cocsepdata, arg->selfsim,
-				j = coord_h48(cube, arg->cocsepdata, 0);
-				x = get_h48_pval(arg->table, j, 4);
-				set_h48_pval(arg->table, j, 4, arg->depth);
-				cc += x == 0xF;
-			)
-			break; /* Enough to find one, skip the rest */
+			j = coord_h48(moved, bfsarg->cocsepdata, 0);
+			mutex = H48_INDEX(j, 4) % CHUNKS;
+			pthread_mutex_lock(bfsarg->table_mutex[mutex]);
+			c = get_h48_pval(bfsarg->table, j, 4);
+			pthread_mutex_unlock(bfsarg->table_mutex[mutex]);
+			if (bfsarg->depth < breakpoint) {
+				if (c <= bfsarg->depth)
+					continue;
+				markarg.cube = moved;
+				d += gendata_h48_mark(&markarg);
+			} else {
+				if (c >= bfsarg->depth)
+					continue;
+				markarg.cube = cube;
+				d += gendata_h48_mark(&markarg);
+				break; /* Enough to find one, skip the rest */
+			}
 		}
 	}
 
-	return cc;
+	pthread_mutex_lock(bfsarg->distribution_mutex);
+	bfsarg->count += d;
+	bfsarg->distribution[bfsarg->depth] += d;
+	pthread_mutex_unlock(bfsarg->distribution_mutex);
+
+	return NULL;
 }
 
 STATIC size_t
@@ -369,24 +414,8 @@ gendata_h48k2(gendata_h48_arg_t *arg)
 	};
 	gendata_h48short(&shortarg);
 
-	selectedbase = base[arg->h];
-	arg->info = (tableinfo_t) {
-		.solver = "h48 solver h =  , k = 2",
-		.type = TABLETYPE_PRUNING,
-		.infosize = INFOSIZE,
-		.fullsize = H48_TABLESIZE(arg->h, 2) + INFOSIZE,
-		.hash = 0, /* TODO */
-		.entries = H48_COORDMAX(arg->h),
-		.classes = 0,
-		.h48h = arg->h,
-		.bits = 2,
-		.base = selectedbase,
-		.maxvalue = 3,
-		.next = 0,
-	};
-	arg->info.solver[15] = (arg->h % 10) + '0';
-	if (arg->h >= 10)
-		arg->info.solver[14] = (arg->h / 10) + '0';
+	selectedbase = arg->base < 20 ? arg->base : base[arg->h];
+	arg->info = makeinfo_h48k2(arg, selectedbase);
 
 	inext = count = 0;
 	pthread_mutex_init(&shortcubes_mutex, NULL);
@@ -419,6 +448,7 @@ gendata_h48k2(gendata_h48_arg_t *arg)
 
 	h48map_destroy(&shortcubes);
 
+	/* TODO: inline into mark */
 	for (j = 0; j < H48_COORDMAX(arg->h); j++) {
 		t = get_h48_pval(table, j, 2);
 		arg->info.distribution[t]++;
@@ -474,21 +504,36 @@ gendata_h48k2_dfs(h48k2_dfs_arg_t *arg)
 	int8_t d;
 	uint8_t m[4];
 	cube_t cube[4];
+	gendata_h48_mark_t markarg;
+
+	markarg = (gendata_h48_mark_t) {
+		.h = arg->h,
+		.k = arg->k,
+		.cocsepdata = arg->cocsepdata,
+		.selfsim = arg->selfsim,
+		.table = arg->table,
+		.table_mutex = arg->table_mutex,
+	};
 
 	d = (int8_t)arg->shortdepth - (int8_t)arg->base;
 
 	/* Depth d+0 (shortcubes) */
-	gendata_h48k2_mark(arg->cube, d, arg);
+	markarg.depth = d;
+	markarg.cube = arg->cube;
+	gendata_h48_mark(&markarg);
 
 	/* Depth d+1 */
 	for (m[0] = 0; m[0] < 18; m[0]++) {
+		markarg.depth = d+1;
 		cube[0] = move(arg->cube, m[0]);
 		if (gendata_h48k2_dfs_stop(cube[0], d+1, arg))
 			continue;
-		gendata_h48k2_mark(cube[0], d+1, arg);
+		markarg.cube = cube[0];
+		gendata_h48_mark(&markarg);
 
 		/* Depth d+2 */
 		for (m[1] = 0; m[1] < 18; m[1]++) {
+			markarg.depth = d+2;
 			if (m[0] / 3 == m[1] / 3) {
 				m[1] += 2;
 				continue;
@@ -496,12 +541,14 @@ gendata_h48k2_dfs(h48k2_dfs_arg_t *arg)
 			cube[1] = move(cube[0], m[1]);
 			if (gendata_h48k2_dfs_stop(cube[1], d+2, arg))
 				continue;
-			gendata_h48k2_mark(cube[1], d+2, arg);
+			markarg.cube = cube[1];
+			gendata_h48_mark(&markarg);
 			if (d >= 0)
 				continue;
 
 			/* Depth d+3 */
 			for (m[2] = 0; m[2] < 18; m[2]++) {
+				markarg.depth = d+3;
 				if (!allowednextmove(m, 3)) {
 					m[2] += 2;
 					continue;
@@ -509,48 +556,55 @@ gendata_h48k2_dfs(h48k2_dfs_arg_t *arg)
 				cube[2] = move(cube[1], m[2]);
 				if (gendata_h48k2_dfs_stop(cube[2], d+3, arg))
 					continue;
-				gendata_h48k2_mark(cube[2], d+3, arg);
+				markarg.cube = cube[2];
+				gendata_h48_mark(&markarg);
 				if (d >= -1)
 					continue;
 
 				/* Depth d+4 */
 				for (m[3] = 0; m[3] < 18; m[3]++) {
+					markarg.depth = d+4;
 					if (!allowednextmove(m, 4)) {
 						m[3] += 2;
 						continue;
 					}
 					cube[3] = move(cube[2], m[3]);
-					gendata_h48k2_mark(cube[3], d+4, arg);
+					markarg.cube = cube[3];
+					gendata_h48_mark(&markarg);
 				}
 			}
 		}
 	}
 }
 
-STATIC_INLINE void
-gendata_h48k2_mark(cube_t cube, int8_t depth, h48k2_dfs_arg_t *arg)
+STATIC_INLINE uint64_t
+gendata_h48_mark(gendata_h48_mark_t *arg)
 {
 	uint8_t oldval, newval;
-	int64_t coord, fullcoord, mutex;
+	uint64_t d;
+	int64_t coord, mutex;
 
-	FOREACH_H48SIM(cube, arg->cocsepdata, arg->selfsim,
-		fullcoord = coord_h48(cube, arg->cocsepdata, 11);
-		coord = fullcoord >> (int64_t)(11 - arg->h);
+	d = 0;
+	FOREACH_H48SIM(arg->cube, arg->cocsepdata, arg->selfsim,
+		coord = coord_h48(arg->cube, arg->cocsepdata, arg->h);
 		mutex = H48_INDEX(coord, arg->k) % CHUNKS;
 		pthread_mutex_lock(arg->table_mutex[mutex]);
 		oldval = get_h48_pval(arg->table, coord, arg->k);
-		newval = (uint8_t)MAX(depth, 0);
+		newval = (uint8_t)MAX(arg->depth, 0);
+		d += newval < oldval;
 		set_h48_pval(arg->table, coord, arg->k, MIN(oldval, newval));
 		pthread_mutex_unlock(arg->table_mutex[mutex]);
 	)
+
+	return d;
 }
 
 STATIC_INLINE bool
-gendata_h48k2_dfs_stop(cube_t cube, uint8_t depth, h48k2_dfs_arg_t *arg)
+gendata_h48k2_dfs_stop(cube_t cube, int8_t depth, h48k2_dfs_arg_t *arg)
 {
 	uint64_t val;
 	int64_t coord;
-	uint8_t oldval;
+	int8_t oldval;
 
 	if (arg->h == 0 || arg->h == 11) {
 		/* We are in the "real coordinate" case, we can stop
@@ -568,26 +622,44 @@ gendata_h48k2_dfs_stop(cube_t cube, uint8_t depth, h48k2_dfs_arg_t *arg)
 	}
 }
 
-STATIC_INLINE uint8_t
-get_h48_pval(const uint8_t *table, int64_t i, uint8_t k)
+STATIC size_t
+gendata_h48k2_realcoord(gendata_h48_arg_t *arg)
 {
-	return (table[H48_INDEX(i, k)] & H48_MASK(i, k)) >> H48_SHIFT(i, k);
+	/* TODO */
+	return gendata_h48k2(arg);
 }
 
-STATIC_INLINE void
-set_h48_pval(uint8_t *table, int64_t i, uint8_t k, uint8_t val)
+STATIC void *
+gendata_h48k2_realcoord_runthread(void *arg)
 {
-	table[H48_INDEX(i, k)] = (table[H48_INDEX(i, k)] & (~H48_MASK(i, k)))
-	    | (val << H48_SHIFT(i, k));
+	/* TODO */
+	return NULL;
 }
 
-STATIC_INLINE uint8_t
-get_h48_bound(cube_t cube, uint32_t cdata, uint8_t h, uint8_t k, uint8_t *table)
+STATIC tableinfo_t
+makeinfo_h48k2(gendata_h48_arg_t *arg, uint8_t base)
 {
-	int64_t coord;
+	tableinfo_t info;
 
-	coord = coord_h48_edges(cube, COCLASS(cdata), TTREP(cdata), h);
-	return get_h48_pval(table, coord, k);
+	info = (tableinfo_t) {
+		.solver = "h48 solver h =  , k = 2",
+		.type = TABLETYPE_PRUNING,
+		.infosize = INFOSIZE,
+		.fullsize = H48_TABLESIZE(arg->h, 2) + INFOSIZE,
+		.hash = 0, /* TODO */
+		.entries = H48_COORDMAX(arg->h),
+		.classes = 0,
+		.h48h = arg->h,
+		.bits = 2,
+		.base = base,
+		.maxvalue = 3,
+		.next = 0,
+	};
+	info.solver[15] = (arg->h % 10) + '0';
+	if (arg->h >= 10)
+		info.solver[14] = (arg->h / 10) + '0';
+
+	return info;
 }
 
 STATIC uint32_t *
@@ -601,3 +673,25 @@ get_h48data_ptr(const void *data)
 {
 	return (uint8_t *)data + COCSEP_FULLSIZE + INFOSIZE;
 }
+
+STATIC_INLINE uint8_t
+get_h48_pval(const uint8_t *table, int64_t i, uint8_t k)
+{
+	return (table[H48_INDEX(i, k)] & H48_MASK(i, k)) >> H48_SHIFT(i, k);
+}
+
+STATIC_INLINE void
+set_h48_pval(uint8_t *table, int64_t i, uint8_t k, uint8_t val)
+{
+	table[H48_INDEX(i, k)] = (table[H48_INDEX(i, k)] & (~H48_MASK(i, k)))
+	    | (val << H48_SHIFT(i, k));
+}
+
+STATIC_INLINE uint8_t
+get_h48_bound(cube_t cube, uint32_t cdata, uint8_t h, uint8_t k, uint8_t *table)
+{
+	int64_t coord;
+
+	coord = coord_h48_edges(cube, COCLASS(cdata), TTREP(cdata), h);
+	return get_h48_pval(table, coord, k);
+}
diff --git a/tools/tool.h b/tools/tool.h
@@ -4,7 +4,6 @@
 #include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <time.h>
 
 #include "../src/nissy.h"

	h48 A prototype for an optimal Rubik's cube solver, work in progress.
	git clone https://git.tronto.net/h48
	Download \| Log \| Files \| Refs \| README \| LICENSE

M	src/solvers/h48/gendata_h48.h	\|	370	+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
M	tools/tool.h	\|	1	-