Improve gendata performance by replacing locks with atomic operations - h48 - A prototype for an optimal Rubik's cube solver, work in progress.

commit 9de887d8f178cec5693cafb03b8715804669f65e
parent 4ee7739d78889cef33d0c60a870922398e0b5e31
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Thu,  3 Oct 2024 18:28:04 +0200

Improve gendata performance by replacing locks with atomic operations

Diffstat:
M Makefile  | 2 +-
M src/solvers/h48/gendata_cocsep.h  | 4 ++--
M src/solvers/h48/gendata_h48.h  | 91 ++++++++++++++++++++++++++++++++++++-------------------------------------------
M src/solvers/h48/solve.h  | 8 ++++----
M src/solvers/h48/solve_multithread.h  | 4 ++--
M src/solvers/h48/stats.h  | 8 ++++----

6 files changed, 54 insertions(+), 63 deletions(-)
diff --git a/Makefile b/Makefile
@@ -15,7 +15,7 @@ debugnissy.o:
 	${CC} ${MACROS} ${DBGFLAGS} -c -o debugnissy.o src/nissy.c
 
 clean:
-	rm -rf *.o run
+	rm -rf *.o *.so run
 
 test: debugnissy.o
 	CC="${CC} ${MACROS} ${DBGFLAGS}" OBJ=debugnissy.o ./test/test.sh
diff --git a/src/solvers/h48/gendata_cocsep.h b/src/solvers/h48/gendata_cocsep.h
@@ -27,7 +27,7 @@ STATIC size_t gendata_cocsep(void *, uint64_t *, cube_t *);
 STATIC uint32_t gendata_cocsep_dfs(cocsep_dfs_arg_t *);
 STATIC void getdistribution_cocsep(const uint32_t *, uint64_t [static 21]);
 
-STATIC_INLINE int8_t get_h48_cdata(cube_t, uint32_t *, uint32_t *);
+STATIC_INLINE int8_t get_h48_cdata(cube_t, const uint32_t *, uint32_t *);
 
 /*
 Each element of the cocsep table is a uint32_t used as follows:
@@ -182,7 +182,7 @@ set_visited(uint8_t *a, int64_t i)
 }
 
 STATIC_INLINE int8_t
-get_h48_cdata(cube_t cube, uint32_t *cocsepdata, uint32_t *cdata)
+get_h48_cdata(cube_t cube, const uint32_t *cocsepdata, uint32_t *cdata)
 {
 	int64_t coord;
 
diff --git a/src/solvers/h48/gendata_h48.h b/src/solvers/h48/gendata_h48.h
@@ -54,7 +54,7 @@ typedef struct {
 typedef struct {
 	uint8_t depth;
 	uint32_t *cocsepdata;
-	uint8_t *table;
+	_Atomic uint8_t *table;
 	uint64_t *selfsim;
 	cube_t *crep;
 	uint64_t start;
@@ -72,7 +72,7 @@ typedef struct {
 	uint8_t base;
 	uint8_t shortdepth;
 	uint32_t *cocsepdata;
-	uint8_t *table;
+	_Atomic uint8_t *table;
 	uint64_t *selfsim;
 	cube_t *crep;
 	h48map_t *shortcubes;
@@ -89,7 +89,7 @@ typedef struct {
 	uint8_t k;
 	uint32_t *cocsepdata;
 	uint64_t *selfsim;
-	uint8_t *table;
+	_Atomic uint8_t *table;
 	pthread_mutex_t **table_mutex;
 } gendata_h48_mark_t;
 
@@ -107,13 +107,13 @@ STATIC tableinfo_t makeinfo_h48k2(gendata_h48_arg_t *);
 STATIC void getdistribution_h48(const uint8_t *,
     uint64_t [static INFO_DISTRIBUTION_LEN], uint8_t, uint8_t);
 
-STATIC uint32_t *get_cocsepdata_ptr(const void *);
-STATIC uint8_t *get_h48data_ptr(const void *);
+STATIC const uint32_t *get_cocsepdata_constptr(const void *);
+STATIC const uint8_t *get_h48data_constptr(const void *);
 
 STATIC_INLINE uint8_t get_h48_pval(const uint8_t *, int64_t, uint8_t);
-STATIC_INLINE void set_h48_pval(uint8_t *, int64_t, uint8_t, uint8_t);
+STATIC_INLINE void set_h48_pval(_Atomic uint8_t *, int64_t, uint8_t, uint8_t);
 STATIC_INLINE uint8_t get_h48_bound(
-    cube_t, uint32_t, uint8_t, uint8_t, uint8_t *);
+    cube_t, uint32_t, uint8_t, uint8_t, const uint8_t *);
 
 size_t gendata_h48_derive(uint8_t, const void *, void *);
 
@@ -209,7 +209,7 @@ gendata_h48_error:
 STATIC size_t
 gendata_h48h0k4(gendata_h48_arg_t *arg)
 {
-	uint8_t *table;
+	_Atomic uint8_t *table;
 	int64_t sc, done, d, h48max;
 	uint64_t t, tt, isize, cc;
 	h48h0k4_bfs_arg_t bfsarg[THREADS];
@@ -234,7 +234,7 @@ gendata_h48h0k4(gendata_h48_arg_t *arg)
 		.next = 0,
 	};
 
-	table = (uint8_t *)arg->h48buf + INFOSIZE;
+	table = (_Atomic uint8_t *)arg->h48buf + INFOSIZE;
 	memset(table, 0xFF, H48_TABLESIZE(0, 4));
 
 	h48max = (int64_t)H48_COORDMAX(0);
@@ -289,26 +289,13 @@ gendata_h48h0k4_return_size:
 	return H48_TABLESIZE(0, 4) + INFOSIZE;
 }
 
-/*
-TODO: the following function suffers from a big performance loss (about 40%)
-because of all the mutex locking. It is possible to reduce the first lock
-(at the beginning of the outermost for loop) by caching the next few items at
-once and then looping over the cached elements. But for the second one (inner
-loop) the access is non-sequential, so this cannot be done.
-
-However, in both cases in theory we do not care if we are reading the old
-value or the updated one. Depending on the compiler guarantees on concurrent
-memory access, we may be fine just removing the locks altogether.
-
-It is probably possible to solve part of this by using atomics.
-*/
 STATIC void *
 gendata_h48h0k4_runthread(void *arg)
 {
 	static const uint8_t breakpoint = 10; /* Hand-picked optimal */
 
 	uint8_t c, m;
-	uint64_t i, d, mutex;
+	uint64_t i, d;
 	int64_t j;
 	cube_t cube, moved;
 	gendata_h48_mark_t markarg;
@@ -331,10 +318,7 @@ gendata_h48h0k4_runthread(void *arg)
          * Otherwise, scan all neighbors of unvisited coordinates.
 	 */
 	for (i = bfsarg->start, d = 0; i < bfsarg->end; i++) {
-		mutex = H48_INDEX(i, 4) % CHUNKS;
-		pthread_mutex_lock(bfsarg->table_mutex[mutex]);
-		c = get_h48_pval(bfsarg->table, i, 4);
-		pthread_mutex_unlock(bfsarg->table_mutex[mutex]);
+		c = get_h48_pval((uint8_t *)bfsarg->table, i, 4);
 
 		if ((bfsarg->depth < breakpoint && c != bfsarg->depth - 1) ||
 		    (bfsarg->depth >= breakpoint && c != 0xF))
@@ -344,10 +328,7 @@ gendata_h48h0k4_runthread(void *arg)
 		for (m = 0; m < 18; m++) {
 			moved = move(cube, m);
 			j = coord_h48(moved, bfsarg->cocsepdata, 0);
-			mutex = H48_INDEX(j, 4) % CHUNKS;
-			pthread_mutex_lock(bfsarg->table_mutex[mutex]);
-			c = get_h48_pval(bfsarg->table, j, 4);
-			pthread_mutex_unlock(bfsarg->table_mutex[mutex]);
+			c = get_h48_pval((uint8_t *)bfsarg->table, j, 4);
 			if (bfsarg->depth < breakpoint) {
 				if (c <= bfsarg->depth)
 					continue;
@@ -392,7 +373,8 @@ gendata_h48k2(gendata_h48_arg_t *arg)
 		[11] = 10
 	};
 
-	uint8_t t, *table;
+	uint8_t t;
+	_Atomic uint8_t *table;
 	int64_t j;
 	uint64_t i, ii, inext, count;
 	h48map_t shortcubes;
@@ -404,7 +386,7 @@ gendata_h48k2(gendata_h48_arg_t *arg)
 	if (arg->buf == NULL)
 		goto gendata_h48k2_return_size;
 
-	table = (uint8_t *)arg->h48buf + INFOSIZE;
+	table = (_Atomic uint8_t *)arg->h48buf + INFOSIZE;
 	if (arg->buf != NULL)
 		memset(table, 0xFF, H48_TABLESIZE(arg->h, arg->k));
 
@@ -456,7 +438,7 @@ gendata_h48k2(gendata_h48_arg_t *arg)
 
 	/* TODO: inline into mark */
 	for (j = 0; j < H48_COORDMAX(arg->h); j++) {
-		t = get_h48_pval(table, j, 2);
+		t = get_h48_pval((uint8_t *)table, j, 2);
 		arg->info.distribution[t]++;
 	}
 
@@ -595,7 +577,7 @@ gendata_h48_mark(gendata_h48_mark_t *arg)
 		coord = coord_h48(arg->cube, arg->cocsepdata, arg->h);
 		mutex = H48_INDEX(coord, arg->k) % CHUNKS;
 		pthread_mutex_lock(arg->table_mutex[mutex]);
-		oldval = get_h48_pval(arg->table, coord, arg->k);
+		oldval = get_h48_pval((uint8_t *)arg->table, coord, arg->k);
 		newval = (uint8_t)MAX(arg->depth, 0);
 		d += newval < oldval;
 		set_h48_pval(arg->table, coord, arg->k, MIN(oldval, newval));
@@ -618,7 +600,7 @@ gendata_h48k2_dfs_stop(cube_t cube, int8_t depth, h48k2_dfs_arg_t *arg)
 		coord = coord_h48(cube, arg->cocsepdata, arg->h);
 		mutex = H48_INDEX(coord, arg->k) % CHUNKS;
 		pthread_mutex_lock(arg->table_mutex[mutex]);
-		oldval = get_h48_pval(arg->table, coord, arg->k);
+		oldval = get_h48_pval((uint8_t *)arg->table, coord, arg->k);
 		pthread_mutex_unlock(arg->table_mutex[mutex]);
 		return oldval <= depth;
 	} else {
@@ -690,14 +672,14 @@ getdistribution_h48(
 	}
 }
 
-STATIC uint32_t *
-get_cocsepdata_ptr(const void *data)
+STATIC const uint32_t *
+get_cocsepdata_constptr(const void *data)
 {
 	return (uint32_t *)((char *)data + INFOSIZE);
 }
 
-STATIC uint8_t *
-get_h48data_ptr(const void *data)
+STATIC const uint8_t *
+get_h48data_constptr(const void *data)
 {
 	return (uint8_t *)data + COCSEP_FULLSIZE + INFOSIZE;
 }
@@ -709,15 +691,20 @@ get_h48_pval(const uint8_t *table, int64_t i, uint8_t k)
 }
 
 STATIC_INLINE void
-set_h48_pval(uint8_t *table, int64_t i, uint8_t k, uint8_t val)
+set_h48_pval(_Atomic uint8_t *table, int64_t i, uint8_t k, uint8_t val)
 {
 	table[H48_INDEX(i, k)] = (table[H48_INDEX(i, k)] & (~H48_MASK(i, k)))
 	    | (val << H48_SHIFT(i, k));
 }
 
 STATIC_INLINE uint8_t
-get_h48_bound(cube_t cube, uint32_t cdata, uint8_t h, uint8_t k, uint8_t *table)
-{
+get_h48_bound(
+	cube_t cube,
+	uint32_t cdata,
+	uint8_t h,
+	uint8_t k,
+	const uint8_t *table
+) {
 	int64_t coord;
 
 	coord = coord_h48_edges(cube, COCLASS(cdata), TTREP(cdata), h);
@@ -728,7 +715,9 @@ size_t
 gendata_h48_derive(uint8_t h, const void *fulltable, void *buf)
 {
 	size_t cocsepsize, h48size;
-	uint8_t val_full, val_derive, *h48full, *h48derive;
+	uint8_t val_full, val_derive;
+	const uint8_t *h48full;
+	_Atomic uint8_t *h48derive;
 	int64_t i, j, h48max;
 	gendata_h48_arg_t arg;
 	tableinfo_t cocsepinfo, fulltableinfo;
@@ -750,7 +739,7 @@ gendata_h48_derive(uint8_t h, const void *fulltable, void *buf)
 	/* Technically this step is redundant, except that we
 	   need selfsim and crep */
 	cocsepsize = gendata_cocsep(buf, arg.selfsim, arg.crep);
-	arg.h48buf = (uint8_t *)buf + cocsepsize;
+	arg.h48buf = (_Atomic uint8_t *)buf + cocsepsize;
 	h48size = H48_TABLESIZE(h, arg.k) + INFOSIZE;
 
 	if (buf == NULL)
@@ -768,8 +757,8 @@ gendata_h48_derive(uint8_t h, const void *fulltable, void *buf)
 		goto gendata_h48_derive_error;
 	}
 
-	h48full = (uint8_t *)fulltable + cocsepsize + INFOSIZE;
-	h48derive = (uint8_t *)arg.h48buf + INFOSIZE;
+	h48full = (const uint8_t *)fulltable + cocsepsize + INFOSIZE;
+	h48derive = (_Atomic uint8_t *)arg.h48buf + INFOSIZE;
 	memset(h48derive, 0xFF, H48_TABLESIZE(h, arg.k));
 	memset(arg.info.distribution, 0,
 	    INFO_DISTRIBUTION_LEN * sizeof(uint64_t));
@@ -780,11 +769,13 @@ gendata_h48_derive(uint8_t h, const void *fulltable, void *buf)
 			LOG("Processing %" PRId64 "th coordinate\n", i);
 		j = i >> (int64_t)(fulltableinfo.h48h - h);
 		val_full = get_h48_pval(h48full, i, arg.k);
-		val_derive = get_h48_pval(h48derive, j, arg.k);
-		set_h48_pval(h48derive, j, arg.k, MIN(val_full, val_derive));
+		val_derive = get_h48_pval((uint8_t *)h48derive, j, arg.k);
+		set_h48_pval(
+		    h48derive, j, arg.k, MIN(val_full, val_derive));
 	}
 
-	getdistribution_h48(h48derive, arg.info.distribution, h, arg.k);
+	getdistribution_h48(
+	    (uint8_t *)h48derive, arg.info.distribution, h, arg.k);
 
 	if (!writetableinfo(&arg.info, arg.h48buf)) {
 		LOG("gendata_h48_derive: could not write info for table\n");
diff --git a/src/solvers/h48/solve.h b/src/solvers/h48/solve.h
@@ -8,8 +8,8 @@ typedef struct {
 	int64_t maxsolutions;
 	uint8_t h;
 	uint8_t k;
-	uint32_t *cocsepdata;
-	uint8_t *h48data;
+	const uint32_t *cocsepdata;
+	const uint8_t *h48data;
 	char **nextsol;
 	uint8_t nissbranch;
 	int8_t npremoves;
@@ -183,8 +183,8 @@ solve_h48(
 		.maxsolutions = maxsolutions,
 		.h = info.h48h,
 		.k = info.bits,
-		.cocsepdata = get_cocsepdata_ptr(data),
-		.h48data = get_h48data_ptr(data),
+		.cocsepdata = get_cocsepdata_constptr(data),
+		.h48data = get_h48data_constptr(data),
 		.nextsol = &solutions
 	};
 
diff --git a/src/solvers/h48/solve_multithread.h b/src/solvers/h48/solve_multithread.h
@@ -258,8 +258,8 @@ solve_h48_multithread(
 		.maxsolutions = maxsolutions,
 		.h = info.h48h,
 		.k = info.bits,
-		.cocsepdata = get_cocsepdata_ptr(data),
-		.h48data = get_h48data_ptr(data),
+		.cocsepdata = get_cocsepdata_constptr(data),
+		.h48data = get_h48data_constptr(data),
 		.nextsol = &solutions};
 
 	task_queue_t q;
diff --git a/src/solvers/h48/stats.h b/src/solvers/h48/stats.h
@@ -10,8 +10,8 @@ typedef struct {
 	int8_t nmoves;
 	int8_t depth;
 	uint8_t moves[MAXLEN];
-	uint32_t *cocsepdata;
-	uint8_t *h48data;
+	const uint32_t *cocsepdata;
+	const uint8_t *h48data;
 	char *s;
 } dfsarg_solveh48stats_t;
 
@@ -80,8 +80,8 @@ solve_h48stats(
 
 	arg = (dfsarg_solveh48stats_t) {
 		.cube = cube,
-		.cocsepdata = get_cocsepdata_ptr(data),
-		.h48data = get_h48data_ptr(data),
+		.cocsepdata = get_cocsepdata_constptr(data),
+		.h48data = get_h48data_constptr(data),
 		.s = solutions
 	};

	h48 A prototype for an optimal Rubik's cube solver, work in progress.
	git clone https://git.tronto.net/h48
	Download \| Log \| Files \| Refs \| README \| LICENSE

M	Makefile	\|	2	+-
M	src/solvers/h48/gendata_cocsep.h	\|	4	++--
M	src/solvers/h48/gendata_h48.h	\|	91	++++++++++++++++++++++++++++++++++++-------------------------------------------
M	src/solvers/h48/solve.h	\|	8	++++----
M	src/solvers/h48/solve_multithread.h	\|	4	++--
M	src/solvers/h48/stats.h	\|	8	++++----