commit ec9593e2ff0856d78b37df3a977a753be82bfddc
parent ac3a91f4f173e7a38c70d5cd0caf5a025642312b
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date: Wed, 17 Dec 2025 17:36:16 +0100
Fix alignment
Diffstat:
6 files changed, 53 insertions(+), 7 deletions(-)
diff --git a/benchmarks/benchmarks.md b/benchmarks/benchmarks.md
@@ -247,3 +247,8 @@ Time per cube adjusted for table size (in seconds \* GiB, lower is better).
* For H48, both GCC and Clang have been tried, with the same options;
the resulting executable was about 10% faster with GCC compared to Clang.
vcube only supports compiling with Clang.
+* The performance of the H48 solver depends slightly, but measurably, on the
+ alignment of the pruning table in memory. This is not handled by the core
+ library, but by the program that uses it. For these tests, we have used as
+ reference implementation the program in `tools/301_solve_file`, which
+ ensures 64-byte alignment.
diff --git a/doc/h48.md b/doc/h48.md
@@ -225,6 +225,8 @@ of 1, 2 or 3 can be used directly as a lower bound of b+1, b+2 and b+3
respectively. However, a value of 0 could mean that the actual lower
bound is anything between 0 and b, so we cannot take b as a lower bound.
+#### Fallback tables
+
To be able to still use some sort of pruning value even when we get a
0 read, we use a **fallback table**. Inspired by nxopt, this table is
interleaved with the main table for cache efficiency: every 254 entries
@@ -235,6 +237,16 @@ when looking up the minimum value in the fallback table after a 0 read.
Smaller lines (of 256 or 128 bits) have been tried, but they do not
give any significant improvement over 512 bit lines.
+This trick provides the gratest performance benefits if the main pruning
+table is properly aligned. Unfortunately, as a design choice, the
+solver is implemented here as a library that defer all memory allocation
+business to the implementor. We do make sure that the table is properly
+aligned in the programs provided in this repository (for example, the
+rudimentary shell and the tools), but we do not enforce this in the
+main library code.
+
+#### Additional (fast) lookups
+
Moreover, as an additional heuristic, in case of a 0 read we also look
up another pruning value in a table that takes into account only the
position of the edges. This table is small (around 1MB), so repeated
diff --git a/doc/solvers.md b/doc/solvers.md
@@ -16,6 +16,15 @@ about how this solver works, see [h48.md](./h48.md). For benchmarks see
* Moveset: HTM (all 18 basic moves).
* From 115MB to 59GB (roughly 2<sup>X</sup>*56MB).
+*Note: for better performance, the solver's data should be 64-byte
+aligned. To achieve this, one can use
+[`aligned_alloc(64, size)`](https://en.cppreference.com/w/c/memory/aligned_alloc)
+in C11 or later,
+[`_aligned_malloc(size, 64)`](https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc)
+on Windows platforms, or the
+[aligned `new` operator](https://cppreference.com/w/cpp/memory/new/operator_new.html)
+in C++17 or later.*
+
## Coordinate solvers
Various solvers to solve different substeps, commonly used for Fewest
diff --git a/shell/shell.c b/shell/shell.c
@@ -14,6 +14,13 @@
#define SOLUTIONS_BUFFER_SIZE UINT64_C(500000)
#define MAX_PATH_LENGTH UINT64_C(10000)
+#if defined(_WIN32)
+#define wrap_aligned_alloc(align, size) malloc(size)
+#else
+#define wrap_aligned_alloc(align, size) \
+ (((size) % (align) == 0) ? aligned_alloc((align), (size)) : malloc(size))
+#endif
+
#define FLAG_CUBE "-cube"
#define FLAG_COMMAND "-command"
#define FLAG_STR_CUBE "-cubestr"
@@ -348,7 +355,7 @@ gendata_exec(args_t *args)
return -2;
}
- buf = malloc(size);
+ buf = wrap_aligned_alloc((size_t)64, size);
ret = nissy_gendata(args->str_solver, size, buf);
if (ret < 0) {
@@ -448,7 +455,7 @@ solve_exec(args_t *args)
if (args->maxsolutions == 0)
args->maxsolutions = args->optimal == 20 ? 1 : UINT_MAX;
- buf = malloc(size);
+ buf = wrap_aligned_alloc((size_t)64, size);
read = fread(buf, size, 1, file);
fclose(file);
if (read != 1) {
diff --git a/src/nissy.h b/src/nissy.h
@@ -304,7 +304,8 @@ Parameters:
data_size - The size of the data buffer. It is advised to use
nissy_solverinfo to check how much memory is needed.
data - The return parameter for the generated data.
- This buffer must have 8-byte alignment.
+ This buffer must have 8-byte alignment. Some solvers (such as
+ h48) will perform better if the buffer is 64-byte aligned.
Return values:
NISSY_ERROR_INVALID_SOLVER - The given solver is not known.
@@ -328,7 +329,10 @@ Parameters:
solver - The name of the solver.
data_size - The size of the data buffer.
data - The data for the solver. Can be computed with gendata.
- This buffer must have 8-byte alignment.
+ This buffer must have 8-byte alignment. Some solvers (such as
+ h48) will perform better if the buffer is 64-byte aligned, but
+ this is not relevant for the purpose of checking the integrity
+ of the data.
Return values:
NISSY_OK - The data is valid.
@@ -358,7 +362,9 @@ Parameters:
to 0, the default value THREADS will be used.
data_size - The size of the data buffer.
data - The data for the solver. Can be computed with gendata.
- This buffer must have 8-byte alignment.
+ This buffer must have 8-byte alignment. Some solvers
+ (such as h48) will perform better if the buffer is
+ 64-byte aligned.
sols_size - The size of the solutions buffer.
sols - The return parameter for the solutions. The solutions are
separated by a '\n' (newline) and a '\0' (NULL character)
diff --git a/tools/tool.h b/tools/tool.h
@@ -8,6 +8,13 @@
#include "../src/nissy.h"
+#if defined(_WIN32)
+#define wrap_aligned_alloc(align, size) malloc(size)
+#else
+#define wrap_aligned_alloc(align, size) \
+ (((size) % (align) == 0) ? aligned_alloc((align), (size)) : malloc(size))
+#endif
+
static void log_stderr(const char *, void *);
static double timerun(void (*)(void));
static void writetable(const unsigned char *, int64_t, const char *);
@@ -117,7 +124,7 @@ generatetable(
return -1;
}
- *buf = malloc(size);
+ *buf = wrap_aligned_alloc((size_t)64, size);
gensize = nissy_gendata(solver, size, *buf);
if (gensize != size) {
@@ -156,7 +163,7 @@ getdata(
} else {
printf("Reading tables from file %s\n", filename);
size = nissy_solverinfo(solver, dataid);
- *buf = malloc(size);
+ *buf = wrap_aligned_alloc((size_t)64, size);
sizeread = fread(*buf, size, 1, f);
fclose(f);
if (sizeread != 1) {