nissy-core

The "engine" of nissy, including the H48 optimal solver.
git clone https://git.tronto.net/nissy-core
Download | Log | Files | Refs | README | LICENSE

commit 2c291aed4bb80f07e8557284d3399cf1e2a4ccb9
parent dff3a040637f2985b5b5e369148a993183964096
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Sun,  1 Mar 2026 18:09:44 +0100

Improve Windows build support.

- Use multithreading (works with a sufficiently recent version of
  the Microsoft developer tools / C SDK).
- Detect CPU architecture and use AVX2 or NEON when appropriate.
- Automatically detect python installation path.

Diffstat:
MREADME.md | 6+++---
Mbuild.bat | 26+++++++++++++++++++-------
Msrc/solvers/distribution.h | 10+++++-----
Msrc/solvers/h48/distribution_h48.h | 10+++++-----
Msrc/solvers/h48/gendata_h48.h | 14+++++++-------
Msrc/solvers/h48/solve.h | 12++++++------
Msrc/utils/prefetch.h | 16++++------------
Msrc/utils/wrapthread.h | 86++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
8 files changed, 112 insertions(+), 68 deletions(-)

diff --git a/README.md b/README.md @@ -64,9 +64,9 @@ can be used to build the basic shell, while Builds and runs the unit tests. See `build.bat help` for a list of all available options. -Note: At the moment certain optimizations, such as multithreading and -advanced CPU instructions, are not supported on Windows. -Work is ongoing to improve Windows support. +Note: The build script for Windows does not support all the options +available with build.sh. If you want to tune the build options, you'll +have to manually edit the build script. ## Running tests diff --git a/build.bat b/build.bat @@ -9,15 +9,25 @@ if [%1]==[/d] ( SET CC=clang SET CXX=clang++ -SET ARCH=PORTABLE -SET THREADS=1 +SET THREADS=16 SET SANITIZE= -:: TODO depends on ARCH variable -SET ARCHOPTS= -:: TODO depends on SANITIZE variable -SET DFLAGS=-g3 -DDEBUG +SET ARCH=PORTABLE +clang -march=native -dM -E - < NUL | findstr /C:"__AVX2__" >NUL 2>&1 +if %ERRORLEVEL% EQU 0 ( + SET ARCH=AVX2 + SET ARCHOPTS=-mavx2 + goto :ArchDone +) +clang -march=native -dM -E - < NUL | findstr /C:"__ARM_NEON" >NUL 2>&1 +if %ERRORLEVEL% EQU 0 ( + SET ARCH=NEON + goto :ArchDone +) +:ArchDone + +SET DFLAGS=-g3 -DDEBUG SET WARNINGS=-Wno-deprecated-declarations SET VARIABLES=-DTHREADS=%THREADS% -D%ARCH% SET OFLAGS=-O3 @@ -28,7 +38,9 @@ SET STACKSIZE=-Wl,-stack:16777216 SET LFLAGS=%STACKSIZE% :: Python libraries - change to match your local installation -SET PYPATH=%userprofile%\AppData\Local\Programs\Python\Python313 +for /f "delims=" %%i in ('python -c "import sys; print(sys.base_prefix)"') do set PYPATH=%%i +::SET PYPATH=%userprofile%\AppData\Local\Programs\Python\Python313 + SET PYINCLUDE=%PYPATH%\include SET PYLIBS=%PYPATH%\libs diff --git a/src/solvers/distribution.h b/src/solvers/distribution.h @@ -10,13 +10,13 @@ typedef struct { const unsigned char *table; } getdistribution_data_t; -STATIC void *getdistribution_runthread(void *); +STATIC wrapthread_return_t getdistribution_runthread(void *); STATIC void getdistribution(const unsigned char *, uint64_t [static INFO_DISTRIBUTION_LEN], const tableinfo_t [static 1]); STATIC bool distribution_equal(const uint64_t [static INFO_DISTRIBUTION_LEN], const uint64_t [static INFO_DISTRIBUTION_LEN], uint8_t); -STATIC void * +STATIC wrapthread_return_t getdistribution_runthread(void *arg) { getdistribution_data_t *data = (getdistribution_data_t *)arg; @@ -33,7 +33,7 @@ getdistribution_runthread(void *arg) for (j = 0; j < ENTRIES_PER_BYTE(k); j++) data->distr[(table[i] & (m << (j*k))) >> (j*k)]++; - return NULL; + return wrapthread_return_val; } STATIC void @@ -60,12 +60,12 @@ getdistribution( .distr = local_distr[i], .table = table, }; - wrapthread_create(&thread[i], NULL, + wrapthread_create(&thread[i], getdistribution_runthread, &targ[i]); } for (i = 0; i < THREADS; i++) - wrapthread_join(thread[i], NULL); + wrapthread_join(thread[i]); memset(distr, 0, INFO_DISTRIBUTION_LEN * sizeof(uint64_t)); for (i = 0; i < THREADS; i++) diff --git a/src/solvers/h48/distribution_h48.h b/src/solvers/h48/distribution_h48.h @@ -4,11 +4,11 @@ needed for H48 because of the intertwined fallback table, and it is easier to have some duplication than to make these functions needlessly generic. */ -STATIC void *getdistribution_h48_runthread(void *); +STATIC wrapthread_return_t getdistribution_h48_runthread(void *); STATIC void getdistribution_h48(const unsigned char *, uint64_t [static INFO_DISTRIBUTION_LEN], const tableinfo_t [static 1]); -STATIC void * +STATIC wrapthread_return_t getdistribution_h48_runthread(void *arg) { getdistribution_data_t *data = (getdistribution_data_t *)arg; @@ -33,7 +33,7 @@ getdistribution_h48_runthread(void *arg) data->distr[(t & (m << (3*k))) >> (3*k)]--; } - return NULL; + return wrapthread_return_val; } STATIC void @@ -58,12 +58,12 @@ getdistribution_h48( .distr = local_distr[i], .table = table, }; - wrapthread_create(&thread[i], NULL, + wrapthread_create(&thread[i], getdistribution_h48_runthread, &targ[i]); } for (i = 0; i < THREADS; i++) - wrapthread_join(thread[i], NULL); + wrapthread_join(thread[i]); memset(distr, 0, INFO_DISTRIBUTION_LEN * sizeof(uint64_t)); for (i = 0; i < THREADS; i++) diff --git a/src/solvers/h48/gendata_h48.h b/src/solvers/h48/gendata_h48.h @@ -3,7 +3,7 @@ STATIC long long gendata_h48_dispatch( STATIC uint64_t gendata_h48short(gendata_h48short_arg_t [static 1]); STATIC int64_t gendata_h48(gendata_h48_arg_t [static 1]); STATIC void gendata_h48_maintable(gendata_h48_arg_t [static 1]); -STATIC void *gendata_h48_runthread(void *); +STATIC wrapthread_return_t gendata_h48_runthread(void *); STATIC_INLINE void gendata_h48_mark(gendata_h48_mark_t [static 1]); STATIC_INLINE bool gendata_h48_dfs_stop( @@ -230,9 +230,9 @@ gendata_h48_maintable(gendata_h48_arg_t arg[static 1]) inext = 0; count = 0; - wrapthread_mutex_init(&shortcubes_mutex, NULL); + wrapthread_mutex_init(&shortcubes_mutex); for (i = 0; i < CHUNKS; i++) - wrapthread_mutex_init(&table_mutex[i], NULL); + wrapthread_mutex_init(&table_mutex[i]); for (i = 0; i < THREADS; i++) { dfsarg[i] = (h48_dfs_arg_t){ .h = arg->h, @@ -251,7 +251,7 @@ gendata_h48_maintable(gendata_h48_arg_t arg[static 1]) dfsarg[i].table_mutex[ii] = &table_mutex[ii]; wrapthread_create( - &thread[i], NULL, gendata_h48_runthread, &dfsarg[i]); + &thread[i], gendata_h48_runthread, &dfsarg[i]); } if (NISSY_CANSLEEP) { @@ -281,7 +281,7 @@ gendata_h48_maintable(gendata_h48_arg_t arg[static 1]) } for (i = 0; i < THREADS; i++) - wrapthread_join(thread[i], NULL); + wrapthread_join(thread[i]); h48map_destroy(&shortcubes); @@ -291,7 +291,7 @@ gendata_h48_maintable(gendata_h48_arg_t arg[static 1]) writetableinfo(&arg->info, bufsize, (unsigned char *)arg->h48buf); } -STATIC void * +STATIC wrapthread_return_t gendata_h48_runthread(void *arg) { uint64_t coord, coordext, coordmin; @@ -328,7 +328,7 @@ gendata_h48_runthread(void *arg) } } - return NULL; + return wrapthread_return_val; } STATIC void diff --git a/src/solvers/h48/solve.h b/src/solvers/h48/solve.h @@ -78,7 +78,7 @@ STATIC_INLINE void h48_prune_restore_inverse(const h48_prune_t [static 1], STATIC int64_t solve_h48_maketasks( dfsarg_solve_h48_t [static 1], dfsarg_solve_h48_maketasks_t [static 1], solve_h48_task_t [static H48_STARTING_CUBES], int [static 1]); -STATIC void *solve_h48_runthread(void *); +STATIC wrapthread_return_t solve_h48_runthread(void *); STATIC int64_t solve_h48_dfs(dfsarg_solve_h48_t [static 1]); STATIC void solve_h48_log_solutions(solution_list_t [static 1], size_t); STATIC int solve_h48_compare_tasks(const void *, const void *); @@ -383,7 +383,7 @@ solve_h48_dfs(dfsarg_solve_h48_t arg[static 1]) return ret; } -STATIC void * +STATIC wrapthread_return_t solve_h48_runthread(void *arg) { int i, j; @@ -445,7 +445,7 @@ solve_h48_runthread(void *arg) solve_h48_runthread_end: dfsarg->thread_done = true; - return NULL; + return wrapthread_return_val; } STATIC int64_t @@ -638,7 +638,7 @@ solve_h48( } - wrapthread_mutex_init(&solutions_mutex, NULL); + wrapthread_mutex_init(&solutions_mutex); mtarg = (dfsarg_solve_h48_maketasks_t) { .cube = oc.cube, @@ -689,7 +689,7 @@ solve_h48( arg[i].target_depth = d; arg[i].thread_done = false; wrapthread_create( - &thread[i], NULL, solve_h48_runthread, &arg[i]); + &thread[i], solve_h48_runthread, &arg[i]); } /* Log solutions and handle pause / stop / resume */ @@ -718,7 +718,7 @@ solve_h48( } for (i = 0; i < threads; i++) - wrapthread_join(thread[i], NULL); + wrapthread_join(thread[i]); solve_h48_log_solutions(&sollist, lastused); lastused = sollist.used; diff --git a/src/utils/prefetch.h b/src/utils/prefetch.h @@ -1,15 +1,7 @@ -#if defined(AVX2) - -#define prefetch(a, i) _mm_prefetch(a+i, _MM_HINT_T0) - -#else #if defined(__GNUC__) || defined(__clang__) - -#define prefetch(a, i) __builtin_prefetch(a+i, 0, 0) - + #define prefetch(a, i) __builtin_prefetch(a+i, 0, 0) +#elif defined(AVX2) + #define prefetch(a, i) _mm_prefetch((char *)(a+i), _MM_HINT_T0) #else - -#define prefetch(a, i) (void)i - -#endif + #define prefetch(a, i) (void)i #endif diff --git a/src/utils/wrapthread.h b/src/utils/wrapthread.h @@ -1,37 +1,77 @@ #if THREADS == 1 + #define WRAPTHREAD_NOTHREADS 1 +#elif defined(__unix__) + #define WRAPTHREAD_PTHREADS 1 +#elif defined(__has_include) + #if __has_include(<pthreads.h>) + #define WRAPTHREAD_PTHREADS 1 + #elif __has_include(<threads.h>) + #define WRAPTHREAD_C11THREADS 1 + #endif +#else + #define WRAPTHREAD_NOTHREADS 1 +#endif + +#if WRAPTHREAD_PTHREADS + #include <pthread.h> + + #define wrapthread_atomic _Atomic + + #define wrapthread_return_t void * + #define wrapthread_return_val NULL + + #define wrapthread_define_var_thread_t(x) pthread_t x + #define wrapthread_define_var_mutex_t(x) pthread_mutex_t x + #define wrapthread_define_struct_thread_t(x) pthread_t x + #define wrapthread_define_struct_mutex_t(x) pthread_mutex_t x + + #define wrapthread_define_if_threads(T, x) T x + + #define wrapthread_create(a, f, arg) pthread_create(a, NULL, f, arg) + #define wrapthread_join(a) pthread_join(a, NULL) + #define wrapthread_mutex_init(a) pthread_mutex_init(a, NULL) + #define wrapthread_mutex_lock(a) pthread_mutex_lock(a) + #define wrapthread_mutex_unlock(a) pthread_mutex_unlock(a) + +#elif WRAPTHREAD_C11THREADS + #include <threads.h> + + #define wrapthread_atomic _Atomic -#define wrapthread_atomic + #define wrapthread_return_t int + #define wrapthread_return_val 0 -#define wrapthread_define_var_thread_t(x) unused char x -#define wrapthread_define_var_mutex_t(x) unused char x -#define wrapthread_define_struct_thread_t(x) char x -#define wrapthread_define_struct_mutex_t(x) char x + #define wrapthread_define_var_thread_t(x) thrd_t x + #define wrapthread_define_var_mutex_t(x) mtx_t x + #define wrapthread_define_struct_thread_t(x) thrd_t x + #define wrapthread_define_struct_mutex_t(x) mtx_t x -#define wrapthread_define_if_threads(T, x) T x; (void)(x) + #define wrapthread_define_if_threads(T, x) T x -#define wrapthread_create(a, b, c, d) c(d) -#define wrapthread_join(a, b) -#define wrapthread_mutex_init(a, b) -#define wrapthread_mutex_lock(a) -#define wrapthread_mutex_unlock(a) + #define wrapthread_create(a, f, arg) thrd_create(a, f, arg) + #define wrapthread_join(a) thrd_join(a, NULL) + #define wrapthread_mutex_init(a) mtx_init(a, mtx_plain) + #define wrapthread_mutex_lock(a) mtx_lock(a) + #define wrapthread_mutex_unlock(a) mtx_unlock(a) #else -#include <pthread.h> + #define wrapthread_atomic -#define wrapthread_atomic _Atomic + #define wrapthread_return_t int + #define wrapthread_return_val 0 -#define wrapthread_define_var_thread_t(x) pthread_t x -#define wrapthread_define_var_mutex_t(x) pthread_mutex_t x -#define wrapthread_define_struct_thread_t(x) pthread_t x -#define wrapthread_define_struct_mutex_t(x) pthread_mutex_t x + #define wrapthread_define_var_thread_t(x) unused char x + #define wrapthread_define_var_mutex_t(x) unused char x + #define wrapthread_define_struct_thread_t(x) char x + #define wrapthread_define_struct_mutex_t(x) char x -#define wrapthread_define_if_threads(T, x) T x + #define wrapthread_define_if_threads(T, x) T x; (void)(x) -#define wrapthread_create(a, b, c, d) pthread_create(a, b, c, d) -#define wrapthread_join(a, b) pthread_join(a, b) -#define wrapthread_mutex_init(a, b) pthread_mutex_init(a, b) -#define wrapthread_mutex_lock(a) pthread_mutex_lock(a) -#define wrapthread_mutex_unlock(a) pthread_mutex_unlock(a) + #define wrapthread_create(t, f, arg) f(arg) + #define wrapthread_join(a) + #define wrapthread_mutex_init(a) + #define wrapthread_mutex_lock(a) + #define wrapthread_mutex_unlock(a) #endif