commit 2c291aed4bb80f07e8557284d3399cf1e2a4ccb9
parent dff3a040637f2985b5b5e369148a993183964096
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date: Sun, 1 Mar 2026 18:09:44 +0100
Improve Windows build support.
- Use multithreading (works with a sufficiently recent version of
the Microsoft developer tools / C SDK).
- Detect CPU architecture and use AVX2 or NEON when appropriate.
- Automatically detect python installation path.
Diffstat:
8 files changed, 112 insertions(+), 68 deletions(-)
diff --git a/README.md b/README.md
@@ -64,9 +64,9 @@ can be used to build the basic shell, while
Builds and runs the unit tests. See `build.bat help` for a list of
all available options.
-Note: At the moment certain optimizations, such as multithreading and
-advanced CPU instructions, are not supported on Windows.
-Work is ongoing to improve Windows support.
+Note: The build script for Windows does not support all the options
+available with build.sh. If you want to tune the build options, you'll
+have to manually edit the build script.
## Running tests
diff --git a/build.bat b/build.bat
@@ -9,15 +9,25 @@ if [%1]==[/d] (
SET CC=clang
SET CXX=clang++
-SET ARCH=PORTABLE
-SET THREADS=1
+SET THREADS=16
SET SANITIZE=
-:: TODO depends on ARCH variable
-SET ARCHOPTS=
-:: TODO depends on SANITIZE variable
-SET DFLAGS=-g3 -DDEBUG
+SET ARCH=PORTABLE
+clang -march=native -dM -E - < NUL | findstr /C:"__AVX2__" >NUL 2>&1
+if %ERRORLEVEL% EQU 0 (
+ SET ARCH=AVX2
+ SET ARCHOPTS=-mavx2
+ goto :ArchDone
+)
+clang -march=native -dM -E - < NUL | findstr /C:"__ARM_NEON" >NUL 2>&1
+if %ERRORLEVEL% EQU 0 (
+ SET ARCH=NEON
+ goto :ArchDone
+)
+:ArchDone
+
+SET DFLAGS=-g3 -DDEBUG
SET WARNINGS=-Wno-deprecated-declarations
SET VARIABLES=-DTHREADS=%THREADS% -D%ARCH%
SET OFLAGS=-O3
@@ -28,7 +38,9 @@ SET STACKSIZE=-Wl,-stack:16777216
SET LFLAGS=%STACKSIZE%
:: Python libraries - change to match your local installation
-SET PYPATH=%userprofile%\AppData\Local\Programs\Python\Python313
+for /f "delims=" %%i in ('python -c "import sys; print(sys.base_prefix)"') do set PYPATH=%%i
+::SET PYPATH=%userprofile%\AppData\Local\Programs\Python\Python313
+
SET PYINCLUDE=%PYPATH%\include
SET PYLIBS=%PYPATH%\libs
diff --git a/src/solvers/distribution.h b/src/solvers/distribution.h
@@ -10,13 +10,13 @@ typedef struct {
const unsigned char *table;
} getdistribution_data_t;
-STATIC void *getdistribution_runthread(void *);
+STATIC wrapthread_return_t getdistribution_runthread(void *);
STATIC void getdistribution(const unsigned char *,
uint64_t [static INFO_DISTRIBUTION_LEN], const tableinfo_t [static 1]);
STATIC bool distribution_equal(const uint64_t [static INFO_DISTRIBUTION_LEN],
const uint64_t [static INFO_DISTRIBUTION_LEN], uint8_t);
-STATIC void *
+STATIC wrapthread_return_t
getdistribution_runthread(void *arg)
{
getdistribution_data_t *data = (getdistribution_data_t *)arg;
@@ -33,7 +33,7 @@ getdistribution_runthread(void *arg)
for (j = 0; j < ENTRIES_PER_BYTE(k); j++)
data->distr[(table[i] & (m << (j*k))) >> (j*k)]++;
- return NULL;
+ return wrapthread_return_val;
}
STATIC void
@@ -60,12 +60,12 @@ getdistribution(
.distr = local_distr[i],
.table = table,
};
- wrapthread_create(&thread[i], NULL,
+ wrapthread_create(&thread[i],
getdistribution_runthread, &targ[i]);
}
for (i = 0; i < THREADS; i++)
- wrapthread_join(thread[i], NULL);
+ wrapthread_join(thread[i]);
memset(distr, 0, INFO_DISTRIBUTION_LEN * sizeof(uint64_t));
for (i = 0; i < THREADS; i++)
diff --git a/src/solvers/h48/distribution_h48.h b/src/solvers/h48/distribution_h48.h
@@ -4,11 +4,11 @@ needed for H48 because of the intertwined fallback table, and it is easier
to have some duplication than to make these functions needlessly generic.
*/
-STATIC void *getdistribution_h48_runthread(void *);
+STATIC wrapthread_return_t getdistribution_h48_runthread(void *);
STATIC void getdistribution_h48(const unsigned char *,
uint64_t [static INFO_DISTRIBUTION_LEN], const tableinfo_t [static 1]);
-STATIC void *
+STATIC wrapthread_return_t
getdistribution_h48_runthread(void *arg)
{
getdistribution_data_t *data = (getdistribution_data_t *)arg;
@@ -33,7 +33,7 @@ getdistribution_h48_runthread(void *arg)
data->distr[(t & (m << (3*k))) >> (3*k)]--;
}
- return NULL;
+ return wrapthread_return_val;
}
STATIC void
@@ -58,12 +58,12 @@ getdistribution_h48(
.distr = local_distr[i],
.table = table,
};
- wrapthread_create(&thread[i], NULL,
+ wrapthread_create(&thread[i],
getdistribution_h48_runthread, &targ[i]);
}
for (i = 0; i < THREADS; i++)
- wrapthread_join(thread[i], NULL);
+ wrapthread_join(thread[i]);
memset(distr, 0, INFO_DISTRIBUTION_LEN * sizeof(uint64_t));
for (i = 0; i < THREADS; i++)
diff --git a/src/solvers/h48/gendata_h48.h b/src/solvers/h48/gendata_h48.h
@@ -3,7 +3,7 @@ STATIC long long gendata_h48_dispatch(
STATIC uint64_t gendata_h48short(gendata_h48short_arg_t [static 1]);
STATIC int64_t gendata_h48(gendata_h48_arg_t [static 1]);
STATIC void gendata_h48_maintable(gendata_h48_arg_t [static 1]);
-STATIC void *gendata_h48_runthread(void *);
+STATIC wrapthread_return_t gendata_h48_runthread(void *);
STATIC_INLINE void gendata_h48_mark(gendata_h48_mark_t [static 1]);
STATIC_INLINE bool gendata_h48_dfs_stop(
@@ -230,9 +230,9 @@ gendata_h48_maintable(gendata_h48_arg_t arg[static 1])
inext = 0;
count = 0;
- wrapthread_mutex_init(&shortcubes_mutex, NULL);
+ wrapthread_mutex_init(&shortcubes_mutex);
for (i = 0; i < CHUNKS; i++)
- wrapthread_mutex_init(&table_mutex[i], NULL);
+ wrapthread_mutex_init(&table_mutex[i]);
for (i = 0; i < THREADS; i++) {
dfsarg[i] = (h48_dfs_arg_t){
.h = arg->h,
@@ -251,7 +251,7 @@ gendata_h48_maintable(gendata_h48_arg_t arg[static 1])
dfsarg[i].table_mutex[ii] = &table_mutex[ii];
wrapthread_create(
- &thread[i], NULL, gendata_h48_runthread, &dfsarg[i]);
+ &thread[i], gendata_h48_runthread, &dfsarg[i]);
}
if (NISSY_CANSLEEP) {
@@ -281,7 +281,7 @@ gendata_h48_maintable(gendata_h48_arg_t arg[static 1])
}
for (i = 0; i < THREADS; i++)
- wrapthread_join(thread[i], NULL);
+ wrapthread_join(thread[i]);
h48map_destroy(&shortcubes);
@@ -291,7 +291,7 @@ gendata_h48_maintable(gendata_h48_arg_t arg[static 1])
writetableinfo(&arg->info, bufsize, (unsigned char *)arg->h48buf);
}
-STATIC void *
+STATIC wrapthread_return_t
gendata_h48_runthread(void *arg)
{
uint64_t coord, coordext, coordmin;
@@ -328,7 +328,7 @@ gendata_h48_runthread(void *arg)
}
}
- return NULL;
+ return wrapthread_return_val;
}
STATIC void
diff --git a/src/solvers/h48/solve.h b/src/solvers/h48/solve.h
@@ -78,7 +78,7 @@ STATIC_INLINE void h48_prune_restore_inverse(const h48_prune_t [static 1],
STATIC int64_t solve_h48_maketasks(
dfsarg_solve_h48_t [static 1], dfsarg_solve_h48_maketasks_t [static 1],
solve_h48_task_t [static H48_STARTING_CUBES], int [static 1]);
-STATIC void *solve_h48_runthread(void *);
+STATIC wrapthread_return_t solve_h48_runthread(void *);
STATIC int64_t solve_h48_dfs(dfsarg_solve_h48_t [static 1]);
STATIC void solve_h48_log_solutions(solution_list_t [static 1], size_t);
STATIC int solve_h48_compare_tasks(const void *, const void *);
@@ -383,7 +383,7 @@ solve_h48_dfs(dfsarg_solve_h48_t arg[static 1])
return ret;
}
-STATIC void *
+STATIC wrapthread_return_t
solve_h48_runthread(void *arg)
{
int i, j;
@@ -445,7 +445,7 @@ solve_h48_runthread(void *arg)
solve_h48_runthread_end:
dfsarg->thread_done = true;
- return NULL;
+ return wrapthread_return_val;
}
STATIC int64_t
@@ -638,7 +638,7 @@ solve_h48(
}
- wrapthread_mutex_init(&solutions_mutex, NULL);
+ wrapthread_mutex_init(&solutions_mutex);
mtarg = (dfsarg_solve_h48_maketasks_t) {
.cube = oc.cube,
@@ -689,7 +689,7 @@ solve_h48(
arg[i].target_depth = d;
arg[i].thread_done = false;
wrapthread_create(
- &thread[i], NULL, solve_h48_runthread, &arg[i]);
+ &thread[i], solve_h48_runthread, &arg[i]);
}
/* Log solutions and handle pause / stop / resume */
@@ -718,7 +718,7 @@ solve_h48(
}
for (i = 0; i < threads; i++)
- wrapthread_join(thread[i], NULL);
+ wrapthread_join(thread[i]);
solve_h48_log_solutions(&sollist, lastused);
lastused = sollist.used;
diff --git a/src/utils/prefetch.h b/src/utils/prefetch.h
@@ -1,15 +1,7 @@
-#if defined(AVX2)
-
-#define prefetch(a, i) _mm_prefetch(a+i, _MM_HINT_T0)
-
-#else
#if defined(__GNUC__) || defined(__clang__)
-
-#define prefetch(a, i) __builtin_prefetch(a+i, 0, 0)
-
+ #define prefetch(a, i) __builtin_prefetch(a+i, 0, 0)
+#elif defined(AVX2)
+ #define prefetch(a, i) _mm_prefetch((char *)(a+i), _MM_HINT_T0)
#else
-
-#define prefetch(a, i) (void)i
-
-#endif
+ #define prefetch(a, i) (void)i
#endif
diff --git a/src/utils/wrapthread.h b/src/utils/wrapthread.h
@@ -1,37 +1,77 @@
#if THREADS == 1
+ #define WRAPTHREAD_NOTHREADS 1
+#elif defined(__unix__)
+ #define WRAPTHREAD_PTHREADS 1
+#elif defined(__has_include)
+ #if __has_include(<pthreads.h>)
+ #define WRAPTHREAD_PTHREADS 1
+ #elif __has_include(<threads.h>)
+ #define WRAPTHREAD_C11THREADS 1
+ #endif
+#else
+ #define WRAPTHREAD_NOTHREADS 1
+#endif
+
+#if WRAPTHREAD_PTHREADS
+ #include <pthread.h>
+
+ #define wrapthread_atomic _Atomic
+
+ #define wrapthread_return_t void *
+ #define wrapthread_return_val NULL
+
+ #define wrapthread_define_var_thread_t(x) pthread_t x
+ #define wrapthread_define_var_mutex_t(x) pthread_mutex_t x
+ #define wrapthread_define_struct_thread_t(x) pthread_t x
+ #define wrapthread_define_struct_mutex_t(x) pthread_mutex_t x
+
+ #define wrapthread_define_if_threads(T, x) T x
+
+ #define wrapthread_create(a, f, arg) pthread_create(a, NULL, f, arg)
+ #define wrapthread_join(a) pthread_join(a, NULL)
+ #define wrapthread_mutex_init(a) pthread_mutex_init(a, NULL)
+ #define wrapthread_mutex_lock(a) pthread_mutex_lock(a)
+ #define wrapthread_mutex_unlock(a) pthread_mutex_unlock(a)
+
+#elif WRAPTHREAD_C11THREADS
+ #include <threads.h>
+
+ #define wrapthread_atomic _Atomic
-#define wrapthread_atomic
+ #define wrapthread_return_t int
+ #define wrapthread_return_val 0
-#define wrapthread_define_var_thread_t(x) unused char x
-#define wrapthread_define_var_mutex_t(x) unused char x
-#define wrapthread_define_struct_thread_t(x) char x
-#define wrapthread_define_struct_mutex_t(x) char x
+ #define wrapthread_define_var_thread_t(x) thrd_t x
+ #define wrapthread_define_var_mutex_t(x) mtx_t x
+ #define wrapthread_define_struct_thread_t(x) thrd_t x
+ #define wrapthread_define_struct_mutex_t(x) mtx_t x
-#define wrapthread_define_if_threads(T, x) T x; (void)(x)
+ #define wrapthread_define_if_threads(T, x) T x
-#define wrapthread_create(a, b, c, d) c(d)
-#define wrapthread_join(a, b)
-#define wrapthread_mutex_init(a, b)
-#define wrapthread_mutex_lock(a)
-#define wrapthread_mutex_unlock(a)
+ #define wrapthread_create(a, f, arg) thrd_create(a, f, arg)
+ #define wrapthread_join(a) thrd_join(a, NULL)
+ #define wrapthread_mutex_init(a) mtx_init(a, mtx_plain)
+ #define wrapthread_mutex_lock(a) mtx_lock(a)
+ #define wrapthread_mutex_unlock(a) mtx_unlock(a)
#else
-#include <pthread.h>
+ #define wrapthread_atomic
-#define wrapthread_atomic _Atomic
+ #define wrapthread_return_t int
+ #define wrapthread_return_val 0
-#define wrapthread_define_var_thread_t(x) pthread_t x
-#define wrapthread_define_var_mutex_t(x) pthread_mutex_t x
-#define wrapthread_define_struct_thread_t(x) pthread_t x
-#define wrapthread_define_struct_mutex_t(x) pthread_mutex_t x
+ #define wrapthread_define_var_thread_t(x) unused char x
+ #define wrapthread_define_var_mutex_t(x) unused char x
+ #define wrapthread_define_struct_thread_t(x) char x
+ #define wrapthread_define_struct_mutex_t(x) char x
-#define wrapthread_define_if_threads(T, x) T x
+ #define wrapthread_define_if_threads(T, x) T x; (void)(x)
-#define wrapthread_create(a, b, c, d) pthread_create(a, b, c, d)
-#define wrapthread_join(a, b) pthread_join(a, b)
-#define wrapthread_mutex_init(a, b) pthread_mutex_init(a, b)
-#define wrapthread_mutex_lock(a) pthread_mutex_lock(a)
-#define wrapthread_mutex_unlock(a) pthread_mutex_unlock(a)
+ #define wrapthread_create(t, f, arg) f(arg)
+ #define wrapthread_join(a)
+ #define wrapthread_mutex_init(a)
+ #define wrapthread_mutex_lock(a)
+ #define wrapthread_mutex_unlock(a)
#endif