nissy-core

The "engine" of nissy, including the H48 optimal solver.
git clone https://git.tronto.net/nissy-core
Download | Log | Files | Refs | README | LICENSE

commit 7c3701364542355c29b2e4ebc6d719ddd123c0f2
parent 836d626e7caba9578c3440ec0484e7ff1b4cc48e
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Thu, 24 Jul 2025 09:00:49 +0200

Use bit trick for portable and arm popcount

Diffstat:
Msrc/arch/neon.h | 15++++++++-------
Msrc/arch/portable.h | 27++++++++++++++++++++-------
Atest/016_popcount_u32/00_all.in | 0
Atest/016_popcount_u32/00_all.out | 1+
Atest/016_popcount_u32/popcount_u32_tests.c | 43+++++++++++++++++++++++++++++++++++++++++++
5 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/src/arch/neon.h b/src/arch/neon.h @@ -30,16 +30,17 @@ STATIC_INLINE uint8x8_t compose_corners_slim(uint8x8_t, uint8x8_t); #define SOLVED_CUBE STATIC_CUBE( \ 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) -/* TODO: optimize this (use intrinsics?) */ STATIC_INLINE int popcount_u32(uint32_t x) { - int ret; - - for (ret = 0; x != 0; x >>= 1) - ret += x & 1; - - return ret; + /* Same as the portable version */ + x -= (x >> UINT32_C(1)) & UINT32_C(0x55555555); + x = (x & UINT32_C(0x33333333)) + + ((x >> UINT32_C(2)) & UINT32_C(0x33333333)); + x = (x + (x >> UINT32_C(4))) & UINT32_C(0x0F0F0F0F); + x = (x * UINT32_C(0x01010101)) >> UINT32_C(24); + + return (int)x; } STATIC void diff --git a/src/arch/portable.h b/src/arch/portable.h @@ -9,16 +9,29 @@ #define SOLVED_CUBE STATIC_CUBE( \ 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) -/* TODO: optimize this (use bit tricks?) */ STATIC_INLINE int popcount_u32(uint32_t x) { - int ret; - - for (ret = 0; x != 0; x >>= 1) - ret += x & 1; - - return ret; + /* + Bit trick: accumulate in pairs of bits, quads of bits and so on, + until the final result is the sum of all bits. + + x = (x & 0x55555555) + ((x >> 1) & 0x55555555); + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + x = (x & 0x0F0F0F0F) + ((x >> 4) & 0x0F0F0F0F); + x = (x & 0x00FF00FF) + ((x >> 8) & 0x00FF00FF); + x = (x & 0x0000FFFF) + ((x >> 16) & 0x0000FFFF); + + The actual method we use is a small optimization of the one above. + */ + + x -= (x >> UINT32_C(1)) & UINT32_C(0x55555555); + x = (x & UINT32_C(0x33333333)) + + ((x >> UINT32_C(2)) & UINT32_C(0x33333333)); + x = (x + (x >> UINT32_C(4))) & UINT32_C(0x0F0F0F0F); + x = (x * UINT32_C(0x01010101)) >> UINT32_C(24); + + return (int)x; } STATIC void diff --git a/test/016_popcount_u32/00_all.in b/test/016_popcount_u32/00_all.in diff --git a/test/016_popcount_u32/00_all.out b/test/016_popcount_u32/00_all.out @@ -0,0 +1 @@ +Ok diff --git a/test/016_popcount_u32/popcount_u32_tests.c b/test/016_popcount_u32/popcount_u32_tests.c @@ -0,0 +1,43 @@ +#include "../test.h" + +int popcount_u32(uint32_t x); + +int +popcount_u32_simple(uint32_t x) +{ + int ret; + + for (ret = 0; x != 0; x >>= 1) + ret += x & 1; + + return ret; +} + +bool +correct(uint32_t x) +{ + int expected = popcount_u32_simple(x); + int actual = popcount_u32(x); + if (actual != expected) { + printf("Error at %" PRIu32 ": expected %d bits, found %d\n", + x, expected, actual); + return false; + } + return true; +} + +void run(void) { + uint32_t i; + + /* Test all numbers up to 2^16, and other ranges of 2^16 numbers */ + for (i = 0; i < 0xFFFF; i++) { + if (!correct(i) || + !correct(i + UINT32_C(0xFFFF0000)) || + !correct(i + UINT32_C(1000000)) || + !correct(i + UINT32_C(1)) || + !correct(i + UINT32_C(1234567))) + return; + } + + printf("Ok\n"); +}