Improve performance of H48 solver with prefetching - nissy-core - The "engine" of nissy, including the H48 optimal solver.

commit 9a013b7c68f94e6be0fe8748c9012a441fe0273f
parent 83f6533c384a617181e818d1941b08e40aa40b7d
Author: Sebastiano Tronto <sebastiano@tronto.net>
Date:   Mon, 12 Jan 2026 18:09:43 +0100

Improve performance of H48 solver with prefetching

With this commit we re-structure how the node expansion in the
H48 solution search works to allow prefetching of pruning values,
showing performance improvements in the range of 30-45% on x86,
depending on table size and solution length.

A small bug fix related to appending solutions is included in this commit.

Diffstat:
M benchmarks/benchmarks.md  | 104 ++++++++++++++++++++++++++++++++++++++++---------------------------------------
M benchmarks/img/17moves16threads.png  | 0 
M benchmarks/img/17moves1thread.png  | 0 
M benchmarks/img/17moves4threads.png  | 0 
M benchmarks/img/18moves16threads.png  | 0 
M benchmarks/img/18moves1thread.png  | 0 
M benchmarks/img/18moves4threads.png  | 0 
M benchmarks/img/19moves16threads.png  | 0 
M benchmarks/img/19moves1thread.png  | 0 
M benchmarks/img/19moves4threads.png  | 0 
M benchmarks/img/20moves16threads.png  | 0 
M benchmarks/img/20moves1thread.png  | 0 
M benchmarks/img/20moves4threads.png  | 0 
M benchmarks/results_h48.py  | 48 ++++++++++++++++++++++++------------------------
M doc/h48.md  | 35 ++++++++++++++++++++++++++---------
M src/solvers/h48/solve.h  | 379 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M src/solvers/solutions.h  | 2 +-
A src/utils/prefetch.h  | 15 +++++++++++++++
M src/utils/utils.h  | 1 +

19 files changed, 364 insertions(+), 220 deletions(-)
diff --git a/benchmarks/benchmarks.md b/benchmarks/benchmarks.md
@@ -70,32 +70,32 @@ Time per cube (in seconds, lower is better).
 |  Solver  |  Size  |17 moves|18 moves|19 moves|20 moves|Superflip|
 |:---------|:-------|-------:|-------:|-------:|-------:|--------:|
 |vcube 212 |58.2 GiB|    0.11|    0.75|    3.43|   27.28|    19.30|
-|H48 h11   |56.5 GiB|    0.13|    0.73|    3.43|   23.28|    19.26|
+|H48 h11   |56.5 GiB|    0.09|    0.50|    2.24|   15.73|    15.55|
 |vcube 404 |31.8 GiB|    0.23|    1.24|    6.10|   59.33|   268.26|
-|H48 h10   |28.3 GiB|    0.23|    1.20|    6.18|   43.51|    48.84|
+|H48 h10   |28.3 GiB|    0.15|    0.76|    3.36|   23.51|    38.05|
 |vcube 308 |21.2 GiB|    0.17|    1.02|    6.20|   58.70|   604.35|
-|H48 h9    |14.1 GiB|    0.36|    2.47|   11.79|        |         |
+|H48 h9    |14.1 GiB|    0.24|    1.48|    6.69|        |         |
 |vcube 208 | 7.3 GiB|    0.56|    4.36|   20.58|        |         |
-|H48 h8    | 7.1 GiB|    0.79|    6.09|   25.58|        |         |
-|H48 h7    | 3.5 GiB|    1.07|    8.59|   42.16|        |         |
+|H48 h8    | 7.1 GiB|    0.46|    3.36|   14.13|        |         |
+|H48 h7    | 3.5 GiB|    0.63|    4.85|   23.25|        |         |
 |vcube 112 | 2.4 GiB|    0.96|    9.29|   40.52|        |         |
-|H48 h6    | 1.8 GiB|    2.12|   16.29|   82.24|        |         |
+|H48 h6    | 1.8 GiB|    1.25|    9.45|   46.31|        |         |
 
 Time per cube adjusted for table size (in seconds \* GiB, lower is better).
 
 |  Solver  |  Size  |17 moves|18 moves|19 moves|20 moves|Superflip|
 |:---------|:-------|-------:|-------:|-------:|-------:|--------:|
 |vcube 212 |58.2 GiB|    6.43|   43.80|  199.38| 1587.43|  1122.73|
-|H48 h11   |56.5 GiB|    7.33|   41.47|  193.67| 1315.32|  1088.34|
+|H48 h11   |56.5 GiB|    5.21|   28.28|  126.68|  889.01|   878.77|
 |vcube 404 |31.8 GiB|    7.40|   39.47|  194.01| 1887.94|  8535.87|
-|H48 h10   |28.3 GiB|    6.60|   34.00|  174.65| 1229.18|  1379.90|
+|H48 h10   |28.3 GiB|    4.22|   21.37|   94.86|  664.22|  1075.08|
 |vcube 308 |21.2 GiB|    3.51|   21.71|  131.50| 1245.26| 12819.94|
-|H48 h9    |14.1 GiB|    5.03|   34.85|  166.51|        |         |
+|H48 h9    |14.1 GiB|    3.34|   20.97|   94.57|        |         |
 |vcube 208 | 7.3 GiB|    4.08|   31.74|  149.68|        |         |
-|H48 h8    | 7.1 GiB|    5.61|   43.02|  180.71|        |         |
-|H48 h7    | 3.5 GiB|    3.80|   30.34|  148.96|        |         |
+|H48 h8    | 7.1 GiB|    3.25|   23.75|   99.82|        |         |
+|H48 h7    | 3.5 GiB|    2.22|   17.12|   82.14|        |         |
 |vcube 112 | 2.4 GiB|    2.33|   22.53|   98.23|        |         |
-|H48 h6    | 1.8 GiB|    3.75|   28.79|  145.36|        |         |
+|H48 h6    | 1.8 GiB|    2.21|   16.70|   81.86|        |         |
 
 <img src="img/17moves1thread.png">
 <img src="img/18moves1thread.png">
@@ -110,32 +110,32 @@ Time per cube (in seconds, lower is better).
 |  Solver  |  Size  |17 moves|18 moves|19 moves|20 moves|Superflip|
 |:---------|:-------|-------:|-------:|-------:|-------:|--------:|
 |vcube 212 |58.2 GiB|    0.03|    0.27|    1.04|    7.70|   (a)   |
-|H48 h11   |56.5 GiB|    0.04|    0.22|    0.97|    6.98|     5.95|
+|H48 h11   |56.5 GiB|    0.04|    0.14|    0.60|    3.97|     4.19|
 |vcube 404 |31.8 GiB|    0.07|    0.30|    1.65|   16.17|   (a)   |
-|H48 h10   |28.3 GiB|    0.07|    0.36|    1.79|   12.42|    14.03|
+|H48 h10   |28.3 GiB|    0.05|    0.20|    0.91|    6.65|     9.81|
 |vcube 308 |21.2 GiB|    0.05|    0.35|    1.78|   16.61|   (a)   |
-|H48 h9    |14.1 GiB|    0.11|    0.70|    3.48|   25.61|    25.37|
+|H48 h9    |14.1 GiB|    0.07|    0.39|    1.74|   12.54|    18.96|
 |vcube 208 | 7.3 GiB|    0.16|    1.47|    5.86|        |   (a)   |
-|H48 h8    | 7.1 GiB|    0.23|    1.74|    7.19|        |         |
-|H48 h7    | 3.5 GiB|    0.32|    2.45|   12.12|        |         |
+|H48 h8    | 7.1 GiB|    0.13|    0.90|    3.71|        |         |
+|H48 h7    | 3.5 GiB|    0.17|    1.28|    6.12|        |         |
 |vcube 112 | 2.4 GiB|    0.29|    3.13|   11.95|        |   (a)   |
-|H48 h6    | 1.8 GiB|    0.61|    4.61|   24.22|        |         |
+|H48 h6    | 1.8 GiB|    0.33|    2.47|   12.22|        |         |
 
 Time per cube adjusted for table size (in seconds \* GiB, lower is better).
 
 |  Solver  |  Size  |17 moves|18 moves|19 moves|20 moves|Superflip|
 |:---------|:-------|-------:|-------:|-------:|-------:|--------:|
 |vcube 212 |58.2 GiB|    2.03|   15.69|   60.25|  447.97|   (a)   |
-|H48 h11   |56.5 GiB|    2.53|   12.71|   54.58|  394.33|   336.47|
+|H48 h11   |56.5 GiB|    2.04|    8.17|   33.77|  224.59|   236.92|
 |vcube 404 |31.8 GiB|    2.32|    9.50|   52.46|  514.50|   (a)   |
-|H48 h10   |28.3 GiB|    2.11|   10.23|   50.61|  350.78|   396.48|
+|H48 h10   |28.3 GiB|    1.37|    5.76|   25.83|  187.88|   277.18|
 |vcube 308 |21.2 GiB|    1.02|    7.52|   37.82|  352.36|   (a)   |
-|H48 h9    |14.1 GiB|    1.57|    9.96|   49.23|  361.85|   358.45|
+|H48 h9    |14.1 GiB|    1.03|    5.45|   24.58|  177.17|   267.79|
 |vcube 208 | 7.3 GiB|    1.18|   10.69|   42.63|        |   (a)   |
-|H48 h8    | 7.1 GiB|    1.66|   12.27|   50.82|        |         |
-|H48 h7    | 3.5 GiB|    1.11|    8.67|   42.83|        |         |
+|H48 h8    | 7.1 GiB|    0.92|    6.39|   26.20|        |         |
+|H48 h7    | 3.5 GiB|    0.62|    4.53|   21.62|        |         |
 |vcube 112 | 2.4 GiB|    0.69|    7.59|   28.97|        |   (a)   |
-|H48 h6    | 1.8 GiB|    1.08|    8.16|   42.81|        |         |
+|H48 h6    | 1.8 GiB|    0.59|    4.37|   21.61|        |         |
 
 (a) vcube cannot parallelize on a single scramble, the results for the
 superflip are going to be the same as in the single thread case.
@@ -153,32 +153,32 @@ Time per cube (in seconds, lower is better).
 |  Solver  |  Size  |17 moves|18 moves|19 moves|20 moves|Superflip|
 |:---------|:-------|-------:|-------:|-------:|-------:|--------:|
 |vcube 212 |58.2 GiB|    0.02|    0.13|    0.45|    2.84|   (a)   |
-|H48 h11   |56.5 GiB|    0.02|    0.08|    0.33|    2.22|     2.41|
+|H48 h11   |56.5 GiB|    0.02|    0.06|    0.22|    1.33|     1.84|
 |vcube 404 |31.8 GiB|    0.04|    0.14|    0.65|    6.08|   (a)   |
-|H48 h10   |28.3 GiB|    0.03|    0.13|    0.58|    4.21|     5.56|
+|H48 h10   |28.3 GiB|    0.03|    0.08|    0.33|    2.34|     4.18|
 |vcube 308 |21.2 GiB|    0.03|    0.19|    0.78|    6.67|   (a)   |
-|H48 h9    |14.1 GiB|    0.04|    0.25|    1.10|    8.05|    10.77|
+|H48 h9    |14.1 GiB|    0.04|    0.15|    0.64|    4.45|     8.09|
 |vcube 208 | 7.3 GiB|    0.08|    0.79|    2.43|        |   (a)   |
-|H48 h8    | 7.1 GiB|    0.08|    0.57|    2.40|        |         |
-|H48 h7    | 3.5 GiB|    0.11|    0.83|    4.00|        |         |
+|H48 h8    | 7.1 GiB|    0.06|    0.34|    1.36|        |         |
+|H48 h7    | 3.5 GiB|    0.07|    0.47|    2.20|        |         |
 |vcube 112 | 2.4 GiB|    0.15|    1.63|    5.10|        |   (a)   |
-|H48 h6    | 1.8 GiB|    0.21|    1.51|    7.74|        |         |
+|H48 h6    | 1.8 GiB|    0.13|    0.91|    4.39|        |         |
 
 Time per cube adjusted for table size (in seconds \* GiB, lower is better).
 
 |  Solver  |  Size  |17 moves|18 moves|19 moves|20 moves|Superflip|
 |:---------|:-------|-------:|-------:|-------:|-------:|--------:|
 |vcube 212 |58.2 GiB|    0.95|    7.83|   26.04|  165.03|   (a)   |
-|H48 h11   |56.5 GiB|    1.12|    4.61|   18.90|  125.27|   136.10|
+|H48 h11   |56.5 GiB|    1.34|    3.48|   12.19|   74.93|   103.98|
 |vcube 404 |31.8 GiB|    1.21|    4.60|   20.76|  193.43|   (a)   |
-|H48 h10   |28.3 GiB|    0.86|    3.61|   16.47|  118.85|   157.13|
+|H48 h10   |28.3 GiB|    0.74|    2.37|    9.46|   66.13|   118.23|
 |vcube 308 |21.2 GiB|    0.67|    4.01|   16.48|  141.49|   (a)   |
-|H48 h9    |14.1 GiB|    0.60|    3.51|   15.57|  113.71|   152.21|
+|H48 h9    |14.1 GiB|    0.51|    2.15|    8.98|   62.92|   114.31|
 |vcube 208 | 7.3 GiB|    0.56|    5.78|   17.68|        |   (a)   |
-|H48 h8    | 7.1 GiB|    0.59|    4.03|   16.98|        |         |
-|H48 h7    | 3.5 GiB|    0.41|    2.94|   14.12|        |         |
+|H48 h8    | 7.1 GiB|    0.40|    2.40|    9.59|        |         |
+|H48 h7    | 3.5 GiB|    0.26|    1.66|    7.77|        |         |
 |vcube 112 | 2.4 GiB|    0.35|    3.95|   12.37|        |   (a)   |
-|H48 h6    | 1.8 GiB|    0.38|    2.67|   13.68|        |         |
+|H48 h6    | 1.8 GiB|    0.23|    1.61|    7.76|        |         |
 
 (a) vcube cannot parallelize on a single scramble, the results for the
 superflip are going to be the same as in the single thread case.
@@ -197,32 +197,34 @@ Time per cube (in seconds, lower is better).
 
 |  Solver  |  Size  |17 moves|18 moves|19 moves|20 moves|Superflip|
 |:---------|:-------|-------:|-------:|-------:|-------:|--------:|
-|H48 h11   |56.5 GiB|    0.05|    0.41|    3.69|   31.65|    21.80|
-|H48 h10   |28.3 GiB|    0.07|    0.70|    6.73|   58.61|    48.62|
-|H48 h9    |14.1 GiB|    0.12|    1.28|   12.93|        |         |
-|H48 h8    | 7.1 GiB|    0.25|    2.72|   27.42|        |         |
-|H48 h7    | 3.5 GiB|    0.37|    4.24|   47.10|        |         |
-|H48 h6    | 1.8 GiB|    0.69|    8.18|   91.85|        |         |
+|H48 h11   |56.5 GiB|    0.04|    0.26|    2.22|   18.96|    16.95|
+|H48 h10   |28.3 GiB|    0.05|    0.42|    3.82|   34.42|    36.80|
+|H48 h9    |14.1 GiB|    0.08|    0.73|    7.28|        |         |
+|H48 h8    | 7.1 GiB|    0.15|    1.56|   15.41|        |         |
+|H48 h7    | 3.5 GiB|    0.21|    2.38|   26.52|        |         |
+|H48 h6    | 1.8 GiB|    0.39|    4.67|   53.00|        |         |
 
 Time per cube adjusted for table size (in seconds \* GiB, lower is better).
 
 |  Solver  |  Size  |17 moves|18 moves|19 moves|20 moves|Superflip|
 |:---------|:-------|-------:|-------:|-------:|-------:|--------:|
-|H48 h11   |56.5 GiB|    2.57|   23.07|  208.71| 1788.12|  1231.86|
-|H48 h10   |28.3 GiB|    2.01|   19.77|  190.14| 1655.95|  1373.79|
-|H48 h9    |14.1 GiB|    1.72|   18.07|  182.67|        |         |
-|H48 h8    | 7.1 GiB|    1.74|   19.21|  193.74|        |         |
-|H48 h7    | 3.5 GiB|    1.29|   14.98|  166.41|        |         |
-|H48 h6    | 1.8 GiB|    1.21|   14.47|  162.36|        |         |
+|H48 h11   |56.5 GiB|    2.11|   14.55|  125.21| 1071.23|   957.47|
+|H48 h10   |28.3 GiB|    1.48|   11.96|  107.83|  972.35|  1039.73|
+|H48 h9    |14.1 GiB|    1.08|   10.29|  102.88|        |         |
+|H48 h8    | 7.1 GiB|    1.03|   10.99|  108.87|        |         |
+|H48 h7    | 3.5 GiB|    0.74|    8.41|   93.69|        |         |
+|H48 h6    | 1.8 GiB|    0.70|    8.25|   93.68|        |         |
 
 </details>
 
 ## Comments on the results
 
-* Adjusting for table size, vcube generally is a bit faster than H48,
-  except for 20 moves scrambles where H48 is a clear winner.
+* Adjusting for table size, H48 is generally faster than vcube.
+* The gap between the two solvers is larger for scrambles with
+  longer optimal solutions.
 * On sets of 25 scrambles, H48 performs better than vcube when using
-  multiple threads. However, this advantage will likely disappear (or
+  multiple threads, compared to their single-threaded performance.
+  However, this advantage will likely disappear (or
   even invert) if we increase the size of the set.
 
 ## Other notes
diff --git a/benchmarks/img/17moves16threads.png b/benchmarks/img/17moves16threads.png
Binary files differ.
diff --git a/benchmarks/img/17moves1thread.png b/benchmarks/img/17moves1thread.png
Binary files differ.
diff --git a/benchmarks/img/17moves4threads.png b/benchmarks/img/17moves4threads.png
Binary files differ.
diff --git a/benchmarks/img/18moves16threads.png b/benchmarks/img/18moves16threads.png
Binary files differ.
diff --git a/benchmarks/img/18moves1thread.png b/benchmarks/img/18moves1thread.png
Binary files differ.
diff --git a/benchmarks/img/18moves4threads.png b/benchmarks/img/18moves4threads.png
Binary files differ.
diff --git a/benchmarks/img/19moves16threads.png b/benchmarks/img/19moves16threads.png
Binary files differ.
diff --git a/benchmarks/img/19moves1thread.png b/benchmarks/img/19moves1thread.png
Binary files differ.
diff --git a/benchmarks/img/19moves4threads.png b/benchmarks/img/19moves4threads.png
Binary files differ.
diff --git a/benchmarks/img/20moves16threads.png b/benchmarks/img/20moves16threads.png
Binary files differ.
diff --git a/benchmarks/img/20moves1thread.png b/benchmarks/img/20moves1thread.png
Binary files differ.
diff --git a/benchmarks/img/20moves4threads.png b/benchmarks/img/20moves4threads.png
Binary files differ.
diff --git a/benchmarks/results_h48.py b/benchmarks/results_h48.py
@@ -1,36 +1,36 @@
 h48_single_thread = {
-	6: {17: 52.9727, 18: 407.1435, 19: 2055.8785},
-	7: {17: 26.8541, 18: 214.7064, 19: 1053.9732},
-	8: {17: 19.8408, 18: 152.2214, 19: 639.4834},
-	9: {17: 8.9053, 18: 61.6760, 19: 294.6517},
-	10: {17: 5.8445, 18: 30.0880, 19: 154.5389, 20: 1087.6552, "superflip": 48.8410},
-	11: {17: 3.2420, 18: 18.3482, 19: 85.6898, 20: 581.9588, "superflip": 19.2614},
+	6: {17: 31.2308, 18: 236.1986, 19: 1157.7502},
+	7: {17: 15.7227, 18: 121.1488, 19: 581.2004},
+	8: {17: 11.5065, 18: 84.0366, 19: 353.2224},
+	9: {17: 5.9123, 18: 37.1034, 19: 167.3496},
+	10: {17: 3.7338, 18: 18.9061, 19: 83.9382, 20: 587.7489, "superflip": 38.0522},
+	11: {17: 2.3070, 18: 12.5145, 19: 56.0479, 20: 393.3391, "superflip": 15.5523},
 }
 
 h48_4_threads = {
-	6: {17: 15.2251, 18: 115.3688, 19: 605.5267},
-	7: {17: 7.8799, 18: 61.3355, 19: 303.0388},
-	8: {17: 5.8624, 18: 43.4193, 19: 179.8491},
-	9: {17: 2.7696, 18: 17.6190, 19: 87.1236, 20: 640.3405, "superflip": 25.3726},
-	10: {17: 1.8676, 18: 9.0543, 19: 44.7847, 20: 310.3914, "superflip": 14.0333},
-	11: {17: 1.1173, 18: 5.6240, 19: 24.1467, 20: 174.4689, "superflip": 5.9548},
+	6: {17: 8.2859, 18: 61.8403, 19: 305.5934},
+	7: {17: 4.3713, 18: 32.0550, 19: 152.9403},
+	8: {17: 3.2607, 18: 22.6002, 19: 92.7235},
+	9: {17: 1.8164, 18: 9.6520, 19: 43.5002, 20: 313.5156, "superflip": 18.9551},
+	10: {17: 1.2098, 18: 5.1007, 19: 22.8530, 20: 166.2468, "superflip": 9.8108},
+	11: {17: 0.9047, 18: 3.6152, 19: 14.9408, 20: 99.3673, "superflip": 4.1930},
 }
 
 h48_16_threads = {
-	6: {17: 5.3309, 18: 37.8142, 19: 193.5352},
-	7: {17: 2.8657, 18: 20.8215, 19: 99.9139},
-	8: {17: 2.0775, 18: 14.2672, 19: 60.0995},
-	9: {17: 1.0595, 18: 6.2074, 19: 27.5523, 20: 201.2200, "superflip": 10.7739},
-	10: {17: 0.7600, 18: 3.1980, 19: 14.5771, 20: 105.1686, "superflip": 5.5616},
-	11: {17: 0.4946, 18: 2.0381, 19: 8.3613, 20: 55.4269, "superflip": 2.4087},
+	6: {17: 3.2536, 18: 22.7283, 19: 109.8073},
+	7: {17: 1.8312, 18: 11.7609, 19: 54.9974},
+	8: {17: 1.4242, 18: 8.4798, 19: 33.9483},
+	9: {17: 0.9010, 18: 3.8064, 19: 15.8985, 20: 111.3421, "superflip": 8.0916},
+	10: {17: 0.6528, 18: 2.1009, 19: 8.3680, 20: 58.5190, "superflip": 4.1848},
+	11: {17: 0.5918, 18: 1.5379, 19: 5.3942, 20: 33.1522, "superflip": 1.8402},
 }
 
 h48_all_solutions = {
-	6: {17: 17.1587, 18: 204.6182, 19: 2296.3135},
-	7: {17: 9.1380, 18: 106.0127, 19: 1177.4728},
-	8: {17: 6.1641, 18: 67.9895, 19: 685.6033},
-	9: {17: 3.0422, 18: 31.9821, 19: 323.2516},
-	10: {17: 1.7822, 18: 17.4947, 19: 168.2516, 20: 1465.2950, "superflip": 48.6248},
-	11: {17: 1.1364, 18: 10.2076, 19: 92.3419, 20: 791.1480, "superflip": 21.8014},
+	6: {17: 9.8554, 18: 116.6374, 19: 1324.9303},
+	7: {17: 5.2495, 18: 59.4768, 19: 662.9106},
+	8: {17: 3.6359, 18: 38.8943, 19: 385.2452},
+	9: {17: 1.9026, 18: 18.2076, 19: 182.0522},
+	10: {17: 1.3117, 18: 10.5836, 19: 95.4136, 20: 860.4004, "superflip": 36.8007},
+	11: {17: 0.9357, 18: 6.4383, 19: 55.3969, 20: 473.9622, "superflip": 16.9452},
 }
 
diff --git a/doc/h48.md b/doc/h48.md
@@ -251,15 +251,18 @@ Moreover, as an additional heuristic, in case of a 0 read we also look
 up another pruning value in a table that takes into account only the
 position of the edges. This table is small (around 1MB), so repeated
 accesses to it are not too slow. In practice, this gives a small speed up
-of around 5%.  More tables could be used to refine the fallback estimate,
-but each additional table leads to longer lookup times, especially if
-it is too large to fit in cache.
-
-Previous versions of this implementation (up to commit 6c42463, or
-to version 0.2) also included the possibility of a table with 4 bits
-per entry, at least for the `h0` case. Such tables did not require
-a fallback lookup, but due to their large size they were not less
-efficient. Therefore, they have been removed.
+of around 5% for random scrambles. However, for small solvers (low values
+of h) and positions where corners are close to solved, this fallback
+table gives dramatic improvements (up to a factor of 1000x in some
+manual tests).
+
+More tables could be used to refine the fallback estimate, but each
+additional table leads to longer lookup times, especially if it is too
+large to fit in cache.  Previous versions of this implementation (up
+to commit 6c42463, or to version 0.2) also included the possibility of
+a table with 4 bits per entry, at least for the `h0` case. Such tables
+did not require a fallback lookup, but due to their large size they were
+not less efficient. Therefore, they have been removed.
 
 ### Estimation refinements
 
@@ -312,6 +315,20 @@ inverse position*, then the coordinate on the normal position has not
 changed. Thus if we keep track of the last computed pruning value,
 we can reuse it and avoid an expensive table lookup.
 
+### Pruning pipeline and prefetching
+
+To improve the memory access pattern and exploit
+[prefetching](https://en.wikipedia.org/wiki/Cache_prefetching)
+opportunities, we don't expand each neighbor one by one in the
+pruning phase. Instead, we employ a *pruning pipeline*, where we
+first compute the cube and inverse position for each neighbor, and
+then we proceed through a series of stages, at each of which we
+compute some prune off some nodes and we pre-compute the data
+necessary for the next step.
+
+This pipeline-based strategy gives an speedup of around 25%.
+Adding manual prefetching, we get an additional 20% speedup.
+
 ### Other optimizations
 
 The H48 solver uses various other optimizations.
diff --git a/src/solvers/h48/solve.h b/src/solvers/h48/solve.h
@@ -1,11 +1,5 @@
 #define H48_STARTING_MOVES 4
-
-#if H48_STARTING_MOVES == 3
-#define H48_STARTING_CUBES 3240 /* Number of 3-move sequences */
-#elif H48_STARTING_MOVES == 4
-#define H48_STARTING_CUBES 43254 /* Number of 4-move sequences */
-#endif
-
+#define H48_STARTING_CUBES 43254
 #define H48_SORT_TASKS_MIN_DEPTH 16
 #define H48_LOG_PROGRESS_MIN_DEPTH 15
 
@@ -14,6 +8,7 @@ typedef struct {
 	uint8_t moves[H48_STARTING_MOVES];
 	int64_t rank;
 	uint64_t tmask[H48_STARTING_MOVES];
+	uint8_t pval;
 } solve_h48_task_t;
 
 typedef struct {
@@ -25,15 +20,13 @@ typedef struct {
 	solution_settings_t *solution_settings;
 	const uint64_t *tmask;
 	solution_list_t *solution_list;
-	int8_t lb_normal;
-	int8_t lb_inverse;
-	bool use_lb_normal;
-	bool use_lb_inverse;
+	uint8_t lb_normal;
+	uint8_t lb_inverse;
 	uint8_t h;
 	uint8_t base;
 	const uint32_t *cocsepdata;
 	const unsigned char *h48data;
-	const unsigned char *h48data_fallback_eoesep;
+	const unsigned char *eoesepdata;
 	uint64_t movemask_normal;
 	uint64_t movemask_inverse;
 	uint64_t nodes_visited;
@@ -58,11 +51,30 @@ typedef struct {
 	uint64_t tmask[H48_STARTING_MOVES];
 } dfsarg_solve_h48_maketasks_t;
 
+typedef struct {
+	cube_t cube;
+	cube_t inverse;
+	uint64_t coord;
+	uint8_t m;
+	uint8_t pn;
+	uint8_t pi;
+	uint8_t stop;
+} h48_prune_t;
+
 STATIC long long solve_h48_dispatch(oriented_cube_t, const char *, unsigned,
     unsigned, unsigned, unsigned, unsigned, unsigned, unsigned long long,
     const unsigned char *, unsigned, char *,
     long long [static NISSY_SIZE_SOLVE_STATS], int (*)(void *), void *);
-STATIC_INLINE bool solve_h48_stop(dfsarg_solve_h48_t [static 1]);
+STATIC_INLINE void h48_prune_pipeline(dfsarg_solve_h48_t [static 1],
+    h48_prune_t [static NMOVES], uint8_t, bool);
+STATIC_INLINE uint8_t h48_prune_lookup(
+    uint64_t, cube_t, dfsarg_solve_h48_t [static 1]);
+STATIC_INLINE uint8_t h48_prune_lookup_nocoord(
+    cube_t, dfsarg_solve_h48_t [static 1]);
+STATIC_INLINE void h48_prune_restore_normal(const h48_prune_t [static 1],
+    dfsarg_solve_h48_t [static 1], uint8_t);
+STATIC_INLINE void h48_prune_restore_inverse(const h48_prune_t [static 1],
+    dfsarg_solve_h48_t [static 1], uint8_t);
 STATIC int64_t solve_h48_maketasks(
     dfsarg_solve_h48_t [static 1], dfsarg_solve_h48_maketasks_t [static 1],
     solve_h48_task_t [static H48_STARTING_CUBES], int [static 1]);
@@ -104,151 +116,242 @@ STATIC long long solve_h48_dispatch(
 	    poll_status, poll_status_data);
 }
 
-STATIC_INLINE bool
-solve_h48_stop(dfsarg_solve_h48_t arg[static 1])
+STATIC_INLINE uint8_t
+h48_prune_lookup(
+	uint64_t coord,
+	cube_t cube,
+	dfsarg_solve_h48_t arg[static 1]
+)
 {
-	uint32_t data, data_inv;
-	int64_t coord;
-	int8_t target, nh, n;
-	uint8_t pval, pval_min, pval_eoesep;
-
-	arg->movemask_normal = arg->movemask_inverse = MM18_ALLMOVES;
-	arg->nodes_visited++;
-
-	n = arg->solution_moves->nmoves + arg->solution_moves->npremoves;
-	target = arg->target_depth - n;
+	uint8_t p, pmin, pe;
+
+	arg->table_lookups++;
+	p = get_h48_pval_and_min(arg->h48data, coord, &pmin);
+	if (p == 0) {
+		arg->table_fallbacks++;
+		pe = get_eoesep_pval_cube(arg->eoesepdata, cube);
+		return MAX(pmin, pe);
+	} else {
+		return p + arg->base;
+	}
+}
 
-	/* We'll never get a bound higher than base + 3 */
-	if (arg->base + 3 <= target)
-		return false;
+STATIC_INLINE uint8_t
+h48_prune_lookup_nocoord(
+	cube_t cube,
+	dfsarg_solve_h48_t arg[static 1]
+)
+{
+	uint32_t cdata;
+	uint64_t coord;
 
-	/* Preliminary probing using last computed bound, if possible */
+	get_h48_cdata(cube, arg->cocsepdata, &cdata);
+	coord = coord_h48_edges(cube, COCLASS(cdata), TTREP(cdata), arg->h);
+	return h48_prune_lookup(coord, cube, arg);
+}
 
-	if ((arg->use_lb_normal && arg->lb_normal > target) ||
-	    (arg->use_lb_inverse && arg->lb_inverse > target))
-		return true;
+STATIC_INLINE void
+h48_prune_pipeline(
+	dfsarg_solve_h48_t arg[static 1],
+	h48_prune_t prune[static NMOVES],
+	uint8_t target,
+	bool normal
+)
+{
+	uint64_t i;
+	uint32_t cdata;
+	uint8_t m, p;
+
+	/* Stage 0: initialize the neighbors array */
+	memset(prune, 0, NMOVES * sizeof(h48_prune_t));
+	if (normal) {
+		for (m = 0; m < NMOVES; m++) {
+			prune[m].pi = m % 3 == 1 ? arg->lb_inverse : 0;
+			if (!(arg->movemask_normal & MM_SINGLE(m)) ||
+			    prune[m].pi > target) {
+				prune[m].stop = 1;
+				continue;
+			}
+			prune[m].cube = move(arg->cube, m);
+			prune[m].inverse = premove(arg->inverse, m);
+			prune[m].m = m;
+			arg->nodes_visited++;
+		}
+	} else {
+		for (m = 0; m < NMOVES; m++) {
+			prune[m].pi = m % 3 == 1 ? arg->lb_normal : 0;
+			if (!(arg->movemask_inverse & MM_SINGLE(m)) ||
+			    prune[m].pi > target) {
+				prune[m].stop = 1;
+				continue;
+			}
+			prune[m].cube = move(arg->inverse, m);
+			prune[m].inverse = premove(arg->cube, m);
+			prune[m].m = m;
+			arg->nodes_visited++;
+		}
+	}
 
-	/* Preliminary corner probing */
+	/* We'll never get a bound higher than base + 3 */
+	if (target > arg->base + 3)
+		return;
 
-	if (get_h48_cdata(arg->cube, arg->cocsepdata, &data) > target ||
-	    get_h48_cdata(arg->inverse, arg->cocsepdata, &data_inv) > target)
-		return true;
+	/* Stage 1: cdata and prefetch inverse */
+	for (m = 0; m < NMOVES; m++) {
+		if (prune[m].stop)
+			continue;
 
-	/* Inverse probing */
+		p = get_h48_cdata(prune[m].inverse, arg->cocsepdata, &cdata);
+		if (p > target) {
+			prune[m].stop = 1;
+			continue;
+		}
+		if (prune[m].pi == 0) {
+			prune[m].coord = coord_h48_edges(prune[m].inverse,
+			    COCLASS(cdata), TTREP(cdata), arg->h);
+			i = H48_INDEX(H48_LINE_EXT(prune[m].coord));
+			prefetch(arg->h48data, i);
+		}
+	}
 
-	if (!arg->use_lb_inverse) {
-		arg->table_lookups++;
-		arg->use_lb_inverse = true;
-		coord = coord_h48_edges(
-		    arg->inverse, COCLASS(data_inv), TTREP(data_inv), arg->h);
-		pval = get_h48_pval_and_min(arg->h48data, coord, &pval_min);
+	/* Stage 2: get pval from inverse, prefetch normal */
+	for (m = 0; m < NMOVES; m++) {
+		if (prune[m].stop)
+			continue;
 
-		if (pval == 0) {
-			arg->table_fallbacks++;
+		if (prune[m].pi == 0) {
+			prune[m].pi = h48_prune_lookup(
+			    prune[m].coord, prune[m].inverse, arg);
+			if (prune[m].pi > target) {
+				prune[m].stop = 1;
+				continue;
+			}
+		}
 
-			pval_eoesep = get_eoesep_pval_cube(
-			    arg->h48data_fallback_eoesep, arg->inverse);
-			pval = MAX(pval_min, pval_eoesep);
-		} else {
-			pval += arg->base;
+		p = get_h48_cdata(prune[m].cube, arg->cocsepdata, &cdata);
+		if (p > target) {
+			prune[m].stop = 1;
+			continue;
 		}
+		prune[m].coord = coord_h48_edges(
+		    prune[m].cube, COCLASS(cdata), TTREP(cdata), arg->h);
+		i = H48_INDEX(H48_LINE_EXT(prune[m].coord));
+		prefetch(arg->h48data, i);
+	}
+
+	/* Stage 3: get pval from normal */
+	for (m = 0; m < NMOVES; m++) {
+		if (prune[m].stop)
+			continue;
 
-		arg->lb_inverse = pval;
+		prune[m].pn = h48_prune_lookup(
+		    prune[m].coord, prune[m].cube, arg);
+		prune[m].stop = prune[m].pn > target;
 	}
+}
 
-	if (arg->lb_inverse > target)
-		return true;
-	nh = arg->lb_inverse == target;
-	arg->movemask_normal = nh * MM18_NOHALFTURNS + (1-nh) * MM18_ALLMOVES;
+STATIC_INLINE void
+h48_prune_restore_normal(
+	const h48_prune_t prune[static 1],
+	dfsarg_solve_h48_t arg[static 1],
+	uint8_t target
+)
+{
+	uint8_t nm;
 
-	/* Normal probing */
+	arg->cube = prune->cube;
+	arg->inverse = prune->inverse;
+	arg->lb_inverse = prune->pi;
+	arg->lb_normal = prune->pn;
 
-	if (!arg->use_lb_normal) {
-		arg->table_lookups++;
-		arg->use_lb_normal = true;
-		coord = coord_h48_edges(
-		    arg->cube, COCLASS(data), TTREP(data), arg->h);
-		pval = get_h48_pval_and_min(arg->h48data, coord, &pval_min);
+	nm = arg->solution_moves->nmoves;
+	arg->solution_moves->moves[nm-1] = prune->m;
+	arg->movemask_normal = allowedmask[movebase(prune->m)];
 
-		if (pval == 0) {
-			arg->table_fallbacks++;
+	if (arg->lb_inverse == target)
+		arg->movemask_normal &= MM18_NOHALFTURNS;
+	if (arg->lb_normal == target)
+		arg->movemask_inverse &= MM18_NOHALFTURNS;
+}
 
-			pval_eoesep = get_eoesep_pval_cube(
-			    arg->h48data_fallback_eoesep, arg->cube);
-			pval = MAX(pval_min, pval_eoesep);
-		} else {
-			pval += arg->base;
-		}
+STATIC_INLINE void
+h48_prune_restore_inverse(
+	const h48_prune_t prune[static 1],
+	dfsarg_solve_h48_t arg[static 1],
+	uint8_t target
+)
+{
+	uint8_t nm;
 
-		arg->lb_normal = pval;
-	}
+	arg->cube = prune->inverse;
+	arg->inverse = prune->cube;
+	arg->lb_inverse = prune->pn;
+	arg->lb_normal = prune->pi;
 
-	if (arg->lb_normal > target)
-		return true;
-	nh = arg->lb_normal == target;
-	arg->movemask_inverse = nh * MM18_NOHALFTURNS + (1-nh) * MM18_ALLMOVES;
+	nm = arg->solution_moves->npremoves;
+	arg->solution_moves->premoves[nm-1] = prune->m;
+	arg->movemask_inverse = allowedmask[movebase(prune->m)];
 
-	return false;
+	if (arg->lb_inverse == target)
+		arg->movemask_normal &= MM18_NOHALFTURNS;
+	if (arg->lb_normal == target)
+		arg->movemask_inverse &= MM18_NOHALFTURNS;
 }
 
 STATIC int64_t
 solve_h48_dfs(dfsarg_solve_h48_t arg[static 1])
 {
 	int64_t ret, n;
-	uint8_t m, nm, lbn, lbi, t;
+	uint8_t m, nm, nn, ni, target;
 	uint64_t mm_normal, mm_inverse;
-	bool ulbi, ulbn;
-	cube_t backup_cube, backup_inverse;
-
-	nm = arg->solution_moves->nmoves + arg->solution_moves->npremoves;
-	if (equal(arg->cube, SOLVED_CUBE)) {
-		if (arg->target_depth != nm)
-			return 0;
-		wrapthread_mutex_lock(arg->solutions_mutex);
-		ret = appendsolution(arg->solution_moves, H48_STARTING_MOVES,
-		    arg->tmask, arg->solution_settings, arg->solution_list);
-		wrapthread_mutex_unlock(arg->solutions_mutex);
-		return ret;
-	}
+	cube_t cube, backup_cube, backup_inverse;
+	h48_prune_t prune[NMOVES];
 
-	if (solve_h48_stop(arg))
+	if (equal(arg->cube, SOLVED_CUBE) || /* Solved before target depth */
+	    arg->solution_list->nsols >= arg->solution_settings->maxsolutions)
 		return 0;
 
-	t = arg->solution_list->shortest_sol + arg->solution_settings->optimal;
-	if (nm + 1 > MIN(t, arg->target_depth) ||
-	    arg->solution_list->nsols >= arg->solution_settings->maxsolutions)
+	nn = arg->solution_moves->nmoves;
+	ni = arg->solution_moves->npremoves;
+	nm = nn + ni;
+	target = arg->target_depth - (nm + 1);
+	mm_normal = arg->movemask_normal;
+	mm_inverse = arg->movemask_inverse;
+	if (target == 0) { /* Last move */
+		arg->solution_moves->nmoves++;
+		for (m = 0; m < NMOVES; m++) {
+			if (!(mm_normal & mm_inverse & MM_SINGLE(m)))
+				continue;
+			cube = move(arg->cube, m);
+			arg->solution_moves->moves[nn] = m;
+			arg->nodes_visited++;
+			if (!equal(cube, SOLVED_CUBE))
+				continue;
+			wrapthread_mutex_lock(arg->solutions_mutex);
+			ret = appendsolution(arg->solution_moves,
+			    H48_STARTING_MOVES, arg->tmask,
+			    arg->solution_settings, arg->solution_list);
+			wrapthread_mutex_unlock(arg->solutions_mutex);
+			arg->solution_moves->nmoves--;
+			return ret;
+		}
+		arg->solution_moves->nmoves--;
 		return 0;
+	}
 
 	backup_cube = arg->cube;
 	backup_inverse = arg->inverse;
-	lbn = arg->lb_normal;
-	lbi = arg->lb_inverse;
-	ulbn = arg->use_lb_normal;
-	ulbi = arg->use_lb_inverse;
 
 	ret = 0;
-	mm_normal = arg->movemask_normal;
-	if (arg->solution_moves->nmoves > 0) {
-		m = arg->solution_moves->moves[arg->solution_moves->nmoves-1];
-		mm_normal &= allowedmask[movebase(m)];
-	}
-	mm_inverse = arg->movemask_inverse;
-	if (arg->solution_moves->npremoves > 0) {
-		m = arg->solution_moves->premoves[arg->solution_moves->npremoves-1];
-		mm_inverse &= allowedmask[movebase(m)];
-	}
 	if (popcount_u32(mm_normal) <= popcount_u32(mm_inverse)) {
+		h48_prune_pipeline(arg, prune, target, true);
 		arg->solution_moves->nmoves++;
-		for (m = 0; m < 18; m++) {
-			if (!(mm_normal & MM_SINGLE(m)))
+		for (m = 0; m < NMOVES; m++) {
+			if (prune[m].stop)
 				continue;
-			arg->solution_moves->moves[
-			    arg->solution_moves->nmoves-1] = m;
-			arg->cube = move(backup_cube, m);
-			arg->inverse = premove(backup_inverse, m);
-			arg->lb_inverse = lbi;
-			arg->use_lb_normal = false;
-			arg->use_lb_inverse = ulbi && m % 3 == 1;
+			arg->movemask_normal = mm_normal;
+			arg->movemask_inverse = mm_inverse;
+			h48_prune_restore_normal(&prune[m], arg, target);
 			n = solve_h48_dfs(arg);
 			if (n < 0)
 				return n;
@@ -256,17 +359,14 @@ solve_h48_dfs(dfsarg_solve_h48_t arg[static 1])
 		}
 		arg->solution_moves->nmoves--;
 	} else {
+		h48_prune_pipeline(arg, prune, target, false);
 		arg->solution_moves->npremoves++;
-		for (m = 0; m < 18; m++) {
-			if(!(mm_inverse & MM_SINGLE(m)))
+		for (m = 0; m < NMOVES; m++) {
+			if (prune[m].stop)
 				continue;
-			arg->solution_moves->premoves[
-			    arg->solution_moves->npremoves-1] = m;
-			arg->inverse = move(backup_inverse, m);
-			arg->cube = premove(backup_cube, m);
-			arg->lb_normal = lbn;
-			arg->use_lb_inverse = false;
-			arg->use_lb_normal = ulbn && m % 3 == 1;
+			arg->movemask_normal = mm_normal;
+			arg->movemask_inverse = mm_inverse;
+			h48_prune_restore_inverse(&prune[m], arg, target);
 			n = solve_h48_dfs(arg);
 			if (n < 0)
 				return n;
@@ -277,6 +377,8 @@ solve_h48_dfs(dfsarg_solve_h48_t arg[static 1])
 
 	arg->cube = backup_cube;
 	arg->inverse = backup_inverse;
+	arg->movemask_normal = mm_normal;
+	arg->movemask_inverse = mm_inverse;
 
 	return ret;
 }
@@ -286,7 +388,7 @@ solve_h48_runthread(void *arg)
 {
 	int i, j;
 	uint8_t lastmove;
-	int64_t nprev;
+	int64_t d, f, nprev;
 	dfsarg_solve_h48_t *dfsarg;
 
 	dfsarg = (dfsarg_solve_h48_t *)arg;
@@ -309,11 +411,15 @@ solve_h48_runthread(void *arg)
 			   move(dfsarg->cube, dfsarg->tasks[i].moves[j]);
 		dfsarg->inverse = inverse(dfsarg->cube);
 
+		dfsarg->nodes_visited++;
+		if (dfsarg->tasks[i].pval + H48_STARTING_MOVES
+		    > dfsarg->target_depth)
+			continue;
+
 		dfsarg->lb_normal = 0;
 		dfsarg->lb_inverse = 0;
-		dfsarg->use_lb_normal = false;
-		dfsarg->use_lb_inverse = false;
-		dfsarg->movemask_normal = MM18_ALLMOVES;
+		dfsarg->movemask_normal = allowedmask[
+		    movebase(dfsarg->tasks[i].moves[H48_STARTING_MOVES-1])];
 		dfsarg->movemask_inverse = MM18_ALLMOVES;
 		dfsarg->tmask = dfsarg->tasks[i].tmask;
 
@@ -331,8 +437,9 @@ solve_h48_runthread(void *arg)
 		inspired by Andrew Skalski's vcube.
 		*/
 		lastmove = dfsarg->tasks[i].moves[H48_STARTING_MOVES-1];
-		dfsarg->tasks[i].rank = (dfsarg->nodes_visited - nprev) *
-		    (movebase(lastmove) % 2 == 0 ? 47525 : 58206);
+		d = (int64_t)dfsarg->nodes_visited - nprev;
+		f = movebase(lastmove) % 2 == 0 ? 47525 : 58206;
+		dfsarg->tasks[i].rank = d * f;
 		nprev = dfsarg->nodes_visited;
 	}
 
@@ -374,6 +481,8 @@ solve_h48_maketasks(
 
 	if (mtarg->nmoves == H48_STARTING_MOVES) {
 		tasks[*ntasks].cube = mtarg->cube;
+		tasks[*ntasks].pval =
+		    h48_prune_lookup_nocoord(mtarg->cube, solve_arg);
 		memcpy(tasks[*ntasks].moves, mtarg->moves,
 		    H48_STARTING_MOVES * sizeof(uint8_t));
 		memcpy(tasks[*ntasks].tmask, mtarg->tmask,
@@ -393,7 +502,7 @@ solve_h48_maketasks(
 
 	mtarg->nmoves++;
 	backup_cube = mtarg->cube;
-	for (m = 0; m < 18; m++) {
+	for (m = 0; m < NMOVES; m++) {
 		if (!(mm & MM_SINGLE(m)))
 			continue;
 
@@ -514,7 +623,7 @@ solve_h48(
 			.base = info.base,
 			.cocsepdata = cocsepdata,
 			.h48data = h48data,
-			.h48data_fallback_eoesep = eoesep,
+			.eoesepdata = eoesep,
 			.solution_moves = &solution_moves[i],
 			.solution_settings = &settings,
 			.solution_list = &sollist,
diff --git a/src/solvers/solutions.h b/src/solvers/solutions.h
@@ -108,7 +108,7 @@ last_solution_is_duplicate(const solution_list_t l[static 1])
 		j--;
 		for (i = l->used-2; l->buf[i] == l->buf[j]; i--, j--) {
 			if (l->buf[i-1] == '\n') {
-				if (l->buf[j-1] == '\n' || j == 0)
+				if (j == 0 || l->buf[j-1] == '\n')
 					return true;
 				else break;
 			}
diff --git a/src/utils/prefetch.h b/src/utils/prefetch.h
@@ -0,0 +1,15 @@
+#if defined(AVX2)
+
+#define prefetch(a, i) _mm_prefetch(a+i, _MM_HINT_T0)
+
+#else
+#if defined(__GNUC__) || defined(__clang__)
+
+#define prefetch(a, i) __builtin_prefetch(a+i, 0, 0)
+
+#else
+
+#define prefetch(a, i) (void)i
+
+#endif
+#endif
diff --git a/src/utils/utils.h b/src/utils/utils.h
@@ -4,3 +4,4 @@
 #include "math.h"
 #include "sleep.h"
 #include "wrapthread.h"
+#include "prefetch.h"

	nissy-core The "engine" of nissy, including the H48 optimal solver.
	git clone https://git.tronto.net/nissy-core
	Download \| Log \| Files \| Refs \| README \| LICENSE

M	benchmarks/benchmarks.md	\|	104	++++++++++++++++++++++++++++++++++++++++---------------------------------------
M	benchmarks/img/17moves16threads.png	\|	0
M	benchmarks/img/17moves1thread.png	\|	0
M	benchmarks/img/17moves4threads.png	\|	0
M	benchmarks/img/18moves16threads.png	\|	0
M	benchmarks/img/18moves1thread.png	\|	0
M	benchmarks/img/18moves4threads.png	\|	0
M	benchmarks/img/19moves16threads.png	\|	0
M	benchmarks/img/19moves1thread.png	\|	0
M	benchmarks/img/19moves4threads.png	\|	0
M	benchmarks/img/20moves16threads.png	\|	0
M	benchmarks/img/20moves1thread.png	\|	0
M	benchmarks/img/20moves4threads.png	\|	0
M	benchmarks/results_h48.py	\|	48	++++++++++++++++++++++++------------------------
M	doc/h48.md	\|	35	++++++++++++++++++++++++++---------
M	src/solvers/h48/solve.h	\|	379	+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M	src/solvers/solutions.h	\|	2	+-
A	src/utils/prefetch.h	\|	15	+++++++++++++++
M	src/utils/utils.h	\|	1	+