This change is part of [IBD] - Tracking PR for speeding up Initial Block Download
Summary
The in-memory representation of the UTXO set uses (salted) SipHash for avoiding key collision attacks.
Hashing uint256
keys is performed frequently throughout the codebase. Previously, specialized optimizations existed as standalone functions (SipHashUint256
and SipHashUint256Extra
), but the constant salting operations (C0-C3 XOR with keys) were recomputed on every call.
This PR introduces PresaltedSipHasher
, a class that caches the initial SipHash state (v0-v3 after XORing constants with keys), eliminating redundant constant computations when hashing multiple values with the same keys. The optimization is applied uniformly across:
- All
Salted*Hasher
classes (SaltedUint256Hasher
,SaltedTxidHasher
,SaltedWtxidHasher
,SaltedOutpointHasher
) CBlockHeaderAndShortTxIDs
for compact block short ID computation
Details
The change replaces standalone SipHashUint256
and SipHashUint256Extra
functions with PresaltedSipHasher
class methods that cache the constant-salted state. This is particularly beneficial for hash map operations where the same salt is used repeatedly (as suggested by Sipa in #30442 (comment)).
Measurements
0diff --git a/src/bench/crypto_hash.cpp b/src/bench/crypto_hash.cpp
1--- a/src/bench/crypto_hash.cpp (revision 9b1a7c3e8dd78d97fbf47c2d056d043b05969176)
2+++ b/src/bench/crypto_hash.cpp (revision e1b4f056b3097e7e34b0eda31f57826d81c9d810)
3@@ -2,7 +2,6 @@
4 // Distributed under the MIT software license, see the accompanying
5 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
6
7-
8 #include <bench/bench.h>
9 #include <crypto/muhash.h>
10 #include <crypto/ripemd160.h>
11@@ -12,9 +11,11 @@
12 #include <crypto/sha512.h>
13 #include <crypto/siphash.h>
14 #include <random.h>
15-#include <span.h>
16 #include <tinyformat.h>
17 #include <uint256.h>
18+#include <primitives/transaction.h>
19+#include <util/hasher.h>
20+#include <unordered_set>
21
22 #include <cstdint>
23 #include <vector>
24@@ -205,6 +206,98 @@
25 });
26 }
27
28+static void SaltedOutpointHasherBench_hash(benchmark::Bench& bench)
29+{
30+ FastRandomContext rng{/*fDeterministic=*/true};
31+ constexpr size_t size{1000};
32+
33+ std::vector<COutPoint> outpoints(size);
34+ for (auto& outpoint : outpoints) {
35+ outpoint = {Txid::FromUint256(rng.rand256()), rng.rand32()};
36+ }
37+
38+ const SaltedOutpointHasher hasher;
39+ bench.batch(size).run([&] {
40+ size_t result{0};
41+ for (const auto& outpoint : outpoints) {
42+ result ^= hasher(outpoint);
43+ }
44+ ankerl::nanobench::doNotOptimizeAway(result);
45+ });
46+}
47+
48+static void SaltedOutpointHasherBench_match(benchmark::Bench& bench)
49+{
50+ FastRandomContext rng{/*fDeterministic=*/true};
51+ constexpr size_t size{1000};
52+
53+ std::unordered_set<COutPoint, SaltedOutpointHasher> values;
54+ std::vector<COutPoint> value_vector;
55+ values.reserve(size);
56+ value_vector.reserve(size);
57+
58+ for (size_t i{0}; i < size; ++i) {
59+ COutPoint outpoint{Txid::FromUint256(rng.rand256()), rng.rand32()};
60+ values.emplace(outpoint);
61+ value_vector.push_back(outpoint);
62+ assert(values.contains(outpoint));
63+ }
64+
65+ bench.batch(size).run([&] {
66+ bool result{true};
67+ for (const auto& outpoint : value_vector) {
68+ result ^= values.contains(outpoint);
69+ }
70+ ankerl::nanobench::doNotOptimizeAway(result);
71+ });
72+}
73+
74+static void SaltedOutpointHasherBench_mismatch(benchmark::Bench& bench)
75+{
76+ FastRandomContext rng{/*fDeterministic=*/true};
77+ constexpr size_t size{1000};
78+
79+ std::unordered_set<COutPoint, SaltedOutpointHasher> values;
80+ std::vector<COutPoint> missing_value_vector;
81+ values.reserve(size);
82+ missing_value_vector.reserve(size);
83+
84+ for (size_t i{0}; i < size; ++i) {
85+ values.emplace(Txid::FromUint256(rng.rand256()), rng.rand32());
86+ COutPoint missing_outpoint{Txid::FromUint256(rng.rand256()), rng.rand32()};
87+ missing_value_vector.push_back(missing_outpoint);
88+ assert(!values.contains(missing_outpoint));
89+ }
90+
91+ bench.batch(size).run([&] {
92+ bool result{false};
93+ for (const auto& outpoint : missing_value_vector) {
94+ result ^= values.contains(outpoint);
95+ }
96+ ankerl::nanobench::doNotOptimizeAway(result);
97+ });
98+}
99+
100+static void SaltedOutpointHasherBench_create_set(benchmark::Bench& bench)
101+{
102+ FastRandomContext rng{/*fDeterministic=*/true};
103+ constexpr size_t size{1000};
104+
105+ std::vector<COutPoint> outpoints(size);
106+ for (auto& outpoint : outpoints) {
107+ outpoint = {Txid::FromUint256(rng.rand256()), rng.rand32()};
108+ }
109+
110+ bench.batch(size).run([&] {
111+ std::unordered_set<COutPoint, SaltedOutpointHasher> set;
112+ set.reserve(size);
113+ for (const auto& outpoint : outpoints) {
114+ set.emplace(outpoint);
115+ }
116+ ankerl::nanobench::doNotOptimizeAway(set.size());
117+ });
118+}
119+
120 static void MuHash(benchmark::Bench& bench)
121 {
122 MuHash3072 acc;
123@@ -276,6 +369,10 @@
124 BENCHMARK(SHA256_32b_AVX2, benchmark::PriorityLevel::HIGH);
125 BENCHMARK(SHA256_32b_SHANI, benchmark::PriorityLevel::HIGH);
126 BENCHMARK(SipHash_32b, benchmark::PriorityLevel::HIGH);
127+BENCHMARK(SaltedOutpointHasherBench_hash, benchmark::PriorityLevel::HIGH);
128+BENCHMARK(SaltedOutpointHasherBench_match, benchmark::PriorityLevel::HIGH);
129+BENCHMARK(SaltedOutpointHasherBench_mismatch, benchmark::PriorityLevel::HIGH);
130+BENCHMARK(SaltedOutpointHasherBench_create_set, benchmark::PriorityLevel::HIGH);
131 BENCHMARK(SHA256D64_1024_STANDARD, benchmark::PriorityLevel::HIGH);
132 BENCHMARK(SHA256D64_1024_SSE4, benchmark::PriorityLevel::HIGH);
133 BENCHMARK(SHA256D64_1024_AVX2, benchmark::PriorityLevel::HIGH);
cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release && cmake –build build -j$(nproc) && build/bin/bench_bitcoin -filter=‘SaltedOutpointHasherBench’ -min-time=10000
Before:
ns/op | op/s | err% | total | benchmark |
---|---|---|---|---|
58.60 | 17,065,922.04 | 0.3% | 11.02 | SaltedOutpointHasherBench_create_set |
11.97 | 83,576,684.83 | 0.1% | 11.01 | SaltedOutpointHasherBench_hash |
14.50 | 68,985,850.12 | 0.3% | 10.96 | SaltedOutpointHasherBench_match |
13.90 | 71,942,033.47 | 0.4% | 11.03 | SaltedOutpointHasherBench_mismatch |
After:
ns/op | op/s | err% | total | benchmark |
---|---|---|---|---|
57.27 | 17,462,299.19 | 0.1% | 11.02 | SaltedOutpointHasherBench_create_set |
11.24 | 88,997,888.48 | 0.3% | 11.04 | SaltedOutpointHasherBench_hash |
13.91 | 71,902,014.20 | 0.2% | 11.01 | SaltedOutpointHasherBench_match |
13.29 | 75,230,390.31 | 0.1% | 11.00 | SaltedOutpointHasherBench_mismatch |
compared to master:
0create_set - 17,462,299.19 / 17,065,922.04 - 2.3% faster
1hash - 88,997,888.48 / 83,576,684.83 - 6.4% faster
2match - 71,902,014.20 / 68,985,850.12 - 4.2% faster
3mismatch - 75,230,390.31 / 71,942,033.47 - 4.5% faster
C++ compiler …………………….. GNU 13.3.0
Before:
ns/op | op/s | err% | ins/op | cyc/op | IPC | bra/op | miss% | total | benchmark |
---|---|---|---|---|---|---|---|---|---|
136.76 | 7,312,133.16 | 0.0% | 1,086.67 | 491.12 | 2.213 | 119.54 | 1.1% | 11.01 | SaltedOutpointHasherBench_create_set |
23.82 | 41,978,882.62 | 0.0% | 252.01 | 85.57 | 2.945 | 4.00 | 0.0% | 11.00 | SaltedOutpointHasherBench_hash |
60.42 | 16,549,695.42 | 0.1% | 460.51 | 217.04 | 2.122 | 21.00 | 1.4% | 10.99 | SaltedOutpointHasherBench_match |
78.66 | 12,713,595.35 | 0.1% | 555.59 | 282.52 | 1.967 | 20.19 | 2.2% | 10.74 | SaltedOutpointHasherBench_mismatch |
After:
ns/op | op/s | err% | ins/op | cyc/op | IPC | bra/op | miss% | total | benchmark |
---|---|---|---|---|---|---|---|---|---|
135.38 | 7,386,349.49 | 0.0% | 1,078.19 | 486.16 | 2.218 | 119.56 | 1.1% | 11.00 | SaltedOutpointHasherBench_create_set |
23.67 | 42,254,558.08 | 0.0% | 247.01 | 85.01 | 2.906 | 4.00 | 0.0% | 11.00 | SaltedOutpointHasherBench_hash |
58.95 | 16,962,220.14 | 0.1% | 446.55 | 211.74 | 2.109 | 20.86 | 1.4% | 11.01 | SaltedOutpointHasherBench_match |
76.98 | 12,991,047.69 | 0.1% | 548.93 | 276.50 | 1.985 | 20.25 | 2.3% | 10.72 | SaltedOutpointHasherBench_mismatch |
0compared to master:
1create_set - 7,386,349.49 / 7,312,133.16 - 1.0% faster
2hash - 42,254,558.08 / 41,978,882.62 - 0.6% faster
3match - 16,962,220.14 / 16,549,695.42 - 2.4% faster
4mismatch - 12,991,047.69 / 12,713,595.35 - 2.1% faster