This change is part of [IBD] - Tracking PR for speeding up Initial Block Download
Summary
The in-memory representation of the UTXO set uses (salted) SipHash for avoiding key collision attacks.
Hashing a uint256
key is done so often that a specialized optimization was extracted to SipHashUint256Extra. The constant salting operations were already extracted in the general case, this PR adjust the main specialization similarly.
Details
Previously, only k0
and k1
were stored, causing the constant xor operations to be recomputed in every call to SipHashUint256Extra
.
This commit adds a dedicated Uint256ExtraSipHasher
class that caches the initial state (v0-v3) and to perform the SipHash
computation on a uint256
(with an extra parameter), hiding the constant computation details from higher-level code and improving efficiency (as suggested by Sipa in #30442 (comment)).
Measurements
0diff --git a/src/bench/crypto_hash.cpp b/src/bench/crypto_hash.cpp
1--- a/src/bench/crypto_hash.cpp (revision 9b1a7c3e8dd78d97fbf47c2d056d043b05969176)
2+++ b/src/bench/crypto_hash.cpp (revision e1b4f056b3097e7e34b0eda31f57826d81c9d810)
3@@ -2,7 +2,6 @@
4 // Distributed under the MIT software license, see the accompanying
5 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
6
7-
8 #include <bench/bench.h>
9 #include <crypto/muhash.h>
10 #include <crypto/ripemd160.h>
11@@ -12,9 +11,11 @@
12 #include <crypto/sha512.h>
13 #include <crypto/siphash.h>
14 #include <random.h>
15-#include <span.h>
16 #include <tinyformat.h>
17 #include <uint256.h>
18+#include <primitives/transaction.h>
19+#include <util/hasher.h>
20+#include <unordered_set>
21
22 #include <cstdint>
23 #include <vector>
24@@ -205,6 +206,98 @@
25 });
26 }
27
28+static void SaltedOutpointHasherBench_hash(benchmark::Bench& bench)
29+{
30+ FastRandomContext rng{/*fDeterministic=*/true};
31+ constexpr size_t size{1000};
32+
33+ std::vector<COutPoint> outpoints(size);
34+ for (auto& outpoint : outpoints) {
35+ outpoint = {Txid::FromUint256(rng.rand256()), rng.rand32()};
36+ }
37+
38+ const SaltedOutpointHasher hasher;
39+ bench.batch(size).run([&] {
40+ size_t result{0};
41+ for (const auto& outpoint : outpoints) {
42+ result ^= hasher(outpoint);
43+ }
44+ ankerl::nanobench::doNotOptimizeAway(result);
45+ });
46+}
47+
48+static void SaltedOutpointHasherBench_match(benchmark::Bench& bench)
49+{
50+ FastRandomContext rng{/*fDeterministic=*/true};
51+ constexpr size_t size{1000};
52+
53+ std::unordered_set<COutPoint, SaltedOutpointHasher> values;
54+ std::vector<COutPoint> value_vector;
55+ values.reserve(size);
56+ value_vector.reserve(size);
57+
58+ for (size_t i{0}; i < size; ++i) {
59+ COutPoint outpoint{Txid::FromUint256(rng.rand256()), rng.rand32()};
60+ values.emplace(outpoint);
61+ value_vector.push_back(outpoint);
62+ assert(values.contains(outpoint));
63+ }
64+
65+ bench.batch(size).run([&] {
66+ bool result{true};
67+ for (const auto& outpoint : value_vector) {
68+ result ^= values.contains(outpoint);
69+ }
70+ ankerl::nanobench::doNotOptimizeAway(result);
71+ });
72+}
73+
74+static void SaltedOutpointHasherBench_mismatch(benchmark::Bench& bench)
75+{
76+ FastRandomContext rng{/*fDeterministic=*/true};
77+ constexpr size_t size{1000};
78+
79+ std::unordered_set<COutPoint, SaltedOutpointHasher> values;
80+ std::vector<COutPoint> missing_value_vector;
81+ values.reserve(size);
82+ missing_value_vector.reserve(size);
83+
84+ for (size_t i{0}; i < size; ++i) {
85+ values.emplace(Txid::FromUint256(rng.rand256()), rng.rand32());
86+ COutPoint missing_outpoint{Txid::FromUint256(rng.rand256()), rng.rand32()};
87+ missing_value_vector.push_back(missing_outpoint);
88+ assert(!values.contains(missing_outpoint));
89+ }
90+
91+ bench.batch(size).run([&] {
92+ bool result{false};
93+ for (const auto& outpoint : missing_value_vector) {
94+ result ^= values.contains(outpoint);
95+ }
96+ ankerl::nanobench::doNotOptimizeAway(result);
97+ });
98+}
99+
100+static void SaltedOutpointHasherBench_create_set(benchmark::Bench& bench)
101+{
102+ FastRandomContext rng{/*fDeterministic=*/true};
103+ constexpr size_t size{1000};
104+
105+ std::vector<COutPoint> outpoints(size);
106+ for (auto& outpoint : outpoints) {
107+ outpoint = {Txid::FromUint256(rng.rand256()), rng.rand32()};
108+ }
109+
110+ bench.batch(size).run([&] {
111+ std::unordered_set<COutPoint, SaltedOutpointHasher> set;
112+ set.reserve(size);
113+ for (const auto& outpoint : outpoints) {
114+ set.emplace(outpoint);
115+ }
116+ ankerl::nanobench::doNotOptimizeAway(set.size());
117+ });
118+}
119+
120 static void MuHash(benchmark::Bench& bench)
121 {
122 MuHash3072 acc;
123@@ -276,6 +369,10 @@
124 BENCHMARK(SHA256_32b_AVX2, benchmark::PriorityLevel::HIGH);
125 BENCHMARK(SHA256_32b_SHANI, benchmark::PriorityLevel::HIGH);
126 BENCHMARK(SipHash_32b, benchmark::PriorityLevel::HIGH);
127+BENCHMARK(SaltedOutpointHasherBench_hash, benchmark::PriorityLevel::HIGH);
128+BENCHMARK(SaltedOutpointHasherBench_match, benchmark::PriorityLevel::HIGH);
129+BENCHMARK(SaltedOutpointHasherBench_mismatch, benchmark::PriorityLevel::HIGH);
130+BENCHMARK(SaltedOutpointHasherBench_create_set, benchmark::PriorityLevel::HIGH);
131 BENCHMARK(SHA256D64_1024_STANDARD, benchmark::PriorityLevel::HIGH);
132 BENCHMARK(SHA256D64_1024_SSE4, benchmark::PriorityLevel::HIGH);
133 BENCHMARK(SHA256D64_1024_AVX2, benchmark::PriorityLevel::HIGH);
cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release && cmake –build build -j$(nproc) && build/bin/bench_bitcoin -filter=‘SaltedOutpointHasherBench’ -min-time=10000
Before:
ns/op | op/s | err% | total | benchmark |
---|---|---|---|---|
58.60 | 17,065,922.04 | 0.3% | 11.02 | SaltedOutpointHasherBench_create_set |
11.97 | 83,576,684.83 | 0.1% | 11.01 | SaltedOutpointHasherBench_hash |
14.50 | 68,985,850.12 | 0.3% | 10.96 | SaltedOutpointHasherBench_match |
13.90 | 71,942,033.47 | 0.4% | 11.03 | SaltedOutpointHasherBench_mismatch |
After:
ns/op | op/s | err% | total | benchmark |
---|---|---|---|---|
57.27 | 17,462,299.19 | 0.1% | 11.02 | SaltedOutpointHasherBench_create_set |
11.24 | 88,997,888.48 | 0.3% | 11.04 | SaltedOutpointHasherBench_hash |
13.91 | 71,902,014.20 | 0.2% | 11.01 | SaltedOutpointHasherBench_match |
13.29 | 75,230,390.31 | 0.1% | 11.00 | SaltedOutpointHasherBench_mismatch |
compared to master:
0create_set - 17,462,299.19 / 17,065,922.04 - 2.3% faster
1hash - 88,997,888.48 / 83,576,684.83 - 6.4% faster
2match - 71,902,014.20 / 68,985,850.12 - 4.2% faster
3mismatch - 75,230,390.31 / 71,942,033.47 - 4.5% faster
C++ compiler …………………….. GNU 13.3.0
Before:
ns/op | op/s | err% | ins/op | cyc/op | IPC | bra/op | miss% | total | benchmark |
---|---|---|---|---|---|---|---|---|---|
136.76 | 7,312,133.16 | 0.0% | 1,086.67 | 491.12 | 2.213 | 119.54 | 1.1% | 11.01 | SaltedOutpointHasherBench_create_set |
23.82 | 41,978,882.62 | 0.0% | 252.01 | 85.57 | 2.945 | 4.00 | 0.0% | 11.00 | SaltedOutpointHasherBench_hash |
60.42 | 16,549,695.42 | 0.1% | 460.51 | 217.04 | 2.122 | 21.00 | 1.4% | 10.99 | SaltedOutpointHasherBench_match |
78.66 | 12,713,595.35 | 0.1% | 555.59 | 282.52 | 1.967 | 20.19 | 2.2% | 10.74 | SaltedOutpointHasherBench_mismatch |
After:
ns/op | op/s | err% | ins/op | cyc/op | IPC | bra/op | miss% | total | benchmark |
---|---|---|---|---|---|---|---|---|---|
135.38 | 7,386,349.49 | 0.0% | 1,078.19 | 486.16 | 2.218 | 119.56 | 1.1% | 11.00 | SaltedOutpointHasherBench_create_set |
23.67 | 42,254,558.08 | 0.0% | 247.01 | 85.01 | 2.906 | 4.00 | 0.0% | 11.00 | SaltedOutpointHasherBench_hash |
58.95 | 16,962,220.14 | 0.1% | 446.55 | 211.74 | 2.109 | 20.86 | 1.4% | 11.01 | SaltedOutpointHasherBench_match |
76.98 | 12,991,047.69 | 0.1% | 548.93 | 276.50 | 1.985 | 20.25 | 2.3% | 10.72 | SaltedOutpointHasherBench_mismatch |
0compared to master:
1create_set - 7,386,349.49 / 7,312,133.16 - 1.0% faster
2hash - 42,254,558.08 / 41,978,882.62 - 0.6% faster
3match - 16,962,220.14 / 16,549,695.42 - 2.4% faster
4mismatch - 12,991,047.69 / 12,713,595.35 - 2.1% faster