Is it worth revisiting LTO compilation?
I did some experimentation with LTO compilation and the results look promising :-)
Binary size results (non-stripped binaries):
bench_bitcoin
shrank from 74 678 800 to 39 695 288 bytes (-47 %)bitcoin-cli
shrank from 4 837 744 to 2 918 544 bytes (-40 %)bitcoin-tx
shrank from 15 206 720 to 7 717 608 bytes (-49 %)bitcoind
shrank from 102 004 960 to 70 706 000 bytes (-31 %)test_bitcoin
shrank from 161 739 656 to 100 838 072 bytes (-38 %)test_bitcoin_fuzzy
shrank from 15 929 968 to 6 036 176 bytes (-62 %)
Binary size results (stripped binaries):
bench_bitcoin
shrank from 5 632 272 to 3 722 720 bytes (-34 %)bitcoin-cli
shrank from 383 216 to 260 288 bytes (-32 %)bitcoin-tx
shrank from 1 399 112 to 936 080 bytes (-33 %)bitcoind
shrank from 6 639 336 to 6 044 520 bytes (-9 %)test_bitcoin
shrank from 12 067 056 to 10 853 616 bytes (-10 %)test_bitcoin_fuzzy
shrank from 1 468 976 to 428 160 bytes (-71 %)
Benchmark results (insignificant relative changes omitted to reduce noise):
- Runtime of benchmark
FastRandom_1bit
changed -7.9 % when enabling LTO - Runtime of benchmark
FastRandom_32bit
changed -6.7 % when enabling LTO - Runtime of benchmark
MatchGCSFilter
changed -11.5 % when enabling LTO - Runtime of benchmark
MempoolEviction
changed -13.3 % when enabling LTO - Runtime of benchmark
PrevectorDeserializeNontrivial
changed -58.1 % when enabling LTO - Runtime of benchmark
RollingBloom
changed -15.0 % when enabling LTO
Below is the log from my experimentation.
Let me know if anything can be improved. Feedback appreciated.
0# Build Bitcoin without LTO (baseline)
1$ git clone https://github.com/bitcoin/bitcoin bitcoin-without-lto
2$ cd bitcoin-without-lto
3$ export CC="clang"
4$ export CXX="clang++"
5$ export RANLIB="/usr/lib/llvm-6.0/bin/llvm-ranlib"
6$ ./autogen.sh
7$ ./configure
8$ make
9$ cd ..
10
11# Build Bitcoin with LTO
12$ git clone https://github.com/bitcoin/bitcoin bitcoin-with-lto
13$ cd bitcoin-with-lto
14$ PREFIX=${PWD}/binutils-bin/
15$ mkdir binutils-bin
16$ apt install texinfo bison
17$ git clone --depth 1 git://sourceware.org/git/binutils-gdb.git binutils
18$ mkdir binutils-build
19$ cd binutils-build
20$ export CC="clang"
21$ export CXX="clang++"
22$ unset RANLIB
23$ ../binutils/configure --enable-gold --enable-plugins --disable-werror --prefix=${PREFIX}
24$ make all-gold
25$ make install
26$ cd ..
27$ ${PREFIX}/bin/ld.gold -plugin 2>&1 | grep -q "plugin: missing argument" && echo "ld.gold has plugin support" || echo "ERROR: ld.gold lacks plugin support"
28$ cp /usr/lib/llvm-6.0/lib/LLVMgold.so ${PREFIX}/lib/
29$ export PATH="${PREFIX}/bin:${PATH}"
30$ export CC="clang -flto"
31$ export CXX="clang++ -flto"
32$ export RANLIB="/usr/lib/llvm-6.0/bin/llvm-ranlib"
33$ ./autogen.sh
34$ ./configure
35$ make
36$ cd ..
37
38# Check binary sizes
39$ ls -Sl bitcoin-*-lto/src/bitcoind \
40 bitcoin-*-lto/src/bitcoin-tx \
41 bitcoin-*-lto/src/bench/bench_bitcoin \
42 bitcoin-*-lto/src/bitcoin-cli \
43 bitcoin-*-lto/src/test/test_bitcoin \
44 bitcoin-*-lto/src/test/test_bitcoin_fuzzy
45-rwxr-xr-x 1 root root 161739656 Sep 20 11:57 bitcoin-without-lto/src/test/test_bitcoin
46-rwxr-xr-x 1 root root 102004960 Sep 20 11:57 bitcoin-without-lto/src/bitcoind
47-rwxr-xr-x 1 root root 100838072 Sep 20 12:12 bitcoin-with-lto/src/test/test_bitcoin
48-rwxr-xr-x 1 root root 74678800 Sep 20 11:57 bitcoin-without-lto/src/bench/bench_bitcoin
49-rwxr-xr-x 1 root root 70706000 Sep 20 12:11 bitcoin-with-lto/src/bitcoind
50-rwxr-xr-x 1 root root 39695288 Sep 20 12:10 bitcoin-with-lto/src/bench/bench_bitcoin
51-rwxr-xr-x 1 root root 15929968 Sep 20 11:57 bitcoin-without-lto/src/test/test_bitcoin_fuzzy
52-rwxr-xr-x 1 root root 15206720 Sep 20 11:57 bitcoin-without-lto/src/bitcoin-tx
53-rwxr-xr-x 1 root root 7717608 Sep 20 12:09 bitcoin-with-lto/src/bitcoin-tx
54-rwxr-xr-x 1 root root 6036176 Sep 20 12:09 bitcoin-with-lto/src/test/test_bitcoin_fuzzy
55-rwxr-xr-x 1 root root 4837744 Sep 20 11:57 bitcoin-without-lto/src/bitcoin-cli
56-rwxr-xr-x 1 root root 2918544 Sep 20 12:08 bitcoin-with-lto/src/bitcoin-cli
57$ strip bitcoin-*-lto/src/bitcoind \
58 bitcoin-*-lto/src/bitcoin-tx \
59 bitcoin-*-lto/src/bench/bench_bitcoin \
60 bitcoin-*-lto/src/bitcoin-cli \
61 bitcoin-*-lto/src/test/test_bitcoin \
62 bitcoin-*-lto/src/test/test_bitcoin_fuzzy
63$ ls -Sl bitcoin-*-lto/src/bitcoind \
64 bitcoin-*-lto/src/bitcoin-tx \
65 bitcoin-*-lto/src/bench/bench_bitcoin \
66 bitcoin-*-lto/src/bitcoin-cli \
67 bitcoin-*-lto/src/test/test_bitcoin \
68 bitcoin-*-lto/src/test/test_bitcoin_fuzzy
69-rwxr-xr-x 1 root root 12067056 Sep 20 15:54 bitcoin-without-lto/src/test/test_bitcoin
70-rwxr-xr-x 1 root root 10853616 Sep 20 15:54 bitcoin-with-lto/src/test/test_bitcoin
71-rwxr-xr-x 1 root root 6639336 Sep 20 15:54 bitcoin-without-lto/src/bitcoind
72-rwxr-xr-x 1 root root 6044520 Sep 20 15:54 bitcoin-with-lto/src/bitcoind
73-rwxr-xr-x 1 root root 5632272 Sep 20 15:54 bitcoin-without-lto/src/bench/bench_bitcoin
74-rwxr-xr-x 1 root root 3722720 Sep 20 15:54 bitcoin-with-lto/src/bench/bench_bitcoin
75-rwxr-xr-x 1 root root 1468976 Sep 20 15:54 bitcoin-without-lto/src/test/test_bitcoin_fuzzy
76-rwxr-xr-x 1 root root 1399112 Sep 20 15:54 bitcoin-without-lto/src/bitcoin-tx
77-rwxr-xr-x 1 root root 936080 Sep 20 15:54 bitcoin-with-lto/src/bitcoin-tx
78-rwxr-xr-x 1 root root 428160 Sep 20 15:54 bitcoin-with-lto/src/test/test_bitcoin_fuzzy
79-rwxr-xr-x 1 root root 383216 Sep 20 15:54 bitcoin-without-lto/src/bitcoin-cli
80-rwxr-xr-x 1 root root 260288 Sep 20 15:54 bitcoin-with-lto/src/bitcoin-cli
81
82# Gather performance measurements until ^C is pressed
83$ while true; do for SWITCH in with without; do echo "# $SWITCH"; \
84 bitcoin-${SWITCH}-lto/src/bench/bench_bitcoin; done; done 2>&1 | \
85 tee bench_bitcoin-lto-vs-non-lto
86
87# Summarize results
88$ ./parse_lto.py < bench_bitcoin-lto-vs-non-lto
89* Runtime of benchmark FastRandom_1bit changed -7.9 % when enabling LTO. Median total time was 4.4 seconds without LTO and 4.1 seconds with LTO. Based on 14 independent runs of bench_bitcoin.
90* Runtime of benchmark FastRandom_32bit changed -6.7 % when enabling LTO. Median total time was 5.8 seconds without LTO and 5.4 seconds with LTO. Based on 14 independent runs of bench_bitcoin.
91* Runtime of benchmark MatchGCSFilter changed -11.5 % when enabling LTO. Median total time was 8.3 seconds without LTO and 7.3 seconds with LTO. Based on 14 independent runs of bench_bitcoin.
92* Runtime of benchmark MempoolEviction changed -13.3 % when enabling LTO. Median total time was 4.6 seconds without LTO and 4.0 seconds with LTO. Based on 14 independent runs of bench_bitcoin.
93* Runtime of benchmark PrevectorDeserializeNontrivial changed -58.1 % when enabling LTO. Median total time was 8.2 seconds without LTO and 3.4 seconds with LTO. Based on 14 independent runs of bench_bitcoin.
94* Runtime of benchmark RollingBloom changed -15.0 % when enabling LTO. Median total time was 4.4 seconds without LTO and 3.7 seconds with LTO. Based on 14 independent runs of bench_bitcoin.
95
96# Environment
97$ clang++ --version | head -2
98clang version 6.0.0-1ubuntu2 (tags/RELEASE_600/final)
99Target: x86_64-pc-linux-gnu
100$ dpkg -S $(which clang++)
101clang: /usr/bin/clang++
102$ dpkg -S /usr/lib/llvm-6.0/bin/llvm-ranlib
103llvm-6.0: /usr/lib/llvm-6.0/bin/llvm-ranlib
104$ dpkg -S /usr/lib/llvm-6.0/lib/LLVMgold.so
105llvm-6.0-dev: /usr/lib/llvm-6.0/lib/LLVMgold.so
106$ cat /etc/lsb-release
107DISTRIB_ID=Ubuntu
108DISTRIB_RELEASE=18.04
109DISTRIB_CODENAME=bionic
110DISTRIB_DESCRIPTION="Ubuntu 18.04.1 LTS"
This is the content of parse_lto.py
:
0#!/usr/bin/env python3
1
2import collections
3import statistics
4import sys
5
6results_lto = collections.defaultdict(list)
7results_nonlto = collections.defaultdict(list)
8for line in sys.stdin:
9 line = line.rstrip("\n")
10 if line.startswith("# Benchmark"):
11 continue
12 if line.startswith("#"):
13 lto_status = line[2:]
14 continue
15 assert(lto_status in ["with", "without"])
16 benchmark, _, _, total_time, _ = line.split(", ", 4)
17 total_time = float(total_time)
18 if lto_status == "with":
19 results_lto[benchmark].append(total_time)
20 continue
21 if lto_status == "without":
22 results_nonlto[benchmark].append(total_time)
23 continue
24 assert(False)
25
26assert(len(results_lto) == len(results_nonlto))
27for benchmark in sorted(results_lto):
28 least_observations = min(len(results_lto[benchmark]), len(results_nonlto[benchmark]))
29 results_lto[benchmark] = results_lto[benchmark][:least_observations]
30 results_nonlto[benchmark] = results_nonlto[benchmark][:least_observations]
31for benchmark in sorted(results_lto):
32 assert(len(results_lto[benchmark]) == len(results_nonlto[benchmark]))
33 median_lto = statistics.median(results_lto[benchmark])
34 median_nonlto = statistics.median(results_nonlto[benchmark])
35 assert(median_nonlto != 0)
36 change = median_lto / median_nonlto - 1
37 if abs(change) < 0.05:
38 continue
39 print("* Runtime of benchmark {} changed {:.1f} % when enabling LTO. Median total time was {:.1f} seconds without LTO and {:.1f} seconds with LTO. Based on {} independent runs of bench_bitcoin.".format(
40 benchmark, 100 * change, median_nonlto, median_lto, len(results_lto[benchmark])
41 ))