Intermittent timeout in tsan feature_init.py #30586

issue maflcko opened this issue on August 5, 2024
  1. maflcko commented at 9:39 AM on August 5, 2024: member

    This happens intermittently with something like https://cirrus-ci.com/task/6520727065591808?logs=ci#L3416

    Remaining jobs: [feature_init.py]
    ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
    Timed out!
    

    I presume it is conceptually a duplicate of a similar report in valgrind, where it ignores the terminate signal: #30011 (comment)

  2. maflcko added the label Tests on Aug 5, 2024
  3. maflcko added the label CI failed on Aug 5, 2024
  4. maflcko commented at 10:30 AM on October 2, 2024: member

    Closing for now, but this can be reopened when it happens again the next time.

  5. maflcko closed this on Oct 2, 2024

  6. maflcko commented at 10:37 AM on October 7, 2024: member
  7. maflcko reopened this on Oct 7, 2024

  8. maflcko commented at 1:42 PM on November 4, 2024: member
  9. maflcko commented at 5:19 PM on May 6, 2025: member

    Not sure why CI runs into this so rarely. This reproduces easily locally on a fresh install of Ubuntu 25.04 Plucky:

        1  apt update && apt install -y git ccache build-essential cmake pkg-config python3-zmq libevent-dev libboost-dev libzmq3-dev libsqlite3-dev  libc++abi-dev libc++-dev clang llvm 
        2  git clone https://github.com/bitcoin/bitcoin --depth=1 ./bitcoin-core && cd bitcoin-core/
        8  cmake -B ./bld -DENABLE_IPC=OFF -DBUILD_TESTS=OFF -DBUILD_BENCH=OFF -DBUILD_FUZZ_BINARY=OFF -DWERROR=ON -DBUILD_GUI=OFF -DWITH_USDT=ON -DWITH_CCACHE=ON -DSANITIZERS=thread -DCMAKE_C_COMPILER="clang" -DCMAKE_CXX_COMPILER="clang++;-stdlib=libc++"  
        9  cmake --build ./bld -j $( nproc )  
    
    # TSAN_OPTIONS="suppressions=$(pwd)/test/sanitizer_suppressions/tsan:halt_on_error=1" ./bld/test/functional/feature_init.py --timeout-factor=4 
    2025-05-06T16:41:13.549000Z TestFramework (INFO): PRNG seed is: 5866693335731863023
    2025-05-06T16:41:13.550000Z TestFramework (INFO): Initializing test directory /tmp/bitcoin_func_test_iszlunlv
    2025-05-06T16:41:14.135000Z TestFramework (INFO): Test specifying custom pid file via -pid command line option
    2025-05-06T16:41:14.135000Z TestFramework (INFO): -> path relative to datadir (my_fancy_bitcoin_pid_file.foobar)
    2025-05-06T16:41:14.957000Z TestFramework (INFO): -> absolute path (/tmp/bitcoin_func_test_iszlunlv/my_fancy_bitcoin_pid_file.foobar)
    2025-05-06T16:41:15.625000Z TestFramework (INFO): Starting node and will terminate after line b'Validating signatures for all blocks'
    2025-05-06T16:41:17.063000Z TestFramework (INFO): Starting node and will terminate after line b'scheduler thread start'
    2025-05-06T16:41:17.484000Z TestFramework (INFO): Starting node and will terminate after line b'Starting HTTP server'
    2025-05-06T16:41:17.894000Z TestFramework (INFO): Starting node and will terminate after line b'Loading P2P addresses'
    2025-05-06T16:41:18.308000Z TestFramework (INFO): Starting node and will terminate after line b'Loading banlist'
    2025-05-06T16:41:18.722000Z TestFramework (INFO): Starting node and will terminate after line b'Loading block index'
    2025-05-06T16:41:19.132000Z TestFramework (INFO): Starting node and will terminate after line b'Checking all blk files are present'
    2025-05-06T16:41:19.554000Z TestFramework (INFO): Starting node and will terminate after line b'Loaded best chain:'
    2025-05-06T16:41:20.995000Z TestFramework (INFO): Starting node and will terminate after line b'init message: Verifying blocks'
    2025-05-06T16:41:21.420000Z TestFramework (INFO): Starting node and will terminate after line b'init message: Starting network threads'
    2025-05-06T16:41:21.979000Z TestFramework (INFO): Starting node and will terminate after line b'net thread start'
    2025-05-06T16:41:22.539000Z TestFramework (INFO): Starting node and will terminate after line b'addcon thread start'
    2025-05-06T16:41:24.112000Z TestFramework (INFO): Starting node and will terminate after line b'initload thread start'
    2025-05-06T16:41:24.675000Z TestFramework (INFO): Starting node and will terminate after line b'txindex thread start'
    2025-05-06T16:41:26.252000Z TestFramework (INFO): Starting node and will terminate after line b'block filter index thread start'
    2025-05-06T16:41:26.816000Z TestFramework (INFO): Starting node and will terminate after line b'coinstatsindex thread start'
      
    ( does not terminate )
    
    
    
  10. maflcko commented at 8:08 PM on May 6, 2025: member

    A workaround seems to be:

    diff --git a/test/functional/feature_init.py b/test/functional/feature_init.py
    index 15b3e85..1fef8e1 100755
    --- a/test/functional/feature_init.py
    +++ b/test/functional/feature_init.py
    @@ -44,7 +44,7 @@ class InitTest(BitcoinTestFramework):
                     # bitcoind to perform any shutdown logic.
                     os.kill(node.process.pid, signal.CTRL_BREAK_EVENT)
                 else:
    -                node.process.terminate()
    +                node.process.kill()
                 node.process.wait()
     
             def start_expecting_error(err_fragment):
    

    However, that will (obviously) change what is being tested in the test.

  11. fanquake commented at 12:42 PM on February 27, 2026: member

    Checked that this still reproduces on Ubuntu 26.04, using master https://github.com/bitcoin/bitcoin/commit/3a8b4e89f6db024e82f897cb88b1cbd00a0941cf.

  12. shuv-amp commented at 2:20 PM on February 27, 2026: none

    I’m preparing a small feature_init.py patch to avoid the intermittent TSAN timeout without changing the intended interrupt behavior. I’ll keep the diff narrow and include clear test results in the PR.

  13. shuv-amp commented at 3:17 PM on February 27, 2026: none

    I have a small patch ready for this.

    It changes feature_init.py to use a bounded wait (10 * timeout_factor) after sending SIGTERM in the interrupt stress helper, instead of waiting indefinitely.

    If that wait times out and TSAN is active (TSAN_OPTIONS is set), it falls back to SIGKILL and waits for exit. Outside TSAN, timeout stays a hard failure.

    I could not reproduce the original hang on macOS TSAN, but repeated local TSAN and non-TSAN runs pass with this change. If this direction looks good for the Ubuntu CI case in this issue, I can open a PR.

  14. shuv-amp commented at 5:02 PM on February 27, 2026: none

    I reran this in fresh Ubuntu 26.04 TSAN containers with a 180s per-run timeout, focused on the interrupt path:

    • upstream (interrupt-only): 6/8 passed, 2/8 timed out (rc=124)
    • patched (interrupt-only): 8/8 passed, 0 timed out
    • I also replayed the two seeds from the failing upstream runs on patched, and both passed

    One caveat from this environment: full feature_init.py is currently failing in break_wait_test() with RemoteDisconnected on both upstream and patched, so that looks unrelated to this change.

    If this direction looks reasonable, I can open a narrow PR with this patch and the test evidence above.

  15. maflcko commented at 5:22 PM on February 27, 2026: member

    @shuv-amp Yeah, I think it is trivial to work around the issue, but I'd like to understand it better first. I am not even sure if running under tsan is supported this way, since we are not instrumenting libcxx with tsan. C.f. commit b09af2ce508, which fixes it for the CI container. I guess the issue won't reproduce in the container on 26.04?

  16. shuv-amp commented at 5:26 PM on February 27, 2026: none

    Thanks, that makes sense.

    My local runs were in a plain Ubuntu 26.04 Docker environment with a local clang TSAN build, not in the CI TSAN setup with instrumented libc++ from b09af2c, so I agree the comparison is not exact.

    In that setup I can still hit the interrupt timeout intermittently, but I also get break_wait_test() failures (RemoteDisconnected) on both upstream and patched, which suggests the environment is noisy.

    Given that, I’ll hold off on opening a PR and first try to reproduce in the same TSAN container/config used by CI.

  17. maflcko commented at 5:58 PM on February 27, 2026: member

    C.f. commit b09af2c, which fixes it for the CI container. I guess the issue won't reproduce in the container on 26.04?

    Indeed. When I revert the commit via:

    * Unmerged path .cirrus.yml
    diff --git a/ci/test/00_setup_env_native_tsan.sh b/ci/test/00_setup_env_native_tsan.sh
    index 29dd06b..ebd91c7 100755
    --- a/ci/test/00_setup_env_native_tsan.sh
    +++ b/ci/test/00_setup_env_native_tsan.sh
    @@ -8,12 +8,10 @@ export LC_ALL=C.UTF-8
     
     export CONTAINER_NAME=ci_native_tsan
     export CI_IMAGE_NAME_TAG="mirror.gcr.io/ubuntu:24.04"
    -export APT_LLVM_V="22"
    -LIBCXX_DIR="/cxx_build/"
    -LIBCXX_FLAGS="-fsanitize=thread -nostdinc++ -nostdlib++ -isystem ${LIBCXX_DIR}include/c++/v1 -L${LIBCXX_DIR}lib -Wl,-rpath,${LIBCXX_DIR}lib -lc++ -lc++abi -lpthread -Wno-unused-command-line-argument"
    -export PACKAGES="clang-${APT_LLVM_V} llvm-${APT_LLVM_V} llvm-${APT_LLVM_V}-dev libclang-${APT_LLVM_V}-dev libclang-rt-${APT_LLVM_V}-dev python3-zmq python3-pip"
    +export APT_LLVM_VVV="21"
    +export PACKAGES="python3-pip clang-${APT_LLVM_VVV} llvm-${APT_LLVM_VVV} llvm-${APT_LLVM_VVV}-dev libclang-${APT_LLVM_VVV}-dev libclang-rt-${APT_LLVM_VVV}-dev libc++abi-${APT_LLVM_VVV}-dev libc++-${APT_LLVM_VVV}-dev python3-zmq"
     export PIP_PACKAGES="--break-system-packages pycapnp"
    -export DEP_OPTS="CC=clang CXX=clang++ CXXFLAGS='${LIBCXX_FLAGS}' NO_QT=1"
    +export DEP_OPTS="CC=clang CXX='clang++ -stdlib=libc++' NO_QT=1"
     export GOAL="install"
     export CI_LIMIT_STACK_SIZE=1
     export BITCOIN_CONFIG="\
    @@ -22,4 +20,3 @@ export BITCOIN_CONFIG="\
       -DSANITIZERS=thread \
       -DAPPEND_CPPFLAGS='-DARENA_DEBUG -DDEBUG_LOCKCONTENTION -D_LIBCPP_REMOVE_TRANSITIVE_INCLUDES' \
     "
    -export USE_INSTRUMENTED_LIBCPP="Thread"
    

    I get the bug when running this inside a 26.04 container:

    # TSAN_OPTIONS="suppressions=$(pwd)/test/sanitizer_suppressions/tsan:halt_on_error=1" /ci_container_base/ci/scratch/build-x86_64-pc-linux-gnu/test/functional/feature_init.py --timeout-factor=4  
    2026-02-27T17:54:51.102463Z TestFramework (INFO): PRNG seed is: 5189495325087692089
    2026-02-27T17:54:51.103297Z TestFramework (INFO): Initializing test directory /tmp/bitcoin_func_test_vgivgmyr
    2026-02-27T17:54:51.762085Z TestFramework (INFO): Test specifying custom pid file via -pid command line option
    2026-02-27T17:54:51.762318Z TestFramework (INFO): -> path relative to datadir (my_fancy_bitcoin_pid_file.foobar)
    2026-02-27T17:54:52.530073Z TestFramework (INFO): -> absolute path (/tmp/bitcoin_func_test_vgivgmyr/my_fancy_bitcoin_pid_file.foobar)
    2026-02-27T17:54:55.633405Z TestFramework (INFO): Starting node and will terminate after line b'Validating signatures for all blocks'
    2026-02-27T17:54:56.967267Z TestFramework (INFO): Starting node and will terminate after line b'scheduler thread start'
    2026-02-27T17:54:57.285640Z TestFramework (INFO): Starting node and will terminate after line b'Starting HTTP server'
    2026-02-27T17:54:57.614265Z TestFramework (INFO): Starting node and will terminate after line b'Loading P2P addresses'
    2026-02-27T17:54:57.935721Z TestFramework (INFO): Starting node and will terminate after line b'Loading banlist'
    2026-02-27T17:54:58.252494Z TestFramework (INFO): Starting node and will terminate after line b'Loading block index'
    2026-02-27T17:54:58.578925Z TestFramework (INFO): Starting node and will terminate after line b'Checking all blk files are present'
    2026-02-27T17:54:58.881776Z TestFramework (INFO): Starting node and will terminate after line b'Loaded best chain:'
    2026-02-27T17:54:59.178582Z TestFramework (INFO): Starting node and will terminate after line b'init message: Verifying blocks'
    2026-02-27T17:54:59.503738Z TestFramework (INFO): Starting node and will terminate after line b'init message: Starting network threads'
    2026-02-27T17:54:59.936891Z TestFramework (INFO): Starting node and will terminate after line b'net thread start'
    2026-02-27T17:55:01.391691Z TestFramework (INFO): Starting node and will terminate after line b'addcon thread start'
    2026-02-27T17:55:02.856179Z TestFramework (INFO): Starting node and will terminate after line b'initload thread start'
    2026-02-27T17:55:03.295662Z TestFramework (INFO): Starting node and will terminate after line b'txindex thread start'
    2026-02-27T17:55:04.740460Z TestFramework (INFO): Starting node and will terminate after line b'block filter index thread start'
    2026-02-27T17:55:06.186958Z TestFramework (INFO): Starting node and will terminate after line b'coinstatsindex thread start'
    
    (ded)
    
    

    edit: Actually, even with

    diff --git a/ci/test/00_setup_env_native_tsan.sh b/ci/test/00_setup_env_native_tsan.sh
    index 29dd06b..88bbbe1 100755
    --- a/ci/test/00_setup_env_native_tsan.sh
    +++ b/ci/test/00_setup_env_native_tsan.sh
    @@ -7,8 +7,8 @@
     export LC_ALL=C.UTF-8
     
     export CONTAINER_NAME=ci_native_tsan
    -export CI_IMAGE_NAME_TAG="mirror.gcr.io/ubuntu:24.04"
    -export APT_LLVM_V="22"
    +export CI_IMAGE_NAME_TAG="mirror.gcr.io/ubuntu:26.04"
    +export APT_LLVM_V="21"
     LIBCXX_DIR="/cxx_build/"
     LIBCXX_FLAGS="-fsanitize=thread -nostdinc++ -nostdlib++ -isystem ${LIBCXX_DIR}include/c++/v1 -L${LIBCXX_DIR}lib -Wl,-rpath,${LIBCXX_DIR}lib -lc++ -lc++abi -lpthread -Wno-unused-command-line-argument"
     export PACKAGES="clang-${APT_LLVM_V} llvm-${APT_LLVM_V} llvm-${APT_LLVM_V}-dev libclang-${APT_LLVM_V}-dev libclang-rt-${APT_LLVM_V}-dev python3-zmq python3-pip"
    diff --git a/ci/test/01_base_install.sh b/ci/test/01_base_install.sh
    index f8d337f..ab848ae 100755
    --- a/ci/test/01_base_install.sh
    +++ b/ci/test/01_base_install.sh
    @@ -21,17 +21,6 @@ if [ -n "$DPKG_ADD_ARCH" ]; then
       dpkg --add-architecture "$DPKG_ADD_ARCH"
     fi
     
    -if [ -n "${APT_LLVM_V}" ]; then
    -  ${CI_RETRY_EXE} apt-get update
    -  ${CI_RETRY_EXE} apt-get install curl -y
    -  curl "https://apt.llvm.org/llvm-snapshot.gpg.key" | tee "/etc/apt/trusted.gpg.d/apt.llvm.org.asc"
    -  (
    -    # shellcheck disable=SC2034
    -    source /etc/os-release
    -    echo "deb http://apt.llvm.org/${VERSION_CODENAME}/ llvm-toolchain-${VERSION_CODENAME}-${APT_LLVM_V} main" > "/etc/apt/sources.list.d/llvm-toolchain-${VERSION_CODENAME}-${APT_LLVM_V}.list"
    -  )
    -fi
    -
     if command -v apk >/dev/null 2>&1; then
       ${CI_RETRY_EXE} apk update
       # shellcheck disable=SC2086
    @@ -57,7 +46,7 @@ if [ -n "$PIP_PACKAGES" ]; then
     fi
     
     if [[ -n "${USE_INSTRUMENTED_LIBCPP}" ]]; then
    -  ${CI_RETRY_EXE} git clone --depth=1 https://github.com/llvm/llvm-project -b "llvmorg-22.1.0" /llvm-project
    +  ${CI_RETRY_EXE} git clone --depth=1 https://github.com/llvm/llvm-project -b "llvmorg-21.1.0" /llvm-project
     
       cmake -G Ninja -B /cxx_build/ \
         -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \
    

    This fails.

    And even on 24.04 it passes with:

    diff --git a/ci/test/00_setup_env_native_tsan.sh b/ci/test/00_setup_env_native_tsan.sh
    index 29dd06b..1171286 100755
    --- a/ci/test/00_setup_env_native_tsan.sh
    +++ b/ci/test/00_setup_env_native_tsan.sh
    @@ -9,11 +9,9 @@ export LC_ALL=C.UTF-8
     export CONTAINER_NAME=ci_native_tsan
     export CI_IMAGE_NAME_TAG="mirror.gcr.io/ubuntu:24.04"
     export APT_LLVM_V="22"
    -LIBCXX_DIR="/cxx_build/"
    -LIBCXX_FLAGS="-fsanitize=thread -nostdinc++ -nostdlib++ -isystem ${LIBCXX_DIR}include/c++/v1 -L${LIBCXX_DIR}lib -Wl,-rpath,${LIBCXX_DIR}lib -lc++ -lc++abi -lpthread -Wno-unused-command-line-argument"
    -export PACKAGES="clang-${APT_LLVM_V} llvm-${APT_LLVM_V} llvm-${APT_LLVM_V}-dev libclang-${APT_LLVM_V}-dev libclang-rt-${APT_LLVM_V}-dev python3-zmq python3-pip"
    +export PACKAGES="python3-pip clang-${APT_LLVM_V} llvm-${APT_LLVM_V} llvm-${APT_LLVM_V}-dev libclang-${APT_LLVM_V}-dev libclang-rt-${APT_LLVM_V}-dev libc++abi-${APT_LLVM_V}-dev libc++-${APT_LLVM_V}-dev python3-zmq"
     export PIP_PACKAGES="--break-system-packages pycapnp"
    -export DEP_OPTS="CC=clang CXX=clang++ CXXFLAGS='${LIBCXX_FLAGS}' NO_QT=1"
    +export DEP_OPTS="CC=clang CXX='clang++ -stdlib=libc++' NO_QT=1"
     export GOAL="install"
     export CI_LIMIT_STACK_SIZE=1
     export BITCOIN_CONFIG="\
    @@ -22,4 +20,3 @@ export BITCOIN_CONFIG="\
       -DSANITIZERS=thread \
       -DAPPEND_CPPFLAGS='-DARENA_DEBUG -DDEBUG_LOCKCONTENTION -D_LIBCPP_REMOVE_TRANSITIVE_INCLUDES' \
     "
    -export USE_INSTRUMENTED_LIBCPP="Thread"
    

    So I guess it could be glibc?

  18. shuv-amp commented at 6:13 PM on February 27, 2026: none

    Thanks for checking and for the detailed repro, this is very helpful.

    Understood on the CI-container context and the b09af2c dependency. I’ll pause on opening a PR for now while this is being understood further.

    If useful, I can run additional targeted checks on my side and share logs.

  19. shuv-amp commented at 7:47 PM on February 27, 2026: none

    I reran this in fresh containerized CI-wrapper runs and still could not reproduce the hang locally.

    Matrix (all with --timeout-factor=4 and sequential feature_init.py x8):

    • current master TSAN setup (instrumented libc++ path): 8/8 passed
    • non-instrumented libc++ variant (revert-style env): 8/8 passed
    • same non-instrumented variant + my local feature_init.py patch: 8/8 passed

    One caveat: on this arm64 host, the exact llvm-21 package set from the snippet was not available in ubuntu:24.04, so I used llvm-20 for the non-instrumented variant.

    If you can share the exact failing CI combo (arch/image/toolchain), I can rerun in that exact setup.

  20. maflcko commented at 7:55 PM on February 27, 2026: member

    If you can share the exact failing CI combo (arch/image/toolchain), I can rerun in that exact setup.

    I haven't tried on aarch64, I think, but #30586 (comment) has the exact steps to reproduce on 25.04+ Ubuntu x86_64.

  21. shuv-amp commented at 7:57 PM on February 27, 2026: none

    Thanks, that helps.

    I’ll rerun using the exact #30586 comment steps on Ubuntu 25.04+ x86_64 and report back with full command output (including seed and where it stalls) before proposing any patch.

  22. shuv-amp commented at 10:39 PM on February 27, 2026: none

    I reran the repro in an Ubuntu 25.04 amd64 container, but on an ARM host this path is not reliable for TSAN.

    With default Docker seccomp, TSAN-related setup fails due to blocked personality(). With seccomp unconfined, the build proceeds further but then fails in the IPC/capnp generation step with:

    FATAL: ThreadSanitizer: memory layout is incompatible, even though ASLR is disabled.

    So I can’t get a trustworthy feature_init.py TSAN result from this emulated setup. I’ll rerun the exact steps on native x86_64 Ubuntu and report back.

  23. maflcko commented at 6:39 AM on February 28, 2026: member

    FATAL: ThreadSanitizer: memory layout is incompatible, even though ASLR is disabled.

    Have you tried following the ci Readme?

    ci/README.md:sudo sysctl -w vm.mmap_rnd_bits=28
    
  24. shuv-amp commented at 7:24 AM on February 28, 2026: none

    I only have macOS arm64 locally, and my amd64-emulated TSAN runs have shown environment-related failures, so I can’t validate this reliably in that setup. I’ll pause here until I can rerun on native Ubuntu x86_64 with vm.mmap_rnd_bits=28.

  25. maflcko commented at 11:43 AM on February 28, 2026: member

    I guess it could be https://github.com/golang/go/issues/18717?

    https://github.com/golang/go/blob/ff023a334125bb4edb43db352d2c6a5eb414d2e4/src/cmd/cgo/internal/testsanitizers/testdata/tsan10.go https://github.com/golang/go/blob/ff023a334125bb4edb43db352d2c6a5eb414d2e4/src/cmd/cgo/internal/testsanitizers/testdata/tsan11.go

    <details><summary>c++ code</summary>

    root@36e68ea6efaa:/b-c-ci# clang++ -std=c++11 -g -O3 -pthread mwe.cpp -o mwe -fsanitize=thread && ./mwe
    Sending SIGTERM specifically to the worker thread...
    Wait a bit to see if the signal handler is ever allowed to run
    --- HANG CONFIRMED ---
    TSan trapped the signal in the worker thread's local queue.
    The loop continues forever because the handler never ran.
    
    root@36e68ea6efaa:/b-c-ci# clang++ -std=c++11 -g -O3 -pthread mwe.cpp -o mwe && ./mwe
    Sending SIGTERM specifically to the worker thread...
    Wait a bit to see if the signal handler is ever allowed to run
    Worker: Signal handler ran -> Woke up and exiting! [109994
    Success: Signal handler executed.
    
    root@36e68ea6efaa:/b-c-ci# clang++ -std=c++11 -g -O3 -pthread mwe_2.cpp -o mwe -fsanitize=thread && ./mwe
    Sending SIGTERM specifically to worker thread...
    HANG CONFIRMED: TSan trapped the signal in the worker thread.
    
    root@36e68ea6efaa:/b-c-ci# clang++ -std=c++11 -g -O3 -pthread mwe_2.cpp -o mwe && ./mwe
    Sending SIGTERM specifically to worker thread...
    Success: Handler executed!
    
    #include <iostream>
    #include <thread>
    #include <csignal>
    #include <unistd.h>
    #include <sys/syscall.h>
    #include <atomic>
    
    std::atomic<bool> flag{false};
    int fds[2];
    
    void handle_signal(int sig) {
        flag = true;
        char x = 'x';
        write(fds[1], &x, 1);
    }
    
    int main() {
        pipe(fds);
        signal(SIGUSR1, handle_signal);
    
        std::thread t([]() {
            sleep(2);
            std::cout << "[Child] Sending SIGUSR1..." << std::endl;
            kill(getpid(), SIGUSR1);
        });
    
        std::cout << "[Main] Blocking in raw read syscall..." << std::endl;
        char buf;
        // Use raw syscall to avoid libc interceptors if possible
        long res = syscall(SYS_read, fds[0], &buf, 1);
        
        if (res > 0) {
            std::cout << "[Main] Woke up! buf=" << buf << " flag=" << flag << std::endl;
        } else {
            std::cout << "[Main] Read failed or interrupted." << std::endl;
        }
    
        t.join();
        return 0;
    }
    
    #include <iostream>
    #include <csignal>
    #include <pthread.h>
    #include <unistd.h>
    
    void handler(int) {
        const char msg[] = "Success: Handler executed!\n";
        auto ignore = write(1, msg, sizeof(msg) - 1);
        (void)ignore;
        _exit(0);
    }
    
    void* worker(void*) {
        // This loop bypasses libc entirely using raw assembly.
        // It perfectly mimics the 'Blind Spot' created by modern glibc's
        // internal syscall restarting loops (__syscall_cancel_arch).
        while (true) {
            long ret;
            // SYS_pause (34) sleeps the thread until a signal arrives.
            asm volatile (
                "syscall"
                : "=a" (ret)
                : "0" (34)
                : "rcx", "r11", "memory"
            );
            // Even when the syscall returns, we loop without calling any
            // TSAN-instrumented libc functions, so TSAN never checks its
            // deferred signal queue.
        }
        return nullptr;
    }
    
    int main() {
        struct sigaction sa = {};
        sa.sa_handler = handler;
        sigemptyset(&sa.sa_mask);
        sa.sa_flags = 0;
        sigaction(SIGTERM, &sa, nullptr);
    
        pthread_t t;
        pthread_create(&t, nullptr, worker, nullptr);
    
        sleep(1); // Let worker enter the assembly loop
    
        std::cout << "Sending SIGTERM specifically to worker thread..." << std::endl;
        pthread_kill(t, SIGTERM);
    
        sleep(3);
    
        std::cout << "HANG CONFIRMED: TSan trapped the signal in the worker thread." << std::endl;
        _exit(1);
    }
    
    #include <atomic>
    #include <chrono>
    #include <condition_variable>
    #include <csignal>
    #include <iostream>
    #include <mutex>
    #include <thread>
    #include <unistd.h>
    
    // We use volatile to ensure a pure memory access loop.
    // This prevents TSan from using the loop condition as a 'synchronization point'
    // that might accidentally flush the signal queue.
    volatile bool shutdown_requested = false;
    
    void handle_sigterm(int) { shutdown_requested = true; }
    
    std::atomic_flag g_worker_waiting{true};
    void worker_loop() {
      // A purely CPU-bound loop mimicking heavy math/crypto.
      // Because this loop contains no libc calls (no printf, no sleep, no mutex),
      // TSan's deferred signal queue will never be checked.
      g_worker_waiting.clear();
      unsigned c = 0;
      while (!shutdown_requested) {
        c++;
      }
    
      std::cout << "Worker: Signal handler ran -> Woke up and exiting! [" << c
                << std::endl;
    }
    
    int main() {
      // Setup the asynchronous signal handler
      struct sigaction sa = {};
      sa.sa_handler = handle_sigterm;
      sigemptyset(&sa.sa_mask);
      sa.sa_flags = 0;
      sigaction(SIGTERM, &sa, nullptr);
    
      // Start the worker thread
      std::thread t(worker_loop);
      // Give the worker time to enter the tight loop
      while (g_worker_waiting.test_and_set())
        std::this_thread::yield();
    
      std::cout << "Sending SIGTERM specifically to the worker thread..."
                << std::endl;
      // We must target the worker thread specifically to trap the signal
      // in its thread-local TSan queue.
      pthread_kill(t.native_handle(), SIGTERM);
    
      std::cout << "Wait a bit to see if the signal handler is ever allowed to run"
                << std::endl;
      std::this_thread::sleep_for(std::chrono::seconds(3));
    
      if (!shutdown_requested) {
        std::cout << "--- HANG CONFIRMED ---" << std::endl;
        std::cout << "TSan trapped the signal in the worker thread's local queue."
                  << std::endl;
        std::cout << "The loop continues forever because the handler never ran."
                  << std::endl;
        _exit(1);
      } else {
        std::cout << "Success: Signal handler executed." << std::endl;
        t.join();
        _exit(0);
      }
    }
    

    </details>

  26. maflcko commented at 7:03 PM on March 3, 2026: member

    A workaround could be to restore the legacy polling behavior removed in commit cd03513dc2fcccaa142e9632a28b38efd0056436. However, I don't understand why the signal would be hit to set the atomic, but fail to write the pipe.

    Sample diff (keeping the cv for windows, but if we go down that route, it may be better to remove the windows code as well):

    diff --git a/src/util/signalinterrupt.cpp b/src/util/signalinterrupt.cpp
    index 76db559..5e9026c 100644
    --- a/src/util/signalinterrupt.cpp
    +++ b/src/util/signalinterrupt.cpp
    @@ -9,3 +9,4 @@
     #else
    -#include <util/tokenpipe.h>
    +#include <util/time.h>
    +#include <chrono>
     #endif
    @@ -19,8 +20,2 @@ SignalInterrupt::SignalInterrupt() : m_flag{false}
     {
    -#ifndef WIN32
    -    std::optional<TokenPipe> pipe = TokenPipe::Make();
    -    if (!pipe) throw std::ios_base::failure("Could not create TokenPipe");
    -    m_pipe_r = pipe->TakeReadEnd();
    -    m_pipe_w = pipe->TakeWriteEnd();
    -#endif
     }
    @@ -34,5 +29,2 @@ bool SignalInterrupt::reset()
     {
    -    // Cancel existing interrupt by waiting for it, this will reset condition flags and remove
    -    // the token from the pipe.
    -    if (*this && !wait()) return false;
         m_flag = false;
    @@ -48,12 +40,3 @@ bool SignalInterrupt::operator()()
     #else
    -    // This must be reentrant and safe for calling in a signal handler, so using a condition variable is not safe.
    -    // Make sure that the token is only written once even if multiple threads call this concurrently or in
    -    // case of a reentrant signal.
    -    if (!m_flag.exchange(true)) {
    -        // Write an arbitrary byte to the write end of the pipe.
    -        int res = m_pipe_w.TokenWrite('x');
    -        if (res != 0) {
    -            return false;
    -        }
    -    }
    +    m_flag = true;
     #endif
    @@ -68,5 +51,4 @@ bool SignalInterrupt::wait()
     #else
    -    int res = m_pipe_r.TokenRead();
    -    if (res != 'x') {
    -        return false;
    +    while (!m_flag.load()) {
    +        UninterruptibleSleep(std::chrono::milliseconds{10});
         }
    diff --git a/src/util/signalinterrupt.h b/src/util/signalinterrupt.h
    index 027dd15..129dad9 100644
    --- a/src/util/signalinterrupt.h
    +++ b/src/util/signalinterrupt.h
    @@ -10,4 +10,2 @@
     #include <mutex>
    -#else
    -#include <util/tokenpipe.h>
     #endif
    @@ -39,7 +37,3 @@ private:
     
    -#ifndef WIN32
    -    // On UNIX-like operating systems use the self-pipe trick.
    -    TokenPipeEnd m_pipe_r;
    -    TokenPipeEnd m_pipe_w;
    -#else
    +#ifdef WIN32
         // On windows use a condition variable, since we don't have any signals there
    

github-metadata-mirror

This is a metadata mirror of the GitHub repository bitcoin/bitcoin. This site is not affiliated with GitHub. Content is generated from a GitHub metadata backup.
generated: 2026-04-24 09:13 UTC

This site is hosted by @0xB10C
More mirrored repositories can be found on mirror.b10c.me