From e6442ebea5bb6c4d3068a51a0ffeabb11a8b8f05 Mon Sep 17 00:00:00 2001 From: rgurunathan Date: Wed, 20 May 2026 15:22:32 -0400 Subject: [PATCH 1/2] #89 - Make Manager::shutdown() idempotent to prevent post-finalize log crash Every Manager backend's shutdown() begins with a DAQIRI_LOG_INFO call, and is invoked twice during the typical bench lifecycle: 1. Explicitly from main() via daqiri::shutdown(). 2. Again from the manager's destructor (or, for SocketMgr in RoCE mode, a destructor cascade into RdmaMgr::shutdown()) during C++ __cxa_finalize. By the time the destructor cascade fires, spdlog's default logger -- a function-local static created lazily on the first DAQIRI_LOG_INFO -- has already been destroyed. The DAQIRI_LOG_INFO at the top of the second shutdown() call then crashes inside spdlog::sink_it_. Repro on DGX Spark: daqiri_bench_rdma --mode both against examples/daqiri_bench_rdma_tx_rx_spark.yaml segfaults immediately after the legitimate shutdown completes. Backtrace shows __cxa_finalize -> ~SocketMgr -> SocketMgr::shutdown -> RdmaMgr::shutdown -> daqiri::log_formatted_message -> spdlog::logger::log_ -> spdlog::logger::sink_it_ -> SIGSEGV. Fix: short-circuit shutdown() on subsequent calls by returning early when initialized_ is false. Applied symmetrically to RdmaMgr, DpdkMgr, and SocketMgr -- the log-first body-second pattern is identical in all three. DpdkMgr's existing num_init reference-counted body is preserved; the guard only activates after the body has cleared initialized_ in the final shutdown. Verified by repeated daqiri_bench_rdma --mode both runs from a bash-parent shell. Pre-fix: SIGSEGV 100% reproducible. Post-fix: clean exit 0, all destructor markers run in order. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: rgurunathan --- src/managers/dpdk/daqiri_dpdk_mgr.cpp | 5 +++++ src/managers/rdma/daqiri_rdma_mgr.cpp | 6 ++++++ src/managers/socket/daqiri_socket_mgr.cpp | 6 ++++++ 3 files changed, 17 insertions(+) diff --git a/src/managers/dpdk/daqiri_dpdk_mgr.cpp b/src/managers/dpdk/daqiri_dpdk_mgr.cpp index de695dc..fc6151a 100644 --- a/src/managers/dpdk/daqiri_dpdk_mgr.cpp +++ b/src/managers/dpdk/daqiri_dpdk_mgr.cpp @@ -4403,6 +4403,11 @@ Status DpdkMgr::send_tx_burst(BurstParams* burst) { } void DpdkMgr::shutdown() { + // Idempotency guard: shutdown() may be invoked a second time via ~DpdkMgr + // during C++ __cxa_finalize, by which point the spdlog default logger has + // already been destroyed and any DAQIRI_LOG_INFO here crashes inside + // spdlog::sink_it_. Skip the body (and the log calls) if already torn down. + if (!initialized_) { return; } DAQIRI_LOG_INFO("daqiri DPDK manager shutdown called {}", num_init); if (--num_init == 0) { diff --git a/src/managers/rdma/daqiri_rdma_mgr.cpp b/src/managers/rdma/daqiri_rdma_mgr.cpp index df88152..e525eeb 100644 --- a/src/managers/rdma/daqiri_rdma_mgr.cpp +++ b/src/managers/rdma/daqiri_rdma_mgr.cpp @@ -1508,6 +1508,12 @@ void RdmaMgr::initialize() { } void RdmaMgr::shutdown() { + // Idempotency guard: shutdown() runs explicitly from the caller AND again + // from ~RdmaMgr / ~SocketMgr during C++ __cxa_finalize. By the second call + // the spdlog default logger (a function-local static created lazily on the + // first DAQIRI_LOG_INFO) has already been destroyed, so any logging here + // crashes inside spdlog::sink_it_. Skip the whole body on subsequent calls. + if (!initialized_) { return; } DAQIRI_LOG_INFO("RDMA manager shutting down"); rdma_force_quit.store(true); diff --git a/src/managers/socket/daqiri_socket_mgr.cpp b/src/managers/socket/daqiri_socket_mgr.cpp index 3ccec89..bca262a 100644 --- a/src/managers/socket/daqiri_socket_mgr.cpp +++ b/src/managers/socket/daqiri_socket_mgr.cpp @@ -466,6 +466,12 @@ void SocketMgr::clear_rx_queues() { } void SocketMgr::shutdown() { + // Idempotency guard: shutdown() may be invoked a second time via + // ~SocketMgr during C++ __cxa_finalize, after the spdlog default logger + // has been destroyed. Any DAQIRI_LOG_INFO from the cascade (here, the + // RoCE branch, or the manager method this delegates to) would then crash + // inside spdlog::sink_it_. Skip the whole body on subsequent calls. + if (!initialized_) { return; } if (is_roce_protocol()) { #if DAQIRI_MGR_RDMA if (roce_mgr_ != nullptr) { From cf0a6ca258420ce3d7ac52bb2a895d7916a532c1 Mon Sep 17 00:00:00 2001 From: rgurunathan Date: Wed, 20 May 2026 17:19:13 -0400 Subject: [PATCH 2/2] #89 - Tighten SocketMgr::shutdown() guard to preserve init-failure cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Greptile review of the original idempotency commit flagged the secondary `if (!initialized_ && !running_.load()) { return; }` in SocketMgr::shutdown() as having a dead `!initialized_` clause, since the new top-of-function guard `if (!initialized_) { return; }` already covers that state. Investigation surfaced the deeper concern: the top guard was too aggressive. SocketMgr::initialize() sets initialized_=false and running_=true before running setup, then sets initialized_=true on success. If setup_tcp_endpoint or setup_udp_endpoint throws after spawning an accept_thread or io_thread, the catch-block shutdown() call entered with initialized_=false and running_=true. Under the original top guard the cleanup body was skipped, leaving the worker threads joinable on the EndpointState — the destructor cascade would then std::terminate on an unjoined std::thread. Tighten the top guard to require both flags cleared. The post-shutdown re-entry from __cxa_finalize still fires (both flags cleared at the end of the body) while the init-failure cleanup path (running_=true) falls through and joins its threads. The pre-existing secondary check is now fully redundant and removed. DpdkMgr and RdmaMgr keep the simpler `if (!initialized_) { return; }` — neither has an init-failure shutdown() caller, so the asymmetry is intentional and isolated to the manager whose initialize() partially spawns threads before setting initialized_=true. Verified manually with both the existing DPDK / socket-udp / socket-tcp normal-shutdown smokes and a new 2-endpoint UDP init-failure repro (malformed remote IP on endpoint 2 → parse_ipv4_addr throws after endpoint 1's io_thread is spawned): rc=1, no SIGSEGV / SIGABRT / "terminate called" in stderr. Co-Authored-By: Claude Opus 4.7 Signed-off-by: rgurunathan --- src/managers/socket/daqiri_socket_mgr.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/managers/socket/daqiri_socket_mgr.cpp b/src/managers/socket/daqiri_socket_mgr.cpp index bca262a..9f16a66 100644 --- a/src/managers/socket/daqiri_socket_mgr.cpp +++ b/src/managers/socket/daqiri_socket_mgr.cpp @@ -471,7 +471,16 @@ void SocketMgr::shutdown() { // has been destroyed. Any DAQIRI_LOG_INFO from the cascade (here, the // RoCE branch, or the manager method this delegates to) would then crash // inside spdlog::sink_it_. Skip the whole body on subsequent calls. - if (!initialized_) { return; } + // + // The guard checks BOTH flags because initialize() sets initialized_=false + // and running_=true before running setup, then sets initialized_=true on + // success. If setup throws, the catch block calls shutdown() with + // initialized_=false and running_=true to clean up any threads/sockets + // that were spawned partway. Guarding on initialized_ alone would skip + // that cleanup. Both flags are only cleared together after a successful + // shutdown() body, so the post-shutdown re-entry from __cxa_finalize is + // the only case where the guard fires. + if (!initialized_ && !running_.load()) { return; } if (is_roce_protocol()) { #if DAQIRI_MGR_RDMA if (roce_mgr_ != nullptr) { @@ -482,8 +491,6 @@ void SocketMgr::shutdown() { return; } - if (!initialized_ && !running_.load()) { return; } - running_.store(false); for (auto& ep : endpoints_) {