From 5df72dd6c7deb39ef22b4cffa47ec3c2575a0cd1 Mon Sep 17 00:00:00 2001 From: "jinyong.choi" Date: Fri, 24 Apr 2026 21:21:21 +0900 Subject: [PATCH 1/2] engine: ignore duplicate STOP to prevent shutdown spin When FLB_ENGINE_STOP arrives more than once in quick succession (e.g. an input plugin's internal flb_engine_exit() followed by an external SIGTERM), the second invocation re-enters the STOP handler block in flb_engine_start() and resets config->event_shutdown->status to MK_EVENT_NONE while the shutdown timerfd is still registered in the kernel's epoll set. The event loop dispatcher then drops the timer event because of the 'status != MK_EVENT_NONE' guard in flb_event_load_bucket_queue(), but the level-triggered timerfd keeps reporting EPOLLIN. The pipeline thread busy-loops in epoll_wait() at 100% CPU, grace_count never advances, and the process fails to terminate. Swallow duplicate STOP messages at the flb_engine_manager() boundary once shutdown is already in progress (config->is_shutting_down is set by flb_engine_stop_ingestion() during the first STOP). The first STOP arms the shutdown timer and drives the grace flow; any further STOPs would only corrupt existing event state without benefit. Periodic work during shutdown (flushing, task draining, grace counter) is already handled by the 1s tick in the FLB_ENGINE_SHUTDOWN branch, so swallowing the duplicate is safe. Signed-off-by: jinyong.choi --- src/flb_engine.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/flb_engine.c b/src/flb_engine.c index c2786a1a247..5252a506d92 100644 --- a/src/flb_engine.c +++ b/src/flb_engine.c @@ -688,6 +688,16 @@ static inline int flb_engine_manager(flb_pipefd_t fd, struct flb_config *config) /* Flush all remaining data */ if (type == 1) { /* Engine type */ if (key == FLB_ENGINE_STOP) { + /* + * Re-entering the STOP handler in flb_engine_start() would reset + * config->event_shutdown.status while the shutdown timerfd is + * still registered, so the dispatcher drops the timer and the + * pipeline thread busy-loops on epoll. + */ + if (config->is_shutting_down) { + flb_debug("[engine] duplicate STOP ignored"); + return 0; + } flb_trace("[engine] flush enqueued data"); flb_engine_flush(config, NULL); return FLB_ENGINE_STOP; From 00a1c2b7faa829d873e430250b4689e46e7f3255 Mon Sep 17 00:00:00 2001 From: "jinyong.choi" Date: Fri, 24 Apr 2026 21:21:32 +0900 Subject: [PATCH 2/2] tests: runtime: add regression test for duplicate STOP shutdown spin Add core_shutdown_spin.c covering the duplicate-FLB_ENGINE_STOP busy-spin bug fixed in the previous commit. The test builds a minimal lib-input -> null-output pipeline, invokes flb_engine_exit() twice in quick succession, and asserts that flb_stop() returns within the grace period. A SIGALRM watchdog (SHUTDOWN_WATCHDOG_SEC=10) bounds the wait: if the guard regresses, pthread_join on the spinning worker never returns, the handler aborts the process with a visible FAIL message and exit code 1. This avoids relying on CTest's per-test timeout (1500s default) and surfaces the regression quickly regardless of how the binary is invoked. Signed-off-by: jinyong.choi --- tests/runtime/CMakeLists.txt | 4 ++ tests/runtime/core_shutdown_spin.c | 110 +++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 tests/runtime/core_shutdown_spin.c diff --git a/tests/runtime/CMakeLists.txt b/tests/runtime/CMakeLists.txt index 7bc6a7d7153..e0489e5250d 100644 --- a/tests/runtime/CMakeLists.txt +++ b/tests/runtime/CMakeLists.txt @@ -27,6 +27,10 @@ endmacro() FLB_RT_CORE_TEST(FLB_COROUTINE_TIMEOUT "core-timeout.c") FLB_RT_CORE_TEST(FLB_INTERNAL_LOGGER "core_internal_logger.c") FLB_RT_CORE_TEST(FLB_DOWNSTREAM_ACCEPT_TIMEOUT "core_accept_timeout.c") +# Uses POSIX alarm()/sigaction() for a watchdog; skip on Windows. +if(NOT FLB_SYSTEM_WINDOWS) + FLB_RT_CORE_TEST(FLB_CORE_SHUTDOWN_SPIN "core_shutdown_spin.c") +endif() FLB_RT_CORE_TEST(1 "http_client_chunked.c") FLB_RT_TEST(FLB_CHUNK_TRACE "core_chunk_trace.c") diff --git a/tests/runtime/core_shutdown_spin.c b/tests/runtime/core_shutdown_spin.c new file mode 100644 index 00000000000..961fb941366 --- /dev/null +++ b/tests/runtime/core_shutdown_spin.c @@ -0,0 +1,110 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2026 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include "flb_tests_runtime.h" + +#define SHUTDOWN_TIME_LIMIT_SEC 5 /* grace=2 + safety margin */ +#define SHUTDOWN_WATCHDOG_SEC 10 + +/* Async-signal-safe abort used when flb_stop() hangs on a regression. */ +static void timeout_abort(int sig) +{ + static const char msg[] = + "\nFAIL: flb_test_duplicate_stop_no_spin timed out; " + "shutdown spin regression likely present.\n"; + (void) sig; + (void) write(STDERR_FILENO, msg, sizeof(msg) - 1); + _exit(1); +} + +/* Regression: two back-to-back STOPs must not cause a shutdown busy-loop. */ +void flb_test_duplicate_stop_no_spin(void) +{ + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + int64_t ret; + time_t start; + time_t elapsed; + struct sigaction sa; + + ctx = flb_create(); + TEST_CHECK(ctx != NULL); + + TEST_CHECK(flb_service_set(ctx, + "Flush", "1", + "Grace", "2", + "Log_Level", "info", + NULL) == 0); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + TEST_CHECK(flb_input_set(ctx, in_ffd, "tag", "test", NULL) == 0); + + out_ffd = flb_output(ctx, (char *) "null", NULL); + TEST_CHECK(out_ffd >= 0); + TEST_CHECK(flb_output_set(ctx, out_ffd, "match", "*", NULL) == 0); + + ret = flb_start(ctx); + TEST_CHECK_(ret == 0, "starting engine"); + + /* Let the engine enter its main event loop. */ + sleep(1); + + TEST_CHECK(flb_engine_exit(ctx->config) >= 0); + + /* Let the first STOP be processed before the second arrives. */ + usleep(100 * 1000); + + TEST_CHECK(flb_engine_exit(ctx->config) >= 0); + + /* Bound flb_stop() so a regression fails fast. */ + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = timeout_abort; + sigaction(SIGALRM, &sa, NULL); + alarm(SHUTDOWN_WATCHDOG_SEC); + + start = time(NULL); + ret = flb_stop(ctx); + elapsed = time(NULL) - start; + + alarm(0); + + TEST_CHECK_(ret == 0, "flb_stop returned %lld", (long long) ret); + TEST_CHECK_(elapsed <= SHUTDOWN_TIME_LIMIT_SEC, + "shutdown took %lds; expected <= %ds (shutdown spin?)", + (long) elapsed, SHUTDOWN_TIME_LIMIT_SEC); + + if (ctx) { + flb_destroy(ctx); + } +} + +/* Test list */ +TEST_LIST = { + {"duplicate_stop_no_spin", flb_test_duplicate_stop_no_spin}, + {NULL, NULL} +};