From 0242f958543361f63b87d59e70669151767901db Mon Sep 17 00:00:00 2001 From: Nelson Boss Date: Fri, 22 May 2026 16:39:13 +0800 Subject: [PATCH] [improvement](executor) use real elapsed time to compute workload group metrics refresh interval Replace the fixed config-based interval with the actual monotonic time delta between two refreshes when calculating per-second CPU and scan IO rates in WorkloadGroupMetrics, so the rates stay accurate even when the refresh thread is delayed or the configured interval is changed at runtime. Also add a guard against division by zero when two refreshes happen within less than one second, and add unit tests covering: - Real elapsed time rate computation - Sub-second interval safety (no division by zero) - Proportional rate vs interval relationship - Memory metrics correctness - First-refresh boundary behavior --- .../workload_group/workload_group_metrics.cpp | 9 +- .../workload_group/workload_group_metrics.h | 1 + .../workload_group_metrics_test.cpp | 159 ++++++++++++++++++ 3 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 be/test/runtime/workload_group/workload_group_metrics_test.cpp diff --git a/be/src/runtime/workload_group/workload_group_metrics.cpp b/be/src/runtime/workload_group/workload_group_metrics.cpp index 41da5af1e788e8..c951cb7c44b1e7 100644 --- a/be/src/runtime/workload_group/workload_group_metrics.cpp +++ b/be/src/runtime/workload_group/workload_group_metrics.cpp @@ -23,6 +23,7 @@ #include "runtime/workload_group/workload_group.h" #include "runtime/workload_management/io_throttle.h" #include "storage/olap_common.h" +#include "util/time.h" namespace doris { @@ -83,7 +84,13 @@ void WorkloadGroupMetrics::update_remote_scan_io_bytes(uint64_t delta_io_bytes) } void WorkloadGroupMetrics::refresh_metrics() { - int interval_second = config::workload_group_metrics_interval_ms / 1000; + uint64_t current_time_ms = MonotonicMillis(); + uint64_t interval_second = (current_time_ms - _last_refresh_time_ms) / 1000; + _last_refresh_time_ms = current_time_ms; + + if (interval_second == 0) { + return; + } // cpu uint64_t _current_cpu_time_nanos = _cpu_time_nanos.load(); diff --git a/be/src/runtime/workload_group/workload_group_metrics.h b/be/src/runtime/workload_group/workload_group_metrics.h index 67085d8374fa1d..ca04e0929ae2f8 100644 --- a/be/src/runtime/workload_group/workload_group_metrics.h +++ b/be/src/runtime/workload_group/workload_group_metrics.h @@ -73,6 +73,7 @@ class WorkloadGroupMetrics { std::atomic _cpu_time_nanos {0}; std::atomic _last_cpu_time_nanos {0}; + std::atomic _last_refresh_time_ms {0}; std::atomic _per_sec_cpu_time_nanos {0}; // used for system table std::atomic _per_sec_local_scan_bytes {0}; diff --git a/be/test/runtime/workload_group/workload_group_metrics_test.cpp b/be/test/runtime/workload_group/workload_group_metrics_test.cpp new file mode 100644 index 00000000000000..ec10e0a0e31c0b --- /dev/null +++ b/be/test/runtime/workload_group/workload_group_metrics_test.cpp @@ -0,0 +1,159 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "runtime/workload_group/workload_group_metrics.h" + +#include + +#include +#include + +#include "runtime/workload_group/workload_group.h" +#include "util/time.h" + +namespace doris { + +class WorkloadGroupMetricsTest : public testing::Test { +protected: + void SetUp() override { + WorkloadGroupInfo wg_info {.id = 1, .name = "test_wg"}; + _wg = std::make_shared(wg_info); + _metrics = std::make_unique(_wg.get()); + } + + void TearDown() override { + _metrics.reset(); + _wg.reset(); + } + + std::shared_ptr _wg; + std::unique_ptr _metrics; +}; + +// Test that refresh_metrics uses real elapsed time to compute per-second rates. +// After sleeping for a known interval, the per-second CPU rate should reflect +// the actual elapsed time rather than the config-based fixed interval. +TEST_F(WorkloadGroupMetricsTest, refresh_uses_real_elapsed_time) { + // First call to refresh_metrics to initialize _last_refresh_time_ms + _metrics->refresh_metrics(); + + // Add known CPU time: 2,000,000,000 nanos = 2 seconds of CPU + const uint64_t cpu_delta_nanos = 2000000000ULL; + _metrics->update_cpu_time_nanos(cpu_delta_nanos); + + // Sleep for ~2 seconds so the real interval is ~2 seconds + std::this_thread::sleep_for(std::chrono::milliseconds(2000)); + + // Refresh metrics — should compute rate based on real ~2 second interval + _metrics->refresh_metrics(); + + // Expected: 2,000,000,000 nanos / 2 seconds = 1,000,000,000 nanos per second + // Allow tolerance for timing imprecision (the sleep may not be exactly 2s) + uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second(); + // With a 2 second interval and 2B nanos of CPU time added, + // the rate should be approximately 1B nanos/sec. + // Allow 50% tolerance due to scheduling variance. + EXPECT_GT(cpu_per_sec, 500000000ULL) + << "CPU per-second rate too low: " << cpu_per_sec + << " (expected ~1,000,000,000 with 2s interval)"; + EXPECT_LT(cpu_per_sec, 2500000000ULL) + << "CPU per-second rate too high: " << cpu_per_sec + << " (expected ~1,000,000,000 with 2s interval)"; +} + +// Test that when interval is less than 1 second, refresh_metrics does not +// cause division by zero and preserves previous rates. +TEST_F(WorkloadGroupMetricsTest, refresh_skips_when_interval_less_than_one_second) { + // First call to initialize _last_refresh_time_ms + _metrics->refresh_metrics(); + + // Add some CPU time + _metrics->update_cpu_time_nanos(1000000000ULL); // 1B nanos + + // Call refresh immediately (< 1 second elapsed) — should not crash + // and should not update per-second rates (interval_second == 0 → early return) + _metrics->refresh_metrics(); + + // Per-second rate should still be 0 (from the initial state) + // because the sub-second refresh skips the rate calculation + uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second(); + EXPECT_EQ(cpu_per_sec, 0) + << "CPU per-second rate should remain unchanged when interval < 1s"; +} + +// Test that different real intervals produce proportionally different rates. +// A shorter interval with the same delta should yield a higher per-second rate. +TEST_F(WorkloadGroupMetricsTest, shorter_interval_yields_higher_rate) { + // --- First measurement: 1 second interval --- + _metrics->refresh_metrics(); + _metrics->update_cpu_time_nanos(1000000000ULL); // 1B nanos + + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + _metrics->refresh_metrics(); + + uint64_t rate_1s = _metrics->get_cpu_time_nanos_per_second(); + + // --- Second measurement: add same delta, wait 2 seconds --- + _metrics->update_cpu_time_nanos(1000000000ULL); // another 1B nanos + + std::this_thread::sleep_for(std::chrono::milliseconds(2000)); + _metrics->refresh_metrics(); + + uint64_t rate_2s = _metrics->get_cpu_time_nanos_per_second(); + + // With the same absolute delta (1B nanos) but double the interval, + // the per-second rate should be roughly half. + // rate_1s ~ 1B/1s = 1B + // rate_2s ~ 1B/2s = 500M + // Allow generous tolerance for timing jitter + EXPECT_GT(rate_1s, rate_2s) + << "1s interval rate (" << rate_1s << ") should be higher than 2s interval rate (" + << rate_2s << ")"; +} + +// Test that memory metrics are correctly reported +TEST_F(WorkloadGroupMetricsTest, memory_used_reported_correctly) { + const int64_t mem_used = 1024L * 1024 * 512; // 512 MB + _metrics->update_memory_used_bytes(mem_used); + _metrics->refresh_metrics(); + + // Need to wait > 1 second for refresh to take effect + std::this_thread::sleep_for(std::chrono::milliseconds(1100)); + _metrics->refresh_metrics(); + + EXPECT_EQ(_metrics->get_memory_used(), mem_used); +} + +// Test that the first refresh (from _last_refresh_time_ms == 0) does not produce +// unreasonable rates since the interval is very large (time since boot). +TEST_F(WorkloadGroupMetricsTest, first_refresh_produces_near_zero_rate) { + // Add some CPU time before the first refresh + _metrics->update_cpu_time_nanos(5000000000ULL); // 5B nanos + + // First refresh: interval = current_time_ms / 1000 (time since boot in seconds) + // For a system with uptime > 5 seconds, rate = 5B / uptime_seconds + // This should be small relative to the delta + _metrics->refresh_metrics(); + + uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second(); + // With system uptime of at least 60 seconds (reasonable assumption), + // rate = 5B / 60+ < 84M nanos/sec + EXPECT_LT(cpu_per_sec, 1000000000ULL) + << "First refresh rate should be modest since interval is system uptime"; +} + +} // namespace doris