From 0242f958543361f63b87d59e70669151767901db Mon Sep 17 00:00:00 2001
From: Nelson Boss <bosswnx@qq.com>
Date: Fri, 22 May 2026 16:39:13 +0800
Subject: [PATCH] [improvement](executor) use real elapsed time to compute
 workload group metrics refresh interval

  Replace the fixed config-based interval with the actual monotonic time delta
  between two refreshes when calculating per-second CPU and scan IO rates in
  WorkloadGroupMetrics, so the rates stay accurate even when the refresh thread
  is delayed or the configured interval is changed at runtime.

  Also add a guard against division by zero when two refreshes happen within
  less than one second, and add unit tests covering:
  - Real elapsed time rate computation
  - Sub-second interval safety (no division by zero)
  - Proportional rate vs interval relationship
  - Memory metrics correctness
  - First-refresh boundary behavior
---
 .../workload_group/workload_group_metrics.cpp |   9 +-
 .../workload_group/workload_group_metrics.h   |   1 +
 .../workload_group_metrics_test.cpp           | 159 ++++++++++++++++++
 3 files changed, 168 insertions(+), 1 deletion(-)
 create mode 100644 be/test/runtime/workload_group/workload_group_metrics_test.cpp

diff --git a/be/src/runtime/workload_group/workload_group_metrics.cpp b/be/src/runtime/workload_group/workload_group_metrics.cpp
index 41da5af1e788e8..c951cb7c44b1e7 100644
--- a/be/src/runtime/workload_group/workload_group_metrics.cpp
+++ b/be/src/runtime/workload_group/workload_group_metrics.cpp
@@ -23,6 +23,7 @@
 #include "runtime/workload_group/workload_group.h"
 #include "runtime/workload_management/io_throttle.h"
 #include "storage/olap_common.h"
+#include "util/time.h"
 
 namespace doris {
 
@@ -83,7 +84,13 @@ void WorkloadGroupMetrics::update_remote_scan_io_bytes(uint64_t delta_io_bytes)
 }
 
 void WorkloadGroupMetrics::refresh_metrics() {
-    int interval_second = config::workload_group_metrics_interval_ms / 1000;
+    uint64_t current_time_ms = MonotonicMillis();
+    uint64_t interval_second = (current_time_ms - _last_refresh_time_ms) / 1000;
+    _last_refresh_time_ms = current_time_ms;
+
+    if (interval_second == 0) {
+        return;
+    }
 
     // cpu
     uint64_t _current_cpu_time_nanos = _cpu_time_nanos.load();
diff --git a/be/src/runtime/workload_group/workload_group_metrics.h b/be/src/runtime/workload_group/workload_group_metrics.h
index 67085d8374fa1d..ca04e0929ae2f8 100644
--- a/be/src/runtime/workload_group/workload_group_metrics.h
+++ b/be/src/runtime/workload_group/workload_group_metrics.h
@@ -73,6 +73,7 @@ class WorkloadGroupMetrics {
 
     std::atomic<uint64_t> _cpu_time_nanos {0};
     std::atomic<uint64_t> _last_cpu_time_nanos {0};
+    std::atomic<uint64_t> _last_refresh_time_ms {0};
     std::atomic<uint64_t> _per_sec_cpu_time_nanos {0}; // used for system table
 
     std::atomic<uint64_t> _per_sec_local_scan_bytes {0};
diff --git a/be/test/runtime/workload_group/workload_group_metrics_test.cpp b/be/test/runtime/workload_group/workload_group_metrics_test.cpp
new file mode 100644
index 00000000000000..ec10e0a0e31c0b
--- /dev/null
+++ b/be/test/runtime/workload_group/workload_group_metrics_test.cpp
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/workload_group/workload_group_metrics.h"
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <thread>
+
+#include "runtime/workload_group/workload_group.h"
+#include "util/time.h"
+
+namespace doris {
+
+class WorkloadGroupMetricsTest : public testing::Test {
+protected:
+    void SetUp() override {
+        WorkloadGroupInfo wg_info {.id = 1, .name = "test_wg"};
+        _wg = std::make_shared<WorkloadGroup>(wg_info);
+        _metrics = std::make_unique<WorkloadGroupMetrics>(_wg.get());
+    }
+
+    void TearDown() override {
+        _metrics.reset();
+        _wg.reset();
+    }
+
+    std::shared_ptr<WorkloadGroup> _wg;
+    std::unique_ptr<WorkloadGroupMetrics> _metrics;
+};
+
+// Test that refresh_metrics uses real elapsed time to compute per-second rates.
+// After sleeping for a known interval, the per-second CPU rate should reflect
+// the actual elapsed time rather than the config-based fixed interval.
+TEST_F(WorkloadGroupMetricsTest, refresh_uses_real_elapsed_time) {
+    // First call to refresh_metrics to initialize _last_refresh_time_ms
+    _metrics->refresh_metrics();
+
+    // Add known CPU time: 2,000,000,000 nanos = 2 seconds of CPU
+    const uint64_t cpu_delta_nanos = 2000000000ULL;
+    _metrics->update_cpu_time_nanos(cpu_delta_nanos);
+
+    // Sleep for ~2 seconds so the real interval is ~2 seconds
+    std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+
+    // Refresh metrics — should compute rate based on real ~2 second interval
+    _metrics->refresh_metrics();
+
+    // Expected: 2,000,000,000 nanos / 2 seconds = 1,000,000,000 nanos per second
+    // Allow tolerance for timing imprecision (the sleep may not be exactly 2s)
+    uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second();
+    // With a 2 second interval and 2B nanos of CPU time added,
+    // the rate should be approximately 1B nanos/sec.
+    // Allow 50% tolerance due to scheduling variance.
+    EXPECT_GT(cpu_per_sec, 500000000ULL)
+            << "CPU per-second rate too low: " << cpu_per_sec
+            << " (expected ~1,000,000,000 with 2s interval)";
+    EXPECT_LT(cpu_per_sec, 2500000000ULL)
+            << "CPU per-second rate too high: " << cpu_per_sec
+            << " (expected ~1,000,000,000 with 2s interval)";
+}
+
+// Test that when interval is less than 1 second, refresh_metrics does not
+// cause division by zero and preserves previous rates.
+TEST_F(WorkloadGroupMetricsTest, refresh_skips_when_interval_less_than_one_second) {
+    // First call to initialize _last_refresh_time_ms
+    _metrics->refresh_metrics();
+
+    // Add some CPU time
+    _metrics->update_cpu_time_nanos(1000000000ULL); // 1B nanos
+
+    // Call refresh immediately (< 1 second elapsed) — should not crash
+    // and should not update per-second rates (interval_second == 0 → early return)
+    _metrics->refresh_metrics();
+
+    // Per-second rate should still be 0 (from the initial state)
+    // because the sub-second refresh skips the rate calculation
+    uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second();
+    EXPECT_EQ(cpu_per_sec, 0)
+            << "CPU per-second rate should remain unchanged when interval < 1s";
+}
+
+// Test that different real intervals produce proportionally different rates.
+// A shorter interval with the same delta should yield a higher per-second rate.
+TEST_F(WorkloadGroupMetricsTest, shorter_interval_yields_higher_rate) {
+    // --- First measurement: 1 second interval ---
+    _metrics->refresh_metrics();
+    _metrics->update_cpu_time_nanos(1000000000ULL); // 1B nanos
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+    _metrics->refresh_metrics();
+
+    uint64_t rate_1s = _metrics->get_cpu_time_nanos_per_second();
+
+    // --- Second measurement: add same delta, wait 2 seconds ---
+    _metrics->update_cpu_time_nanos(1000000000ULL); // another 1B nanos
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+    _metrics->refresh_metrics();
+
+    uint64_t rate_2s = _metrics->get_cpu_time_nanos_per_second();
+
+    // With the same absolute delta (1B nanos) but double the interval,
+    // the per-second rate should be roughly half.
+    // rate_1s ~ 1B/1s = 1B
+    // rate_2s ~ 1B/2s = 500M
+    // Allow generous tolerance for timing jitter
+    EXPECT_GT(rate_1s, rate_2s)
+            << "1s interval rate (" << rate_1s << ") should be higher than 2s interval rate ("
+            << rate_2s << ")";
+}
+
+// Test that memory metrics are correctly reported
+TEST_F(WorkloadGroupMetricsTest, memory_used_reported_correctly) {
+    const int64_t mem_used = 1024L * 1024 * 512; // 512 MB
+    _metrics->update_memory_used_bytes(mem_used);
+    _metrics->refresh_metrics();
+
+    // Need to wait > 1 second for refresh to take effect
+    std::this_thread::sleep_for(std::chrono::milliseconds(1100));
+    _metrics->refresh_metrics();
+
+    EXPECT_EQ(_metrics->get_memory_used(), mem_used);
+}
+
+// Test that the first refresh (from _last_refresh_time_ms == 0) does not produce
+// unreasonable rates since the interval is very large (time since boot).
+TEST_F(WorkloadGroupMetricsTest, first_refresh_produces_near_zero_rate) {
+    // Add some CPU time before the first refresh
+    _metrics->update_cpu_time_nanos(5000000000ULL); // 5B nanos
+
+    // First refresh: interval = current_time_ms / 1000 (time since boot in seconds)
+    // For a system with uptime > 5 seconds, rate = 5B / uptime_seconds
+    // This should be small relative to the delta
+    _metrics->refresh_metrics();
+
+    uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second();
+    // With system uptime of at least 60 seconds (reasonable assumption),
+    // rate = 5B / 60+ < 84M nanos/sec
+    EXPECT_LT(cpu_per_sec, 1000000000ULL)
+            << "First refresh rate should be modest since interval is system uptime";
+}
+
+} // namespace doris