From 1168c35460daa06c4a65fc06d5985b8f9607c42d Mon Sep 17 00:00:00 2001 From: Brian Marks Date: Wed, 27 May 2026 14:26:17 -0400 Subject: [PATCH 01/27] log-injection: capture thread dump of forked process on smoketest timeout (#11400) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit log-injection: capture thread dump of forked process on timeout When `waitForTraceCountAlive` times out, the existing diagnostic dumps the tail of the forked process's captured stdout. In all 49 observed `traceCount=0` failures since #11075, that dump has come back empty — the JVM is alive (RC polls=130+) but the output-capture thread appears starved, so we can't see what the agent is doing. Capture a thread dump via `jstack` instead. Its output is read by the smoketest JVM directly, bypassing the tested-process capture thread. No raw `kill -3` fallback — PID reuse on shared CI hosts could cause us to signal an unrelated process if the child has exited since the surrounding liveness check. Diagnostic-only, fires only on the timeout failure path. log-injection: harden dumpThreadStacks against masking errors Two issues in the previous commit's diagnostic path: 1. The body of dumpThreadStacks had no top-level try/catch — an unexpected throw (reflection error, NPE, etc.) would propagate out and replace the original AssertionError with a less useful one. Wrap the whole body and return a "(thread dump failed: ...)" string instead. 2. PID-reuse race: between the alive check at the call site and the jstack invocation, the child can exit and be reaped; if the OS reuses the PID we'd jstack an unrelated process and attach misleading diagnostics to the failure. Re-check isAlive() immediately before invoking jstack to narrow the race window. log-injection: spotless formatting log-injection: spotless fix for return-closure single-line Revert spotless single-line closure change The single-line closure format CI flagged earlier was a transient CI runner state (likely stale Groovy-Eclipse formatter cache); subsequent runs flagged the opposite direction. Reverting to the master format, which matches what the current CI runners accept. log-injection: print full thread dump to stdout, filter inline message CI Visibility caps error.message at ~5000 chars, which truncates the full jstack output mid-thread. Two changes to make sure no dump info is ever lost: 1. Print the full thread dump to stdout via println(). Gradle's test reporter captures stdout per-test, surfacing it in both the build log (visible in GitLab CI) and the test report XML. This becomes the source of truth when the inline dump is incomplete. 2. Filter the inline dump (in error.message) to prioritize the threads that explain a hang: main, dd-*/datadog-* agent threads, OkHttp threads, and anything BLOCKED. Known JVM boilerplate (Reference Handler, Finalizer, compiler threads, GC, etc.) is dropped. The result is capped at 4200 chars with an elision marker noting where to find the full dump. log-injection: drop misleading stdout-dump path The earlier println-the-full-dump-to-stdout idea was supposed to give us a fallback when CI Visibility truncates error.message at ~5000 chars. Verified in CI: that println goes to Gradle's per-test stdout capture, which lands in the test report XML — but the XML is not uploaded as a GitLab artifact, and CI Visibility doesn't capture test stdout either. So the "full dump" claim was misleading. Drop the println and the "(full dump on stdout)" marker. Bump the inline filter cap from 4200 to 4700 to use the full error.message budget. The filtered dump retains main + all dd-* agent threads + any BLOCKED thread — exactly what's needed to diagnose a wedged tracer under traceCount=0. Co-authored-by: devflow.devflow-routing-intake --- .../smoketest/LogInjectionSmokeTest.groovy | 201 +++++++++++++++++- 1 file changed, 200 insertions(+), 1 deletion(-) diff --git a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy index 5a640da4ae8..0e35061d8ee 100644 --- a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy +++ b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy @@ -3,6 +3,7 @@ package datadog.smoketest import com.squareup.moshi.Moshi import com.squareup.moshi.Types import datadog.environment.JavaVirtualMachine +import datadog.environment.OperatingSystem import datadog.trace.agent.test.server.http.TestHttpServer.HandlerApi.RequestApi import datadog.trace.api.config.GeneralConfig import datadog.trace.test.util.Flaky @@ -378,15 +379,213 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest { // The default error ("Condition not satisfied after 30s") is useless — enrich with diagnostic state def alive = testedProcess?.isAlive() def lastLines = tailProcessLog(30) + def threadDump = alive ? dumpThreadStacks() : "(process not alive, skipping thread dump)" throw new AssertionError( "Timed out waiting for ${count} traces after ${defaultPoll.timeout}s. " + "traceCount=${traceCount.get()}, process.alive=${alive}, " + "RC polls received: ${rcClientMessages.size()}.\n" + - "Last process output:\n${lastLines}", e) + "Last process output:\n${lastLines}\n" + + "Thread dump:\n${threadDump}", e) } traceCount.get() } + /** + * Capture a thread dump of the forked process via {@code jstack}. jstack's output is captured by + * the smoketest JVM directly, bypassing the tested-process output-capture thread that has been + * observed to be starved at timeout (which makes the SIGQUIT-via-stderr approach unreliable for + * exactly the failures we want to diagnose). + * + *

No raw {@code kill -3} fallback: PID reuse on shared CI hosts could cause us to signal an + * unrelated process if the child has exited since the surrounding liveness check. + */ + private String dumpThreadStacks() { + try { + if (testedProcess == null) { + return "(no tested process)" + } + if (OperatingSystem.isWindows()) { + return "(thread dump not supported on Windows)" + } + long pid = getTestedProcessPid() + if (pid <= 0) { + return "(could not determine pid)" + } + // Re-check liveness immediately before invoking jstack. The earlier check that gates this + // method runs ~1 statement away, but if the child has exited and been reaped since then, + // the OS may have reused the PID — jstack-ing the wrong process would attach misleading + // diagnostics to the test failure. + if (!testedProcess.isAlive()) { + return "(process exited between liveness check and dump; skipping to avoid PID reuse)" + } + String jstackOut = runJstack(pid) + if (jstackOut == null) { + return "(jstack not available or failed)" + } + return filterThreadDump(jstackOut) + } catch (Throwable t) { + // Never let a diagnostic failure mask the original AssertionError. + return "(thread dump failed: ${t.getClass().simpleName}: ${t.message})" + } + } + + // Approximate budget for the inline dump in error.message. Datadog CI Visibility caps + // error.message at ~5000 chars; this leaves a few hundred for the "Timed out waiting..." + // prefix and the elision marker. + private static final int INLINE_DUMP_CAP = 4700 + + /** + * Reduce a jstack thread dump to the threads most likely to explain a hang: the main thread, + * dd-trace agent threads (dd-*, datadog-*), OkHttp threads, and anything BLOCKED. Drops known + * JVM boilerplate (compiler/GC/reference handler/etc). Truncates to {@link #INLINE_DUMP_CAP} + * with an elision marker. + */ + private String filterThreadDump(String fullDump) { + int firstBlockIdx = fullDump.indexOf('\n"') + if (firstBlockIdx < 0) { + // No recognizable thread blocks — return the original, truncated if needed + return fullDump.length() > INLINE_DUMP_CAP + ? fullDump.substring(0, INLINE_DUMP_CAP) + "\n(truncated)" + : fullDump + } + String header = fullDump.substring(0, firstBlockIdx + 1) + String rest = fullDump.substring(firstBlockIdx + 1) + + List blocks = [] + int i = 0 + while (i < rest.length()) { + int next = rest.indexOf('\n"', i) + if (next < 0) { + blocks.add(rest.substring(i)) + break + } + blocks.add(rest.substring(i, next + 1)) + i = next + 1 + } + + List highPriority = [] + List lowPriority = [] + int boilerplateDropped = 0 + for (String block : blocks) { + def m = block =~ /^"([^"]+)"/ + String name = m.find() ? m.group(1) : '' + if (isBoilerplateThread(name)) { + boilerplateDropped++ + } else if (isHighPriorityThread(name, block)) { + highPriority.add(block) + } else { + lowPriority.add(block) + } + } + + StringBuilder out = new StringBuilder(header) + int elided = 0 + for (String block : highPriority + lowPriority) { + if (out.length() + block.length() + 120 > INLINE_DUMP_CAP) { + elided++ + continue + } + out.append(block) + } + if (boilerplateDropped > 0 || elided > 0) { + out.append("\n(elided ${boilerplateDropped} JVM-boilerplate thread(s)") + if (elided > 0) { + out.append(", elided ${elided} other thread(s) for size") + } + out.append(")") + } + return out.toString() + } + + private static boolean isBoilerplateThread(String name) { + if (name in [ + "Reference Handler", "Finalizer", "Signal Dispatcher", "Common-Cleaner", + "Service Thread", "Monitor Deflation Thread", "Notification Thread", + "Attach Listener", "process reaper", "Sweeper thread", "VM Thread", "VM Periodic Task Thread" + ]) { + return true + } + return name.startsWith("C1 CompilerThread") || + name.startsWith("C2 CompilerThread") || + name.startsWith("GC Thread") || + name.startsWith("G1 ") || + name.startsWith("ParGC ") || + name.startsWith("CMS ") + } + + private static boolean isHighPriorityThread(String name, String block) { + if (name == "main") { + return true + } + if (name.startsWith("dd-") || name.startsWith("datadog-")) { + return true + } + if (name.startsWith("OkHttp") || name.contains("okhttp")) { + return true + } + return block.contains("java.lang.Thread.State: BLOCKED") + } + + private long getTestedProcessPid() { + try { + return (long) testedProcess.getClass().getMethod("pid").invoke(testedProcess) + } catch (Throwable ignored) { + try { + // UNIXProcess's private 'pid' field, for JDK 8 compatibility + def field = testedProcess.getClass().getDeclaredField("pid") + field.setAccessible(true) + return field.getInt(testedProcess) as long + } catch (Throwable ignored2) { + return -1L + } + } + } + + private String runJstack(long pid) { + def candidates = [] + // java.home is always set by the JVM and points to the active JDK/JRE; prefer it over the + // JAVA_HOME env var which is frequently absent in CI runners even when Java is present. + String javaHome = System.getProperty("java.home") + if (javaHome) { + candidates.add(javaHome + "/bin/jstack") + } + String javaHomeEnv = System.getenv("JAVA_HOME") + if (javaHomeEnv && javaHomeEnv != javaHome) { + candidates.add(javaHomeEnv + "/bin/jstack") + } + candidates.add("jstack") + for (String cmd : candidates) { + File tmp = null + try { + tmp = File.createTempFile("jstack", ".txt") + // Redirect output to a file to avoid pipe-buffer deadlock — a full thread dump can + // exceed the OS pipe buffer (typically 64 KB) before waitFor returns. + Process p = new ProcessBuilder(cmd, String.valueOf(pid)) + .redirectErrorStream(true) + .redirectOutput(tmp) + .start() + if (!p.waitFor(5, SECONDS)) { + p.destroyForcibly() + p.waitFor(2, SECONDS) + continue + } + if (p.exitValue() == 0) { + String output = tmp.getText("UTF-8") + if (output) { + return output + } + } + } catch (Throwable ignored) { + // try next candidate + } finally { + if (tmp != null) { + tmp.delete() + } + } + } + return null + } + private String tailProcessLog(int lines) { try { def logFile = new File(logFilePath) From ff45ab072b32fa5e5bd01d0d4542aa7699847d35 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 May 2026 19:05:26 +0000 Subject: [PATCH 02/27] chore(ci): bump the gh-actions-packages group with 2 updates (#11464) chore(ci): bump the gh-actions-packages group with 2 updates Bumps the gh-actions-packages group with 2 updates: [github/codeql-action](https://github.com/github/codeql-action) and [actions/stale](https://github.com/actions/stale). Updates `github/codeql-action` from 4.35.5 to 4.36.0 - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/9e0d7b8d25671d64c341c19c0152d693099fb5ba...7211b7c8077ea37d8641b6271f6a365a22a5fbfa) Updates `actions/stale` from 10.2.0 to 10.3.0 - [Release notes](https://github.com/actions/stale/releases) - [Changelog](https://github.com/actions/stale/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/stale/compare/b5d41d4e1d5dceea10e7104786b73624c18a190f...eb5cf3af3ac0a1aa4c9c45633dd1ae542a27a899) --- updated-dependencies: - dependency-name: github/codeql-action dependency-version: 4.36.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: gh-actions-packages - dependency-name: actions/stale dependency-version: 10.3.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: gh-actions-packages ... Signed-off-by: dependabot[bot] Co-authored-by: devflow.devflow-routing-intake --- .github/workflows/analyze-changes.yaml | 6 +++--- .github/workflows/prune-old-pull-requests.yaml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/analyze-changes.yaml b/.github/workflows/analyze-changes.yaml index 8622431686a..51f652b2b4b 100644 --- a/.github/workflows/analyze-changes.yaml +++ b/.github/workflows/analyze-changes.yaml @@ -30,7 +30,7 @@ jobs: ${{ runner.os }}-gradle- - name: Initialize CodeQL - uses: github/codeql-action/init@9e0d7b8d25671d64c341c19c0152d693099fb5ba # v4.35.5 + uses: github/codeql-action/init@7211b7c8077ea37d8641b6271f6a365a22a5fbfa # v4.36.0 with: languages: 'java' build-mode: 'manual' @@ -43,7 +43,7 @@ jobs: ./gradlew clean :dd-java-agent:shadowJar --build-cache --parallel --stacktrace --no-daemon --max-workers=4 - name: Perform CodeQL Analysis and upload results to GitHub Security tab - uses: github/codeql-action/analyze@9e0d7b8d25671d64c341c19c0152d693099fb5ba # v4.35.5 + uses: github/codeql-action/analyze@7211b7c8077ea37d8641b6271f6a365a22a5fbfa # v4.36.0 trivy: name: Analyze changes with Trivy @@ -102,7 +102,7 @@ jobs: TRIVY_JAVA_DB_REPOSITORY: ghcr.io/aquasecurity/trivy-java-db,public.ecr.aws/aquasecurity/trivy-java-db - name: Upload Trivy scan results to GitHub Security tab - uses: github/codeql-action/upload-sarif@9e0d7b8d25671d64c341c19c0152d693099fb5ba # v4.35.5 + uses: github/codeql-action/upload-sarif@7211b7c8077ea37d8641b6271f6a365a22a5fbfa # v4.36.0 if: always() with: sarif_file: 'trivy-results.sarif' diff --git a/.github/workflows/prune-old-pull-requests.yaml b/.github/workflows/prune-old-pull-requests.yaml index e5633a39415..db92774ea0c 100644 --- a/.github/workflows/prune-old-pull-requests.yaml +++ b/.github/workflows/prune-old-pull-requests.yaml @@ -13,7 +13,7 @@ jobs: pull-requests: write steps: - name: Prune old pull requests - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f # v10.2.0 + uses: actions/stale@eb5cf3af3ac0a1aa4c9c45633dd1ae542a27a899 # v10.3.0 with: days-before-stale: -1 # Disable general stale bot days-before-pr-stale: 90 # Only enable stale bot for PRs with no activity for 90 days From 00e6aa4ebbd32c3d6079d8ac763cdab75bdcb66c Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Wed, 27 May 2026 22:15:38 +0200 Subject: [PATCH 03/27] Bump java-profiler to 1.43.0 for critical stability fixes (#11475) Bump java-profiler to 1.43.0 for critical stability fixes Co-authored-by: jaroslav.bachorik --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 6abe7896aab..2014db87f13 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -8,7 +8,7 @@ gradle-tooling-api = "8.14.5" spotbugs_annotations = "4.9.8" # DataDog libs and forks -ddprof = "1.42.0" +ddprof = "1.43.0" dogstatsd = "4.4.5" okhttp = "3.12.15" # Datadog fork to support Java 7 From 56c6e875af15ce09c7b52f3d4ae1170e4907530d Mon Sep 17 00:00:00 2001 From: Sarah Chen Date: Wed, 27 May 2026 18:03:35 -0400 Subject: [PATCH 04/27] Clean up dependency_age script (#11477) Clean up dependency age script Co-authored-by: sarah.chen --- .github/scripts/dependency_age.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/scripts/dependency_age.py b/.github/scripts/dependency_age.py index 62e27422e2e..9c0a77a8bc2 100644 --- a/.github/scripts/dependency_age.py +++ b/.github/scripts/dependency_age.py @@ -452,8 +452,10 @@ def build_validation_summary( if old_gav not in seen: seen.add(old_gav) if new_gav in baseline_coords: - continue # no-op downgrade — replacement matches baseline - lines.append(f"- `{old_gav}` is {hours_remaining}h away from meeting {min_age_hours}h cooldown, updated to `{new_gav}`") + lines.append(f"- `{old_gav}` is {hours_remaining}h away from meeting {min_age_hours}h cooldown, reverted") + else: + new_version = new_gav.rsplit(":", 1)[1] + lines.append(f"- `{old_gav}` is {hours_remaining}h away from meeting {min_age_hours}h cooldown, updated to `{new_version}`") for entries in violations_by_file.values(): for gav, kind, hours_remaining in entries: if gav not in seen: @@ -462,8 +464,6 @@ def build_validation_summary( lines.append(f"- `{gav}` — cannot verify age, reverted") else: lines.append(f"- `{gav}` is {hours_remaining}h away from meeting {min_age_hours}h cooldown, reverted") - if len(lines) == 2: - return "" # only header, no entries after filtering return "\n".join(lines) From 8130646e6607d13448179401d2a9e0c37781a484 Mon Sep 17 00:00:00 2001 From: Brice Dutheil Date: Thu, 28 May 2026 10:47:08 +0200 Subject: [PATCH 05/27] Fix runner dashboard image filters (#11467) Fix runner dashboard image filters Co-authored-by: brice.dutheil --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8f19a760870..7188f670f3a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -161,8 +161,8 @@ default: fi echo -e "${TEXT_BOLD}${TEXT_YELLOW}Runner dashboard, these are live (limited to a 1-hour window, and expire after 36 hours)${TEXT_CLEAR}" - echo -e "${TEXT_BOLD}${TEXT_YELLOW} Containers:${TEXT_CLEAR} https://app.datadoghq.com/containers?${TIME_PARAMS}query=image_name%3A%2A%2Fdatadog%2Fdd-trace-java-docker-build%20AND%20pod_name%3A${POD_NAME}&live=false" - echo -e "${TEXT_BOLD}${TEXT_YELLOW} Processes:${TEXT_CLEAR} https://app.datadoghq.com/process?${TIME_PARAMS}query=image_name%3A%2A%2Fdatadog%2Fdd-trace-java-docker-build%20AND%20pod_name%3A${POD_NAME}&live=false" + echo -e "${TEXT_BOLD}${TEXT_YELLOW} Containers:${TEXT_CLEAR} https://app.datadoghq.com/containers?${TIME_PARAMS}query=image_name%3A%2A%2Fdd-trace-java-docker-build%20AND%20pod_name%3A${POD_NAME}&live=false" + echo -e "${TEXT_BOLD}${TEXT_YELLOW} Processes:${TEXT_CLEAR} https://app.datadoghq.com/process?${TIME_PARAMS}query=image_name%3A%2A%2Fdd-trace-java-docker-build%20AND%20pod_name%3A${POD_NAME}&live=false" .gitlab_base_ref_params: &gitlab_base_ref_params - | From ff6fca7d22e1594ccf11222a6975af0af2e21d91 Mon Sep 17 00:00:00 2001 From: Brice Dutheil Date: Thu, 28 May 2026 11:14:36 +0200 Subject: [PATCH 06/27] chore: add VisibleForTesting annotation (#11473) chore: add VisibleForTesting annotation Co-authored-by: brice.dutheil --- .../serialization/FlushingBuffer.java | 3 ++- components/annotations/build.gradle.kts | 1 + .../trace/api/internal/VisibleForTesting.java | 10 ++++++++++ components/environment/build.gradle.kts | 4 ++++ .../datadog/environment/JavaVirtualMachine.java | 3 ++- .../java/datadog/environment/JvmOptions.java | 3 ++- .../trace/bootstrap/InstrumentationErrors.java | 3 ++- .../trace/bootstrap/WeakMapContextStore.java | 4 +++- .../blocking/BlockingActionHelper.java | 3 ++- .../instrumentation/dbm/SharedDBCommenter.java | 3 ++- .../datadog/crashtracking/ConfigManager.java | 7 ++++--- .../datadog/crashtracking/CrashUploader.java | 11 ++++++----- .../CrashUploaderScriptInitializer.java | 5 +++-- .../OOMENotifierScriptInitializer.java | 3 ++- .../crashtracking/parsers/RedactUtils.java | 3 ++- .../debugger/agent/ConfigurationUpdater.java | 3 ++- .../com/datadog/debugger/sink/DebuggerSink.java | 3 ++- .../debugger/uploader/BatchUploader.java | 8 +++----- .../trace/logging/PrintStreamWrapper.java | 3 ++- .../shim/logs/OtelLogRecordBuilder.java | 4 ++-- .../ddprof/DatadogProfilerOngoingRecording.java | 3 ++- .../openjdk/OpenJdkOngoingRecording.java | 3 ++- .../openjdk/OpenJdkRecordingData.java | 3 ++- .../openjdk/events/SmapEntryCache.java | 7 ++++--- .../oracle/OracleJdkOngoingRecording.java | 3 ++- .../profiling/controller/ProfilingSystem.java | 3 ++- .../profiling/ddprof/DatadogProfiler.java | 3 ++- .../ddprof/DatadogProfilerRecording.java | 5 +++-- .../profiling/uploader/ProfileUploader.java | 11 +++-------- .../profiling/agent/CompositeController.java | 3 ++- .../agent/ScrubRecordingDataListener.java | 3 ++- dd-java-agent/build.gradle | 1 + .../datadog/trace/bootstrap/AgentPreCheck.java | 5 +++-- .../common/writer/TraceStructureWriter.java | 3 ++- .../common/writer/ddagent/TraceMapperV0_5.java | 5 +++-- .../java/datadog/trace/core/CoreTracer.java | 3 ++- .../main/java/datadog/trace/core/DDSpan.java | 5 +++-- .../trace/core/LongRunningTracesTracker.java | 5 +++-- .../java/datadog/trace/core/PendingTrace.java | 17 +++++++++-------- .../datadog/trace/core/PendingTraceBuffer.java | 7 ++++--- .../trace/core/propagation/W3CHttpCodec.java | 8 ++++---- .../tagprocessor/HttpEndpointPostProcessor.java | 4 ++-- .../tagprocessor/PeerServiceCalculator.java | 3 ++- internal-api/build.gradle.kts | 1 + .../java/datadog/trace/api/ProcessTags.java | 10 +++++----- .../api/civisibility/config/BazelMode.java | 5 +++-- .../trace/api/datastreams/TransactionInfo.java | 3 ++- .../api/profiling/ProfilerFlareLogger.java | 9 +++++---- .../api/remoteconfig/ServiceNameCollector.java | 3 ++- .../datadog/trace/util/ProcessSupervisor.java | 3 ++- .../datadog/trace/util/TempLocationManager.java | 7 ++++--- settings.gradle.kts | 1 + utils/config-utils/build.gradle.kts | 1 + .../trace/api/env/CapturedEnvironment.java | 16 ++++------------ .../src/main/java/datadog/logging/IOLogger.java | 3 ++- .../java/datadog/logging/RatelimitedLogger.java | 3 ++- 56 files changed, 156 insertions(+), 109 deletions(-) create mode 100644 components/annotations/build.gradle.kts create mode 100644 components/annotations/src/main/java/datadog/trace/api/internal/VisibleForTesting.java diff --git a/communication/src/main/java/datadog/communication/serialization/FlushingBuffer.java b/communication/src/main/java/datadog/communication/serialization/FlushingBuffer.java index 332434ad46a..e3087bd5f27 100644 --- a/communication/src/main/java/datadog/communication/serialization/FlushingBuffer.java +++ b/communication/src/main/java/datadog/communication/serialization/FlushingBuffer.java @@ -1,5 +1,6 @@ package datadog.communication.serialization; +import datadog.trace.api.internal.VisibleForTesting; import java.nio.ByteBuffer; public final class FlushingBuffer implements StreamingBuffer { @@ -106,7 +107,7 @@ public void reset() { mark = 0; } - // for tests only + @VisibleForTesting int getMessageCount() { return messageCount; } diff --git a/components/annotations/build.gradle.kts b/components/annotations/build.gradle.kts new file mode 100644 index 00000000000..b8314ee7f33 --- /dev/null +++ b/components/annotations/build.gradle.kts @@ -0,0 +1 @@ +apply(from = "$rootDir/gradle/java.gradle") diff --git a/components/annotations/src/main/java/datadog/trace/api/internal/VisibleForTesting.java b/components/annotations/src/main/java/datadog/trace/api/internal/VisibleForTesting.java new file mode 100644 index 00000000000..60e3cd2c33a --- /dev/null +++ b/components/annotations/src/main/java/datadog/trace/api/internal/VisibleForTesting.java @@ -0,0 +1,10 @@ +package datadog.trace.api.internal; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.SOURCE) +@Target({ElementType.CONSTRUCTOR, ElementType.FIELD, ElementType.METHOD, ElementType.TYPE}) +public @interface VisibleForTesting {} diff --git a/components/environment/build.gradle.kts b/components/environment/build.gradle.kts index 7818fef2463..82c834d5ef3 100644 --- a/components/environment/build.gradle.kts +++ b/components/environment/build.gradle.kts @@ -5,6 +5,10 @@ plugins { apply(from = "$rootDir/gradle/java.gradle") +dependencies { + compileOnly(project(":components:annotations")) +} + /* * Add an addition gradle configuration to be consumed by bootstrap only. * "datadog.trace." prefix is required to be excluded from Jacoco instrumentation. diff --git a/components/environment/src/main/java/datadog/environment/JavaVirtualMachine.java b/components/environment/src/main/java/datadog/environment/JavaVirtualMachine.java index a71d66859bc..d6fe120c98b 100644 --- a/components/environment/src/main/java/datadog/environment/JavaVirtualMachine.java +++ b/components/environment/src/main/java/datadog/environment/JavaVirtualMachine.java @@ -1,5 +1,6 @@ package datadog.environment; +import datadog.trace.api.internal.VisibleForTesting; import java.util.List; import java.util.Locale; import javax.annotation.Nullable; @@ -212,7 +213,7 @@ public Runtime() { SystemProperties.get("java.vendor.version")); } - // Only visible for testing + @VisibleForTesting Runtime(String javaVer, String rtVer, String name, String vendor, String vendorVersion) { this.name = name == null ? "" : name; this.vendor = vendor == null ? "" : vendor; diff --git a/components/environment/src/main/java/datadog/environment/JvmOptions.java b/components/environment/src/main/java/datadog/environment/JvmOptions.java index ef549bc4d73..7dd77e29ff9 100644 --- a/components/environment/src/main/java/datadog/environment/JvmOptions.java +++ b/components/environment/src/main/java/datadog/environment/JvmOptions.java @@ -5,6 +5,7 @@ import static java.util.Collections.emptyList; import static java.util.Collections.singletonList; +import datadog.trace.api.internal.VisibleForTesting; import de.thetaphi.forbiddenapis.SuppressForbidden; import java.io.BufferedReader; import java.io.IOException; @@ -93,7 +94,7 @@ private List findVmOptions() { // Be aware that when running a native image, the command line in /proc/self/cmdline is just the // executable - // Visible for testing + @VisibleForTesting List findVmOptionsFromProcFs(String[] procfsCmdline) { // Create the list of VM options List vmOptions = new ArrayList<>(); diff --git a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/InstrumentationErrors.java b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/InstrumentationErrors.java index c18534a73e9..b2bc1702805 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/InstrumentationErrors.java +++ b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/InstrumentationErrors.java @@ -1,5 +1,6 @@ package datadog.trace.bootstrap; +import datadog.trace.api.internal.VisibleForTesting; import java.io.PrintWriter; import java.io.StringWriter; import java.util.List; @@ -30,7 +31,7 @@ public static Throwable recordError(Throwable error) { return error; // keep throwable at top of the stack } - // Visible for testing + @VisibleForTesting public static void resetErrors() { COUNTER.set(0); if (detailed) { diff --git a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/WeakMapContextStore.java b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/WeakMapContextStore.java index a3926412889..1750b7f2b9c 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/WeakMapContextStore.java +++ b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/WeakMapContextStore.java @@ -1,5 +1,7 @@ package datadog.trace.bootstrap; +import datadog.trace.api.internal.VisibleForTesting; + /** * Weak {@link ContextStore} that acts as a fall-back when field-injection isn't possible. * @@ -85,7 +87,7 @@ public V remove(final K key) { return (V) map.remove(key); } - // Package reachable for testing + @VisibleForTesting int size() { return map.size(); } diff --git a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/blocking/BlockingActionHelper.java b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/blocking/BlockingActionHelper.java index fbe5e150367..e72a86a7098 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/blocking/BlockingActionHelper.java +++ b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/blocking/BlockingActionHelper.java @@ -6,6 +6,7 @@ import datadog.appsec.api.blocking.BlockingContentType; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; @@ -187,7 +188,7 @@ private static String nextMediaRange(String s, int[] pos, float[] quality) { return mediaRangeMatcher.group(1); } - // public for testing + @VisibleForTesting public static void reset(Config config) { TEMPLATE_HTML = null; TEMPLATE_JSON = null; diff --git a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/dbm/SharedDBCommenter.java b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/dbm/SharedDBCommenter.java index 161e2f8ad11..3dbf916362c 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/dbm/SharedDBCommenter.java +++ b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/dbm/SharedDBCommenter.java @@ -4,6 +4,7 @@ import datadog.trace.api.BaseHash; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.instrumentation.api.AgentSpan; import datadog.trace.bootstrap.instrumentation.api.Tags; import java.io.UnsupportedEncodingException; @@ -88,7 +89,7 @@ private static void ensureStaticPrefixComputed() { staticPrefixComputed = true; } - // @VisibleForTesting + @VisibleForTesting public static void resetStaticPrefixForTesting() { staticPrefixComputed = false; } diff --git a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/ConfigManager.java b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/ConfigManager.java index 02c10016d83..99e6235a6b2 100644 --- a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/ConfigManager.java +++ b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/ConfigManager.java @@ -8,6 +8,7 @@ import datadog.trace.api.Config; import datadog.trace.api.ProcessTags; import datadog.trace.api.WellKnownTags; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.util.PidHelper; import datadog.trace.util.RandomUtils; import java.io.BufferedReader; @@ -137,7 +138,7 @@ public Builder extendedInfoEnabled(boolean extendedInfoEnabled) { return this; } - // @VisibleForTesting + @VisibleForTesting Builder reportUUID(String reportUUID) { this.reportUUID = reportUUID; return this; @@ -170,7 +171,7 @@ private static String getBaseName(File file) { return filename.substring(0, dotIndex); } - // @VisibleForTesting + @VisibleForTesting static String getMergedTagsForSerialization(Config config) { return config.getMergedCrashTrackingTags().entrySet().stream() .filter(e -> e.getValue() != null) @@ -195,7 +196,7 @@ public static void writeConfigToPath(File scriptFile, String... additionalEntrie writeConfigToFile(Config.get(), cfgFile, additionalEntries); } - // @VisibleForTesting + @VisibleForTesting static void writeConfigToFile(Config config, File cfgFile, String... additionalEntries) { final WellKnownTags wellKnownTags = config.getWellKnownTags(); diff --git a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploader.java b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploader.java index 76fd0e45f8c..5c37b7c5908 100644 --- a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploader.java +++ b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploader.java @@ -24,6 +24,7 @@ import datadog.environment.SystemProperties; import datadog.trace.api.Config; import datadog.trace.api.DDTags; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.config.provider.ConfigProvider; import datadog.trace.util.AgentThreadFactory; import datadog.trace.util.PidHelper; @@ -187,7 +188,7 @@ public void notifyCrashStarted(String error) { } } - // @VisibleForTesting + @VisibleForTesting void sendPingToTelemetry(String error) { // send a ping message to the telemetry to notify that the crash report started try (Buffer buf = new Buffer(); @@ -207,7 +208,7 @@ void sendPingToTelemetry(String error) { } } - // @VisibleForTesting + @VisibleForTesting void sendPingToErrorTracking(String error) { try { final CrashLog ping = @@ -253,7 +254,7 @@ public void upload(@Nonnull Path file) { } } - // @VisibleForTesting + @VisibleForTesting void remoteUpload( @Nonnull String fileContent, boolean sendToTelemetry, boolean sendToErrorTracking) { final String uuid = storedConfig.reportUUID; @@ -316,7 +317,7 @@ void uploadToLogs(@Nonnull String message, @Nonnull PrintStream out) throws IOEx } } - // @VisibleForTesting + @VisibleForTesting @SuppressForbidden static String extractErrorKind(String fileContent) { Matcher matcher = ERROR_MESSAGE_PATTERN.matcher(fileContent); @@ -348,7 +349,7 @@ static String extractErrorKind(String fileContent) { "$"), Pattern.DOTALL | Pattern.MULTILINE); - // @VisibleForTesting + @VisibleForTesting @SuppressForbidden static String extractErrorMessage(String fileContent) { Matcher matcher = ERROR_MESSAGE_PATTERN.matcher(fileContent); diff --git a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploaderScriptInitializer.java b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploaderScriptInitializer.java index 0e810694be1..5677d10f220 100644 --- a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploaderScriptInitializer.java +++ b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploaderScriptInitializer.java @@ -8,6 +8,7 @@ import static java.util.Locale.ROOT; import datadog.environment.SystemProperties; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.util.PidHelper; import datadog.trace.util.Strings; import java.io.BufferedReader; @@ -25,12 +26,12 @@ public final class CrashUploaderScriptInitializer { private CrashUploaderScriptInitializer() {} - // @VisibleForTests + @VisibleForTesting static void initialize(String onErrorVal, String onErrorFile) { initialize(onErrorVal, onErrorFile, null); } - // @VisibleForTests + @VisibleForTesting static void initialize(String onErrorVal, String onErrorFile, String javacorePath) { if (onErrorVal == null || onErrorVal.isEmpty()) { LOG.debug( diff --git a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/OOMENotifierScriptInitializer.java b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/OOMENotifierScriptInitializer.java index c558d3a83a5..5d37b975b98 100644 --- a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/OOMENotifierScriptInitializer.java +++ b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/OOMENotifierScriptInitializer.java @@ -8,6 +8,7 @@ import static datadog.crashtracking.Initializer.pidFromSpecialFileName; import static datadog.trace.api.telemetry.LogCollector.SEND_TELEMETRY; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.util.PidHelper; import java.io.File; import java.io.FileOutputStream; @@ -20,7 +21,7 @@ public final class OOMENotifierScriptInitializer { private OOMENotifierScriptInitializer() {} - // @VisibleForTests + @VisibleForTesting static void initialize(String onOutOfMemoryVal) { if (onOutOfMemoryVal == null || onOutOfMemoryVal.isEmpty()) { LOG.debug( diff --git a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/parsers/RedactUtils.java b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/parsers/RedactUtils.java index 7be12817525..6665d0f412d 100644 --- a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/parsers/RedactUtils.java +++ b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/parsers/RedactUtils.java @@ -1,5 +1,6 @@ package datadog.crashtracking.parsers; +import datadog.trace.api.internal.VisibleForTesting; import de.thetaphi.forbiddenapis.SuppressForbidden; import java.util.function.Function; import java.util.regex.Matcher; @@ -215,7 +216,7 @@ static String redactNmethodClass(String line) { * java.lang.Class} oop) use {@link #redactRegisterToMemoryMapping} which detects the oop type * automatically. */ - // @VisibleForTesting + @VisibleForTesting static String redactDottedClassOopRef(String line) { return redactStringOopRef(line, false); } diff --git a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java index c392ab3c663..3caa2a149ab 100644 --- a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java +++ b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java @@ -15,6 +15,7 @@ import datadog.environment.JavaVirtualMachine; import datadog.logging.RatelimitedLogger; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.debugger.DebuggerContext; import datadog.trace.bootstrap.debugger.ProbeId; import datadog.trace.bootstrap.debugger.ProbeImplementation; @@ -445,7 +446,7 @@ private void removeCurrentTransformer() { currentTransformer = null; } - // only visible for tests + @VisibleForTesting Map getAppliedDefinitions() { if (currentTransformer == null) { return Collections.emptyMap(); diff --git a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/sink/DebuggerSink.java b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/sink/DebuggerSink.java index 5c91e110acb..becf47b0337 100644 --- a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/sink/DebuggerSink.java +++ b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/sink/DebuggerSink.java @@ -5,6 +5,7 @@ import com.datadog.debugger.uploader.BatchUploader; import com.datadog.debugger.util.DebuggerMetrics; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.debugger.DebuggerContext.SkipCause; import datadog.trace.bootstrap.debugger.ProbeId; import datadog.trace.util.AgentTaskScheduler; @@ -159,7 +160,7 @@ private void lowRateReschedule() { TimeUnit.MILLISECONDS); } - // visible for testing + @VisibleForTesting void lowRateFlush(DebuggerSink ignored) { symbolSink.flush(); probeStatusSink.flush(tags); diff --git a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/uploader/BatchUploader.java b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/uploader/BatchUploader.java index 63ff0ef8dcc..993d8194668 100644 --- a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/uploader/BatchUploader.java +++ b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/uploader/BatchUploader.java @@ -7,6 +7,7 @@ import datadog.communication.http.OkHttpUtils; import datadog.logging.RatelimitedLogger; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.util.AgentThreadFactory; import java.io.IOException; import java.time.Duration; @@ -124,7 +125,7 @@ public BatchUploader(String name, Config config, String endpoint, RetryPolicy re ContainerInfo.getEntityId()); } - // Visible for testing + @VisibleForTesting BatchUploader( String name, Config config, @@ -220,10 +221,7 @@ private void doUpload(Runnable makeRequest) { } } - /** - * Note that this method is only visible for testing and should not be used from outside this - * class. - */ + @VisibleForTesting OkHttpClient getClient() { return client; } diff --git a/dd-java-agent/agent-logging/src/main/java/datadog/trace/logging/PrintStreamWrapper.java b/dd-java-agent/agent-logging/src/main/java/datadog/trace/logging/PrintStreamWrapper.java index cd1e1b4a9c9..c71390a8e23 100644 --- a/dd-java-agent/agent-logging/src/main/java/datadog/trace/logging/PrintStreamWrapper.java +++ b/dd-java-agent/agent-logging/src/main/java/datadog/trace/logging/PrintStreamWrapper.java @@ -1,6 +1,7 @@ package datadog.trace.logging; import datadog.environment.OperatingSystem; +import datadog.trace.api.internal.VisibleForTesting; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.OutputStream; @@ -17,7 +18,7 @@ public PrintStreamWrapper(PrintStream ps) { super(ps); } - // use for tests only + @VisibleForTesting public OutputStream getOriginalPrintStream() { return super.out; } diff --git a/dd-java-agent/agent-otel/otel-shim/src/main/java/datadog/opentelemetry/shim/logs/OtelLogRecordBuilder.java b/dd-java-agent/agent-otel/otel-shim/src/main/java/datadog/opentelemetry/shim/logs/OtelLogRecordBuilder.java index 93554fdc8c5..81cfb7c875e 100644 --- a/dd-java-agent/agent-otel/otel-shim/src/main/java/datadog/opentelemetry/shim/logs/OtelLogRecordBuilder.java +++ b/dd-java-agent/agent-otel/otel-shim/src/main/java/datadog/opentelemetry/shim/logs/OtelLogRecordBuilder.java @@ -3,6 +3,7 @@ import static datadog.opentelemetry.shim.trace.OtelExtractedContext.extract; import static io.opentelemetry.api.common.AttributeKey.stringKey; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.time.SystemTimeSource; import datadog.trace.api.time.TimeSource; import datadog.trace.bootstrap.otel.logs.data.OtelLogRecordProcessor; @@ -23,8 +24,7 @@ @ParametersAreNonnullByDefault final class OtelLogRecordBuilder implements LogRecordBuilder { - // package-visible for testing - static TimeSource TIME_SOURCE = SystemTimeSource.INSTANCE; + @VisibleForTesting static TimeSource TIME_SOURCE = SystemTimeSource.INSTANCE; private static final AttributeKey EXCEPTION_TYPE_KEY = stringKey("exception.type"); private static final AttributeKey EXCEPTION_MESSAGE_KEY = stringKey("exception.message"); diff --git a/dd-java-agent/agent-profiling/profiling-controller-ddprof/src/main/java/com/datadog/profiling/controller/ddprof/DatadogProfilerOngoingRecording.java b/dd-java-agent/agent-profiling/profiling-controller-ddprof/src/main/java/com/datadog/profiling/controller/ddprof/DatadogProfilerOngoingRecording.java index 7976f7f81a9..26ffc13e972 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-ddprof/src/main/java/com/datadog/profiling/controller/ddprof/DatadogProfilerOngoingRecording.java +++ b/dd-java-agent/agent-profiling/profiling-controller-ddprof/src/main/java/com/datadog/profiling/controller/ddprof/DatadogProfilerOngoingRecording.java @@ -20,6 +20,7 @@ import com.datadog.profiling.controller.UnsupportedEnvironmentException; import com.datadog.profiling.ddprof.DatadogProfiler; import datadog.environment.JavaVirtualMachine; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.ProfilingSnapshot; import datadog.trace.api.profiling.RecordingData; import java.time.Instant; @@ -52,7 +53,7 @@ public RecordingData stop() { return recording.stop(); } - // @VisibleForTesting + @VisibleForTesting final RecordingData snapshot(final Instant start) { return snapshot(start, ProfilingSnapshot.Kind.PERIODIC); } diff --git a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkOngoingRecording.java b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkOngoingRecording.java index ddac86e663f..4b9d71e3a26 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkOngoingRecording.java +++ b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkOngoingRecording.java @@ -3,6 +3,7 @@ import com.datadog.profiling.controller.ControllerContext; import com.datadog.profiling.controller.OngoingRecording; import com.datadog.profiling.utils.ProfilingMode; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.ProfilingSnapshot; import datadog.trace.api.profiling.RecordingData; import datadog.trace.bootstrap.config.provider.ConfigProvider; @@ -135,7 +136,7 @@ public RecordingData stop() { return new OpenJdkRecordingData(recording, ProfilingSnapshot.Kind.PERIODIC); } - // @VisibleForTesting + @VisibleForTesting final RecordingData snapshot(@Nonnull final Instant start) { return snapshot(start, ProfilingSnapshot.Kind.PERIODIC); } diff --git a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkRecordingData.java b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkRecordingData.java index 6cedf96691b..da022c226ed 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkRecordingData.java +++ b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkRecordingData.java @@ -15,6 +15,7 @@ */ package com.datadog.profiling.controller.openjdk; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.RecordingData; import datadog.trace.api.profiling.RecordingInputStream; import java.io.IOException; @@ -54,7 +55,7 @@ public String getName() { return recording.getName(); } - // Visible for testing + @VisibleForTesting Recording getRecording() { return recording; } diff --git a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/events/SmapEntryCache.java b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/events/SmapEntryCache.java index 4b03fff0396..b890904a461 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/events/SmapEntryCache.java +++ b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/events/SmapEntryCache.java @@ -1,6 +1,7 @@ package com.datadog.profiling.controller.openjdk.events; import datadog.environment.JavaVirtualMachine; +import datadog.trace.api.internal.VisibleForTesting; import de.thetaphi.forbiddenapis.SuppressForbidden; import java.io.BufferedReader; import java.io.IOException; @@ -60,7 +61,7 @@ static class AnnotatedRegion { this.smapsPath = smapsPath; } - // @VisibleForTesting + @VisibleForTesting void invalidate() { UPDATER.getAndSet(this, System.nanoTime() - (2 * ttl)); } @@ -81,7 +82,7 @@ public List getEvents() { return (List) events[index]; } - // accessible for testing + @VisibleForTesting static AnnotatedRegion fromAnnotatedEntry(String line, int javaVersion) { boolean isRegion = line.startsWith("0x"); if (isRegion) { @@ -149,7 +150,7 @@ private static boolean isSmapHeader(String line) { return ((firstChar >= '0' && firstChar <= '9') || (firstChar >= 'a' && firstChar <= 'f')); } - // accessible for testing + @VisibleForTesting static void readEvents(BufferedReader br, List events) throws IOException { String line = br.readLine(); while (line != null) { diff --git a/dd-java-agent/agent-profiling/profiling-controller-oracle/src/main/java/com/datadog/profiling/controller/oracle/OracleJdkOngoingRecording.java b/dd-java-agent/agent-profiling/profiling-controller-oracle/src/main/java/com/datadog/profiling/controller/oracle/OracleJdkOngoingRecording.java index 18f104e4e6f..2edbc1a369e 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-oracle/src/main/java/com/datadog/profiling/controller/oracle/OracleJdkOngoingRecording.java +++ b/dd-java-agent/agent-profiling/profiling-controller-oracle/src/main/java/com/datadog/profiling/controller/oracle/OracleJdkOngoingRecording.java @@ -1,6 +1,7 @@ package com.datadog.profiling.controller.oracle; import com.datadog.profiling.controller.OngoingRecording; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.ProfilingSnapshot; import java.io.IOException; import java.time.Duration; @@ -52,7 +53,7 @@ public OracleJdkRecordingData stop() { } } - // @VisibleForTesting + @VisibleForTesting final OracleJdkRecordingData snapshot(@Nonnull final Instant start) { return snapshot(start, ProfilingSnapshot.Kind.PERIODIC); } diff --git a/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java b/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java index 7f57b356d99..bfb3df0f333 100644 --- a/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java +++ b/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java @@ -22,6 +22,7 @@ import static datadog.trace.util.AgentThreadFactory.AgentThread.PROFILER_RECORDING_SCHEDULER; import datadog.environment.JavaVirtualMachine; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.ProfilerFlareLogger; import datadog.trace.api.profiling.ProfilingSnapshot; import datadog.trace.api.profiling.RecordingData; @@ -238,7 +239,7 @@ public boolean isStarted() { return started; } - /** VisibleForTesting */ + @VisibleForTesting final Duration getStartupDelay() { return startupDelay; } diff --git a/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java b/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java index b1e07b08c32..9aedde9e49f 100644 --- a/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java +++ b/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java @@ -38,6 +38,7 @@ import datadog.environment.JavaVirtualMachine; import datadog.libs.ddprof.DdprofLibraryLoader; import datadog.trace.api.config.ProfilingConfig; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.RecordingData; import datadog.trace.bootstrap.config.provider.ConfigProvider; import datadog.trace.bootstrap.instrumentation.api.TaskWrapper; @@ -115,7 +116,7 @@ private DatadogProfiler(ConfigProvider configProvider) { this(configProvider, getContextAttributes(configProvider)); } - // visible for testing + @VisibleForTesting DatadogProfiler(ConfigProvider configProvider, Set contextAttributes) { this.configProvider = configProvider; this.profiler = DdprofLibraryLoader.javaProfiler().getComponent(); diff --git a/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfilerRecording.java b/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfilerRecording.java index 8154a8a444e..82bf3d5b868 100644 --- a/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfilerRecording.java +++ b/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfilerRecording.java @@ -1,6 +1,7 @@ package com.datadog.profiling.ddprof; import com.datadog.profiling.controller.OngoingRecording; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.ProfilingSnapshot; import datadog.trace.api.profiling.RecordingData; import java.io.IOException; @@ -37,7 +38,7 @@ public RecordingData stop() { recordingFile, started, Instant.now(), ProfilingSnapshot.Kind.ON_SHUTDOWN); } - // @VisibleForTesting + @VisibleForTesting final RecordingData snapshot(@Nonnull Instant start) { return snapshot(start, ProfilingSnapshot.Kind.PERIODIC); } @@ -62,7 +63,7 @@ public void close() { } } - // used for tests only + @VisibleForTesting Path getRecordingFile() { return recordingFile; } diff --git a/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java b/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java index 80afc760feb..86ea3f6e3e0 100644 --- a/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java +++ b/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java @@ -31,6 +31,7 @@ import datadog.trace.api.ProcessTags; import datadog.trace.api.git.GitInfo; import datadog.trace.api.git.GitInfoProvider; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.RecordingData; import datadog.trace.api.profiling.RecordingType; import datadog.trace.bootstrap.config.provider.ConfigProvider; @@ -134,10 +135,7 @@ public ProfileUploader(final Config config, final ConfigProvider configProvider) this(config, configProvider, new IOLogger(log), TERMINATION_TIMEOUT_SEC); } - /** - * Note that this method is only visible for testing and should not be used from outside this - * class. - */ + @VisibleForTesting ProfileUploader( final Config config, final ConfigProvider configProvider, @@ -443,10 +441,7 @@ private List tagsToList(final Map tags) { .collect(Collectors.toList()); } - /** - * Note that this method is only visible for testing and should not be used from outside this - * class. - */ + @VisibleForTesting OkHttpClient getClient() { return client; } diff --git a/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/CompositeController.java b/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/CompositeController.java index 5cec160a754..236bf52b1ae 100644 --- a/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/CompositeController.java +++ b/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/CompositeController.java @@ -17,6 +17,7 @@ import datadog.trace.api.Config; import datadog.trace.api.Platform; import datadog.trace.api.config.ProfilingConfig; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.ProfilerFlareLogger; import datadog.trace.api.profiling.ProfilingSnapshot; import datadog.trace.api.profiling.RecordingData; @@ -50,7 +51,7 @@ public CompositeController(List controllers) { this.controllers = controllers; } - // visible for testing + @VisibleForTesting public List getControllers() { return Collections.unmodifiableList(controllers); } diff --git a/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ScrubRecordingDataListener.java b/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ScrubRecordingDataListener.java index 6e195a0b7a7..211d0e23c64 100644 --- a/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ScrubRecordingDataListener.java +++ b/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ScrubRecordingDataListener.java @@ -4,6 +4,7 @@ import com.datadog.profiling.scrubber.DefaultScrubDefinition; import com.datadog.profiling.scrubber.JfrScrubber; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.RecordingData; import datadog.trace.api.profiling.RecordingDataListener; import datadog.trace.api.profiling.RecordingInputStream; @@ -45,7 +46,7 @@ static RecordingDataListener wrap( this(delegate, scrubber, failOpen, null); } - // visible for testing + @VisibleForTesting ScrubRecordingDataListener( RecordingDataListener delegate, JfrScrubber scrubber, boolean failOpen, Path tempDir) { this.delegate = delegate; diff --git a/dd-java-agent/build.gradle b/dd-java-agent/build.gradle index 58ad7540df2..a2001cb4ee8 100644 --- a/dd-java-agent/build.gradle +++ b/dd-java-agent/build.gradle @@ -60,6 +60,7 @@ tasks.named("compileJava") { dependencies { implementation sourceSets.main_java11.output main_java11CompileOnly libs.forbiddenapis + main_java6CompileOnly project(':components:annotations') main_java6CompileOnly libs.forbiddenapis testImplementation sourceSets.main_java6.output } diff --git a/dd-java-agent/src/main/java6/datadog/trace/bootstrap/AgentPreCheck.java b/dd-java-agent/src/main/java6/datadog/trace/bootstrap/AgentPreCheck.java index de49e86ec73..e9ea4fc7b7a 100644 --- a/dd-java-agent/src/main/java6/datadog/trace/bootstrap/AgentPreCheck.java +++ b/dd-java-agent/src/main/java6/datadog/trace/bootstrap/AgentPreCheck.java @@ -1,5 +1,6 @@ package datadog.trace.bootstrap; +import datadog.trace.api.internal.VisibleForTesting; import de.thetaphi.forbiddenapis.SuppressForbidden; import java.io.BufferedReader; import java.io.IOException; @@ -91,9 +92,9 @@ private static boolean compatible() { return compatible(javaVersion, javaHome, System.err); } - // Reachable for testing // System.getenv usage is necessary since class is designed to be Java 6 compatible, while // Environment component is for Java 8+ + @VisibleForTesting @SuppressForbidden static boolean compatible(String javaVersion, String javaHome, PrintStream output) { int majorJavaVersion = parseJavaMajorVersion(javaVersion); @@ -114,7 +115,7 @@ static boolean compatible(String javaVersion, String javaHome, PrintStream outpu return false; } - // Reachable for testing + @VisibleForTesting static int parseJavaMajorVersion(String javaVersion) { int major = 0; if (javaVersion == null || javaVersion.isEmpty()) { diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/TraceStructureWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/TraceStructureWriter.java index afeaebf2772..f5cb0d2a978 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/TraceStructureWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/TraceStructureWriter.java @@ -3,6 +3,7 @@ import datadog.environment.OperatingSystem; import datadog.trace.api.DDSpanId; import datadog.trace.api.DDTraceId; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.core.DDSpan; import de.thetaphi.forbiddenapis.SuppressForbidden; import java.io.FileOutputStream; @@ -82,7 +83,7 @@ private static String[] parseArgs(String outputFile) { return parseArgs(outputFile, OperatingSystem.isWindows()); } - // package visibility for testing + @VisibleForTesting static String[] parseArgs(String outputFile, boolean windows) { String[] args = ARGS_DELIMITER.split(outputFile); // Check Windows absolute paths (:) as column is used as arg delimiter diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/TraceMapperV0_5.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/TraceMapperV0_5.java index b8d1c77afdc..a993fdb3c26 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/TraceMapperV0_5.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/TraceMapperV0_5.java @@ -9,6 +9,7 @@ import datadog.communication.serialization.msgpack.MsgPackWriter; import datadog.trace.api.TagMap; import datadog.trace.api.TagMap.EntryReader; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.instrumentation.api.InstrumentationTags; import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; import datadog.trace.common.writer.Payload; @@ -125,12 +126,12 @@ public String endpoint() { return "v0.5"; } - // Visible for tests + @VisibleForTesting Map getEncoding() { return encoding; } - // Visible for tests + @VisibleForTesting GrowableBuffer getDictionary() { return dictionary; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java b/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java index 55db6962976..73e3c2063e0 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java @@ -53,6 +53,7 @@ import datadog.trace.api.interceptor.MutableSpan; import datadog.trace.api.interceptor.TraceInterceptor; import datadog.trace.api.internal.TraceSegment; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.metrics.SpanMetricRegistry; import datadog.trace.api.naming.SpanNaming; import datadog.trace.api.remoteconfig.ServiceNameCollector; @@ -1391,7 +1392,7 @@ public String getSpanId(AgentSpan span) { return "0"; } - // @VisibleForTesting + @VisibleForTesting TraceInterceptors getInterceptors() { return interceptors; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java b/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java index d404343bb96..8ffcc77b49c 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java @@ -19,6 +19,7 @@ import datadog.trace.api.debugger.DebuggerConfigBridge; import datadog.trace.api.gateway.Flow; import datadog.trace.api.gateway.RequestContext; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.metrics.SpanMetricRegistry; import datadog.trace.api.metrics.SpanMetrics; import datadog.trace.api.sampling.PrioritySampling; @@ -125,7 +126,7 @@ static DDSpan create( * @param timestampMicro if greater than zero, use this time instead of the current time * @param context the context used for the span */ - // @VisibleForTesting + @VisibleForTesting DDSpan( @Nonnull String instrumentationName, final long timestampMicro, @@ -681,7 +682,7 @@ public long getDurationNano() { return durationNano; } - // @VisibleForTesting + @VisibleForTesting void setDurationNano(long duration) { DURATION_NANO_UPDATER.set(this, duration); } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java b/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java index dbbd57b0c48..6002fce5415 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java @@ -3,6 +3,7 @@ import datadog.communication.ddagent.DDAgentFeaturesDiscovery; import datadog.communication.ddagent.SharedCommunicationObjects; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.core.monitor.HealthMetrics; import java.util.ArrayList; import java.util.List; @@ -140,12 +141,12 @@ private void flushStats() { expired = 0; } - // @VisibleForTesting + @VisibleForTesting int trackedCount() { return traceArray.size(); } - // @VisibleForTesting + @VisibleForTesting int getDropped() { return dropped; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java b/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java index 5a0d1e992a4..11f8566cc48 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java @@ -2,6 +2,7 @@ import datadog.metrics.api.Recording; import datadog.trace.api.DDTraceId; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.time.TimeSource; import datadog.trace.bootstrap.instrumentation.api.AgentScope; import datadog.trace.core.CoreTracer.ConfigSnapshot; @@ -139,7 +140,7 @@ public PendingTrace create(@Nonnull DDTraceId traceId, ConfigSnapshot traceConfi private static final AtomicLongFieldUpdater LAST_REFERENCED = AtomicLongFieldUpdater.newUpdater(PendingTrace.class, "lastReferenced"); - // @VisibleForTesting + @VisibleForTesting PendingTrace( @Nonnull CoreTracer tracer, @Nonnull DDTraceId traceId, @@ -224,37 +225,37 @@ boolean compareAndSetLongRunningState(int expected, int newState) { return LONG_RUNNING_STATE.compareAndSet(this, expected, newState); } - // @VisibleForTesting + @VisibleForTesting int getLongRunningTrackedState() { return longRunningTrackedState; } - // @VisibleForTesting + @VisibleForTesting void setLongRunningTrackedState(int state) { LONG_RUNNING_STATE.set(this, state); } - // @VisibleForTesting + @VisibleForTesting int getPendingReferenceCount() { return pendingReferenceCount; } - // @VisibleForTesting + @VisibleForTesting PendingTraceBuffer getPendingTraceBuffer() { return pendingTraceBuffer; } - // @VisibleForTesting + @VisibleForTesting DDTraceId getTraceId() { return traceId; } - // @VisibleForTesting + @VisibleForTesting boolean isRootSpanWritten() { return rootSpanWritten; } - // @VisibleForTesting + @VisibleForTesting int getIsEnqueued() { return isEnqueued; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/PendingTraceBuffer.java b/dd-trace-core/src/main/java/datadog/trace/core/PendingTraceBuffer.java index 9cd7f73eb2e..c0e8de95a5c 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/PendingTraceBuffer.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/PendingTraceBuffer.java @@ -11,6 +11,7 @@ import datadog.communication.ddagent.SharedCommunicationObjects; import datadog.trace.api.Config; import datadog.trace.api.flare.TracerFlare; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.time.TimeSource; import datadog.trace.common.writer.TraceDumpJsonExporter; import datadog.trace.core.monitor.HealthMetrics; @@ -304,17 +305,17 @@ public DelayingPendingTraceBuffer( : null; } - // @VisibleForTesting + @VisibleForTesting LongRunningTracesTracker getRunningTracesTracker() { return runningTracesTracker; } - // @VisibleForTesting + @VisibleForTesting Thread getWorker() { return worker; } - // @VisibleForTesting + @VisibleForTesting MessagePassingBlockingQueue getQueue() { return queue; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/propagation/W3CHttpCodec.java b/dd-trace-core/src/main/java/datadog/trace/core/propagation/W3CHttpCodec.java index 52037859cb2..f8372f0d76c 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/propagation/W3CHttpCodec.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/propagation/W3CHttpCodec.java @@ -16,6 +16,7 @@ import datadog.trace.api.DDTraceId; import datadog.trace.api.TraceConfig; import datadog.trace.api.TracePropagationStyle; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.internal.util.LongStringUtils; import datadog.trace.api.propagation.W3CTraceParent; import datadog.trace.api.sampling.PrioritySampling; @@ -40,10 +41,9 @@ class W3CHttpCodec { private static final int TRACE_PARENT_FLAGS_SAMPLED = 1; private static final int TRACE_PARENT_LENGTH = TRACE_PARENT_FLAGS_START + 2; - // Package-protected for testing - static final String TRACE_PARENT_KEY = "traceparent"; - static final String TRACE_STATE_KEY = "tracestate"; - static final String OT_BAGGAGE_PREFIX = "ot-baggage-"; + @VisibleForTesting static final String TRACE_PARENT_KEY = "traceparent"; + @VisibleForTesting static final String TRACE_STATE_KEY = "tracestate"; + @VisibleForTesting static final String OT_BAGGAGE_PREFIX = "ot-baggage-"; static final String E2E_START_KEY = OT_BAGGAGE_PREFIX + DDTags.TRACE_START_TIME; private W3CHttpCodec() { diff --git a/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/HttpEndpointPostProcessor.java b/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/HttpEndpointPostProcessor.java index 340a5d82649..c2e0dd72761 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/HttpEndpointPostProcessor.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/HttpEndpointPostProcessor.java @@ -6,6 +6,7 @@ import datadog.trace.api.TagMap; import datadog.trace.api.endpoint.EndpointResolver; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.instrumentation.api.AppendableSpanLinks; import datadog.trace.core.DDSpanContext; import org.slf4j.Logger; @@ -45,10 +46,9 @@ public HttpEndpointPostProcessor() { /** * Creates a new HttpEndpointPostProcessor with the given endpoint resolver. * - *

Visible for testing. - * * @param endpointResolver the resolver to use for endpoint inference */ + @VisibleForTesting HttpEndpointPostProcessor(EndpointResolver endpointResolver) { this.endpointResolver = endpointResolver; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/PeerServiceCalculator.java b/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/PeerServiceCalculator.java index ed1fee8b2d6..198e2c78f1c 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/PeerServiceCalculator.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/PeerServiceCalculator.java @@ -3,6 +3,7 @@ import datadog.trace.api.Config; import datadog.trace.api.DDTags; import datadog.trace.api.TagMap; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.naming.NamingSchema; import datadog.trace.api.naming.SpanNaming; import datadog.trace.bootstrap.instrumentation.api.AppendableSpanLinks; @@ -22,7 +23,7 @@ public PeerServiceCalculator() { this(SpanNaming.instance().namingSchema().peerService(), Config.get().getPeerServiceMapping()); } - // Visible for testing + @VisibleForTesting PeerServiceCalculator( @Nonnull final NamingSchema.ForPeerService peerServiceNaming, @Nonnull final Map peerServiceMapping) { diff --git a/internal-api/build.gradle.kts b/internal-api/build.gradle.kts index b78e5b38b14..fc95dd9e1f1 100644 --- a/internal-api/build.gradle.kts +++ b/internal-api/build.gradle.kts @@ -260,6 +260,7 @@ dependencies { // references TraceScope and Continuation from public api api(project(":dd-trace-api")) api(libs.slf4j) + compileOnlyApi(project(":components:annotations")) api(project(":components:context")) api(project(":components:environment")) api(project(":components:json")) diff --git a/internal-api/src/main/java/datadog/trace/api/ProcessTags.java b/internal-api/src/main/java/datadog/trace/api/ProcessTags.java index 932ec19c565..40b23faa45c 100644 --- a/internal-api/src/main/java/datadog/trace/api/ProcessTags.java +++ b/internal-api/src/main/java/datadog/trace/api/ProcessTags.java @@ -3,6 +3,7 @@ import datadog.environment.EnvironmentVariables; import datadog.environment.SystemProperties; import datadog.trace.api.env.CapturedEnvironment; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; import datadog.trace.util.TraceUtils; import java.nio.file.Path; @@ -30,8 +31,7 @@ public class ProcessTags { public static final String ENTRYPOINT_BASEDIR = "entrypoint.basedir"; public static final String ENTRYPOINT_WORKDIR = "entrypoint.workdir"; - // visible for testing - static Function envGetter = EnvironmentVariables::get; + @VisibleForTesting static Function envGetter = EnvironmentVariables::get; private static class Lazy { // the tags are used to compute a hash for dsm hence that map must be sorted. @@ -269,7 +269,7 @@ public static UTF8BytesString getTagsForSerialization() { return Lazy.serializedForm; } - /** Visible for testing. */ + @VisibleForTesting static void empty() { synchronized (Lazy.TAGS) { Lazy.TAGS.clear(); @@ -279,12 +279,12 @@ static void empty() { } } - /** Visible for testing. */ + @VisibleForTesting static void reset() { reset(Config.get()); } - /** Visible for testing. */ + @VisibleForTesting public static void reset(Config config) { synchronized (Lazy.TAGS) { empty(); diff --git a/internal-api/src/main/java/datadog/trace/api/civisibility/config/BazelMode.java b/internal-api/src/main/java/datadog/trace/api/civisibility/config/BazelMode.java index 898dd76129b..56698fe7085 100644 --- a/internal-api/src/main/java/datadog/trace/api/civisibility/config/BazelMode.java +++ b/internal-api/src/main/java/datadog/trace/api/civisibility/config/BazelMode.java @@ -3,6 +3,7 @@ import static java.nio.charset.StandardCharsets.UTF_8; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.config.inversion.ConfigHelper; import datadog.trace.util.Strings; import java.io.BufferedReader; @@ -51,7 +52,7 @@ public static BazelMode get() { return INSTANCE; } - // visible for testing + @VisibleForTesting BazelMode(Config config) { String manifestRloc = config.getTestOptimizationManifestFile(); if (Strings.isNotBlank(manifestRloc)) { @@ -331,7 +332,7 @@ private static String lookupInRunfilesManifest(String manifestFile, String rloca return null; } - // visible for testing + @VisibleForTesting static void reset() { INSTANCE = null; } diff --git a/internal-api/src/main/java/datadog/trace/api/datastreams/TransactionInfo.java b/internal-api/src/main/java/datadog/trace/api/datastreams/TransactionInfo.java index 9f09186d30d..b4755d58342 100644 --- a/internal-api/src/main/java/datadog/trace/api/datastreams/TransactionInfo.java +++ b/internal-api/src/main/java/datadog/trace/api/datastreams/TransactionInfo.java @@ -1,5 +1,6 @@ package datadog.trace.api.datastreams; +import datadog.trace.api.internal.VisibleForTesting; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.charset.StandardCharsets; @@ -71,7 +72,7 @@ public byte[] getBytes() { return buffer.array(); } - // @VisibleForTesting + @VisibleForTesting static synchronized void resetCache() { CACHE.clear(); CACHE_BYTES = new byte[0]; diff --git a/internal-api/src/main/java/datadog/trace/api/profiling/ProfilerFlareLogger.java b/internal-api/src/main/java/datadog/trace/api/profiling/ProfilerFlareLogger.java index 2672d8e1567..6fc57ae641d 100644 --- a/internal-api/src/main/java/datadog/trace/api/profiling/ProfilerFlareLogger.java +++ b/internal-api/src/main/java/datadog/trace/api/profiling/ProfilerFlareLogger.java @@ -1,6 +1,7 @@ package datadog.trace.api.profiling; import datadog.trace.api.flare.TracerFlare; +import datadog.trace.api.internal.VisibleForTesting; import java.io.IOException; import java.time.Instant; import java.time.ZoneOffset; @@ -23,7 +24,7 @@ private static final class Singleton { private final List flareReportLines = new ArrayList<>(); private int usedReportCapacity = 0; - // @VisibleForTesting + @VisibleForTesting ProfilerFlareLogger() { TracerFlare.addReporter(this); } @@ -76,17 +77,17 @@ public void cleanupAfterFlare() { cleanup(); } - // @VisibleForTesting + @VisibleForTesting int getUsedReportCapacity() { return usedReportCapacity; } - // @VisibleForTesting + @VisibleForTesting int getMaxReportCapacity() { return REPORT_CAPACITY; } - // @VisibleForTesting + @VisibleForTesting int linesSize() { synchronized (flareReportLines) { return flareReportLines.size(); diff --git a/internal-api/src/main/java/datadog/trace/api/remoteconfig/ServiceNameCollector.java b/internal-api/src/main/java/datadog/trace/api/remoteconfig/ServiceNameCollector.java index b7c0a52a63f..664e1f1fe6f 100644 --- a/internal-api/src/main/java/datadog/trace/api/remoteconfig/ServiceNameCollector.java +++ b/internal-api/src/main/java/datadog/trace/api/remoteconfig/ServiceNameCollector.java @@ -3,6 +3,7 @@ import static datadog.trace.api.telemetry.LogCollector.SEND_TELEMETRY; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import java.util.ArrayList; import java.util.List; import java.util.Set; @@ -73,7 +74,7 @@ public void clear() { services.clear(); } - // Visible for testing + @VisibleForTesting static void setInstance(ServiceNameCollector instance) { INSTANCE = instance; } diff --git a/internal-api/src/main/java/datadog/trace/util/ProcessSupervisor.java b/internal-api/src/main/java/datadog/trace/util/ProcessSupervisor.java index 686c35ce65b..006d921bec9 100644 --- a/internal-api/src/main/java/datadog/trace/util/ProcessSupervisor.java +++ b/internal-api/src/main/java/datadog/trace/util/ProcessSupervisor.java @@ -7,6 +7,7 @@ import static datadog.trace.util.ProcessSupervisor.Health.INTERRUPTED; import static datadog.trace.util.ProcessSupervisor.Health.READY_TO_START; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.instrumentation.api.AgentTracer; import datadog.trace.context.TraceScope; import java.io.Closeable; @@ -146,7 +147,7 @@ public void close() { } } - // Package reachable for testing + @VisibleForTesting Process getCurrentProcess() { return currentProcess; } diff --git a/internal-api/src/main/java/datadog/trace/util/TempLocationManager.java b/internal-api/src/main/java/datadog/trace/util/TempLocationManager.java index 7574c9c5edc..94c74c9f5f8 100644 --- a/internal-api/src/main/java/datadog/trace/util/TempLocationManager.java +++ b/internal-api/src/main/java/datadog/trace/util/TempLocationManager.java @@ -2,6 +2,7 @@ import datadog.environment.SystemProperties; import datadog.trace.api.config.ProfilingConfig; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.ProfilerFlareLogger; import datadog.trace.api.time.SystemTimeSource; import datadog.trace.api.time.TimeSource; @@ -307,7 +308,7 @@ private TempLocationManager() { createTempDir(tempDir); } - // @VisibleForTesting + @VisibleForTesting static String getBaseTempDirName() { String userName = SystemProperties.get("user.name"); // unlikely, but fall-back to system env based user name @@ -399,7 +400,7 @@ boolean cleanup(long timeout, TimeUnit unit) { return false; } - // accessible for tests + @VisibleForTesting boolean waitForCleanup(long timeout, TimeUnit unit) { try { return cleanupTask.await(timeout, unit); @@ -416,7 +417,7 @@ boolean waitForCleanup(long timeout, TimeUnit unit) { return false; } - // accessible for tests + @VisibleForTesting void createDirStructure() throws IOException { Files.createDirectories(baseTempDir); } diff --git a/settings.gradle.kts b/settings.gradle.kts index e6c9469b9c6..dda1f432e6f 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -112,6 +112,7 @@ include( include( ":communication", + ":components:annotations", ":components:context", ":components:environment", ":components:http:http-api", diff --git a/utils/config-utils/build.gradle.kts b/utils/config-utils/build.gradle.kts index 84f494596d8..b1bb146231e 100644 --- a/utils/config-utils/build.gradle.kts +++ b/utils/config-utils/build.gradle.kts @@ -55,6 +55,7 @@ val excludedClassesInstructionCoverage by extra( ) dependencies { + compileOnly(project(":components:annotations")) implementation(project(":components:environment")) implementation(project(":dd-trace-api")) api(project(":utils:filesystem-utils")) diff --git a/utils/config-utils/src/main/java/datadog/trace/api/env/CapturedEnvironment.java b/utils/config-utils/src/main/java/datadog/trace/api/env/CapturedEnvironment.java index 44e55b6fd18..2db02edd88a 100644 --- a/utils/config-utils/src/main/java/datadog/trace/api/env/CapturedEnvironment.java +++ b/utils/config-utils/src/main/java/datadog/trace/api/env/CapturedEnvironment.java @@ -2,6 +2,7 @@ import datadog.environment.JavaVirtualMachine; import datadog.trace.api.config.GeneralConfig; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.config.inversion.ConfigHelper; import java.io.File; import java.util.HashMap; @@ -24,12 +25,7 @@ public ProcessInfo() { mainClass = JavaVirtualMachine.getMainClass(); } - /** - * Visible for testing - * - * @param mainClass - * @param jarFile - */ + @VisibleForTesting ProcessInfo(String mainClass, File jarFile) { this.mainClass = mainClass; this.jarFile = jarFile; @@ -55,17 +51,13 @@ public ProcessInfo getProcessInfo() { return processInfo; } - // Testing purposes + @VisibleForTesting static void useFixedEnv(final Map props) { INSTANCE.properties.clear(); INSTANCE.properties.putAll(props); } - /** - * For testing purposes. - * - * @param processInfo - */ + @VisibleForTesting static void useFixedProcessInfo(final ProcessInfo processInfo) { INSTANCE.processInfo = processInfo; } diff --git a/utils/logging-utils/src/main/java/datadog/logging/IOLogger.java b/utils/logging-utils/src/main/java/datadog/logging/IOLogger.java index 407f7509454..af21f4a0c57 100644 --- a/utils/logging-utils/src/main/java/datadog/logging/IOLogger.java +++ b/utils/logging-utils/src/main/java/datadog/logging/IOLogger.java @@ -2,6 +2,7 @@ import static datadog.trace.api.telemetry.LogCollector.EXCLUDE_TELEMETRY; +import datadog.trace.api.internal.VisibleForTesting; import java.util.concurrent.TimeUnit; import org.slf4j.Logger; @@ -15,7 +16,7 @@ public IOLogger(final Logger log) { this(log, new RatelimitedLogger(log, 5, TimeUnit.MINUTES)); } - // Visible for testing + @VisibleForTesting IOLogger(final Logger log, final RatelimitedLogger ratelimitedLogger) { this.log = log; this.ratelimitedLogger = ratelimitedLogger; diff --git a/utils/logging-utils/src/main/java/datadog/logging/RatelimitedLogger.java b/utils/logging-utils/src/main/java/datadog/logging/RatelimitedLogger.java index 3e5728b6bb7..b88505c1e59 100644 --- a/utils/logging-utils/src/main/java/datadog/logging/RatelimitedLogger.java +++ b/utils/logging-utils/src/main/java/datadog/logging/RatelimitedLogger.java @@ -1,5 +1,6 @@ package datadog.logging; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.time.SystemTimeSource; import datadog.trace.api.time.TimeSource; import java.util.Locale; @@ -25,7 +26,7 @@ public RatelimitedLogger(final Logger log, final int delay, final TimeUnit timeU this(log, delay, timeUnit, SystemTimeSource.INSTANCE); } - // Visible for testing + @VisibleForTesting RatelimitedLogger( final Logger log, final int delay, final TimeUnit timeUnit, final TimeSource timeSource) { this.log = log; From d1403671c6fdd4b7820e1323c09b8538fd9c55d7 Mon Sep 17 00:00:00 2001 From: Brice Dutheil Date: Thu, 28 May 2026 12:18:41 +0200 Subject: [PATCH 07/27] Reduce wait time for test job to run as soon as their needed build job is finished (#11470) Limit GitLab test build dependencies fix(ci): clarify build test matrix needs Co-authored-by: brice.dutheil --- .gitlab-ci.yml | 54 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7188f670f3a..4081c4b7123 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -548,7 +548,15 @@ check_debugger: muzzle: extends: .gradle_build - needs: [ build_tests ] + # needs:parallel:matrix limits this job to a specific build_tests combination. + # Keep matrix vars exact and in build_tests declaration order: + # https://docs.gitlab.com/ci/yaml/#needsparallelmatrix + needs: &needs_build_tests_inst + - job: build_tests + parallel: + matrix: + - GRADLE_TARGET: ":instrumentationTest" + CACHE_TYPE: "inst" stage: tests rules: - if: '$CI_COMMIT_BRANCH =~ /^mq-working-branch-/' @@ -586,7 +594,7 @@ muzzle: muzzle-dep-report: extends: .gradle_build - needs: [ build_tests ] + needs: *needs_build_tests_inst stage: tests rules: - if: '$CI_COMMIT_BRANCH =~ /^mq-working-branch-/' @@ -737,6 +745,15 @@ agent_integration_tests: test_base: extends: .test_job + # needs:parallel:matrix limits this job to a specific build_tests combination. + # Keep matrix vars exact and in build_tests declaration order: + # https://docs.gitlab.com/ci/yaml/#needsparallelmatrix + needs: + - job: build_tests + parallel: + matrix: + - GRADLE_TARGET: ":baseTest" + CACHE_TYPE: "base" variables: GRADLE_TARGET: ":baseTest" CACHE_TYPE: "base" @@ -748,6 +765,7 @@ test_base: test_inst: extends: .test_job_with_test_agent + needs: *needs_build_tests_inst variables: GRADLE_TARGET: ":instrumentationTest" CACHE_TYPE: "inst" @@ -756,6 +774,15 @@ test_inst: test_inst_latest: extends: .test_job_with_test_agent + # needs:parallel:matrix limits this job to a specific build_tests combination. + # Keep matrix vars exact and in build_tests declaration order: + # https://docs.gitlab.com/ci/yaml/#needsparallelmatrix + needs: + - job: build_tests + parallel: + matrix: + - GRADLE_TARGET: ":instrumentationLatestDepTest" + CACHE_TYPE: "latestdep" variables: GRADLE_TARGET: ":instrumentationLatestDepTest" CACHE_TYPE: "latestdep" @@ -786,6 +813,7 @@ test_flaky: test_flaky_inst: extends: .test_job + needs: *needs_build_tests_inst variables: GRADLE_TARGET: ":instrumentationTest" GRADLE_PARAMS: "-PrunFlakyTests" @@ -800,6 +828,15 @@ test_flaky_inst: test_profiling: extends: .test_job + # needs:parallel:matrix limits this job to a specific build_tests combination. + # Keep matrix vars exact and in build_tests declaration order: + # https://docs.gitlab.com/ci/yaml/#needsparallelmatrix + needs: + - job: build_tests + parallel: + matrix: + - GRADLE_TARGET: ":profilingTest" + CACHE_TYPE: "profiling" variables: GRADLE_TARGET: ":profilingTest" CACHE_TYPE: "profiling" @@ -819,6 +856,16 @@ test_debugger: test_smoke: extends: .test_job + # needs:parallel:matrix limits this job to a specific build_tests combination. + # Keep matrix vars exact and in build_tests declaration order: + # https://docs.gitlab.com/ci/yaml/#needsparallelmatrix + needs: &needs_build_tests_smoke + - job: build_tests + parallel: + matrix: + - GRADLE_TARGET: ":smokeTest" + CACHE_TYPE: "smoke" + MAVEN_OPTS: "-Xms256M -Xmx1024M" variables: GRADLE_TARGET: "stageMainDist :smokeTest" GRADLE_PARAMS: "-PskipFlakyTests" @@ -828,6 +875,7 @@ test_smoke: test_ssi_smoke: extends: .test_job + needs: *needs_build_tests_smoke rules: - if: $CI_COMMIT_BRANCH == "master" when: on_success @@ -845,6 +893,7 @@ test_ssi_smoke: test_smoke_graalvm: extends: .test_job + needs: *needs_build_tests_smoke tags: [ "arch:amd64" ] variables: GRADLE_TARGET: "stageMainDist :dd-smoke-test:spring-boot-3.0-native:test" @@ -857,6 +906,7 @@ test_smoke_graalvm: test_smoke_semeru8_debugger: extends: .test_job + needs: *needs_build_tests_smoke tags: [ "arch:amd64" ] variables: GRADLE_TARGET: "stageMainDist dd-smoke-tests:debugger-integration-tests:test" From 1d12490fa0add682d60469e2f2c21658f4a42d99 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 09:54:02 -0400 Subject: [PATCH 08/27] Add MpscRingBuffer for pre-allocated, recyclable slot rings Bounded multi-producer / single-consumer ring buffer of long-lived T instances. Producers mutate slots in place via callbacks; the consumer reads them the same way. No allocation per write/read after construction. BiConsumer/TriConsumer variants take context object(s) before the slot, matching the TagMap.forEach / Hashtable.forEach convention so callers can use static final non-capturing lambdas. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/util/concurrent/MpscRingBuffer.java | 196 ++++++++++++++ .../util/concurrent/MpscRingBufferTest.java | 247 ++++++++++++++++++ 2 files changed, 443 insertions(+) create mode 100644 internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java create mode 100644 internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java diff --git a/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java b/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java new file mode 100644 index 00000000000..f6b9ee998ed --- /dev/null +++ b/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java @@ -0,0 +1,196 @@ +package datadog.trace.util.concurrent; + +import datadog.trace.api.function.TriConsumer; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicLongArray; +import java.util.function.BiConsumer; +import java.util.function.Consumer; +import java.util.function.Supplier; + +/** + * Bounded multi-producer / single-consumer ring buffer of pre-allocated {@code T} instances. + * + *

Each slot is a long-lived {@code T} instance created at construction time via the supplied + * factory and recycled forever. Producers mutate slots in place via callbacks; the consumer reads + * them the same way. No allocation occurs per write/read after construction, which is the entire + * point of this class over a queue of references. + * + *

The {@code BiConsumer} and {@link TriConsumer} variants take their context object(s) + * before the slot, matching the convention of {@code TagMap.forEach} and {@code + * Hashtable.forEach}. That ordering lets callers declare the callback as a {@code static final} + * non-capturing lambda and pass per-call context at the call site without allocating a closure. + * + *

Producer cursor is CAS-claimed; visibility of a claimed slot to the consumer is gated by a + * per-slot publication-sequence array. Consumer cursor is updated with a volatile write so + * producers observe space being freed. + */ +public final class MpscRingBuffer { + + private final T[] slots; + + /** + * Per-slot publication sequence. Producers write the claimed sequence here as the last step of a + * publish (release write via {@link AtomicLongArray#set}); the consumer reads it (acquire read) + * to determine whether the slot at the next position is ready. A slot is considered published for + * sequence {@code s} iff {@code sequences[s & mask] == s}. + */ + private final AtomicLongArray publishedSequences; + + private final int capacity; + private final int mask; + + /** Next sequence to claim. Producers increment via CAS. */ + private final AtomicLong producerCursor = new AtomicLong(-1L); + + /** + * Highest sequence consumed. Volatile so producers see space freed up; only the consumer thread + * writes to it, so a plain field with a manual volatile-store would also work. + */ + private volatile long consumerCursor = -1L; + + @SuppressWarnings("unchecked") + public MpscRingBuffer(final Supplier factory, final int capacityHint) { + if (capacityHint < 1) { + throw new IllegalArgumentException("capacityHint must be >= 1, got " + capacityHint); + } + this.capacity = nextPowerOfTwo(capacityHint); + if (this.capacity < 1) { + throw new IllegalArgumentException("capacity overflow for hint " + capacityHint); + } + this.mask = capacity - 1; + this.slots = (T[]) new Object[capacity]; + this.publishedSequences = new AtomicLongArray(capacity); + for (int i = 0; i < capacity; i++) { + slots[i] = factory.get(); + // Initial: sentinel "no sequence published here yet" -- anything < 0 works since + // sequences are 0-based and monotonically increasing. + publishedSequences.set(i, Long.MIN_VALUE); + } + } + + public int capacity() { + return capacity; + } + + /** Approximate count of slots holding unread items. May briefly exceed capacity under race. */ + public int size() { + final long p = producerCursor.get(); + final long c = consumerCursor; + final long diff = p - c; + if (diff <= 0) return 0; + if (diff > capacity) return capacity; + return (int) diff; + } + + public boolean isEmpty() { + return producerCursor.get() == consumerCursor; + } + + /** {@code true} if the slot was filled and published; {@code false} if the ring is full. */ + public boolean tryWrite(final Consumer filler) { + final long seq = claim(); + if (seq < 0L) return false; + filler.accept(slots[(int) (seq & mask)]); + publish(seq); + return true; + } + + public boolean tryWrite(final C context, final BiConsumer filler) { + final long seq = claim(); + if (seq < 0L) return false; + filler.accept(context, slots[(int) (seq & mask)]); + publish(seq); + return true; + } + + public boolean tryWrite( + final C1 context1, + final C2 context2, + final TriConsumer filler) { + final long seq = claim(); + if (seq < 0L) return false; + filler.accept(context1, context2, slots[(int) (seq & mask)]); + publish(seq); + return true; + } + + /** Drains all currently-available slots. Returns the count processed. */ + public int drain(final Consumer handler) { + long cursor = consumerCursor; + int count = 0; + while (true) { + final long nextSeq = cursor + 1L; + final int idx = (int) (nextSeq & mask); + if (publishedSequences.get(idx) != nextSeq) break; + handler.accept(slots[idx]); + cursor = nextSeq; + count++; + } + if (count > 0) consumerCursor = cursor; + return count; + } + + public int drain(final C context, final BiConsumer handler) { + long cursor = consumerCursor; + int count = 0; + while (true) { + final long nextSeq = cursor + 1L; + final int idx = (int) (nextSeq & mask); + if (publishedSequences.get(idx) != nextSeq) break; + handler.accept(context, slots[idx]); + cursor = nextSeq; + count++; + } + if (count > 0) consumerCursor = cursor; + return count; + } + + public int drain( + final C1 context1, + final C2 context2, + final TriConsumer handler) { + long cursor = consumerCursor; + int count = 0; + while (true) { + final long nextSeq = cursor + 1L; + final int idx = (int) (nextSeq & mask); + if (publishedSequences.get(idx) != nextSeq) break; + handler.accept(context1, context2, slots[idx]); + cursor = nextSeq; + count++; + } + if (count > 0) consumerCursor = cursor; + return count; + } + + /** CAS-claim the next sequence, or return {@code -1} if the ring is full. */ + private long claim() { + while (true) { + final long current = producerCursor.get(); + // Stale read of consumerCursor is fine: a false "full" reading just causes a drop, and + // a real "full" reading is correctly identified because consumerCursor only advances. + final long consumed = consumerCursor; + if (current - consumed >= capacity) { + return -1L; + } + final long next = current + 1L; + if (producerCursor.compareAndSet(current, next)) { + return next; + } + // CAS failure -> another producer claimed; retry. + } + } + + /** Mark sequence {@code seq} as published. Release semantics via {@link AtomicLongArray#set}. */ + private void publish(final long seq) { + publishedSequences.set((int) (seq & mask), seq); + } + + private static int nextPowerOfTwo(final int n) { + if (n <= 1) return 1; + // 32 - leadingZeros(n-1) gives the number of bits needed to represent n-1; shifting 1 by that + // gives the smallest power of two >= n. + final int bits = 32 - Integer.numberOfLeadingZeros(n - 1); + return 1 << bits; + } +} diff --git a/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java b/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java new file mode 100644 index 00000000000..43b89271621 --- /dev/null +++ b/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java @@ -0,0 +1,247 @@ +package datadog.trace.util.concurrent; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import datadog.trace.api.function.TriConsumer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiConsumer; +import org.junit.jupiter.api.Test; + +class MpscRingBufferTest { + + /** Mutable slot used by the tests; replaces the per-publish allocation a real consumer avoids. */ + static final class Slot { + int value; + String tag; + } + + // ============ Construction ============ + + @Test + void capacityRoundsUpToPowerOfTwo() { + assertEquals(1, new MpscRingBuffer<>(Slot::new, 1).capacity()); + assertEquals(2, new MpscRingBuffer<>(Slot::new, 2).capacity()); + assertEquals(4, new MpscRingBuffer<>(Slot::new, 3).capacity()); + assertEquals(16, new MpscRingBuffer<>(Slot::new, 10).capacity()); + assertEquals(1024, new MpscRingBuffer<>(Slot::new, 1024).capacity()); + assertEquals(2048, new MpscRingBuffer<>(Slot::new, 1025).capacity()); + } + + @Test + void rejectsNonPositiveCapacityHint() { + assertThrows(IllegalArgumentException.class, () -> new MpscRingBuffer<>(Slot::new, 0)); + assertThrows(IllegalArgumentException.class, () -> new MpscRingBuffer<>(Slot::new, -1)); + } + + @Test + void slotsArePreAllocatedFromFactory() { + AtomicInteger calls = new AtomicInteger(); + MpscRingBuffer ring = + new MpscRingBuffer<>( + () -> { + calls.incrementAndGet(); + return new Slot(); + }, + 8); + assertEquals(8, ring.capacity()); + assertEquals(8, calls.get(), "factory must run capacity times during construction"); + } + + // ============ Basic produce / consume ============ + + @Test + void emptyRingIsEmpty() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + assertTrue(ring.isEmpty()); + assertEquals(0, ring.size()); + assertEquals(0, ring.drain(s -> {})); + } + + @Test + void singleWriteThenDrain() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + assertTrue( + ring.tryWrite( + s -> { + s.value = 42; + s.tag = "hello"; + })); + assertEquals(1, ring.size()); + + List seen = new ArrayList<>(); + int drained = ring.drain(s -> seen.add(s.value)); + assertEquals(1, drained); + assertEquals(Arrays.asList(42), seen); + assertTrue(ring.isEmpty()); + } + + @Test + void writesPreserveFIFOOrder() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + for (int i = 0; i < 6; i++) { + final int v = i; + assertTrue(ring.tryWrite(s -> s.value = v)); + } + List seen = new ArrayList<>(); + ring.drain(s -> seen.add(s.value)); + assertEquals(Arrays.asList(0, 1, 2, 3, 4, 5), seen); + } + + // ============ Full / drop behavior ============ + + @Test + void writesBeyondCapacityFail() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + for (int i = 0; i < 4; i++) { + assertTrue(ring.tryWrite(s -> s.value = 0)); + } + assertFalse(ring.tryWrite(s -> s.value = 0), "5th write must fail when capacity == 4"); + assertEquals(4, ring.size()); + } + + @Test + void writesResumeAfterDrain() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + for (int i = 0; i < 4; i++) { + final int v = i; + assertTrue(ring.tryWrite(s -> s.value = v)); + } + assertFalse(ring.tryWrite(s -> {})); + + AtomicInteger sum = new AtomicInteger(); + ring.drain(s -> sum.addAndGet(s.value)); + assertEquals(0 + 1 + 2 + 3, sum.get()); + + // Now should accept another full round. + for (int i = 100; i < 104; i++) { + final int v = i; + assertTrue(ring.tryWrite(s -> s.value = v)); + } + assertFalse(ring.tryWrite(s -> {})); + } + + // ============ Context-passing variants (context first, slot last) ============ + + @Test + void writeAndDrainWithOneContext() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + BiConsumer filler = (ctx, s) -> s.value = ctx; + assertTrue(ring.tryWrite(7, filler)); + assertTrue(ring.tryWrite(8, filler)); + + List seen = new ArrayList<>(); + BiConsumer, Slot> reader = (sink, s) -> sink.add(s.value); + assertEquals(2, ring.drain(seen, reader)); + assertEquals(Arrays.asList(7, 8), seen); + } + + @Test + void writeAndDrainWithTwoContexts() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + TriConsumer filler = + (v, t, s) -> { + s.value = v; + s.tag = t; + }; + assertTrue(ring.tryWrite(1, "a", filler)); + assertTrue(ring.tryWrite(2, "b", filler)); + + List tags = new ArrayList<>(); + List vals = new ArrayList<>(); + TriConsumer, List, Slot> reader = + (ts, vs, s) -> { + ts.add(s.tag); + vs.add(s.value); + }; + assertEquals(2, ring.drain(tags, vals, reader)); + assertEquals(Arrays.asList("a", "b"), tags); + assertEquals(Arrays.asList(1, 2), vals); + } + + // ============ Concurrency ============ + + @Test + void manyProducersSingleConsumerSeesEveryWrittenValue() throws InterruptedException { + final int producers = 8; + final int perProducer = 50_000; + final int total = producers * perProducer; + + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 1024); + + ExecutorService producerPool = Executors.newFixedThreadPool(producers); + AtomicInteger dropped = new AtomicInteger(); + AtomicInteger written = new AtomicInteger(); + CountDownLatch start = new CountDownLatch(1); + + BiConsumer filler = (v, s) -> s.value = v; + + for (int p = 0; p < producers; p++) { + final int base = p * perProducer; + producerPool.submit( + () -> { + try { + start.await(); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + return; + } + for (int i = 0; i < perProducer; i++) { + int v = base + i; + // Spin until the consumer makes room. We're testing correctness, not throughput. + while (!ring.tryWrite(v, filler)) { + dropped.incrementAndGet(); + Thread.yield(); + } + written.incrementAndGet(); + } + }); + } + + Set seen = new HashSet<>(total); + BiConsumer, Slot> reader = (sink, s) -> sink.add(s.value); + + Thread consumer = + new Thread( + () -> { + while (seen.size() < total) { + if (ring.drain(seen, reader) == 0) Thread.yield(); + } + }, + "ring-consumer"); + consumer.start(); + + start.countDown(); + producerPool.shutdown(); + assertTrue(producerPool.awaitTermination(30, TimeUnit.SECONDS), "producers timed out"); + assertTrue(consumer.isAlive() || seen.size() == total); + consumer.join(30_000); + assertFalse(consumer.isAlive(), "consumer timed out"); + + assertEquals(total, written.get(), "every producer call should eventually succeed"); + assertEquals(total, seen.size(), "consumer must see every value exactly once"); + } + + @Test + void sizeReflectsOutstandingWrites() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + assertEquals(0, ring.size()); + ring.tryWrite(s -> {}); + assertEquals(1, ring.size()); + ring.tryWrite(s -> {}); + assertEquals(2, ring.size()); + ring.drain(s -> {}); + assertEquals(0, ring.size()); + } +} From 16b2ec6f0c472e497819af25c0773b6949827379 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 09:58:25 -0400 Subject: [PATCH 09/27] Add throughput benchmarks for MpscRingBuffer Five benchmarks: producer-only throughput at 1/8/16 threads with a background drainer, plus an e2e @Group bench pairing 8 producers with 1 consumer for system throughput. Ring capacity is a @Param so runs can sweep capacities without recompiling. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../concurrent/MpscRingBufferBenchmark.java | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 internal-api/src/jmh/java/datadog/trace/util/concurrent/MpscRingBufferBenchmark.java diff --git a/internal-api/src/jmh/java/datadog/trace/util/concurrent/MpscRingBufferBenchmark.java b/internal-api/src/jmh/java/datadog/trace/util/concurrent/MpscRingBufferBenchmark.java new file mode 100644 index 00000000000..e9d084f3334 --- /dev/null +++ b/internal-api/src/jmh/java/datadog/trace/util/concurrent/MpscRingBufferBenchmark.java @@ -0,0 +1,146 @@ +package datadog.trace.util.concurrent; + +import static java.util.concurrent.TimeUnit.SECONDS; + +import java.util.function.BiConsumer; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Group; +import org.openjdk.jmh.annotations.GroupThreads; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * Throughput benchmarks for {@link MpscRingBuffer}. + * + *

+ * + *

Run with {@code -p capacity=...} to override the default ring capacity. + */ +@State(Scope.Benchmark) +@Warmup(iterations = 2, time = 15, timeUnit = SECONDS) +@Measurement(iterations = 5, time = 15, timeUnit = SECONDS) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(SECONDS) +@Fork(value = 1) +public class MpscRingBufferBenchmark { + + /** + * Static filler so the lambda is non-capturing and the JIT can hoist it past the {@code tryWrite} + * call. Context arg comes first, slot last — matches {@code TagMap.forEach} convention. + */ + private static final BiConsumer FILLER = (v, slot) -> slot.value = v; + + /** Mutable slot. Replicates the per-publish allocation a real producer wants to avoid. */ + public static final class Slot { + long value; + } + + @Param({"1024"}) + public int capacity; + + /** + * Shared ring for the {@code write_*} benches. A background drainer keeps space available so + * producer benchmarks measure write throughput rather than full-ring drop throughput. + */ + MpscRingBuffer ring; + + private volatile boolean stopDrainer; + private Thread drainerThread; + + /** + * Separate ring for the {@code e2e_*} group benches. JMH drives both sides directly so we don't + * want our own background drainer for those. + */ + MpscRingBuffer e2eRing; + + @Setup(Level.Trial) + public void setup() { + ring = new MpscRingBuffer<>(Slot::new, capacity); + e2eRing = new MpscRingBuffer<>(Slot::new, capacity); + stopDrainer = false; + drainerThread = + new Thread( + () -> { + while (!stopDrainer) { + if (ring.drain((Slot s) -> {}) == 0) Thread.yield(); + } + }, + "MpscRingBufferBenchmark-drainer"); + drainerThread.setDaemon(true); + drainerThread.start(); + } + + @TearDown(Level.Trial) + public void teardown() throws InterruptedException { + stopDrainer = true; + drainerThread.join(5_000); + } + + @State(Scope.Thread) + public static class ThreadState { + long counter; + } + + // ============ Write throughput with background drainer ============ + + @Threads(1) + @Benchmark + public boolean write_1p(ThreadState ts) { + return ring.tryWrite(ts.counter++, FILLER); + } + + @Threads(8) + @Benchmark + public boolean write_8p(ThreadState ts) { + return ring.tryWrite(ts.counter++, FILLER); + } + + @Threads(16) + @Benchmark + public boolean write_16p(ThreadState ts) { + return ring.tryWrite(ts.counter++, FILLER); + } + + // ============ End-to-end producer/consumer pairing ============ + // + // JMH runs both methods in the same trial: 8 producer threads + 1 consumer thread. Throughput + // is reported as ops/sec aggregated across all 9 threads, but the consumer's drain count + // dwarfs the producer ops since one call processes many slots -- in practice the bottleneck + // is the producer side (CAS contention), and that's what the number reflects. + + private static final BiConsumer CONSUMER = (bh, slot) -> bh.consume(slot.value); + + @Group("e2e_8p") + @GroupThreads(8) + @Benchmark + public boolean e2e_producer(ThreadState ts) { + return e2eRing.tryWrite(ts.counter++, FILLER); + } + + @Group("e2e_8p") + @GroupThreads(1) + @Benchmark + public int e2e_consumer(Blackhole bh) { + int drained = e2eRing.drain(bh, CONSUMER); + if (drained == 0) Thread.yield(); + return drained; + } +} From e67dde72a50c9ea4e5870fe209d0164db737ffab Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 10:19:54 -0400 Subject: [PATCH 10/27] Add RingVsQueueBenchmark for MpscRingBuffer vs MpscArrayQueue Head-to-head benches in dd-trace-core's jmh source set (which already depends on jctools): MpscRingBuffer mutating pre-allocated slots vs MpscArrayQueue with a fresh Slot allocated per publish -- the latter mirrors the existing SpanSnapshot pattern in the CSS code. write_ring_8p / write_queue_8p compare publish cost with a background drainer; e2e_ring_8p / e2e_queue_8p use @Group to pair 8 producers with 1 consumer for end-to-end throughput. Run with -prof gc to see per-op allocation rate where the ring's win shows up loudest. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../util/concurrent/RingVsQueueBenchmark.java | 191 ++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 dd-trace-core/src/jmh/java/datadog/trace/util/concurrent/RingVsQueueBenchmark.java diff --git a/dd-trace-core/src/jmh/java/datadog/trace/util/concurrent/RingVsQueueBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/util/concurrent/RingVsQueueBenchmark.java new file mode 100644 index 00000000000..6a1e3d7c802 --- /dev/null +++ b/dd-trace-core/src/jmh/java/datadog/trace/util/concurrent/RingVsQueueBenchmark.java @@ -0,0 +1,191 @@ +package datadog.trace.util.concurrent; + +import static java.util.concurrent.TimeUnit.SECONDS; + +import java.util.function.BiConsumer; +import org.jctools.queues.MpscArrayQueue; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Group; +import org.openjdk.jmh.annotations.GroupThreads; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * Head-to-head comparison of {@link MpscRingBuffer} (mutable pre-allocated slots) against the + * conventional approach of a jctools {@link MpscArrayQueue} with a fresh {@code Slot} allocated per + * publish. The latter is the pattern the current CSS code uses for {@code SpanSnapshot} on the + * producer side, so the delta between the two measures the actual allocation/handoff saving of the + * ring-buffer rewrite. + * + *

+ * + *

Run with {@code -prof gc} to also see per-op allocation rate — that's where the ring's win is + * loudest, since the queue allocates one {@code Slot} per publish and the ring allocates none. + */ +@State(Scope.Benchmark) +@Warmup(iterations = 2, time = 15, timeUnit = SECONDS) +@Measurement(iterations = 5, time = 15, timeUnit = SECONDS) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(SECONDS) +@Fork(value = 1) +public class RingVsQueueBenchmark { + + /** + * Shared slot type. {@code MpscRingBuffer} pre-allocates these and the producer mutates in place; + * the queue path allocates a fresh one per publish and offers the reference. Two constructors so + * both styles read naturally. + */ + public static final class Slot { + long value; + + Slot() {} + + Slot(final long value) { + this.value = value; + } + } + + // Static (non-capturing) handlers. Passing ts/bh as context lets the JIT keep these as + // singleton functions and avoid per-call lambda allocation. + private static final BiConsumer RING_FILLER = + (ts, slot) -> { + slot.value = ts.counter; + ts.counter++; + }; + + private static final BiConsumer RING_CONSUMER = + (bh, slot) -> bh.consume(slot.value); + + @Param({"1024"}) + public int capacity; + + /** Write-side benchmark structures. Drained by a background thread so they never fill. */ + MpscRingBuffer ring; + + MpscArrayQueue queue; + + /** E2e benchmark structures. JMH drives both sides via {@code @Group}; no background drainer. */ + MpscRingBuffer e2eRing; + + MpscArrayQueue e2eQueue; + + private volatile boolean stopDrainers; + private Thread ringDrainer; + private Thread queueDrainer; + + @Setup(Level.Trial) + public void setup() { + ring = new MpscRingBuffer<>(Slot::new, capacity); + queue = new MpscArrayQueue<>(capacity); + e2eRing = new MpscRingBuffer<>(Slot::new, capacity); + e2eQueue = new MpscArrayQueue<>(capacity); + + stopDrainers = false; + ringDrainer = + new Thread( + () -> { + while (!stopDrainers) { + if (ring.drain((Slot s) -> {}) == 0) Thread.yield(); + } + }, + "RingVsQueueBenchmark-ringDrainer"); + ringDrainer.setDaemon(true); + ringDrainer.start(); + + queueDrainer = + new Thread( + () -> { + while (!stopDrainers) { + Slot s = queue.poll(); + if (s == null) Thread.yield(); + } + }, + "RingVsQueueBenchmark-queueDrainer"); + queueDrainer.setDaemon(true); + queueDrainer.start(); + } + + @TearDown(Level.Trial) + public void teardown() throws InterruptedException { + stopDrainers = true; + ringDrainer.join(5_000); + queueDrainer.join(5_000); + } + + @State(Scope.Thread) + public static class ThreadState { + long counter; + } + + // ============ Write-side throughput ============ + + @Threads(8) + @Benchmark + public boolean write_ring_8p(final ThreadState ts) { + return ring.tryWrite(ts, RING_FILLER); + } + + /** Mirrors the SpanSnapshot pattern: allocate a fresh instance per publish, offer it. */ + @Threads(8) + @Benchmark + public boolean write_queue_8p(final ThreadState ts) { + return queue.offer(new Slot(ts.counter++)); + } + + // ============ End-to-end producer/consumer ============ + + @Group("e2e_ring_8p") + @GroupThreads(8) + @Benchmark + public boolean e2e_ring_producer(final ThreadState ts) { + return e2eRing.tryWrite(ts, RING_FILLER); + } + + @Group("e2e_ring_8p") + @GroupThreads(1) + @Benchmark + public int e2e_ring_consumer(final Blackhole bh) { + int drained = e2eRing.drain(bh, RING_CONSUMER); + if (drained == 0) Thread.yield(); + return drained; + } + + @Group("e2e_queue_8p") + @GroupThreads(8) + @Benchmark + public boolean e2e_queue_producer(final ThreadState ts) { + return e2eQueue.offer(new Slot(ts.counter++)); + } + + @Group("e2e_queue_8p") + @GroupThreads(1) + @Benchmark + public int e2e_queue_consumer(final Blackhole bh) { + int drained = 0; + Slot slot; + while ((slot = e2eQueue.poll()) != null) { + bh.consume(slot.value); + drained++; + } + if (drained == 0) Thread.yield(); + return drained; + } +} From 7ec3f116ce25a6f9a7464dd6f6d95ad11e62bcf4 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 11:07:38 -0400 Subject: [PATCH 11/27] Swap CSS inbox for MpscRingBuffer of mutable SpanSnapshot slots Producer/consumer split allocated one SpanSnapshot per metrics-eligible span and handed the reference through a jctools MpscArrayQueue. At tight heap (Xmx64m petclinic) the in-flight snapshots overflow G1 survivor regions, triggering To-space Exhausted -> Full GC storms. The data channel now uses MpscRingBuffer with pre-allocated, recyclable slots: the producer mutates a slot in place via fillSlot and publishes; the aggregator reads the slot during drain. No per-publish allocation. Control messages (REPORT / STOP / CLEAR) move to a small side-channel MpscArrayQueue. The data ring only holds one element type (the slot), and routing signals through a separate channel lets the aggregator service them ahead of the data backlog without the InboxItem-marker type-switch dispatch. Aggregator.run() now drains the data ring first, then polls signals. REPORT/STOP handling does an inline data drain before flushing so the ordering guarantee holds: any publish that happens-before signalInbox.offer is visible to the report's flush. SpanSnapshot fields are non-final; the no-arg constructor is what the ring uses to pre-populate slots. The 15-arg constructor is retained for tests that built immutable snapshots inline. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/common/metrics/Aggregator.java | 136 ++++++++++-------- .../metrics/ConflatingMetricsAggregator.java | 128 +++++++++++------ .../trace/common/metrics/SpanSnapshot.java | 47 +++--- .../ConflatingMetricAggregatorTest.groovy | 2 +- 4 files changed, 187 insertions(+), 126 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index d809d452522..25574e8e154 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -5,8 +5,10 @@ import datadog.trace.common.metrics.SignalItem.ClearSignal; import datadog.trace.common.metrics.SignalItem.StopSignal; import datadog.trace.core.monitor.HealthMetrics; +import datadog.trace.util.concurrent.MpscRingBuffer; import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.util.concurrent.TimeUnit; +import java.util.function.BiConsumer; import org.jctools.queues.MessagePassingQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,7 +19,8 @@ final class Aggregator implements Runnable { private static final Logger log = LoggerFactory.getLogger(Aggregator.class); - private final MessagePassingQueue inbox; + private final MpscRingBuffer dataInbox; + private final MessagePassingQueue signalInbox; private final AggregateTable aggregates; private final MetricWriter writer; private final HealthMetrics healthMetrics; @@ -42,9 +45,22 @@ final class Aggregator implements Runnable { justification = "the field is confined to the agent thread running the Aggregator") private boolean dirty; + @SuppressFBWarnings( + value = "AT_STALE_THREAD_WRITE_OF_PRIMITIVE", + justification = "the field is confined to the agent thread running the Aggregator") + private boolean stopped; + + /** + * Static-singleton snapshot handler. Reads from the slot via {@code this} (passed as the context) + * and avoids per-drain lambda capture. + */ + private static final BiConsumer SNAPSHOT_HANDLER = + Aggregator::handleSnapshot; + Aggregator( MetricWriter writer, - MessagePassingQueue inbox, + MpscRingBuffer dataInbox, + MessagePassingQueue signalInbox, int maxAggregates, long reportingInterval, TimeUnit reportingIntervalTimeUnit, @@ -52,7 +68,8 @@ final class Aggregator implements Runnable { Runnable onReportCycle) { this( writer, - inbox, + dataInbox, + signalInbox, maxAggregates, reportingInterval, reportingIntervalTimeUnit, @@ -63,7 +80,8 @@ final class Aggregator implements Runnable { Aggregator( MetricWriter writer, - MessagePassingQueue inbox, + MpscRingBuffer dataInbox, + MessagePassingQueue signalInbox, int maxAggregates, long reportingInterval, TimeUnit reportingIntervalTimeUnit, @@ -71,7 +89,8 @@ final class Aggregator implements Runnable { HealthMetrics healthMetrics, Runnable onReportCycle) { this.writer = writer; - this.inbox = inbox; + this.dataInbox = dataInbox; + this.signalInbox = signalInbox; this.aggregates = new AggregateTable(maxAggregates); this.reportingIntervalNanos = reportingIntervalTimeUnit.toNanos(reportingInterval); this.sleepMillis = sleepMillis; @@ -82,12 +101,19 @@ final class Aggregator implements Runnable { @Override public void run() { Thread currentThread = Thread.currentThread(); - Drainer drainer = new Drainer(); - while (!currentThread.isInterrupted() && !drainer.stopped) { + while (!currentThread.isInterrupted() && !stopped) { try { - if (!inbox.isEmpty()) { - inbox.drain(drainer); - } else { + int drainedData = dataInbox.drain(this, SNAPSHOT_HANDLER); + // Signals are processed after the data drain. REPORT/STOP additionally drain any + // snapshots that arrived between the drain above and the signal's enqueue -- the + // signal-channel offer happens-before the poll here, so any publish that completed + // before report()/stop() is guaranteed visible to the inline drain in handleSignal. + SignalItem signal; + while (!stopped && (signal = signalInbox.poll()) != null) { + handleSignal(signal); + } + if (stopped) break; + if (drainedData == 0 && signalInbox.isEmpty()) { Thread.sleep(sleepMillis); } } catch (InterruptedException e) { @@ -99,57 +125,45 @@ public void run() { log.debug("metrics aggregator exited"); } - private final class Drainer implements MessagePassingQueue.Consumer { - - boolean stopped = false; - - @Override - public void accept(InboxItem item) { - if (item == ClearSignal.CLEAR) { - // ClearSignal is routed through the inbox (rather than letting the caller mutate - // AggregateTable directly) so the aggregator thread stays the sole writer. AggregateTable - // is not thread-safe; a direct clear() from e.g. the OkHttpSink callback thread would - // race with Drainer.accept on this thread. - // - // We deliberately do NOT call inbox.clear() here. Doing so would erase any queued STOP - // (or REPORT) signals that happen to sit behind CLEAR -- a real concern when a - // downgrade is followed quickly by close(), where the trampled STOP leaves the - // aggregator thread spinning until thread.join times out. features.supportsMetrics() is - // already false by the time CLEAR was offered, so producers have stopped publishing; - // any in-flight snapshots will drain naturally into the just-cleared table, get - // re-aggregated, and flushed on the next report -- where the agent rejects them again, - // triggering another DOWNGRADED -> disable() -> CLEAR cycle. Worst case: one extra - // reporting cycle of wasted work, which we accept for the safety of preserving STOP. - if (!stopped) { - aggregates.clear(); - // Clear dirty too -- without this, the next report() would see dirty=true, run - // expungeStaleAggregates against the (now-empty) table, find isEmpty()=true, and skip - // the flush anyway. Same observable outcome, but resetting here keeps the invariant - // "dirty implies there's data to flush" honest. - dirty = false; - } - ((SignalItem) item).complete(); - } else if (item instanceof SignalItem) { - SignalItem signal = (SignalItem) item; - if (!stopped) { - report(wallClockTime(), signal); - stopped = item instanceof StopSignal; - if (stopped) { - signal.complete(); - } - } else { - signal.ignore(); - } - } else if (item instanceof SpanSnapshot && !stopped) { - SpanSnapshot snapshot = (SpanSnapshot) item; - AggregateEntry entry = aggregates.findOrInsert(snapshot); - if (entry != null) { - entry.recordOneDuration(snapshot.tagAndDuration); - dirty = true; - } else { - // table at cap with no stale entry available to evict - healthMetrics.onStatsAggregateDropped(); - } + private static void handleSnapshot(Aggregator agg, SpanSnapshot snapshot) { + AggregateEntry entry = agg.aggregates.findOrInsert(snapshot); + if (entry != null) { + entry.recordOneDuration(snapshot.tagAndDuration); + agg.dirty = true; + } else { + // table at cap with no stale entry available to evict + agg.healthMetrics.onStatsAggregateDropped(); + } + } + + private void handleSignal(SignalItem signal) { + if (signal == ClearSignal.CLEAR) { + // ClearSignal is routed through the signal channel (rather than letting the caller mutate + // AggregateTable directly) so the aggregator thread stays the sole writer. AggregateTable + // is not thread-safe; a direct clear() from e.g. the OkHttpSink callback thread would race + // with the data-drain on this thread. + // + // We do NOT clear the snapshot data ring here. Any in-flight snapshots will drain + // naturally into the just-cleared table, get re-aggregated, and flushed on the next + // report -- where the agent rejects them again, triggering another DOWNGRADED -> disable() + // -> CLEAR cycle. Worst case: one extra reporting cycle of wasted work. + aggregates.clear(); + // Clear dirty too -- without this, the next report() would see dirty=true, run + // expungeStaleAggregates against the (now-empty) table, find isEmpty()=true, and skip + // the flush anyway. Same observable outcome, but resetting here keeps the invariant + // "dirty implies there's data to flush" honest. + dirty = false; + signal.complete(); + } else { + // STOP or REPORT: catch up on any data that arrived between the loop's main drain and the + // signal being enqueued. Producer's signalInbox.offer happens-after its dataInbox publish, + // so by the time we observe the signal, every snapshot the producer had published is + // visible to drain() here. This is what gives report() bucket-boundary determinism. + dataInbox.drain(this, SNAPSHOT_HANDLER); + report(wallClockTime(), signal); + if (signal instanceof StopSignal) { + stopped = true; + signal.complete(); } } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index 895ee434854..b008cef3d85 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -28,6 +28,7 @@ import datadog.trace.core.SpanKindFilter; import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.util.AgentTaskScheduler; +import datadog.trace.util.concurrent.MpscRingBuffer; import java.util.Collections; import java.util.List; import java.util.Map; @@ -63,9 +64,35 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve private static final SpanKindFilter INTERNAL_KIND = SpanKindFilter.builder().includeInternal().build(); + /** Capacity of the small MPSC queue carrying control signals (REPORT / STOP / CLEAR). */ + private static final int SIGNAL_INBOX_CAPACITY = 16; + private final Set ignoredResources; private final Thread thread; - private final MessagePassingQueue inbox; + + /** + * Producer/consumer data channel: a {@link MpscRingBuffer} of pre-allocated, recyclable {@link + * SpanSnapshot} slots. Producers mutate slots in place via {@link #slotFiller} -- no per-publish + * allocation. + */ + private final MpscRingBuffer dataInbox; + + /** + * Separate small queue for control signals. Kept off the data ring because the ring only holds + * one element type (the slot), and routing signals through a side channel lets the aggregator + * service them ahead of the data backlog each loop iteration. + */ + private final MessagePassingQueue signalInbox; + + /** + * Slot filler captured as an instance field so the lambda is allocated once at construction. The + * method reference closes over {@code this}, but the field is per-aggregator so context lives in + * the {@code (CoreSpan, isTopLevel)} arguments passed by the producer at each call -- no + * per-publish capture. + */ + private final datadog.trace.api.function.TriConsumer, Boolean, SpanSnapshot> + slotFiller = this::fillSlot; + private final Sink sink; private final Aggregator aggregator; private final long reportingInterval; @@ -173,14 +200,16 @@ public ConflatingMetricsAggregator( boolean includeEndpointInMetrics) { this.ignoredResources = ignoredResources; this.includeEndpointInMetrics = includeEndpointInMetrics; - this.inbox = Queues.mpscArrayQueue(queueSize); + this.dataInbox = new MpscRingBuffer<>(SpanSnapshot::new, queueSize); + this.signalInbox = Queues.mpscArrayQueue(SIGNAL_INBOX_CAPACITY); this.features = features; this.healthMetrics = healthMetric; this.sink = sink; this.aggregator = new Aggregator( metricWriter, - inbox, + dataInbox, + signalInbox, maxAggregates, reportingInterval, timeUnit, @@ -218,11 +247,11 @@ public boolean report() { boolean published; int attempts = 0; do { - published = inbox.offer(REPORT); + published = signalInbox.offer(REPORT); ++attempts; } while (!published && attempts < 10); if (!published) { - log.debug("Skipped metrics reporting because the queue is full"); + log.debug("Skipped metrics reporting because the signal queue is full"); } return published; } @@ -245,7 +274,7 @@ public Future forceReport() { ReportSignal reportSignal = new ReportSignal(); boolean published = false; while (thread.isAlive() && !published) { - published = inbox.offer(reportSignal); + published = signalInbox.offer(reportSignal); if (!published) { try { Thread.sleep(10); @@ -296,17 +325,32 @@ private boolean publish(CoreSpan span, boolean isTopLevel) { // Error decision drives force-keep sampling regardless of whether the snapshot gets queued. boolean error = span.getError() > 0; - // Fast-path the inbox-full case before any tag extraction or snapshot allocation. size() is - // approximate on jctools' MPSC queue but that's fine: if we under-estimate, we fall through - // and let inbox.offer be the source of truth (existing behavior); if we over-estimate, we - // drop a snapshot that would have fit -- acceptable, onStatsInboxFull was going to fire - // imminently anyway. - if (inbox.size() >= inbox.capacity()) { + // Fast-path the inbox-full case before tag extraction. size() is approximate but that's fine: + // a false "full" reading just causes a drop, and a real one is correctly identified. + if (dataInbox.size() >= dataInbox.capacity()) { healthMetrics.onStatsInboxFull(); return error; } - // Extract HTTP method and endpoint only if the feature is enabled + // tryWrite claims a slot, runs slotFiller to populate it in place, and publishes. No + // SpanSnapshot is allocated -- the slot is one of the ring's pre-allocated instances. + if (!dataInbox.tryWrite(span, isTopLevel, slotFiller)) { + healthMetrics.onStatsInboxFull(); + } + return error; + } + + /** + * Producer-side slot fill. Runs inside {@link MpscRingBuffer#tryWrite} after the producer has + * claimed a sequence but before the slot is visible to the aggregator. Reads from {@code span} + * and instance state, writes every field on {@code slot} (including {@code null} where + * applicable) so stale values from a prior occupant of this slot don't bleed through. + */ + private void fillSlot(CoreSpan span, Boolean isTopLevelBoxed, SpanSnapshot slot) { + final boolean isTopLevel = isTopLevelBoxed; + final boolean error = span.getError() > 0; + + // Extract HTTP method and endpoint only if the feature is enabled. String httpMethod = null; String httpEndpoint = null; if (includeEndpointInMetrics) { @@ -338,28 +382,21 @@ private boolean publish(CoreSpan span, boolean isTopLevel) { peerTagSchema = null; } - SpanSnapshot snapshot = - new SpanSnapshot( - span.getResourceName(), - span.getServiceName(), - span.getOperationName(), - span.getServiceNameSource(), - spanType, - span.getHttpStatusCode(), - isSynthetic(span), - span.getParentId() == 0, - spanKind, - peerTagSchema, - peerTagValues, - httpMethod, - httpEndpoint, - grpcStatusCode, - tagAndDuration); - if (!inbox.offer(snapshot)) { - healthMetrics.onStatsInboxFull(); - } - // force keep keys if there are errors - return error; + slot.resourceName = span.getResourceName(); + slot.serviceName = span.getServiceName(); + slot.operationName = span.getOperationName(); + slot.serviceNameSource = span.getServiceNameSource(); + slot.spanType = spanType; + slot.httpStatusCode = span.getHttpStatusCode(); + slot.synthetic = isSynthetic(span); + slot.traceRoot = span.getParentId() == 0; + slot.spanKind = spanKind; + slot.peerTagSchema = peerTagSchema; + slot.peerTagValues = peerTagValues; + slot.httpMethod = httpMethod; + slot.httpEndpoint = httpEndpoint; + slot.grpcStatusCode = grpcStatusCode; + slot.tagAndDuration = tagAndDuration; } /** @@ -474,7 +511,7 @@ public void stop() { if (null != cancellation) { cancellation.cancel(); } - inbox.offer(STOP); + signalInbox.offer(STOP); } @Override @@ -512,17 +549,18 @@ private void disable() { features.discover(); if (!features.supportsMetrics()) { log.debug("Disabling metric reporting because an agent downgrade was detected"); - // Route the clear through the inbox so the aggregator thread is the only writer. + // Route the clear through the signal channel so the aggregator thread is the only writer. // AggregateTable is not thread-safe; mutating it directly from this thread would race - // with Drainer.accept on the aggregator thread. + // with snapshot processing on the aggregator thread. // - // Best-effort single offer rather than the retry-loop pattern in report(). If the inbox is - // full at downgrade time the clear is dropped, but the system self-heals: features.discover() - // already flipped supportsMetrics() false, so producer publish() calls now skip the inbox; - // the aggregator drains existing snapshots and ships them on the next report cycle; the - // sink rejects that payload and fires DOWNGRADED again, which retries disable() against a - // now-empty inbox. Worst case: one extra reporting cycle of stale data. - inbox.offer(CLEAR); + // Best-effort single offer rather than the retry-loop pattern in report(). If the signal + // queue is full at downgrade time the clear is dropped, but the system self-heals: + // features.discover() already flipped supportsMetrics() false, so producer publish() calls + // now skip the data ring; the aggregator drains existing snapshots and ships them on the + // next report cycle; the sink rejects that payload and fires DOWNGRADED again, which + // retries disable() against a now-empty signal queue. Worst case: one extra reporting + // cycle of stale data. + signalInbox.offer(CLEAR); } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java index 152ac42bb55..e030a9f68cd 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java @@ -1,46 +1,55 @@ package datadog.trace.common.metrics; /** - * Immutable per-span value posted from the producer to the aggregator thread. Carries the raw - * inputs the aggregator needs to look up or build an {@link AggregateEntry} and update its - * counters. + * Per-span value posted from the producer to the aggregator thread. Carries the raw inputs the + * aggregator needs to look up or build an {@link AggregateEntry} and update its counters. + * + *

Fields are mutable so this class can serve as a slot in {@link + * datadog.trace.util.concurrent.MpscRingBuffer}: the producer claims a slot, writes its fields, and + * publishes; the aggregator reads them. There is exactly one writer per outstanding sequence (the + * producer that claimed it) and one reader (the aggregator), so no synchronization on fields is + * required. * *

All cache-canonicalization (service-name, span-kind, peer-tag string interning) happens on the * aggregator thread; the producer just shuffles references. */ -final class SpanSnapshot implements InboxItem { +final class SpanSnapshot { - final CharSequence resourceName; - final String serviceName; - final CharSequence operationName; - final CharSequence serviceNameSource; - final CharSequence spanType; - final short httpStatusCode; - final boolean synthetic; - final boolean traceRoot; - final String spanKind; + CharSequence resourceName; + String serviceName; + CharSequence operationName; + CharSequence serviceNameSource; + CharSequence spanType; + short httpStatusCode; + boolean synthetic; + boolean traceRoot; + String spanKind; /** * Schema for {@link #peerTagValues}. {@code null} when the span has no peer tags. The schema * carries the names in parallel-array form; {@code peerTagValues} holds the per-span tag values * at the same indices. */ - final PeerTagSchema peerTagSchema; + PeerTagSchema peerTagSchema; /** * Peer tag values captured from the span, parallel to {@code peerTagSchema.names}. A {@code null} * entry means the span didn't have that peer tag set. {@code null} (the whole array) when {@link * #peerTagSchema} is {@code null}. */ - final String[] peerTagValues; + String[] peerTagValues; - final String httpMethod; - final String httpEndpoint; - final String grpcStatusCode; + String httpMethod; + String httpEndpoint; + String grpcStatusCode; /** Duration in nanoseconds, OR-ed with {@code ERROR_TAG} / {@code TOP_LEVEL_TAG} as needed. */ - final long tagAndDuration; + long tagAndDuration; + + /** No-arg constructor used by {@link MpscRingBuffer} to pre-allocate empty slots. */ + SpanSnapshot() {} + /** Convenience constructor retained for tests that built immutable snapshots inline. */ SpanSnapshot( CharSequence resourceName, String serviceName, diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy index 00bd706b8fb..271578c8689 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy @@ -1718,7 +1718,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { def waitUntilEmpty(ConflatingMetricsAggregator aggregator) { int i = 0 - while (!aggregator.inbox.isEmpty() && i++ < 100) { + while ((!aggregator.dataInbox.isEmpty() || !aggregator.signalInbox.isEmpty()) && i++ < 100) { Thread.sleep(10) } } From cec6abf082cdd018fc2bb73dc348de00cbcc16b0 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 11:16:24 -0400 Subject: [PATCH 12/27] Document MpscRingBuffer thread-safety contract; publish on filler throw Spell out the contract that slot users rely on: plain (non-volatile) fields, no retention past handler return, slot-reference-not-shared. Producer fillers must not throw if possible -- and if they do, the slot is now published anyway (try/finally) so the consumer can't deadlock waiting for an unfinished sequence. Test covers the throw-then-recover path; the ring's cursors stay healthy. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/util/concurrent/MpscRingBuffer.java | 61 +++++++++++++++++-- .../util/concurrent/MpscRingBufferTest.java | 32 ++++++++++ 2 files changed, 87 insertions(+), 6 deletions(-) diff --git a/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java b/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java index f6b9ee998ed..121c5717a1d 100644 --- a/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java +++ b/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java @@ -20,6 +20,44 @@ * Hashtable.forEach}. That ordering lets callers declare the callback as a {@code static final} * non-capturing lambda and pass per-call context at the call site without allocating a closure. * + *

Thread safety contract

+ * + *

The ring buffer is thread-safe for any number of producer threads plus exactly one consumer + * thread. Calling {@code drain} from multiple threads concurrently is not supported and will + * corrupt state. + * + *

For the slot type {@code T}: + * + *

+ * + *

For producer fillers: + * + *

+ * + *

Implementation

+ * *

Producer cursor is CAS-claimed; visibility of a claimed slot to the consumer is gated by a * per-slot publication-sequence array. Consumer cursor is updated with a volatile write so * producers observe space being freed. @@ -90,16 +128,24 @@ public boolean isEmpty() { public boolean tryWrite(final Consumer filler) { final long seq = claim(); if (seq < 0L) return false; - filler.accept(slots[(int) (seq & mask)]); - publish(seq); + // publish in finally so a throwing filler doesn't leave the slot un-published -- the + // consumer would otherwise wait at that sequence forever. See class javadoc. + try { + filler.accept(slots[(int) (seq & mask)]); + } finally { + publish(seq); + } return true; } public boolean tryWrite(final C context, final BiConsumer filler) { final long seq = claim(); if (seq < 0L) return false; - filler.accept(context, slots[(int) (seq & mask)]); - publish(seq); + try { + filler.accept(context, slots[(int) (seq & mask)]); + } finally { + publish(seq); + } return true; } @@ -109,8 +155,11 @@ public boolean tryWrite( final TriConsumer filler) { final long seq = claim(); if (seq < 0L) return false; - filler.accept(context1, context2, slots[(int) (seq & mask)]); - publish(seq); + try { + filler.accept(context1, context2, slots[(int) (seq & mask)]); + } finally { + publish(seq); + } return true; } diff --git a/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java b/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java index 43b89271621..d48c872883f 100644 --- a/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java +++ b/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java @@ -2,6 +2,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertSame; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -244,4 +245,35 @@ void sizeReflectsOutstandingWrites() { ring.drain(s -> {}); assertEquals(0, ring.size()); } + + @Test + void throwingFillerStillPublishesSoConsumerDoesntHang() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + + // First write succeeds. + assertTrue(ring.tryWrite(s -> s.value = 1)); + + // Second write throws midway through filling. Slot must still be published so the consumer's + // drain can advance past it. + RuntimeException boom = new RuntimeException("boom"); + RuntimeException thrown = + assertThrows( + RuntimeException.class, + () -> + ring.tryWrite( + s -> { + s.value = 2; + throw boom; + })); + assertSame(boom, thrown); + + // Third write proves the ring's cursors are still healthy after the throw. + assertTrue(ring.tryWrite(s -> s.value = 3)); + + // Consumer drains all three: it must not hang on the partially-filled slot. + List seen = new ArrayList<>(); + int drained = ring.drain(s -> seen.add(s.value)); + assertEquals(3, drained, "consumer must advance past the throwing slot"); + assertEquals(Arrays.asList(1, 2, 3), seen, "throwing slot keeps whatever filler had written"); + } } From 9162a1ef472e5e4a09db85a72f6512ac346d48d0 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 11:25:10 -0400 Subject: [PATCH 13/27] Use AtomicLongFieldUpdater for the producer cursor Saves one wrapper allocation per ring instance: producerCursor becomes a volatile long on the instance, paired with a static AtomicLongFieldUpdater for CAS. Same memory ordering as the prior AtomicLong (volatile field + field-updater CAS), but no per-instance wrapper object. publishedSequences stays AtomicLongArray -- the field updater approach doesn't apply to array element access. consumerCursor was already a plain volatile long with no wrapper. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/util/concurrent/MpscRingBuffer.java | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java b/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java index 121c5717a1d..77cc8d08435 100644 --- a/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java +++ b/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java @@ -1,8 +1,8 @@ package datadog.trace.util.concurrent; import datadog.trace.api.function.TriConsumer; -import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLongArray; +import java.util.concurrent.atomic.AtomicLongFieldUpdater; import java.util.function.BiConsumer; import java.util.function.Consumer; import java.util.function.Supplier; @@ -77,12 +77,17 @@ public final class MpscRingBuffer { private final int capacity; private final int mask; - /** Next sequence to claim. Producers increment via CAS. */ - private final AtomicLong producerCursor = new AtomicLong(-1L); + /** Next sequence to claim. Producers increment via CAS through {@link #PRODUCER_CURSOR}. */ + @SuppressWarnings("unused") // accessed via PRODUCER_CURSOR field updater + private volatile long producerCursor = -1L; + + @SuppressWarnings("rawtypes") // AtomicLongFieldUpdater cannot reference a parameterized type + private static final AtomicLongFieldUpdater PRODUCER_CURSOR = + AtomicLongFieldUpdater.newUpdater(MpscRingBuffer.class, "producerCursor"); /** * Highest sequence consumed. Volatile so producers see space freed up; only the consumer thread - * writes to it, so a plain field with a manual volatile-store would also work. + * writes to it. */ private volatile long consumerCursor = -1L; @@ -112,7 +117,7 @@ public int capacity() { /** Approximate count of slots holding unread items. May briefly exceed capacity under race. */ public int size() { - final long p = producerCursor.get(); + final long p = producerCursor; final long c = consumerCursor; final long diff = p - c; if (diff <= 0) return 0; @@ -121,7 +126,7 @@ public int size() { } public boolean isEmpty() { - return producerCursor.get() == consumerCursor; + return producerCursor == consumerCursor; } /** {@code true} if the slot was filled and published; {@code false} if the ring is full. */ @@ -215,7 +220,7 @@ public int drain( /** CAS-claim the next sequence, or return {@code -1} if the ring is full. */ private long claim() { while (true) { - final long current = producerCursor.get(); + final long current = producerCursor; // Stale read of consumerCursor is fine: a false "full" reading just causes a drop, and // a real "full" reading is correctly identified because consumerCursor only advances. final long consumed = consumerCursor; @@ -223,7 +228,7 @@ private long claim() { return -1L; } final long next = current + 1L; - if (producerCursor.compareAndSet(current, next)) { + if (PRODUCER_CURSOR.compareAndSet(this, current, next)) { return next; } // CAS failure -> another producer claimed; retry. From e8ead019ba9cfd9f88fc07284654690c9123701d Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 13:44:44 -0400 Subject: [PATCH 14/27] Pad MpscRingBuffer cursors and stride publishedSequences Two related cache-line fixes for the producer hot path under heavy contention: 1. Stride publishedSequences by 8 longs (one cache line). Without this, adjacent logical slots share cache lines and concurrent producers writing nearby sequences ping-pong the same line between cores. The array grows by 8x but the upfront cost is bounded by the ring's capacity (e.g. 8 MB at the CSS default cap=131072). 2. Cache-line-pad the producerCursor and consumerCursor against each other using the standard Disruptor class-hierarchy pattern. Every consumer-side advance of consumerCursor would otherwise invalidate the line producers read for producerCursor (and vice versa). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/util/concurrent/MpscRingBuffer.java | 82 +++++++++++++------ 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java b/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java index 77cc8d08435..b84ee54922f 100644 --- a/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java +++ b/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java @@ -7,6 +7,35 @@ import java.util.function.Consumer; import java.util.function.Supplier; +// Disruptor-style cache-line padding around the cursors. The two cursors live on different cache +// lines so consumer-side writes to consumerCursor don't invalidate the line producers read for +// producerCursor (and vice versa). Each padding class declares 7 longs (56 bytes); combined with +// the cursor's own 8 bytes plus the JVM object header, each cursor + its surrounding pad fills +// a 64-byte cache line. The HotSpot field-layout strategy preserves the declaration order across +// the class hierarchy, so this pattern is reliable on all production JVMs we target. + +abstract class MpscRingBufferPad0 { + long p01, p02, p03, p04, p05, p06, p07; +} + +abstract class MpscRingBufferProducerCursor extends MpscRingBufferPad0 { + /** Next sequence to claim. Producers increment via CAS through {@code PRODUCER_CURSOR}. */ + volatile long producerCursor = -1L; +} + +abstract class MpscRingBufferPad1 extends MpscRingBufferProducerCursor { + long p11, p12, p13, p14, p15, p16, p17; +} + +abstract class MpscRingBufferConsumerCursor extends MpscRingBufferPad1 { + /** Highest sequence consumed. Only the consumer thread writes; producers read volatile. */ + volatile long consumerCursor = -1L; +} + +abstract class MpscRingBufferPad2 extends MpscRingBufferConsumerCursor { + long p21, p22, p23, p24, p25, p26, p27; +} + /** * Bounded multi-producer / single-consumer ring buffer of pre-allocated {@code T} instances. * @@ -60,37 +89,38 @@ * *

Producer cursor is CAS-claimed; visibility of a claimed slot to the consumer is gated by a * per-slot publication-sequence array. Consumer cursor is updated with a volatile write so - * producers observe space being freed. + * producers observe space being freed. Cursors are cache-line-padded against each other (see the + * {@code MpscRingBufferPad*} hierarchy at the top of this file) and the publication-sequence array + * is strided so each logical entry occupies a distinct cache line. */ -public final class MpscRingBuffer { +public final class MpscRingBuffer extends MpscRingBufferPad2 { + + /** + * Cache line size in {@code long}-units. 64-byte cache lines on every common CPU we ship to (x86, + * ARM); 8 bytes per long. Each logical slot in {@link #publishedSequences} is spread out by this + * stride so adjacent logical sequences don't share a cache line and don't ping-pong between + * producer cores under heavy contention. + */ + private static final int CACHE_LINE_LONGS = 8; + + @SuppressWarnings("rawtypes") // AtomicLongFieldUpdater can't take a parameterized class + private static final AtomicLongFieldUpdater PRODUCER_CURSOR = + AtomicLongFieldUpdater.newUpdater(MpscRingBufferProducerCursor.class, "producerCursor"); private final T[] slots; /** - * Per-slot publication sequence. Producers write the claimed sequence here as the last step of a - * publish (release write via {@link AtomicLongArray#set}); the consumer reads it (acquire read) - * to determine whether the slot at the next position is ready. A slot is considered published for - * sequence {@code s} iff {@code sequences[s & mask] == s}. + * Per-slot publication sequence, strided by {@link #CACHE_LINE_LONGS} to avoid false sharing. + * Producers write the claimed sequence here as the last step of a publish (release write via + * {@link AtomicLongArray#set}); the consumer reads it (acquire read) to determine whether the + * slot at the next position is ready. A slot is considered published for sequence {@code s} iff + * {@code publishedSequences[(s & mask) * CACHE_LINE_LONGS] == s}. */ private final AtomicLongArray publishedSequences; private final int capacity; private final int mask; - /** Next sequence to claim. Producers increment via CAS through {@link #PRODUCER_CURSOR}. */ - @SuppressWarnings("unused") // accessed via PRODUCER_CURSOR field updater - private volatile long producerCursor = -1L; - - @SuppressWarnings("rawtypes") // AtomicLongFieldUpdater cannot reference a parameterized type - private static final AtomicLongFieldUpdater PRODUCER_CURSOR = - AtomicLongFieldUpdater.newUpdater(MpscRingBuffer.class, "producerCursor"); - - /** - * Highest sequence consumed. Volatile so producers see space freed up; only the consumer thread - * writes to it. - */ - private volatile long consumerCursor = -1L; - @SuppressWarnings("unchecked") public MpscRingBuffer(final Supplier factory, final int capacityHint) { if (capacityHint < 1) { @@ -102,12 +132,12 @@ public MpscRingBuffer(final Supplier factory, final int capacityHint) { } this.mask = capacity - 1; this.slots = (T[]) new Object[capacity]; - this.publishedSequences = new AtomicLongArray(capacity); + this.publishedSequences = new AtomicLongArray(capacity * CACHE_LINE_LONGS); for (int i = 0; i < capacity; i++) { slots[i] = factory.get(); // Initial: sentinel "no sequence published here yet" -- anything < 0 works since // sequences are 0-based and monotonically increasing. - publishedSequences.set(i, Long.MIN_VALUE); + publishedSequences.set(i * CACHE_LINE_LONGS, Long.MIN_VALUE); } } @@ -175,7 +205,7 @@ public int drain(final Consumer handler) { while (true) { final long nextSeq = cursor + 1L; final int idx = (int) (nextSeq & mask); - if (publishedSequences.get(idx) != nextSeq) break; + if (publishedSequences.get(idx * CACHE_LINE_LONGS) != nextSeq) break; handler.accept(slots[idx]); cursor = nextSeq; count++; @@ -190,7 +220,7 @@ public int drain(final C context, final BiConsumer han while (true) { final long nextSeq = cursor + 1L; final int idx = (int) (nextSeq & mask); - if (publishedSequences.get(idx) != nextSeq) break; + if (publishedSequences.get(idx * CACHE_LINE_LONGS) != nextSeq) break; handler.accept(context, slots[idx]); cursor = nextSeq; count++; @@ -208,7 +238,7 @@ public int drain( while (true) { final long nextSeq = cursor + 1L; final int idx = (int) (nextSeq & mask); - if (publishedSequences.get(idx) != nextSeq) break; + if (publishedSequences.get(idx * CACHE_LINE_LONGS) != nextSeq) break; handler.accept(context1, context2, slots[idx]); cursor = nextSeq; count++; @@ -237,7 +267,7 @@ private long claim() { /** Mark sequence {@code seq} as published. Release semantics via {@link AtomicLongArray#set}. */ private void publish(final long seq) { - publishedSequences.set((int) (seq & mask), seq); + publishedSequences.set(((int) (seq & mask)) * CACHE_LINE_LONGS, seq); } private static int nextPowerOfTwo(final int n) { From ef046be94ed404d7a80423efbf87c0f62c14dfc6 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 13:47:00 -0400 Subject: [PATCH 15/27] Shrink tracer.metrics defaults at Xmx<128m for the ring buffer MpscRingBuffer pre-allocates one SpanSnapshot per slot at construction. At the default pending=2048 (logical) * 64 (LEGACY_BATCH_SIZE) = 131072 slots * ~120B = ~15 MB resident before any traffic arrives. At Xmx<128m that's 25%+ of the heap up front, starving Spring Boot / Tomcat and sending the JVM into a Full-GC death spiral (observed catastrophically at Xmx64m petclinic: 0 throughput, JVM unresponsive to readiness probes). Tighter defaults at <128m heap: maxAggregates 2048 -> 256, maxPending 2048 -> 64 (logical 64 * 64 batch = 4096 slots = ~500 KB upfront). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/main/java/datadog/trace/api/Config.java | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/internal-api/src/main/java/datadog/trace/api/Config.java b/internal-api/src/main/java/datadog/trace/api/Config.java index 07f10672273..56b72ff24b6 100644 --- a/internal-api/src/main/java/datadog/trace/api/Config.java +++ b/internal-api/src/main/java/datadog/trace/api/Config.java @@ -2184,7 +2184,19 @@ private Config(final ConfigProvider configProvider, final InstrumenterConfig ins configProvider.getBoolean(TRACE_STATS_COMPUTATION_IGNORE_AGENT_VERSION, false); tracerMetricsBufferingEnabled = configProvider.getBoolean(TRACER_METRICS_BUFFERING_ENABLED, false); - tracerMetricsMaxAggregates = configProvider.getInteger(TRACER_METRICS_MAX_AGGREGATES, 2048); + // The MpscRingBuffer pre-allocates one SpanSnapshot per slot at construction; at the default + // pending=2048 (logical) * 64 (LEGACY_BATCH_SIZE) = 131072 slots * ~120 bytes/SpanSnapshot, + // that's a ~15 MB resident footprint before any traffic arrives. At Xmx<128 MB this + // consumes 25%+ of the heap up front and starves Spring Boot / Tomcat, sending the JVM into + // a Full-GC death spiral (observed at Xmx64m petclinic). Shrink the defaults at tight heap + // so the upfront ring footprint stays under 1 MB. + final boolean tightHeap = Runtime.getRuntime().maxMemory() < 128L * 1024 * 1024; + final int defaultMaxAggregates = tightHeap ? 256 : 2048; + // 64 logical * 64 LEGACY_BATCH_SIZE = 4096 slots -> ~500 KB upfront SpanSnapshot footprint. + final int defaultMaxPending = tightHeap ? 64 : 2048; + + tracerMetricsMaxAggregates = + configProvider.getInteger(TRACER_METRICS_MAX_AGGREGATES, defaultMaxAggregates); /* * TRACER_METRICS_MAX_PENDING historically counted conflating Batch slots (~64 spans per batch * via Batch.MAX_BATCH_SIZE). The inbox now holds 1 SpanSnapshot per metrics-eligible span, so @@ -2199,7 +2211,8 @@ private Config(final ConfigProvider configProvider, final InstrumenterConfig ins * see java.util.ArraysSupport.SOFT_MAX_ARRAY_LENGTH for the same convention. */ long requestedMaxPending = - (long) configProvider.getInteger(TRACER_METRICS_MAX_PENDING, 2048) * LEGACY_BATCH_SIZE; + (long) configProvider.getInteger(TRACER_METRICS_MAX_PENDING, defaultMaxPending) + * LEGACY_BATCH_SIZE; tracerMetricsMaxPending = (int) Math.min(requestedMaxPending, MAX_SAFE_ARRAY_SIZE); reportHostName = From 525599ebb3e93670e0e9d15fb81b9a9fdf44558a Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 13:50:44 -0400 Subject: [PATCH 16/27] Cut default tracer.metrics.max.pending from 2048 to 128 The historical default (2048 logical * 64 LEGACY_BATCH_SIZE = 131072 slots) was sized for the on-demand batch-allocation model where slot memory was only realized under burst load. MpscRingBuffer pre-allocates every slot at construction, so 131072 means a ~15 MB upfront pin -- vs the sub-second of consumer-stall buffer it's actually expected to absorb (~80 ms of GC pause at 10K spans/s ~ 800 slots worth). New default: 128 logical * 64 = 8192 slots = ~1 MB upfront, ~0.8 s of buffer at typical load. Tight-heap default already at 64 logical (~500 KB / ~0.4 s). Explicit customer overrides on TRACER_METRICS_MAX_PENDING are unaffected -- the LEGACY_BATCH_SIZE multiplier still applies to them. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../main/java/datadog/trace/api/Config.java | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/internal-api/src/main/java/datadog/trace/api/Config.java b/internal-api/src/main/java/datadog/trace/api/Config.java index 56b72ff24b6..b90fcb0fd87 100644 --- a/internal-api/src/main/java/datadog/trace/api/Config.java +++ b/internal-api/src/main/java/datadog/trace/api/Config.java @@ -2184,16 +2184,20 @@ private Config(final ConfigProvider configProvider, final InstrumenterConfig ins configProvider.getBoolean(TRACE_STATS_COMPUTATION_IGNORE_AGENT_VERSION, false); tracerMetricsBufferingEnabled = configProvider.getBoolean(TRACER_METRICS_BUFFERING_ENABLED, false); - // The MpscRingBuffer pre-allocates one SpanSnapshot per slot at construction; at the default - // pending=2048 (logical) * 64 (LEGACY_BATCH_SIZE) = 131072 slots * ~120 bytes/SpanSnapshot, - // that's a ~15 MB resident footprint before any traffic arrives. At Xmx<128 MB this - // consumes 25%+ of the heap up front and starves Spring Boot / Tomcat, sending the JVM into - // a Full-GC death spiral (observed at Xmx64m petclinic). Shrink the defaults at tight heap - // so the upfront ring footprint stays under 1 MB. + // The MpscRingBuffer pre-allocates one SpanSnapshot per slot at construction, so capacity + // becomes a resident-RSS cost. The old TRACER_METRICS_MAX_PENDING default of 2048 (logical) + // was sized for the previous on-demand batch-allocation model; with the ring it would pin + // ~15 MB upfront for what only needs to absorb sub-second consumer stalls. Cut the default + // accordingly: + // - normal heap: 128 logical * 64 LEGACY_BATCH_SIZE = 8192 slots, ~1 MB upfront, ~0.8 s + // of buffer at 10K spans/s. Plenty of margin for GC pauses. + // - tight heap (<128 MB): 64 logical * 64 = 4096 slots, ~500 KB upfront. Observed + // catastrophic at Xmx64m with the prior 131072-slot default (Full-GC death spiral). + // Customers who explicitly configured TRACER_METRICS_MAX_PENDING keep their value (the + // LEGACY_BATCH_SIZE multiplier still applies to it) -- only the implicit default shrinks. final boolean tightHeap = Runtime.getRuntime().maxMemory() < 128L * 1024 * 1024; final int defaultMaxAggregates = tightHeap ? 256 : 2048; - // 64 logical * 64 LEGACY_BATCH_SIZE = 4096 slots -> ~500 KB upfront SpanSnapshot footprint. - final int defaultMaxPending = tightHeap ? 64 : 2048; + final int defaultMaxPending = tightHeap ? 64 : 128; tracerMetricsMaxAggregates = configProvider.getInteger(TRACER_METRICS_MAX_AGGREGATES, defaultMaxAggregates); From c31715f49cc2362ac1c88f64e19b3348fef04212 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 15:06:52 -0400 Subject: [PATCH 17/27] Reuse peerTagValues array across publishes via slot-owned scratch Producer-side fillSlot now treats slot.peerTagValues as a reusable scratch buffer owned by the ring: it's allocated only on first use or when the peer-tag schema's length changes. Subsequent tag-firing publishes on the same slot write into the existing array. When a publish has no peer tags, peerTagSchema is cleared but the scratch array is left in place for future reuse. For petclinic with ~50 unique span signatures and JDBC traffic, this eliminates ~30K String[] allocations/sec from the producer hot path after warmup. Consumer side: - AggregateEntry constructor copies peerTagValues by value at insert (Arrays.copyOf) so the entry's identity doesn't drift when the slot is reclaimed. Cache hits on findOrInsert pay no allocation. - AggregateEntry.matches() and hashOf() both gate the peerTagValues comparison/hash on peerTagSchema -- stale scratch carried across a no-tag publish is inert. Tests cover the slot-reuse scenario (stale scratch must not break match or hash) and the producer-array-reuse semantics (entry must snapshot peer tag values, not retain a live reference). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/common/metrics/AggregateEntry.java | 22 ++++++- .../metrics/ConflatingMetricsAggregator.java | 53 +++++++-------- .../common/metrics/AggregateTableTest.java | 65 +++++++++++++++++++ 3 files changed, 108 insertions(+), 32 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java index 8d6dc6b72d0..6b05ba46974 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -157,7 +157,11 @@ final class AggregateEntry extends Hashtable.Entry { this.synthetic = s.synthetic; this.traceRoot = s.traceRoot; this.peerTagNames = s.peerTagSchema == null ? null : s.peerTagSchema.names; - this.peerTagValues = s.peerTagValues; + // The slot's peerTagValues is a reusable scratch buffer owned by the ring -- the producer + // overwrites it on every claim of this slot. Snapshot it here by value so this entry's + // identity doesn't drift when the slot is reclaimed. + this.peerTagValues = + s.peerTagSchema == null ? null : Arrays.copyOf(s.peerTagValues, s.peerTagValues.length); this.peerTags = materializePeerTags(this.peerTagNames, this.peerTagValues); } @@ -242,6 +246,11 @@ void clear() { boolean matches(SpanSnapshot s) { String[] snapshotNames = s.peerTagSchema == null ? null : s.peerTagSchema.names; + // peerTagValues is meaningful only when peerTagNames is non-null. When the snapshot's slot + // had no peer tags for this publish (peerTagSchema=null), s.peerTagValues may still hold + // stale scratch from a prior tag-firing publish on the same slot -- skip the values check + // in that case. The names check above already rejects entries whose peer-tag presence + // doesn't match the snapshot. return httpStatusCode == s.httpStatusCode && synthetic == s.synthetic && traceRoot == s.traceRoot @@ -252,7 +261,7 @@ && contentEquals(serviceSource, s.serviceNameSource) && contentEquals(type, s.spanType) && contentEquals(spanKind, s.spanKind) && Arrays.equals(peerTagNames, snapshotNames) - && Arrays.equals(peerTagValues, s.peerTagValues) + && (peerTagNames == null || Arrays.equals(peerTagValues, s.peerTagValues)) && contentEquals(httpMethod, s.httpMethod) && contentEquals(httpEndpoint, s.httpEndpoint) && contentEquals(grpcStatusCode, s.grpcStatusCode); @@ -292,10 +301,17 @@ static long hashOf(SpanSnapshot s) { // Object[].hashCode is identity-based, which would let two snapshots with content-equal but // distinct PeerTagSchema instances hash to different buckets. Null inputs hash to 0 here, // distinct from {@code Arrays.hashCode(empty)} = 1 or any non-empty array. + // + // peerTagValues is gated by peerTagSchema: the slot's peerTagValues is a reusable scratch + // buffer that may carry stale contents from a prior tag-firing publish when this publish had + // no peer tags. Hash it only when the schema says it's meaningful, matching the matches() + // contract. h = LongHashingUtils.addToHash( h, s.peerTagSchema == null ? 0 : Arrays.hashCode(s.peerTagSchema.names)); - h = LongHashingUtils.addToHash(h, Arrays.hashCode(s.peerTagValues)); + h = + LongHashingUtils.addToHash( + h, s.peerTagSchema == null ? 0 : Arrays.hashCode(s.peerTagValues)); h = LongHashingUtils.addToHash(h, s.httpMethod); h = LongHashingUtils.addToHash(h, s.httpEndpoint); h = LongHashingUtils.addToHash(h, s.grpcStatusCode); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index b008cef3d85..713f1f97141 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -373,13 +373,31 @@ private void fillSlot(CoreSpan span, Boolean isTopLevelBoxed, SpanSnapshot sl long tagAndDuration = span.getDurationNano() | (error ? ERROR_TAG : 0L) | (isTopLevel ? TOP_LEVEL_TAG : 0L); + // Fill the slot's reusable peerTagValues scratch in place. The slot owns its array across + // publishes; we only allocate when the slot has none or the schema's length changed. When + // no tags fire, leave the scratch in place and clear peerTagSchema -- consumers gate on + // peerTagSchema (see AggregateEntry.matches / hashOf), so stale scratch contents are inert. PeerTagSchema peerTagSchema = peerTagSchemaFor(span); - String[] peerTagValues = - peerTagSchema == null ? null : capturePeerTagValues(span, peerTagSchema); - if (peerTagValues == null) { - // No tags fired -- drop the schema reference so the consumer doesn't bother iterating an - // all-null array. - peerTagSchema = null; + if (peerTagSchema != null) { + final String[] names = peerTagSchema.names; + final int n = names.length; + String[] valuesArr = slot.peerTagValues; + if (valuesArr == null || valuesArr.length != n) { + valuesArr = new String[n]; + slot.peerTagValues = valuesArr; + } + boolean anyHit = false; + for (int i = 0; i < n; i++) { + Object v = span.unsafeGetTag(names[i]); + String tag = v == null ? null : v.toString(); + valuesArr[i] = tag; + anyHit |= (tag != null); + } + if (!anyHit) { + // Schema fired no tags -- treat like the no-peer-tag case. Keep valuesArr alive on the + // slot for the next publish; nullifying peerTagSchema makes downstream gates skip it. + peerTagSchema = null; + } } slot.resourceName = span.getResourceName(); @@ -392,7 +410,6 @@ private void fillSlot(CoreSpan span, Boolean isTopLevelBoxed, SpanSnapshot sl slot.traceRoot = span.getParentId() == 0; slot.spanKind = spanKind; slot.peerTagSchema = peerTagSchema; - slot.peerTagValues = peerTagValues; slot.httpMethod = httpMethod; slot.httpEndpoint = httpEndpoint; slot.grpcStatusCode = grpcStatusCode; @@ -481,28 +498,6 @@ private void reconcilePeerTagSchema() { } } - /** - * Captures the span's peer-tag values into a {@code String[]} parallel to {@code schema.names}. - * Slots remain {@code null} for tags the span didn't set; the array itself is lazily allocated on - * the first hit so spans that fire no peer tags pay zero allocation. Returns {@code null} when - * none of the configured peer tags are set on the span. - */ - private static String[] capturePeerTagValues(CoreSpan span, PeerTagSchema schema) { - String[] names = schema.names; - int n = names.length; - String[] values = null; - for (int i = 0; i < n; i++) { - Object v = span.unsafeGetTag(names[i]); - if (v != null) { - if (values == null) { - values = new String[n]; - } - values[i] = v.toString(); - } - } - return values; - } - private static boolean isSynthetic(CoreSpan span) { return span.getOrigin() != null && SYNTHETICS_ORIGIN.equals(span.getOrigin().toString()); } diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java index 618ead2ab43..0f12e9b6a3a 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java @@ -266,6 +266,71 @@ void nullServiceAndSpanKindDoNotNpeAndCollapseWithEmpty() { assertEquals(0, first.getSpanKind().length(), "null spanKind should canonicalize to EMPTY"); } + @Test + void noPeerTagSnapshotMatchesEntryEvenWithStaleScratchPeerTagValues() { + // Models the ring-buffer slot-reuse pattern: a slot whose previous publish fired peer tags + // ends up reused for a span with no peer tags. The producer clears peerTagSchema but leaves + // peerTagValues populated (the array is the slot's reusable scratch). The consumer must + // gate on peerTagSchema and ignore stale scratch values for both hashOf and matches -- + // otherwise the slot's stale state would steer the snapshot to a different bucket or fail + // the field-by-field comparison, producing a duplicate aggregate. + AggregateTable table = new AggregateTable(8); + SpanSnapshot cleanNoPeerTags = snapshot("svc", "op", "internal"); + AggregateEntry inserted = table.findOrInsert(cleanNoPeerTags); + + SpanSnapshot withStaleScratch = snapshot("svc", "op", "internal"); + // Simulate stale scratch: same logical contents (no peer tags -> schema null), but the + // values array carries leftover data the slot's prior tag-firing publish wrote into it. + withStaleScratch.peerTagSchema = null; + withStaleScratch.peerTagValues = new String[] {"leftover.host", "leftover.service"}; + + AggregateEntry hit = table.findOrInsert(withStaleScratch); + assertSame(inserted, hit, "stale peerTagValues must be ignored when peerTagSchema is null"); + assertEquals(1, table.size()); + } + + @Test + void aggregateEntryCopiesPeerTagValuesSoSlotReuseDoesntCorruptIt() { + // The AggregateEntry must snapshot peerTagValues by value at insert time -- otherwise + // when the producer reuses the slot's array on the next publish, the entry's identity + // would silently change. Verify by mutating the source array after insert and confirming + // the entry still matches its original-content snapshot. + AggregateTable table = new AggregateTable(8); + String[] values = new String[] {"host1", "svc1"}; + SpanSnapshot s1 = snapshotWithPeerTags("svc", "op", "client", values); + AggregateEntry inserted = table.findOrInsert(s1); + + // Producer "reuses" the array: same reference, new contents. + values[0] = "host2"; + values[1] = "svc2"; + + // A FRESH snapshot with the original peer tag values should still match the entry. + SpanSnapshot s2 = snapshotWithPeerTags("svc", "op", "client", new String[] {"host1", "svc1"}); + AggregateEntry hit = table.findOrInsert(s2); + assertSame(inserted, hit, "entry's peer tag values must be a snapshot, not a live ref"); + } + + private static SpanSnapshot snapshotWithPeerTags( + String service, String operation, String spanKind, String[] peerTagValues) { + PeerTagSchema schema = PeerTagSchema.testSchema(new String[] {"peer.host", "peer.service"}); + return new SpanSnapshot( + "resource", + service, + operation, + null, + "web", + (short) 200, + false, + true, + spanKind, + schema, + peerTagValues, + null, + null, + null, + 0L); + } + private static SpanSnapshot nullServiceKindSnapshot(String service, String spanKind) { return new SpanSnapshot( "resource", From 7e6f49713b17ea9e3dc53ea2496610e392f1a232 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 16:17:22 -0400 Subject: [PATCH 18/27] Add MpscRingBuffer.tryClaim(n) batch-claim API A single CAS claims n contiguous slots, returning a Batch handle that the caller fills via fillAndPublish(slot...). Designed for callers whose work has a natural batch boundary (e.g. CSS publishing a trace's worth of metrics-eligible spans in one shot): cuts producer-cursor contention from O(N) CASes to O(1) per call. All-or-nothing: tryClaim(n) returns null if the ring can't fit the whole batch. The Batch is single-threaded (owned by the claiming thread), short-lived (scoped to one publish call), and has no thread-shutdown hazard -- the batch is fully consumed before returning. Filler-throw safety matches the existing tryWrite contract: the slot is published in a finally block so the consumer can advance, and the batch's published counter increments either way. Tests cover: requested size, capacity rejection, all-or-nothing, three filler overloads, over-publish IllegalStateException, throw recovery, and 8-producer concurrency (200 batches/thread x 16 size = 25600 items, single consumer sees every value exactly once). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/util/concurrent/MpscRingBuffer.java | 106 ++++++++++++ .../util/concurrent/MpscRingBufferTest.java | 162 ++++++++++++++++++ 2 files changed, 268 insertions(+) diff --git a/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java b/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java index b84ee54922f..cce1b46aa6f 100644 --- a/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java +++ b/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java @@ -247,6 +247,40 @@ public int drain( return count; } + /** + * Try to claim a contiguous range of {@code n} sequences in a single CAS. Returns {@code null} if + * the ring doesn't have room for the whole batch -- the caller treats that as "drop all {@code + * n}", which is the natural shape for callers that batch by a higher-level unit (e.g. one CSS + * publish per completed trace). When the caller has a list of N items to write, this amortizes + * producer-cursor contention from O(N) CASes to O(1) per call. + * + *

The returned {@link Batch} must be filled via {@link Batch#fillAndPublish} exactly {@code n} + * times. Under-publishing leaves the ring stuck at the unfilled sequence -- the consumer waits + * there forever. Over-publishing throws {@link IllegalStateException}. + * + * @throws IllegalArgumentException if {@code n < 1} + */ + public Batch tryClaim(final int n) { + if (n < 1) { + throw new IllegalArgumentException("n must be >= 1, got " + n); + } + while (true) { + final long current = producerCursor; + // Stale read of consumerCursor is fine: a false "full" reading just causes a drop, and a + // real one is correctly identified because consumerCursor only advances. + final long consumed = consumerCursor; + final long next = current + n; + if (next - consumed > capacity) { + return null; + } + if (PRODUCER_CURSOR.compareAndSet(this, current, next)) { + // Claimed sequences [current + 1, next] inclusive (== n sequences total). + return new Batch(current + 1L, n); + } + // CAS failure -> another producer claimed; retry. + } + } + /** CAS-claim the next sequence, or return {@code -1} if the ring is full. */ private long claim() { while (true) { @@ -265,6 +299,78 @@ private long claim() { } } + /** + * Handle returned by {@link MpscRingBuffer#tryClaim}. Holds a contiguous range of pre-claimed + * sequences belonging to the producer thread that called {@code tryClaim}; the caller must fill + * and publish each via {@link #fillAndPublish}. + * + *

Not thread-safe -- the producer thread owns it for the lifetime of the call. Do not + * share across threads. + */ + public final class Batch { + private final long startSeq; + private final int size; + private int published; + + Batch(final long startSeq, final int size) { + this.startSeq = startSeq; + this.size = size; + } + + /** Total slots in this batch (the {@code n} passed to {@code tryClaim}). */ + public int size() { + return size; + } + + /** Slots not yet filled. */ + public int remaining() { + return size - published; + } + + public void fillAndPublish(final Consumer filler) { + final long seq = nextSeq(); + final int idx = (int) (seq & mask); + try { + filler.accept(slots[idx]); + } finally { + publishedSequences.set(idx * CACHE_LINE_LONGS, seq); + } + } + + public void fillAndPublish(final C context, final BiConsumer filler) { + final long seq = nextSeq(); + final int idx = (int) (seq & mask); + try { + filler.accept(context, slots[idx]); + } finally { + publishedSequences.set(idx * CACHE_LINE_LONGS, seq); + } + } + + public void fillAndPublish( + final C1 context1, + final C2 context2, + final TriConsumer filler) { + final long seq = nextSeq(); + final int idx = (int) (seq & mask); + try { + filler.accept(context1, context2, slots[idx]); + } finally { + publishedSequences.set(idx * CACHE_LINE_LONGS, seq); + } + } + + private long nextSeq() { + if (published >= size) { + throw new IllegalStateException( + "Batch over-published: size=" + size + " published=" + published); + } + final long seq = startSeq + published; + published++; + return seq; + } + } + /** Mark sequence {@code seq} as published. Release semantics via {@link AtomicLongArray#set}. */ private void publish(final long seq) { publishedSequences.set(((int) (seq & mask)) * CACHE_LINE_LONGS, seq); diff --git a/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java b/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java index d48c872883f..48ab2ea4415 100644 --- a/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java +++ b/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java @@ -2,6 +2,8 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertSame; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -276,4 +278,164 @@ void throwingFillerStillPublishesSoConsumerDoesntHang() { assertEquals(3, drained, "consumer must advance past the throwing slot"); assertEquals(Arrays.asList(1, 2, 3), seen, "throwing slot keeps whatever filler had written"); } + + // ============ Batch claim (tryClaim) ============ + + @Test + void tryClaimReturnsBatchOfRequestedSize() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + MpscRingBuffer.Batch batch = ring.tryClaim(3); + + assertNotNull(batch); + assertEquals(3, batch.size()); + assertEquals(3, batch.remaining()); + } + + @Test + void tryClaimRejectsZeroOrNegative() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + assertThrows(IllegalArgumentException.class, () -> ring.tryClaim(0)); + assertThrows(IllegalArgumentException.class, () -> ring.tryClaim(-1)); + } + + @Test + void tryClaimReturnsNullWhenRingCantFitBatch() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + assertNotNull(ring.tryClaim(3)); + // Only 1 slot left; claiming 2 must fail wholesale. + assertNull(ring.tryClaim(2), "all-or-nothing: partial batches are not allowed"); + // But one more slot does fit. + assertNotNull(ring.tryClaim(1)); + // Now full. + assertNull(ring.tryClaim(1)); + } + + @Test + void tryClaimFillAndPublishDeliversAllToDrain() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + MpscRingBuffer.Batch batch = ring.tryClaim(5); + + for (int i = 0; i < 5; i++) { + final int v = i; + batch.fillAndPublish(s -> s.value = v); + } + assertEquals(0, batch.remaining()); + + List seen = new ArrayList<>(); + int drained = ring.drain(s -> seen.add(s.value)); + assertEquals(5, drained); + assertEquals(Arrays.asList(0, 1, 2, 3, 4), seen, "batch publishes in order"); + } + + @Test + void overPublishingBatchThrows() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + MpscRingBuffer.Batch batch = ring.tryClaim(2); + + batch.fillAndPublish(s -> s.value = 1); + batch.fillAndPublish(s -> s.value = 2); + assertThrows(IllegalStateException.class, () -> batch.fillAndPublish(s -> s.value = 3)); + } + + @Test + void batchSupportsContextFillers() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + MpscRingBuffer.Batch batch = ring.tryClaim(3); + + BiConsumer oneCtx = (v, s) -> s.value = v; + TriConsumer twoCtx = + (v, t, s) -> { + s.value = v; + s.tag = t; + }; + + batch.fillAndPublish(s -> s.value = 1); + batch.fillAndPublish(2, oneCtx); + batch.fillAndPublish(3, "three", twoCtx); + + List seen = new ArrayList<>(); + ring.drain(s -> seen.add(s.value + "/" + s.tag)); + assertEquals(Arrays.asList("1/null", "2/null", "3/three"), seen); + } + + @Test + void batchFillerThrowStillPublishesAndAdvances() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + MpscRingBuffer.Batch batch = ring.tryClaim(3); + + batch.fillAndPublish(s -> s.value = 1); + RuntimeException boom = new RuntimeException("boom"); + assertThrows( + RuntimeException.class, + () -> + batch.fillAndPublish( + s -> { + s.value = 2; + throw boom; + })); + // The throwing slot's sequence has already been consumed; published counter advanced. + assertEquals(1, batch.remaining(), "throwing slot still counts as published"); + batch.fillAndPublish(s -> s.value = 3); + + List seen = new ArrayList<>(); + int drained = ring.drain(s -> seen.add(s.value)); + assertEquals(3, drained); + assertEquals(Arrays.asList(1, 2, 3), seen); + } + + @Test + void concurrentBatchClaimsAreOrderedAndDontInterleave() throws InterruptedException { + final int producers = 8; + final int batchesPerProducer = 200; + final int batchSize = 16; + final int total = producers * batchesPerProducer * batchSize; + + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 256); + ExecutorService pool = Executors.newFixedThreadPool(producers); + AtomicInteger writes = new AtomicInteger(); + CountDownLatch start = new CountDownLatch(1); + + for (int p = 0; p < producers; p++) { + final int base = p * batchesPerProducer * batchSize; + pool.submit( + () -> { + try { + start.await(); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + return; + } + for (int b = 0; b < batchesPerProducer; b++) { + MpscRingBuffer.Batch batch; + while ((batch = ring.tryClaim(batchSize)) == null) { + Thread.yield(); + } + for (int i = 0; i < batchSize; i++) { + final int v = base + b * batchSize + i; + batch.fillAndPublish(s -> s.value = v); + } + writes.addAndGet(batchSize); + } + }); + } + + Set seen = new HashSet<>(total); + Thread consumer = + new Thread( + () -> { + while (seen.size() < total) { + if (ring.drain((Slot s) -> seen.add(s.value)) == 0) Thread.yield(); + } + }, + "ring-batch-consumer"); + consumer.start(); + + start.countDown(); + pool.shutdown(); + assertTrue(pool.awaitTermination(30, TimeUnit.SECONDS), "producers timed out"); + consumer.join(30_000); + assertFalse(consumer.isAlive(), "consumer timed out"); + assertEquals(total, writes.get()); + assertEquals(total, seen.size(), "consumer must see every value exactly once"); + } } From 3324952112502de8d6caa10230fcee9ef5b7c59a Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 16:19:51 -0400 Subject: [PATCH 19/27] Use Batch claim for CSS publish(trace) Leverage PendingTrace's natural batching: the producer pre-counts metrics-eligible spans in trace order (with the ignored-resource short-circuit), then claims all of their ring slots in a single CAS via MpscRingBuffer.tryClaim(n). A second pass fills the batch. Cuts producer-cursor contention from O(N) CASes per trace to O(1). For typical traces with 3-5 metrics-eligible spans, that's a 3-5x reduction in atomic ops on the hot field -- bigger for traces with many spans. All-or-nothing semantics: a trace either gets its metrics queued or doesn't. Slight semantic change from before (previously, a partially full inbox could publish some of a trace's spans before dropping the rest); the new behavior is cleaner for aggregation since partial-trace metrics are misleading. No ThreadLocal, no claim leak: the batch is fully consumed before publish() returns. forceKeep is now computed during the count pass (any-span-error) so the publish behavior on inbox-full mirrors the prior force-keep semantics for callers. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../metrics/ConflatingMetricsAggregator.java | 75 +++++++++++-------- 1 file changed, 43 insertions(+), 32 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index 713f1f97141..13498f62f09 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -293,24 +293,54 @@ public Future forceReport() { @Override public boolean publish(List> trace) { + if (!features.supportsMetrics()) { + return false; + } + + // First pass: count metrics-eligible spans, detect the ignored-resource short-circuit, and + // compute forceKeep (any-error). Doing this pre-claim lets us claim the whole trace's slots + // in a single CAS instead of one per span. boolean forceKeep = false; - int counted = 0; - if (features.supportsMetrics()) { - for (CoreSpan span : trace) { - boolean isTopLevel = span.isTopLevel(); - if (shouldComputeMetric(span, isTopLevel)) { - final CharSequence resourceName = span.getResourceName(); - if (resourceName != null && ignoredResources.contains(resourceName.toString())) { - // skip publishing all children - forceKeep = false; - break; + int eligibleCount = 0; + for (CoreSpan span : trace) { + boolean isTopLevel = span.isTopLevel(); + if (shouldComputeMetric(span, isTopLevel)) { + final CharSequence resourceName = span.getResourceName(); + if (resourceName != null && ignoredResources.contains(resourceName.toString())) { + // Ignored-resource span: drop metrics for the whole trace. + forceKeep = false; + eligibleCount = 0; + break; + } + eligibleCount++; + forceKeep |= span.getError() > 0; + } + } + + if (eligibleCount > 0) { + // All-or-nothing claim. If the ring can't fit the whole trace's worth of slots, we drop + // all of them rather than partially publish -- partial-trace metrics would be misleading + // anyway, and atomic claim cuts producer-cursor contention from O(eligibleCount) to O(1). + final MpscRingBuffer.Batch batch = dataInbox.tryClaim(eligibleCount); + if (batch == null) { + healthMetrics.onStatsInboxFull(); + } else { + // Second pass: fill the claimed batch. Same filter conditions as the first pass. + for (CoreSpan span : trace) { + boolean isTopLevel = span.isTopLevel(); + if (shouldComputeMetric(span, isTopLevel)) { + batch.fillAndPublish(span, isTopLevel, slotFiller); + if (batch.remaining() == 0) { + // Stop early in the unusual case that the second pass would see more eligible + // spans than the first (e.g. if shouldComputeMetric changed semantics mid-trace). + break; + } } - counted++; - forceKeep |= publish(span, isTopLevel); } } - healthMetrics.onClientStatTraceComputed(counted, trace.size(), !forceKeep); } + + healthMetrics.onClientStatTraceComputed(eligibleCount, trace.size(), !forceKeep); return forceKeep; } @@ -321,25 +351,6 @@ private boolean shouldComputeMetric(CoreSpan span, boolean isTopLevel) { && span.getDurationNano() > 0; } - private boolean publish(CoreSpan span, boolean isTopLevel) { - // Error decision drives force-keep sampling regardless of whether the snapshot gets queued. - boolean error = span.getError() > 0; - - // Fast-path the inbox-full case before tag extraction. size() is approximate but that's fine: - // a false "full" reading just causes a drop, and a real one is correctly identified. - if (dataInbox.size() >= dataInbox.capacity()) { - healthMetrics.onStatsInboxFull(); - return error; - } - - // tryWrite claims a slot, runs slotFiller to populate it in place, and publishes. No - // SpanSnapshot is allocated -- the slot is one of the ring's pre-allocated instances. - if (!dataInbox.tryWrite(span, isTopLevel, slotFiller)) { - healthMetrics.onStatsInboxFull(); - } - return error; - } - /** * Producer-side slot fill. Runs inside {@link MpscRingBuffer#tryWrite} after the producer has * claimed a sequence but before the slot is visible to the aggregator. Reads from {@code span} From 780a20428af3690682b25c104b990d52e230f196 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 16:28:52 -0400 Subject: [PATCH 20/27] Add low-level tryClaimRange/slotAt/publish primitives to MpscRingBuffer The Batch handle from tryClaim was supposed to be scalar-replaced by escape analysis, but JMH measurements showed it's not -- the inner-class implicit this$0 plus the CAS-retry inside tryClaim block scalarization on HotSpot. Result: ~24 bytes of Batch + cursor state allocated per publish on the hot path, ~50% throughput drop on single-element claims in CSS-style benches. Add three sequence-based primitives that callers manage directly: long tryClaimRange(int n) -> start sequence or -1L T slotAt(long seq) -> slot for that sequence void publish(long seq) -> release the slot to the consumer No per-call allocation, no callback dispatch. Callers handle the sequence arithmetic themselves and trade safety (forget to publish -> ring stuck) for hot-path predictability. The Batch API stays for safer use cases. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/util/concurrent/MpscRingBuffer.java | 52 ++++++++++++++++- .../util/concurrent/MpscRingBufferTest.java | 57 +++++++++++++++++++ 2 files changed, 107 insertions(+), 2 deletions(-) diff --git a/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java b/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java index cce1b46aa6f..4cefa43cac0 100644 --- a/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java +++ b/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java @@ -371,8 +371,56 @@ private long nextSeq() { } } - /** Mark sequence {@code seq} as published. Release semantics via {@link AtomicLongArray#set}. */ - private void publish(final long seq) { + // ============ Low-level primitives ============ + // + // These three methods (tryClaimRange / slotAt / publish) expose the producer-side machinery + // directly. Callers manage the claimed sequence range themselves -- no per-call handle + // allocated, no callback dispatched. This is intended for hot paths where the higher-level + // tryWrite/tryClaim+Batch APIs would allocate per call (Iterator on the iterable being + // batched, or the Batch object itself) that escape analysis doesn't eliminate. + // + // Misuse hazards: every claimed sequence MUST be published exactly once; sequences must be + // published in [start, start+n) range only; the slot returned by slotAt() must not escape past + // the next publish() of any sequence (the producer "owns" the slot until publish). + + /** + * Try to claim a contiguous range of {@code n} sequences in a single CAS. Returns the + * start sequence on success (a non-negative long) or {@code -1L} if the ring doesn't have + * room for the whole batch. The caller must publish each sequence in {@code [start, start + n)} + * exactly once via {@link #publish(long)}. + * + * @throws IllegalArgumentException if {@code n < 1} + */ + public long tryClaimRange(final int n) { + if (n < 1) { + throw new IllegalArgumentException("n must be >= 1, got " + n); + } + while (true) { + final long current = producerCursor; + final long consumed = consumerCursor; + final long next = current + n; + if (next - consumed > capacity) { + return -1L; + } + if (PRODUCER_CURSOR.compareAndSet(this, current, next)) { + return current + 1L; + } + } + } + + /** + * Slot for sequence {@code seq}. Only safe when {@code seq} is in a range currently owned by the + * caller (claimed via {@link #tryClaimRange} and not yet published). + */ + public T slotAt(final long seq) { + return slots[(int) (seq & mask)]; + } + + /** + * Publish sequence {@code seq}, making the slot at {@code slotAt(seq)} visible to the consumer. + * Release semantics via {@link AtomicLongArray#set}. + */ + public void publish(final long seq) { publishedSequences.set(((int) (seq & mask)) * CACHE_LINE_LONGS, seq); } diff --git a/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java b/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java index 48ab2ea4415..6aeb635abd0 100644 --- a/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java +++ b/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java @@ -383,6 +383,63 @@ void batchFillerThrowStillPublishesAndAdvances() { assertEquals(Arrays.asList(1, 2, 3), seen); } + // ============ Low-level primitives (tryClaimRange / slotAt / publish) ============ + + @Test + void tryClaimRangeReturnsStartSequence() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + long start1 = ring.tryClaimRange(3); + long start2 = ring.tryClaimRange(2); + + assertEquals(0L, start1, "first range starts at sequence 0"); + assertEquals(3L, start2, "second range begins immediately after the first"); + } + + @Test + void tryClaimRangeRejectsZeroOrNegative() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + assertThrows(IllegalArgumentException.class, () -> ring.tryClaimRange(0)); + assertThrows(IllegalArgumentException.class, () -> ring.tryClaimRange(-1)); + } + + @Test + void tryClaimRangeReturnsMinusOneWhenFull() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + assertTrue(ring.tryClaimRange(3) >= 0); + assertEquals(-1L, ring.tryClaimRange(2), "all-or-nothing"); + assertTrue(ring.tryClaimRange(1) >= 0); + assertEquals(-1L, ring.tryClaimRange(1)); + } + + @Test + void slotAtAndPublishRoundTrip() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + long start = ring.tryClaimRange(3); + assertTrue(start >= 0); + + for (int i = 0; i < 3; i++) { + long seq = start + i; + Slot slot = ring.slotAt(seq); + slot.value = (int) (seq + 100); + ring.publish(seq); + } + + List seen = new ArrayList<>(); + int drained = ring.drain(s -> seen.add(s.value)); + assertEquals(3, drained); + assertEquals(Arrays.asList(100, 101, 102), seen); + } + + @Test + void slotAtReturnsSameInstanceForSameModuloPosition() { + // After publish+drain wraps around, the slot at sequence N and sequence N+capacity are the + // same physical object (this is the whole point of the ring). + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + Slot firstSlot = ring.slotAt(0L); + Slot wrappedSlot = ring.slotAt(4L); // 4 & mask(3) == 0 + assertSame(firstSlot, wrappedSlot); + } + @Test void concurrentBatchClaimsAreOrderedAndDontInterleave() throws InterruptedException { final int producers = 8; From 32c69d11150b2d86497692c18da32bae27901ef4 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 16:32:08 -0400 Subject: [PATCH 21/27] Use MpscRingBuffer low-level primitives in publish(trace) The previous Batch handle wasn't being scalar-replaced -- visible as a ~50% throughput regression on single-element-claim benches due to per- publish allocations: - Batch object (~24 B, inner class with implicit this$0) - Iterator from the enhanced-for loop (~24 B per pass, two passes) Switch publish(trace) to: 1. Indexed iteration (trace.get(i)) -- kills the Iterator allocations. Trace lists are SpanList / singletonList, both with O(1) get(i). 2. tryClaimRange + slotAt + publish primitives -- no Batch handle. 3. fillSlot takes primitive `boolean isTopLevel` (no autoboxing now that it isn't dispatched through TriConsumer). The slotFiller TriConsumer field is removed -- direct method call from the loop is cheaper than the indirect dispatch. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../metrics/ConflatingMetricsAggregator.java | 57 ++++++++++--------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index 13498f62f09..c47a393f9c4 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -84,15 +84,6 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve */ private final MessagePassingQueue signalInbox; - /** - * Slot filler captured as an instance field so the lambda is allocated once at construction. The - * method reference closes over {@code this}, but the field is per-aggregator so context lives in - * the {@code (CoreSpan, isTopLevel)} arguments passed by the producer at each call -- no - * per-publish capture. - */ - private final datadog.trace.api.function.TriConsumer, Boolean, SpanSnapshot> - slotFiller = this::fillSlot; - private final Sink sink; private final Aggregator aggregator; private final long reportingInterval; @@ -297,12 +288,18 @@ public boolean publish(List> trace) { return false; } + // Indexed iteration (vs. enhanced-for) avoids per-call Iterator allocation. trace is + // typically an ArrayList-shaped structure (SpanList) so get(i) is O(1); even + // Collections.singletonList wins here because its iterator() allocates. + final int traceSize = trace.size(); + // First pass: count metrics-eligible spans, detect the ignored-resource short-circuit, and // compute forceKeep (any-error). Doing this pre-claim lets us claim the whole trace's slots - // in a single CAS instead of one per span. + // in a single CAS via tryClaimRange instead of one per span. boolean forceKeep = false; int eligibleCount = 0; - for (CoreSpan span : trace) { + for (int i = 0; i < traceSize; i++) { + CoreSpan span = trace.get(i); boolean isTopLevel = span.isTopLevel(); if (shouldComputeMetric(span, isTopLevel)) { final CharSequence resourceName = span.getResourceName(); @@ -320,19 +317,28 @@ public boolean publish(List> trace) { if (eligibleCount > 0) { // All-or-nothing claim. If the ring can't fit the whole trace's worth of slots, we drop // all of them rather than partially publish -- partial-trace metrics would be misleading - // anyway, and atomic claim cuts producer-cursor contention from O(eligibleCount) to O(1). - final MpscRingBuffer.Batch batch = dataInbox.tryClaim(eligibleCount); - if (batch == null) { + // anyway, and the atomic range claim cuts producer-cursor contention from O(eligibleCount) + // CASes to one CAS per trace. Uses the low-level primitives so escape analysis isn't + // relied on to elide a per-publish Batch handle. + final long startSeq = dataInbox.tryClaimRange(eligibleCount); + if (startSeq < 0L) { healthMetrics.onStatsInboxFull(); } else { - // Second pass: fill the claimed batch. Same filter conditions as the first pass. - for (CoreSpan span : trace) { + // Second pass: fill each claimed slot. Same filter conditions as the first pass. + int filled = 0; + for (int i = 0; i < traceSize; i++) { + CoreSpan span = trace.get(i); boolean isTopLevel = span.isTopLevel(); if (shouldComputeMetric(span, isTopLevel)) { - batch.fillAndPublish(span, isTopLevel, slotFiller); - if (batch.remaining() == 0) { - // Stop early in the unusual case that the second pass would see more eligible - // spans than the first (e.g. if shouldComputeMetric changed semantics mid-trace). + long seq = startSeq + filled; + try { + fillSlot(span, isTopLevel, dataInbox.slotAt(seq)); + } finally { + // Always publish so a throwing fillSlot can't strand the consumer; matches the + // try/finally publish-on-throw guarantee in MpscRingBuffer.tryWrite. + dataInbox.publish(seq); + } + if (++filled == eligibleCount) { break; } } @@ -352,13 +358,12 @@ private boolean shouldComputeMetric(CoreSpan span, boolean isTopLevel) { } /** - * Producer-side slot fill. Runs inside {@link MpscRingBuffer#tryWrite} after the producer has - * claimed a sequence but before the slot is visible to the aggregator. Reads from {@code span} - * and instance state, writes every field on {@code slot} (including {@code null} where - * applicable) so stale values from a prior occupant of this slot don't bleed through. + * Producer-side slot fill. Called between a {@link MpscRingBuffer#tryClaimRange} and the matching + * {@link MpscRingBuffer#publish}: reads from {@code span} and instance state, writes every field + * on {@code slot} (including {@code null} where applicable) so stale values from a prior occupant + * of this slot don't bleed through. */ - private void fillSlot(CoreSpan span, Boolean isTopLevelBoxed, SpanSnapshot slot) { - final boolean isTopLevel = isTopLevelBoxed; + private void fillSlot(CoreSpan span, boolean isTopLevel, SpanSnapshot slot) { final boolean error = span.getError() > 0; // Extract HTTP method and endpoint only if the feature is enabled. From e4277d0ec789cca81e25954f588acb4349df4c4c Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 16:32:27 -0400 Subject: [PATCH 22/27] Fix stale Javadoc reference to removed slotFiller field Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/common/metrics/ConflatingMetricsAggregator.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index c47a393f9c4..b93c37d1a40 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -72,7 +72,8 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve /** * Producer/consumer data channel: a {@link MpscRingBuffer} of pre-allocated, recyclable {@link - * SpanSnapshot} slots. Producers mutate slots in place via {@link #slotFiller} -- no per-publish + * SpanSnapshot} slots. {@link #publish(List)} claims a range of slots in one CAS via {@link + * MpscRingBuffer#tryClaimRange} and fills each in place via {@link #fillSlot} -- no per-publish * allocation. */ private final MpscRingBuffer dataInbox; From fb14c627858ed57fa050d7ae53eba8ac0d511167 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 16:46:32 -0400 Subject: [PATCH 23/27] Overclaim + skip-sentinel in CSS publish(trace): one pass, no Batch publish(trace) used to do two passes -- count eligible spans, claim exactly that many, fill them. The count pass duplicated shouldComputeMetric (plus isTopLevel, error, ignored-resource checks) for every span, costing ~2x the producer-side work per span on single-element-trace JMH benches. Single pass instead: claim traceSize slots up front, fill the eligible ones in place, mark non-eligible slots with tagAndDuration=0 as a "skip" sentinel. The aggregator's handleSnapshot recognises 0 (which shouldComputeMetric already rejects for any real publish) and bypasses findOrInsert for those slots. For petclinic-typical traces where most spans are CSS-eligible, the slot waste is small (~20%) and the producer-side win dominates. The ConflatingMetricAggregatorTest queueSize was sized for the old per-eligible-span claim; bumped to 1024 to give multi-span tests adequate headroom. ConflatingMetricsAggregatorInboxFullTest keeps its queueSize=8 (deliberately exercises full-ring behaviour). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/common/metrics/Aggregator.java | 7 ++ .../metrics/ConflatingMetricsAggregator.java | 95 ++++++++++--------- .../ConflatingMetricAggregatorTest.groovy | 7 +- 3 files changed, 61 insertions(+), 48 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index 25574e8e154..215b8c0500c 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -126,6 +126,13 @@ public void run() { } private static void handleSnapshot(Aggregator agg, SpanSnapshot snapshot) { + // Skip-sentinel from the producer's overclaim path: the producer claimed this slot + // speculatively (one slot per span in the trace) but the span turned out to be non-eligible + // or the trace hit an ignored-resource. shouldComputeMetric guarantees duration > 0 for any + // real publish, so tagAndDuration == 0 uniquely identifies a skip. + if (snapshot.tagAndDuration == 0L) { + return; + } AggregateEntry entry = agg.aggregates.findOrInsert(snapshot); if (entry != null) { entry.recordOneDuration(snapshot.tagAndDuration); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index b93c37d1a40..160ef363e72 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -288,66 +288,67 @@ public boolean publish(List> trace) { if (!features.supportsMetrics()) { return false; } - - // Indexed iteration (vs. enhanced-for) avoids per-call Iterator allocation. trace is - // typically an ArrayList-shaped structure (SpanList) so get(i) is O(1); even - // Collections.singletonList wins here because its iterator() allocates. final int traceSize = trace.size(); + if (traceSize == 0) { + return false; + } + + // Overclaim the worst case: assume every span in the trace is metrics-eligible and claim + // that many ring slots in a single CAS. Non-eligible spans get a "skip" sentinel + // (tagAndDuration = 0L); the aggregator recognises this and bypasses findOrInsert for those + // slots. We expect most spans in a typical trace to be eligible (top-level / measured / + // server-client-producer-consumer kinds), so the slot waste is small in exchange for a + // single-pass producer loop. Indexed iteration (vs. enhanced-for) avoids per-call Iterator + // allocation -- trace is typically a SpanList, but even Collections.singletonList's + // iterator() allocates. + final long startSeq = dataInbox.tryClaimRange(traceSize); + if (startSeq < 0L) { + healthMetrics.onStatsInboxFull(); + return false; + } - // First pass: count metrics-eligible spans, detect the ignored-resource short-circuit, and - // compute forceKeep (any-error). Doing this pre-claim lets us claim the whole trace's slots - // in a single CAS via tryClaimRange instead of one per span. boolean forceKeep = false; - int eligibleCount = 0; + int filled = 0; + for (int i = 0; i < traceSize; i++) { - CoreSpan span = trace.get(i); - boolean isTopLevel = span.isTopLevel(); + final CoreSpan span = trace.get(i); + final long seq = startSeq + i; + final SpanSnapshot slot = dataInbox.slotAt(seq); + final boolean isTopLevel = span.isTopLevel(); + if (shouldComputeMetric(span, isTopLevel)) { final CharSequence resourceName = span.getResourceName(); if (resourceName != null && ignoredResources.contains(resourceName.toString())) { - // Ignored-resource span: drop metrics for the whole trace. - forceKeep = false; - eligibleCount = 0; - break; + // Ignored resource: drop metrics for the whole trace. Mark this slot AND every + // remaining slot in the claimed range as skip-sentinel so the aggregator advances past + // them. + slot.tagAndDuration = 0L; + dataInbox.publish(seq); + for (int j = i + 1; j < traceSize; j++) { + final long skipSeq = startSeq + j; + dataInbox.slotAt(skipSeq).tagAndDuration = 0L; + dataInbox.publish(skipSeq); + } + healthMetrics.onClientStatTraceComputed(0, traceSize, true); + return false; + } + try { + fillSlot(span, isTopLevel, slot); + } finally { + // Publish in finally so a throwing fillSlot can't strand the consumer at this + // sequence -- matches the publish-on-throw guarantee in MpscRingBuffer.tryWrite. + dataInbox.publish(seq); } - eligibleCount++; forceKeep |= span.getError() > 0; - } - } - - if (eligibleCount > 0) { - // All-or-nothing claim. If the ring can't fit the whole trace's worth of slots, we drop - // all of them rather than partially publish -- partial-trace metrics would be misleading - // anyway, and the atomic range claim cuts producer-cursor contention from O(eligibleCount) - // CASes to one CAS per trace. Uses the low-level primitives so escape analysis isn't - // relied on to elide a per-publish Batch handle. - final long startSeq = dataInbox.tryClaimRange(eligibleCount); - if (startSeq < 0L) { - healthMetrics.onStatsInboxFull(); + filled++; } else { - // Second pass: fill each claimed slot. Same filter conditions as the first pass. - int filled = 0; - for (int i = 0; i < traceSize; i++) { - CoreSpan span = trace.get(i); - boolean isTopLevel = span.isTopLevel(); - if (shouldComputeMetric(span, isTopLevel)) { - long seq = startSeq + filled; - try { - fillSlot(span, isTopLevel, dataInbox.slotAt(seq)); - } finally { - // Always publish so a throwing fillSlot can't strand the consumer; matches the - // try/finally publish-on-throw guarantee in MpscRingBuffer.tryWrite. - dataInbox.publish(seq); - } - if (++filled == eligibleCount) { - break; - } - } - } + // Non-eligible span: mark slot as skip-sentinel and publish so the aggregator advances. + slot.tagAndDuration = 0L; + dataInbox.publish(seq); } } - healthMetrics.onClientStatTraceComputed(eligibleCount, trace.size(), !forceKeep); + healthMetrics.onClientStatTraceComputed(filled, traceSize, !forceKeep); return forceKeep; } diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy index 271578c8689..193d46b18bd 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy @@ -27,7 +27,12 @@ class ConflatingMetricAggregatorTest extends DDSpecification { @Shared long reportingInterval = 1 @Shared - int queueSize = 256 + // The CSS overclaim path claims traceSize slots per publish (not just the eligible count), + // so tests with multi-span traces and tight ring sizes can spuriously hit capacity. 1024 leaves + // plenty of headroom for the largest test traces (~10 spans) across hundreds of iterations. + // ConflatingMetricsAggregatorInboxFullTest deliberately keeps a tiny ring to exercise full-ring + // behaviour. + int queueSize = 1024 def "should ignore traces with no measured spans"() { setup: From e29b802122f2eaad694afe2f1f04ce18400d0748 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 17:18:09 -0400 Subject: [PATCH 24/27] Use PendingTrace pre-counted eligible-span count for exact CSS ring claim PendingTrace#enqueueSpansToWrite already iterates every span as it assembles the write list. Piggyback on that iteration: call ConflatingMetricsAggregator.shouldComputeMetric per finished span, sum the eligibles, and attach the count to the SpanList via a new setMetricEligibleCount/getMetricEligibleCount pair (default -1 means unset). CSS publish(trace) now reads getMetricEligibleCount when the trace is a SpanList with a known count, and sizes its single ring claim exactly. No overclaim, no skip-sentinel writes for ineligible slots, no second pass to count. When the count is unknown (e.g. a TraceInterceptor rebuilt the list into a fresh SpanList) the path falls back to overclaim + skip-sentinel, preserving correctness. shouldComputeMetric in ConflatingMetricsAggregator is promoted from private instance method to public static -- it's a pure span-local predicate. PendingTrace calls it directly. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../metrics/ConflatingMetricsAggregator.java | 64 +++++++++++-------- .../java/datadog/trace/core/PendingTrace.java | 13 ++++ .../java/datadog/trace/core/SpanList.java | 20 +++++- 3 files changed, 71 insertions(+), 26 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index 160ef363e72..1c6a83cd9bf 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -26,6 +26,7 @@ import datadog.trace.core.CoreSpan; import datadog.trace.core.DDTraceCoreInfo; import datadog.trace.core.SpanKindFilter; +import datadog.trace.core.SpanList; import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.util.AgentTaskScheduler; import datadog.trace.util.concurrent.MpscRingBuffer; @@ -293,38 +294,41 @@ public boolean publish(List> trace) { return false; } - // Overclaim the worst case: assume every span in the trace is metrics-eligible and claim - // that many ring slots in a single CAS. Non-eligible spans get a "skip" sentinel - // (tagAndDuration = 0L); the aggregator recognises this and bypasses findOrInsert for those - // slots. We expect most spans in a typical trace to be eligible (top-level / measured / - // server-client-producer-consumer kinds), so the slot waste is small in exchange for a - // single-pass producer loop. Indexed iteration (vs. enhanced-for) avoids per-call Iterator - // allocation -- trace is typically a SpanList, but even Collections.singletonList's - // iterator() allocates. - final long startSeq = dataInbox.tryClaimRange(traceSize); + // Claim ring slots in a single CAS. When PendingTrace assembled this SpanList it counted + // the eligible spans during its own iteration (free, no extra pass), and we use that exact + // number here -- no overclaim, no skip-sentinel writes for ineligible slots. If the count + // is -1 (unknown -- e.g. a TraceInterceptor rebuilt the list, or the caller isn't a + // SpanList) we fall back to overclaiming traceSize and marking non-eligible slots with the + // skip-sentinel as we iterate. + final int hintedEligibleCount = + trace instanceof SpanList ? ((SpanList) trace).getMetricEligibleCount() : -1; + final boolean precounted = hintedEligibleCount >= 0; + if (precounted && hintedEligibleCount == 0) { + // PendingTrace told us nothing in this trace would be aggregated. Nothing to claim. + healthMetrics.onClientStatTraceComputed(0, traceSize, true); + return false; + } + final int claimCount = precounted ? hintedEligibleCount : traceSize; + final long startSeq = dataInbox.tryClaimRange(claimCount); if (startSeq < 0L) { healthMetrics.onStatsInboxFull(); return false; } boolean forceKeep = false; - int filled = 0; + int nextSlot = 0; // index into the claimed range; sequence = startSeq + nextSlot + int filledMetrics = 0; for (int i = 0; i < traceSize; i++) { final CoreSpan span = trace.get(i); - final long seq = startSeq + i; - final SpanSnapshot slot = dataInbox.slotAt(seq); final boolean isTopLevel = span.isTopLevel(); if (shouldComputeMetric(span, isTopLevel)) { final CharSequence resourceName = span.getResourceName(); if (resourceName != null && ignoredResources.contains(resourceName.toString())) { - // Ignored resource: drop metrics for the whole trace. Mark this slot AND every - // remaining slot in the claimed range as skip-sentinel so the aggregator advances past - // them. - slot.tagAndDuration = 0L; - dataInbox.publish(seq); - for (int j = i + 1; j < traceSize; j++) { + // Ignored resource: drop metrics for the whole trace. Mark every still-claimed-but- + // unpublished slot as skip-sentinel so the aggregator advances past them. + for (int j = nextSlot; j < claimCount; j++) { final long skipSeq = startSeq + j; dataInbox.slotAt(skipSeq).tagAndDuration = 0L; dataInbox.publish(skipSeq); @@ -332,27 +336,37 @@ public boolean publish(List> trace) { healthMetrics.onClientStatTraceComputed(0, traceSize, true); return false; } + final long seq = startSeq + nextSlot; try { - fillSlot(span, isTopLevel, slot); + fillSlot(span, isTopLevel, dataInbox.slotAt(seq)); } finally { // Publish in finally so a throwing fillSlot can't strand the consumer at this // sequence -- matches the publish-on-throw guarantee in MpscRingBuffer.tryWrite. dataInbox.publish(seq); } forceKeep |= span.getError() > 0; - filled++; - } else { - // Non-eligible span: mark slot as skip-sentinel and publish so the aggregator advances. - slot.tagAndDuration = 0L; + nextSlot++; + filledMetrics++; + } else if (!precounted) { + // Overclaim path: this span has a claimed slot we need to publish as skip-sentinel so + // the aggregator advances. In the precounted path this slot was never claimed. + final long seq = startSeq + nextSlot; + dataInbox.slotAt(seq).tagAndDuration = 0L; dataInbox.publish(seq); + nextSlot++; } } - healthMetrics.onClientStatTraceComputed(filled, traceSize, !forceKeep); + healthMetrics.onClientStatTraceComputed(filledMetrics, traceSize, !forceKeep); return forceKeep; } - private boolean shouldComputeMetric(CoreSpan span, boolean isTopLevel) { + /** + * Whether a span's metrics should be aggregated. Exposed as static so {@code PendingTrace} can + * pre-compute the count of eligible spans during list assembly and pass it to the CSS publish + * path via {@code SpanList#getMetricEligibleCount()}, letting CSS size its ring claim exactly. + */ + public static boolean shouldComputeMetric(CoreSpan span, boolean isTopLevel) { return (span.isMeasured() || isTopLevel || span.isKind(METRICS_ELIGIBLE_KINDS)) && span.getLongRunningVersion() <= 0 // either not long-running or unpublished long-running span diff --git a/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java b/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java index 11f8566cc48..a97815fc846 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java @@ -5,6 +5,7 @@ import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.time.TimeSource; import datadog.trace.bootstrap.instrumentation.api.AgentScope; +import datadog.trace.common.metrics.ConflatingMetricsAggregator; import datadog.trace.core.CoreTracer.ConfigSnapshot; import datadog.trace.core.monitor.HealthMetrics; import java.util.ArrayList; @@ -408,6 +409,7 @@ private int write(boolean isPartial) { int enqueueSpansToWrite(List trace, boolean writeRunningSpans) { int completedSpans = 0; + int eligibleForCss = 0; boolean runningSpanSeen = false; long firstRunningSpanID = 0; long nowNano = 0; @@ -426,6 +428,12 @@ int enqueueSpansToWrite(List trace, boolean writeRunningSpans) { if (span.isFinished()) { trace.add(span); completedSpans++; + // Count CSS-eligible spans during the same iteration the list is built. The CSS + // aggregator reads this via SpanList#getEligibleCount() to size its ring claim exactly, + // avoiding both the overclaim-then-skip waste and a second pass to count. + if (ConflatingMetricsAggregator.shouldComputeMetric(span, span.isTopLevel())) { + eligibleForCss++; + } } else { // keep the running span in the list spans.add(span); @@ -437,10 +445,15 @@ int enqueueSpansToWrite(List trace, boolean writeRunningSpans) { span.setLongRunningVersion( (int) TimeUnit.NANOSECONDS.toMillis(nowNano - span.getStartTime())); trace.add(span); + // Long-running unfinished spans never pass shouldComputeMetric (longRunningVersion>0 + // and duration not yet set), so they don't contribute to eligibleForCss. } } span = spans.pollFirst(); } + if (trace instanceof SpanList) { + ((SpanList) trace).setMetricEligibleCount(eligibleForCss); + } return completedSpans; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/SpanList.java b/dd-trace-core/src/main/java/datadog/trace/core/SpanList.java index 2ce199b1768..6f568c4acba 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/SpanList.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/SpanList.java @@ -3,9 +3,17 @@ import java.util.ArrayList; /** ArrayList that exposes modCount to allow for an optimization in TraceInterceptor handling */ -final class SpanList extends ArrayList { +public final class SpanList extends ArrayList { static final SpanList EMPTY = new SpanList(0); + /** + * Hint set by {@code PendingTrace} when the list is assembled: how many spans pass the + * CSS-eligibility predicate. The CSS aggregator uses this to size its ring claim exactly instead + * of overclaiming. {@code -1} means "unknown" -- the consumer must fall back to counting itself + * (or to overclaiming based on {@code size()}). + */ + private int eligibleCount = -1; + /** * Convenience function for creating a SpanList containing a single DDSpan. Meant as replacement * for Collections.singletonList when creating a SpanList. @@ -32,4 +40,14 @@ static final SpanList of(DDSpan span) { int modCount() { return this.modCount; } + + /** See {@link #eligibleCount}; called from {@code PendingTrace} during list assembly. */ + void setMetricEligibleCount(int count) { + this.eligibleCount = count; + } + + /** {@code -1} if unset (e.g. after TraceInterceptors built a fresh SpanList). */ + public int getMetricEligibleCount() { + return eligibleCount; + } } From 21b5bd71dd4a0348cab6a30e20f6bce33a2066cb Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 17:25:40 -0400 Subject: [PATCH 25/27] Revert "Use PendingTrace pre-counted eligible-span count for exact CSS ring claim" This reverts commit e29b802122f2eaad694afe2f1f04ce18400d0748. --- .../metrics/ConflatingMetricsAggregator.java | 64 ++++++++----------- .../java/datadog/trace/core/PendingTrace.java | 13 ---- .../java/datadog/trace/core/SpanList.java | 20 +----- 3 files changed, 26 insertions(+), 71 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index 1c6a83cd9bf..160ef363e72 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -26,7 +26,6 @@ import datadog.trace.core.CoreSpan; import datadog.trace.core.DDTraceCoreInfo; import datadog.trace.core.SpanKindFilter; -import datadog.trace.core.SpanList; import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.util.AgentTaskScheduler; import datadog.trace.util.concurrent.MpscRingBuffer; @@ -294,41 +293,38 @@ public boolean publish(List> trace) { return false; } - // Claim ring slots in a single CAS. When PendingTrace assembled this SpanList it counted - // the eligible spans during its own iteration (free, no extra pass), and we use that exact - // number here -- no overclaim, no skip-sentinel writes for ineligible slots. If the count - // is -1 (unknown -- e.g. a TraceInterceptor rebuilt the list, or the caller isn't a - // SpanList) we fall back to overclaiming traceSize and marking non-eligible slots with the - // skip-sentinel as we iterate. - final int hintedEligibleCount = - trace instanceof SpanList ? ((SpanList) trace).getMetricEligibleCount() : -1; - final boolean precounted = hintedEligibleCount >= 0; - if (precounted && hintedEligibleCount == 0) { - // PendingTrace told us nothing in this trace would be aggregated. Nothing to claim. - healthMetrics.onClientStatTraceComputed(0, traceSize, true); - return false; - } - final int claimCount = precounted ? hintedEligibleCount : traceSize; - final long startSeq = dataInbox.tryClaimRange(claimCount); + // Overclaim the worst case: assume every span in the trace is metrics-eligible and claim + // that many ring slots in a single CAS. Non-eligible spans get a "skip" sentinel + // (tagAndDuration = 0L); the aggregator recognises this and bypasses findOrInsert for those + // slots. We expect most spans in a typical trace to be eligible (top-level / measured / + // server-client-producer-consumer kinds), so the slot waste is small in exchange for a + // single-pass producer loop. Indexed iteration (vs. enhanced-for) avoids per-call Iterator + // allocation -- trace is typically a SpanList, but even Collections.singletonList's + // iterator() allocates. + final long startSeq = dataInbox.tryClaimRange(traceSize); if (startSeq < 0L) { healthMetrics.onStatsInboxFull(); return false; } boolean forceKeep = false; - int nextSlot = 0; // index into the claimed range; sequence = startSeq + nextSlot - int filledMetrics = 0; + int filled = 0; for (int i = 0; i < traceSize; i++) { final CoreSpan span = trace.get(i); + final long seq = startSeq + i; + final SpanSnapshot slot = dataInbox.slotAt(seq); final boolean isTopLevel = span.isTopLevel(); if (shouldComputeMetric(span, isTopLevel)) { final CharSequence resourceName = span.getResourceName(); if (resourceName != null && ignoredResources.contains(resourceName.toString())) { - // Ignored resource: drop metrics for the whole trace. Mark every still-claimed-but- - // unpublished slot as skip-sentinel so the aggregator advances past them. - for (int j = nextSlot; j < claimCount; j++) { + // Ignored resource: drop metrics for the whole trace. Mark this slot AND every + // remaining slot in the claimed range as skip-sentinel so the aggregator advances past + // them. + slot.tagAndDuration = 0L; + dataInbox.publish(seq); + for (int j = i + 1; j < traceSize; j++) { final long skipSeq = startSeq + j; dataInbox.slotAt(skipSeq).tagAndDuration = 0L; dataInbox.publish(skipSeq); @@ -336,37 +332,27 @@ public boolean publish(List> trace) { healthMetrics.onClientStatTraceComputed(0, traceSize, true); return false; } - final long seq = startSeq + nextSlot; try { - fillSlot(span, isTopLevel, dataInbox.slotAt(seq)); + fillSlot(span, isTopLevel, slot); } finally { // Publish in finally so a throwing fillSlot can't strand the consumer at this // sequence -- matches the publish-on-throw guarantee in MpscRingBuffer.tryWrite. dataInbox.publish(seq); } forceKeep |= span.getError() > 0; - nextSlot++; - filledMetrics++; - } else if (!precounted) { - // Overclaim path: this span has a claimed slot we need to publish as skip-sentinel so - // the aggregator advances. In the precounted path this slot was never claimed. - final long seq = startSeq + nextSlot; - dataInbox.slotAt(seq).tagAndDuration = 0L; + filled++; + } else { + // Non-eligible span: mark slot as skip-sentinel and publish so the aggregator advances. + slot.tagAndDuration = 0L; dataInbox.publish(seq); - nextSlot++; } } - healthMetrics.onClientStatTraceComputed(filledMetrics, traceSize, !forceKeep); + healthMetrics.onClientStatTraceComputed(filled, traceSize, !forceKeep); return forceKeep; } - /** - * Whether a span's metrics should be aggregated. Exposed as static so {@code PendingTrace} can - * pre-compute the count of eligible spans during list assembly and pass it to the CSS publish - * path via {@code SpanList#getMetricEligibleCount()}, letting CSS size its ring claim exactly. - */ - public static boolean shouldComputeMetric(CoreSpan span, boolean isTopLevel) { + private boolean shouldComputeMetric(CoreSpan span, boolean isTopLevel) { return (span.isMeasured() || isTopLevel || span.isKind(METRICS_ELIGIBLE_KINDS)) && span.getLongRunningVersion() <= 0 // either not long-running or unpublished long-running span diff --git a/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java b/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java index a97815fc846..11f8566cc48 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java @@ -5,7 +5,6 @@ import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.time.TimeSource; import datadog.trace.bootstrap.instrumentation.api.AgentScope; -import datadog.trace.common.metrics.ConflatingMetricsAggregator; import datadog.trace.core.CoreTracer.ConfigSnapshot; import datadog.trace.core.monitor.HealthMetrics; import java.util.ArrayList; @@ -409,7 +408,6 @@ private int write(boolean isPartial) { int enqueueSpansToWrite(List trace, boolean writeRunningSpans) { int completedSpans = 0; - int eligibleForCss = 0; boolean runningSpanSeen = false; long firstRunningSpanID = 0; long nowNano = 0; @@ -428,12 +426,6 @@ int enqueueSpansToWrite(List trace, boolean writeRunningSpans) { if (span.isFinished()) { trace.add(span); completedSpans++; - // Count CSS-eligible spans during the same iteration the list is built. The CSS - // aggregator reads this via SpanList#getEligibleCount() to size its ring claim exactly, - // avoiding both the overclaim-then-skip waste and a second pass to count. - if (ConflatingMetricsAggregator.shouldComputeMetric(span, span.isTopLevel())) { - eligibleForCss++; - } } else { // keep the running span in the list spans.add(span); @@ -445,15 +437,10 @@ int enqueueSpansToWrite(List trace, boolean writeRunningSpans) { span.setLongRunningVersion( (int) TimeUnit.NANOSECONDS.toMillis(nowNano - span.getStartTime())); trace.add(span); - // Long-running unfinished spans never pass shouldComputeMetric (longRunningVersion>0 - // and duration not yet set), so they don't contribute to eligibleForCss. } } span = spans.pollFirst(); } - if (trace instanceof SpanList) { - ((SpanList) trace).setMetricEligibleCount(eligibleForCss); - } return completedSpans; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/SpanList.java b/dd-trace-core/src/main/java/datadog/trace/core/SpanList.java index 6f568c4acba..2ce199b1768 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/SpanList.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/SpanList.java @@ -3,17 +3,9 @@ import java.util.ArrayList; /** ArrayList that exposes modCount to allow for an optimization in TraceInterceptor handling */ -public final class SpanList extends ArrayList { +final class SpanList extends ArrayList { static final SpanList EMPTY = new SpanList(0); - /** - * Hint set by {@code PendingTrace} when the list is assembled: how many spans pass the - * CSS-eligibility predicate. The CSS aggregator uses this to size its ring claim exactly instead - * of overclaiming. {@code -1} means "unknown" -- the consumer must fall back to counting itself - * (or to overclaiming based on {@code size()}). - */ - private int eligibleCount = -1; - /** * Convenience function for creating a SpanList containing a single DDSpan. Meant as replacement * for Collections.singletonList when creating a SpanList. @@ -40,14 +32,4 @@ static final SpanList of(DDSpan span) { int modCount() { return this.modCount; } - - /** See {@link #eligibleCount}; called from {@code PendingTrace} during list assembly. */ - void setMetricEligibleCount(int count) { - this.eligibleCount = count; - } - - /** {@code -1} if unset (e.g. after TraceInterceptors built a fresh SpanList). */ - public int getMetricEligibleCount() { - return eligibleCount; - } } From a39ede1d6fad11a0824ce8136bc26e86c4b53064 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 20:26:45 -0400 Subject: [PATCH 26/27] Park/unpark aggregator and cache PeerTagSchema.namesHash Two small wins identified by the 64m CPU profile: 1. Replace Thread.sleep(10ms) in the aggregator's idle wait with LockSupport.parkNanos + producer-side unparkIfWaiting(). Cuts the worst-case wake latency from 10ms to whenever a producer publishes. Producer-side cost is one volatile read of the waiting flag per publish; only on a true read does the producer pay an unpark. At saturating workloads where the aggregator is never parked, the volatile read is the only cost. 2. Cache Arrays.hashCode(peerTagSchema.names) on PeerTagSchema instead of recomputing per AggregateEntry.hashOf call. The schema is shared across many publishes; the per-publish recomputation was a top aggregator-thread sample. Now one field load. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/common/metrics/AggregateEntry.java | 4 +- .../trace/common/metrics/Aggregator.java | 49 +++++++++++++++++-- .../metrics/ConflatingMetricsAggregator.java | 8 +++ .../trace/common/metrics/PeerTagSchema.java | 9 ++++ 4 files changed, 63 insertions(+), 7 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java index 6b05ba46974..1eefba7cf61 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -306,9 +306,7 @@ static long hashOf(SpanSnapshot s) { // buffer that may carry stale contents from a prior tag-firing publish when this publish had // no peer tags. Hash it only when the schema says it's meaningful, matching the matches() // contract. - h = - LongHashingUtils.addToHash( - h, s.peerTagSchema == null ? 0 : Arrays.hashCode(s.peerTagSchema.names)); + h = LongHashingUtils.addToHash(h, s.peerTagSchema == null ? 0 : s.peerTagSchema.namesHash); h = LongHashingUtils.addToHash( h, s.peerTagSchema == null ? 0 : Arrays.hashCode(s.peerTagValues)); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index 215b8c0500c..d70a1fbaae1 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -8,6 +8,7 @@ import datadog.trace.util.concurrent.MpscRingBuffer; import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.LockSupport; import java.util.function.BiConsumer; import org.jctools.queues.MessagePassingQueue; import org.slf4j.Logger; @@ -50,6 +51,20 @@ final class Aggregator implements Runnable { justification = "the field is confined to the agent thread running the Aggregator") private boolean stopped; + /** + * True iff the aggregator thread is currently parked (or about to park) waiting for new work. + * Producers volatile-read this after publishing; on true they call {@link #unparkIfWaiting()} to + * wake the aggregator without paying the {@code sleepMillis} latency. + * + *

Volatile so the producer side sees the latest value. Set true before park, false on wake. + * The aggregator re-checks the inbox after setting this true (see {@link #run}) to close the race + * window: if a producer published just before the aggregator set the flag, the aggregator would + * have missed the unpark; the re-check catches that work and skips the park. + */ + private volatile boolean waiting; + + private volatile Thread aggregatorThread; + /** * Static-singleton snapshot handler. Reads from the slot via {@code this} (passed as the context) * and avoids per-drain lambda capture. @@ -100,7 +115,9 @@ final class Aggregator implements Runnable { @Override public void run() { - Thread currentThread = Thread.currentThread(); + final Thread currentThread = Thread.currentThread(); + aggregatorThread = currentThread; + final long parkNanos = TimeUnit.MILLISECONDS.toNanos(sleepMillis); while (!currentThread.isInterrupted() && !stopped) { try { int drainedData = dataInbox.drain(this, SNAPSHOT_HANDLER); @@ -114,10 +131,18 @@ public void run() { } if (stopped) break; if (drainedData == 0 && signalInbox.isEmpty()) { - Thread.sleep(sleepMillis); + waiting = true; + try { + // Re-check after publishing the waiting flag. A producer that published just before + // we set waiting=true won't have unparked us (they read waiting=false); the re-check + // catches that work via the volatile producerCursor. + if (dataInbox.isEmpty() && signalInbox.isEmpty()) { + LockSupport.parkNanos(this, parkNanos); + } + } finally { + waiting = false; + } } - } catch (InterruptedException e) { - currentThread.interrupt(); } catch (Throwable error) { log.debug("error aggregating metrics", error); } @@ -125,6 +150,22 @@ public void run() { log.debug("metrics aggregator exited"); } + /** + * Wake the aggregator if it's currently parked. Called from producer threads after publishing. + * + *

Cost on the producer hot path is one volatile read; only on a {@code true} read does the + * producer pay an {@code unpark} (cheap idempotent atomic). At saturating workloads where the + * aggregator is never parked, the volatile read is the only cost. + */ + void unparkIfWaiting() { + if (waiting) { + final Thread t = aggregatorThread; + if (t != null) { + LockSupport.unpark(t); + } + } + } + private static void handleSnapshot(Aggregator agg, SpanSnapshot snapshot) { // Skip-sentinel from the producer's overclaim path: the producer claimed this slot // speculatively (one slot per span in the trace) but the span turned out to be non-eligible diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index 160ef363e72..9dbdefa0e33 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -330,6 +330,7 @@ public boolean publish(List> trace) { dataInbox.publish(skipSeq); } healthMetrics.onClientStatTraceComputed(0, traceSize, true); + aggregator.unparkIfWaiting(); return false; } try { @@ -349,9 +350,16 @@ public boolean publish(List> trace) { } healthMetrics.onClientStatTraceComputed(filled, traceSize, !forceKeep); + // Wake the aggregator if it parked while we were filling. Producer-side cost is one volatile + // read when the aggregator isn't parked. + aggregator.unparkIfWaiting(); return forceKeep; } + // Other site that posts work the aggregator needs to see: ignored-resource short-circuit + // returns above before reaching unparkIfWaiting. Mirror that wake-up there too so the + // aggregator picks up the skip-sentinel slots quickly. + private boolean shouldComputeMetric(CoreSpan span, boolean isTopLevel) { return (span.isMeasured() || isTopLevel || span.isKind(METRICS_ELIGIBLE_KINDS)) && span.getLongRunningVersion() diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java index 4821d1b33a4..d3a3d47d65a 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java @@ -43,6 +43,14 @@ final class PeerTagSchema { final String[] names; + /** + * Precomputed {@code Arrays.hashCode(names)}. The schema is shared across many publishes so + * recomputing it on the aggregator hot path (per-publish call to {@code AggregateEntry.hashOf}) + * was waste -- it showed up as a top aggregator-thread sample. Cached here, computed once at + * construction. + */ + final int namesHash; + /** * The {@code DDAgentFeaturesDiscovery.state()} hash this schema was built from. The aggregator * thread reads and updates this once per reporting cycle when reconciling against the latest @@ -54,6 +62,7 @@ final class PeerTagSchema { private PeerTagSchema(String[] names, String state) { this.names = names; + this.namesHash = java.util.Arrays.hashCode(names); this.state = state; } From c330d9c821e896a2439a1f339a70918e514f2931 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 28 May 2026 21:27:22 -0400 Subject: [PATCH 27/27] Canonicalize + hash on the producer; identity matches on the aggregator The 64m CPU profile showed the aggregator spending most of its time on findOrInsert's hashing chain (LongHashingUtils.addToHash, Arrays.hashCode, StringLatin1.hashCode) and contentEquals comparison. With one aggregator thread vs 8 producer threads, this concentrated CPU is the petclinic regression source. Move that work to the producer: - SpanSnapshot fields become UTF8BytesString (or @Nullable variants), canonicalized in fillSlot via the per-field DDCaches that already live on AggregateEntry (made package-private static). - SpanSnapshot.computeAndSetKeyHash precomputes the table-lookup hash at the end of fillSlot. AggregateTable.findOrInsert reads snapshot.keyHash directly; no rehashing on the aggregator thread. - AggregateEntry's constructor stops canonicalizing -- it just copies the already-canonical refs from the snapshot. - AggregateEntry.matches uses identity (==) for the UTF8BytesString fields. peerTagNames uses Arrays.equals (PeerTagSchema instances can be swapped during reconcile; the a==b fast path inside Arrays.equals makes the shared-reference case O(1)). canonicalize/canonicalizeOptional treat null and length-zero inputs as the same canonical (EMPTY for required, null for optional) so the prior null-vs-empty collapse semantics survive the move from contentEquals to identity comparison. hashOf is removed from AggregateEntry; tests build snapshots via a new AggregateEntryTestUtils.buildSnapshot helper that mirrors fillSlot's canonicalize-and-hash sequence. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/common/metrics/AggregateEntry.java | 142 +++++++----------- .../trace/common/metrics/AggregateTable.java | 6 +- .../metrics/ConflatingMetricsAggregator.java | 34 +++-- .../trace/common/metrics/SpanSnapshot.java | 133 ++++++++++------ .../common/metrics/AggregateEntryTest.java | 4 +- .../metrics/AggregateEntryTestUtils.java | 65 ++++++-- .../common/metrics/AggregateTableTest.java | 8 +- 7 files changed, 226 insertions(+), 166 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java index 1eefba7cf61..a518090f93b 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -8,7 +8,6 @@ import datadog.trace.api.cache.DDCaches; import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; import datadog.trace.util.Hashtable; -import datadog.trace.util.LongHashingUtils; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -62,23 +61,19 @@ final class AggregateEntry extends Hashtable.Entry { static final long ERROR_TAG = 0x8000000000000000L; static final long TOP_LEVEL_TAG = 0x4000000000000000L; - // UTF8 caches consolidated from the previous MetricKey + ConflatingMetricsAggregator split. - private static final DDCache RESOURCE_CACHE = - DDCaches.newFixedSizeCache(32); - private static final DDCache SERVICE_CACHE = - DDCaches.newFixedSizeCache(32); - private static final DDCache OPERATION_CACHE = - DDCaches.newFixedSizeCache(64); - private static final DDCache SERVICE_SOURCE_CACHE = - DDCaches.newFixedSizeCache(16); - private static final DDCache TYPE_CACHE = DDCaches.newFixedSizeCache(8); - private static final DDCache SPAN_KIND_CACHE = + // UTF8 caches. Package-private so ConflatingMetricsAggregator's fillSlot can canonicalize + // producer-side via the same caches the entry constructor used to consult. + static final DDCache RESOURCE_CACHE = DDCaches.newFixedSizeCache(32); + static final DDCache SERVICE_CACHE = DDCaches.newFixedSizeCache(32); + static final DDCache OPERATION_CACHE = DDCaches.newFixedSizeCache(64); + static final DDCache SERVICE_SOURCE_CACHE = DDCaches.newFixedSizeCache(16); - private static final DDCache HTTP_METHOD_CACHE = - DDCaches.newFixedSizeCache(8); - private static final DDCache HTTP_ENDPOINT_CACHE = + static final DDCache TYPE_CACHE = DDCaches.newFixedSizeCache(8); + static final DDCache SPAN_KIND_CACHE = DDCaches.newFixedSizeCache(16); + static final DDCache HTTP_METHOD_CACHE = DDCaches.newFixedSizeCache(8); + static final DDCache HTTP_ENDPOINT_CACHE = DDCaches.newFixedSizeCache(32); - private static final DDCache GRPC_STATUS_CODE_CACHE = + static final DDCache GRPC_STATUS_CODE_CACHE = DDCaches.newFixedSizeCache(32); /** @@ -141,18 +136,21 @@ final class AggregateEntry extends Hashtable.Entry { private int topLevelCount; private long duration; - /** Hot-path constructor for the producer/consumer flow. Builds UTF8 fields via the caches. */ - AggregateEntry(SpanSnapshot s, long keyHash) { - super(keyHash); - this.resource = canonicalize(RESOURCE_CACHE, s.resourceName); - this.service = canonicalize(SERVICE_CACHE, s.serviceName); - this.operationName = canonicalize(OPERATION_CACHE, s.operationName); - this.serviceSource = canonicalizeOptional(SERVICE_SOURCE_CACHE, s.serviceNameSource); - this.type = canonicalize(TYPE_CACHE, s.spanType); - this.spanKind = canonicalize(SPAN_KIND_CACHE, s.spanKind); - this.httpMethod = canonicalizeOptional(HTTP_METHOD_CACHE, s.httpMethod); - this.httpEndpoint = canonicalizeOptional(HTTP_ENDPOINT_CACHE, s.httpEndpoint); - this.grpcStatusCode = canonicalizeOptional(GRPC_STATUS_CODE_CACHE, s.grpcStatusCode); + /** Hot-path constructor for the producer/consumer flow. Slot fields are already canonical. */ + AggregateEntry(SpanSnapshot s) { + super(s.keyHash); + // The slot's string fields are already canonical UTF8BytesString instances (canonicalized by + // the producer in fillSlot via the same caches we used to consult here). Just copy refs; + // matches() can do identity comparison instead of contentEquals. + this.resource = s.resourceName; + this.service = s.serviceName; + this.operationName = s.operationName; + this.serviceSource = s.serviceNameSource; + this.type = s.spanType; + this.spanKind = s.spanKind; + this.httpMethod = s.httpMethod; + this.httpEndpoint = s.httpEndpoint; + this.grpcStatusCode = s.grpcStatusCode; this.httpStatusCode = s.httpStatusCode; this.synthetic = s.synthetic; this.traceRoot = s.traceRoot; @@ -245,26 +243,29 @@ void clear() { } boolean matches(SpanSnapshot s) { - String[] snapshotNames = s.peerTagSchema == null ? null : s.peerTagSchema.names; - // peerTagValues is meaningful only when peerTagNames is non-null. When the snapshot's slot - // had no peer tags for this publish (peerTagSchema=null), s.peerTagValues may still hold - // stale scratch from a prior tag-firing publish on the same slot -- skip the values check - // in that case. The names check above already rejects entries whose peer-tag presence - // doesn't match the snapshot. + // All string fields are canonical UTF8BytesString references from the per-field DDCaches. + // Same content => same canonical instance => identity comparison suffices. + // peerTagSchema is also shared by reference (producer reads cachedPeerTagSchema), so its + // names array reference is stable too. peerTagValues stays content-compared since values + // vary per publish and aren't canonicalized. return httpStatusCode == s.httpStatusCode && synthetic == s.synthetic && traceRoot == s.traceRoot - && contentEquals(resource, s.resourceName) - && contentEquals(service, s.serviceName) - && contentEquals(operationName, s.operationName) - && contentEquals(serviceSource, s.serviceNameSource) - && contentEquals(type, s.spanType) - && contentEquals(spanKind, s.spanKind) - && Arrays.equals(peerTagNames, snapshotNames) + && resource == s.resourceName + && service == s.serviceName + && operationName == s.operationName + && serviceSource == s.serviceNameSource + && type == s.spanType + && spanKind == s.spanKind + // PeerTagSchema instances aren't guaranteed to be shared by reference across all + // producers (cachedPeerTagSchema can be swapped during reconcile; tests build fresh + // instances), so compare names by content. The same-reference case short-circuits to + // O(1) via Arrays.equals's a==b fast path. + && Arrays.equals(peerTagNames, s.peerTagSchema == null ? null : s.peerTagSchema.names) && (peerTagNames == null || Arrays.equals(peerTagValues, s.peerTagValues)) - && contentEquals(httpMethod, s.httpMethod) - && contentEquals(httpEndpoint, s.httpEndpoint) - && contentEquals(grpcStatusCode, s.grpcStatusCode); + && httpMethod == s.httpMethod + && httpEndpoint == s.httpEndpoint + && grpcStatusCode == s.grpcStatusCode; } /** @@ -276,46 +277,6 @@ boolean matches(long keyHash, SpanSnapshot s) { return this.keyHash == keyHash && matches(s); } - /** - * Computes the 64-bit lookup hash for a {@link SpanSnapshot}. Chained per-field calls -- no - * varargs / Object[] allocation, no autoboxing on primitive overloads. The constructor's - * super({@code hashOf(s)}) call uses the same function so an entry built from a snapshot hashes - * to the same bucket the snapshot itself looks up. - * - *

Hashes are content-stable across {@code String} / {@code UTF8BytesString}: {@link - * UTF8BytesString#hashCode()} returns the underlying {@code String}'s hash. - */ - static long hashOf(SpanSnapshot s) { - long h = 0; - h = LongHashingUtils.addToHash(h, s.resourceName); - h = LongHashingUtils.addToHash(h, s.serviceName); - h = LongHashingUtils.addToHash(h, s.operationName); - h = LongHashingUtils.addToHash(h, s.serviceNameSource); - h = LongHashingUtils.addToHash(h, s.spanType); - h = LongHashingUtils.addToHash(h, s.httpStatusCode); - h = LongHashingUtils.addToHash(h, s.synthetic); - h = LongHashingUtils.addToHash(h, s.traceRoot); - h = LongHashingUtils.addToHash(h, s.spanKind); - // Always mix in both the schema's content hash and the values' content hash, unconditionally - // (no null-skip). Arrays.hashCode is content-based for both String[]s; the default - // Object[].hashCode is identity-based, which would let two snapshots with content-equal but - // distinct PeerTagSchema instances hash to different buckets. Null inputs hash to 0 here, - // distinct from {@code Arrays.hashCode(empty)} = 1 or any non-empty array. - // - // peerTagValues is gated by peerTagSchema: the slot's peerTagValues is a reusable scratch - // buffer that may carry stale contents from a prior tag-firing publish when this publish had - // no peer tags. Hash it only when the schema says it's meaningful, matching the matches() - // contract. - h = LongHashingUtils.addToHash(h, s.peerTagSchema == null ? 0 : s.peerTagSchema.namesHash); - h = - LongHashingUtils.addToHash( - h, s.peerTagSchema == null ? 0 : Arrays.hashCode(s.peerTagValues)); - h = LongHashingUtils.addToHash(h, s.httpMethod); - h = LongHashingUtils.addToHash(h, s.httpEndpoint); - h = LongHashingUtils.addToHash(h, s.grpcStatusCode); - return h; - } - // Accessors for SerializingMetricWriter. UTF8BytesString getResource() { return resource; @@ -380,9 +341,12 @@ List getPeerTags() { // ----- helpers ----- - private static UTF8BytesString canonicalize( + static UTF8BytesString canonicalize( DDCache cache, CharSequence charSeq) { - if (charSeq == null) { + // Treat null and length-zero as the same canonical EMPTY instance so the producer-side + // canonicalization produces identity-equal results for null/"" inputs. matches() now uses + // identity comparison, so the previous content-equals collapse needs to happen here. + if (charSeq == null || charSeq.length() == 0) { return EMPTY; } if (charSeq instanceof UTF8BytesString) { @@ -402,9 +366,11 @@ private static UTF8BytesString canonicalize( * single helper keeps the constructor consistent. */ @Nullable - private static UTF8BytesString canonicalizeOptional( + static UTF8BytesString canonicalizeOptional( DDCache cache, @Nullable CharSequence charSeq) { - if (charSeq == null) { + // Treat null and length-zero the same: both map to null (i.e. "absent"). matches() uses + // identity comparison, so the prior content-equals collapse needs to happen here. + if (charSeq == null || charSeq.length() == 0) { return null; } if (charSeq instanceof UTF8BytesString) { diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java index abadc7e5f17..b23e492ce82 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java @@ -55,7 +55,9 @@ boolean isEmpty() { * caller should drop the data point in that case. */ AggregateEntry findOrInsert(SpanSnapshot snapshot) { - long keyHash = AggregateEntry.hashOf(snapshot); + // keyHash is precomputed by the producer in SpanSnapshot.computeAndSetKeyHash, no + // re-hashing on the aggregator thread. + final long keyHash = snapshot.keyHash; for (AggregateEntry candidate = Support.bucket(buckets, keyHash); candidate != null; candidate = candidate.next()) { @@ -66,7 +68,7 @@ AggregateEntry findOrInsert(SpanSnapshot snapshot) { if (size >= maxAggregates && !evictOneStale()) { return null; } - AggregateEntry entry = new AggregateEntry(snapshot, keyHash); + AggregateEntry entry = new AggregateEntry(snapshot); Support.insertHeadEntry(buckets, keyHash, entry); size++; return entry; diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index 9dbdefa0e33..dc6ea86bac7 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -426,20 +426,36 @@ private void fillSlot(CoreSpan span, boolean isTopLevel, SpanSnapshot slot) { } } - slot.resourceName = span.getResourceName(); - slot.serviceName = span.getServiceName(); - slot.operationName = span.getOperationName(); - slot.serviceNameSource = span.getServiceNameSource(); - slot.spanType = spanType; + // Canonicalize every string field to UTF8BytesString via the per-field DDCaches on + // AggregateEntry. This work used to happen on the aggregator thread; doing it here + // distributes the cost across producer threads and gives the aggregator's matches() identity + // comparisons instead of contentEquals chains. + slot.resourceName = + AggregateEntry.canonicalize(AggregateEntry.RESOURCE_CACHE, span.getResourceName()); + slot.serviceName = + AggregateEntry.canonicalize(AggregateEntry.SERVICE_CACHE, span.getServiceName()); + slot.operationName = + AggregateEntry.canonicalize(AggregateEntry.OPERATION_CACHE, span.getOperationName()); + slot.serviceNameSource = + AggregateEntry.canonicalizeOptional( + AggregateEntry.SERVICE_SOURCE_CACHE, span.getServiceNameSource()); + slot.spanType = AggregateEntry.canonicalize(AggregateEntry.TYPE_CACHE, spanType); slot.httpStatusCode = span.getHttpStatusCode(); slot.synthetic = isSynthetic(span); slot.traceRoot = span.getParentId() == 0; - slot.spanKind = spanKind; + slot.spanKind = AggregateEntry.canonicalize(AggregateEntry.SPAN_KIND_CACHE, spanKind); slot.peerTagSchema = peerTagSchema; - slot.httpMethod = httpMethod; - slot.httpEndpoint = httpEndpoint; - slot.grpcStatusCode = grpcStatusCode; + slot.httpMethod = + AggregateEntry.canonicalizeOptional(AggregateEntry.HTTP_METHOD_CACHE, httpMethod); + slot.httpEndpoint = + AggregateEntry.canonicalizeOptional(AggregateEntry.HTTP_ENDPOINT_CACHE, httpEndpoint); + slot.grpcStatusCode = + AggregateEntry.canonicalizeOptional(AggregateEntry.GRPC_STATUS_CODE_CACHE, grpcStatusCode); slot.tagAndDuration = tagAndDuration; + + // Precompute the table-lookup key hash on the producer side. AggregateTable#findOrInsert + // reads this directly instead of running the chained hashOf on the aggregator thread. + SpanSnapshot.computeAndSetKeyHash(slot); } /** diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java index e030a9f68cd..f10d60167cc 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java @@ -1,8 +1,13 @@ package datadog.trace.common.metrics; +import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import datadog.trace.util.LongHashingUtils; +import java.util.Arrays; +import javax.annotation.Nullable; + /** - * Per-span value posted from the producer to the aggregator thread. Carries the raw inputs the - * aggregator needs to look up or build an {@link AggregateEntry} and update its counters. + * Per-span value posted from the producer to the aggregator thread. Carries the canonical inputs + * the aggregator needs to look up or build an {@link AggregateEntry} and update its counters. * *

Fields are mutable so this class can serve as a slot in {@link * datadog.trace.util.concurrent.MpscRingBuffer}: the producer claims a slot, writes its fields, and @@ -10,76 +15,104 @@ * producer that claimed it) and one reader (the aggregator), so no synchronization on fields is * required. * - *

All cache-canonicalization (service-name, span-kind, peer-tag string interning) happens on the - * aggregator thread; the producer just shuffles references. + *

Producer-side canonicalization and hashing. String fields are canonicalized to {@link + * UTF8BytesString} on the producer side (see {@link AggregateEntry#canonicalize}); same-content + * strings collapse to the same canonical reference through the per-field {@code DDCache} instances + * on {@link AggregateEntry}. The producer also precomputes {@link #keyHash} via the same chain the + * consumer used to use. The aggregator then does identity comparisons in {@link + * AggregateEntry#matches} and a precomputed-hash table lookup, with all the per-field work + * distributed across producer threads where there's CPU headroom. */ final class SpanSnapshot { - CharSequence resourceName; - String serviceName; - CharSequence operationName; - CharSequence serviceNameSource; - CharSequence spanType; + /** Canonical UTF-8 form of the span's resource name. {@link UTF8BytesString#EMPTY} when null. */ + UTF8BytesString resourceName; + + /** Canonical UTF-8 form of the span's service. {@link UTF8BytesString#EMPTY} when null. */ + UTF8BytesString serviceName; + + /** Canonical UTF-8 form of the span's operation name. {@link UTF8BytesString#EMPTY} when null. */ + UTF8BytesString operationName; + + /** Canonical UTF-8 form, or {@code null} when the span had no service-name source set. */ + @Nullable UTF8BytesString serviceNameSource; + + /** Canonical UTF-8 form of the span's type. {@link UTF8BytesString#EMPTY} when null. */ + UTF8BytesString spanType; + short httpStatusCode; boolean synthetic; boolean traceRoot; - String spanKind; + + /** Canonical UTF-8 form of the span kind. {@link UTF8BytesString#EMPTY} when not set. */ + UTF8BytesString spanKind; /** * Schema for {@link #peerTagValues}. {@code null} when the span has no peer tags. The schema * carries the names in parallel-array form; {@code peerTagValues} holds the per-span tag values * at the same indices. */ - PeerTagSchema peerTagSchema; + @Nullable PeerTagSchema peerTagSchema; /** * Peer tag values captured from the span, parallel to {@code peerTagSchema.names}. A {@code null} - * entry means the span didn't have that peer tag set. {@code null} (the whole array) when {@link - * #peerTagSchema} is {@code null}. + * entry means the span didn't have that peer tag set. These are per-publish strings (typically + * not shared across calls), so they stay as {@link String} -- canonicalizing them is more + * expensive than the aggregator's content-comparison on a 1-3 element array. + * + *

The slot owns this array as a reusable scratch buffer; see {@code + * ConflatingMetricsAggregator#fillSlot}. */ - String[] peerTagValues; + @Nullable String[] peerTagValues; - String httpMethod; - String httpEndpoint; - String grpcStatusCode; + @Nullable UTF8BytesString httpMethod; + @Nullable UTF8BytesString httpEndpoint; + @Nullable UTF8BytesString grpcStatusCode; /** Duration in nanoseconds, OR-ed with {@code ERROR_TAG} / {@code TOP_LEVEL_TAG} as needed. */ long tagAndDuration; - /** No-arg constructor used by {@link MpscRingBuffer} to pre-allocate empty slots. */ + /** + * Hashtable bucket key, computed by the producer at the end of {@code fillSlot} via {@link + * #computeAndSetKeyHash}. {@code AggregateTable#findOrInsert} uses this directly instead of + * re-running the chained hash on the aggregator thread. + */ + long keyHash; + + /** No-arg constructor used by {@link datadog.trace.util.concurrent.MpscRingBuffer}. */ SpanSnapshot() {} - /** Convenience constructor retained for tests that built immutable snapshots inline. */ - SpanSnapshot( - CharSequence resourceName, - String serviceName, - CharSequence operationName, - CharSequence serviceNameSource, - CharSequence spanType, - short httpStatusCode, - boolean synthetic, - boolean traceRoot, - String spanKind, - PeerTagSchema peerTagSchema, - String[] peerTagValues, - String httpMethod, - String httpEndpoint, - String grpcStatusCode, - long tagAndDuration) { - this.resourceName = resourceName; - this.serviceName = serviceName; - this.operationName = operationName; - this.serviceNameSource = serviceNameSource; - this.spanType = spanType; - this.httpStatusCode = httpStatusCode; - this.synthetic = synthetic; - this.traceRoot = traceRoot; - this.spanKind = spanKind; - this.peerTagSchema = peerTagSchema; - this.peerTagValues = peerTagValues; - this.httpMethod = httpMethod; - this.httpEndpoint = httpEndpoint; - this.grpcStatusCode = grpcStatusCode; - this.tagAndDuration = tagAndDuration; + /** + * Computes and stores {@link #keyHash} from the snapshot's current canonical fields. The producer + * calls this at the end of {@code fillSlot}; the aggregator then uses {@code keyHash} directly + * for bucket lookup in {@code AggregateTable}. + * + *

Mixing identical to the prior {@code AggregateEntry.hashOf} so the bucket distribution is + * unchanged: {@code UTF8BytesString.hashCode} returns the underlying {@code String} hash, so + * snapshots built from {@code String} and {@code UTF8BytesString} of the same content fold to the + * same long. + */ + static void computeAndSetKeyHash(SpanSnapshot s) { + long h = 0; + h = LongHashingUtils.addToHash(h, s.resourceName); + h = LongHashingUtils.addToHash(h, s.serviceName); + h = LongHashingUtils.addToHash(h, s.operationName); + h = LongHashingUtils.addToHash(h, s.serviceNameSource); + h = LongHashingUtils.addToHash(h, s.spanType); + h = LongHashingUtils.addToHash(h, s.httpStatusCode); + h = LongHashingUtils.addToHash(h, s.synthetic); + h = LongHashingUtils.addToHash(h, s.traceRoot); + h = LongHashingUtils.addToHash(h, s.spanKind); + // peerTagValues is gated by peerTagSchema -- the slot's array is a reusable scratch buffer + // that may carry stale contents from a prior tag-firing publish when this publish had no + // peer tags. Hash it only when the schema says it's meaningful. + h = LongHashingUtils.addToHash(h, s.peerTagSchema == null ? 0 : s.peerTagSchema.namesHash); + h = + LongHashingUtils.addToHash( + h, s.peerTagSchema == null ? 0 : Arrays.hashCode(s.peerTagValues)); + h = LongHashingUtils.addToHash(h, s.httpMethod); + h = LongHashingUtils.addToHash(h, s.httpEndpoint); + h = LongHashingUtils.addToHash(h, s.grpcStatusCode); + s.keyHash = h; } } diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java index 7fd767533c7..b35dd4cd938 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java @@ -124,7 +124,7 @@ void testUtilsEqualEntriesHaveEqualHashCodes() { } private static SpanSnapshot snapshotWithPeerTags(String[] names, String[] values) { - return new SpanSnapshot( + return AggregateEntryTestUtils.buildSnapshot( "resource", "svc", "op", @@ -144,7 +144,7 @@ private static SpanSnapshot snapshotWithPeerTags(String[] names, String[] values private static AggregateEntry newEntry() { SpanSnapshot snapshot = - new SpanSnapshot( + AggregateEntryTestUtils.buildSnapshot( "resource", "svc", "op", diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTestUtils.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTestUtils.java index ed6fd5a3a7e..acc298fd436 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTestUtils.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTestUtils.java @@ -69,33 +69,76 @@ public static AggregateEntry of( schema = PeerTagSchema.testSchema(names); } SpanSnapshot syntheticSnapshot = - new SpanSnapshot( + buildSnapshot( resource, - service == null ? null : service.toString(), + service, operationName, serviceSource, type, (short) httpStatusCode, synthetic, traceRoot, - spanKind == null ? null : spanKind.toString(), + spanKind, schema, values, - httpMethod == null ? null : httpMethod.toString(), - httpEndpoint == null ? null : httpEndpoint.toString(), - grpcStatusCode == null ? null : grpcStatusCode.toString(), + httpMethod, + httpEndpoint, + grpcStatusCode, 0L); return forSnapshot(syntheticSnapshot); } /** - * Builds an {@link AggregateEntry} from {@code s} by computing its lookup hash via {@link - * AggregateEntry#hashOf(SpanSnapshot)} and calling the package-private constructor directly. - * Production callers route through {@link AggregateTable#findOrInsert} which already has the - * {@code keyHash} on hand; tests rarely do, so this helper hides the second argument. + * Test-only snapshot builder. Production fillSlot canonicalizes each field via the per-field + * DDCaches on {@link AggregateEntry} and precomputes {@link SpanSnapshot#keyHash}; this helper + * does the same so tests exercise the production matches/hash path. + */ + static SpanSnapshot buildSnapshot( + CharSequence resource, + CharSequence service, + CharSequence operationName, + @Nullable CharSequence serviceSource, + CharSequence type, + short httpStatusCode, + boolean synthetic, + boolean traceRoot, + CharSequence spanKind, + @Nullable PeerTagSchema peerTagSchema, + @Nullable String[] peerTagValues, + @Nullable CharSequence httpMethod, + @Nullable CharSequence httpEndpoint, + @Nullable CharSequence grpcStatusCode, + long tagAndDuration) { + SpanSnapshot s = new SpanSnapshot(); + s.resourceName = AggregateEntry.canonicalize(AggregateEntry.RESOURCE_CACHE, resource); + s.serviceName = AggregateEntry.canonicalize(AggregateEntry.SERVICE_CACHE, service); + s.operationName = AggregateEntry.canonicalize(AggregateEntry.OPERATION_CACHE, operationName); + s.serviceNameSource = + AggregateEntry.canonicalizeOptional(AggregateEntry.SERVICE_SOURCE_CACHE, serviceSource); + s.spanType = AggregateEntry.canonicalize(AggregateEntry.TYPE_CACHE, type); + s.httpStatusCode = httpStatusCode; + s.synthetic = synthetic; + s.traceRoot = traceRoot; + s.spanKind = AggregateEntry.canonicalize(AggregateEntry.SPAN_KIND_CACHE, spanKind); + s.peerTagSchema = peerTagSchema; + s.peerTagValues = peerTagValues; + s.httpMethod = + AggregateEntry.canonicalizeOptional(AggregateEntry.HTTP_METHOD_CACHE, httpMethod); + s.httpEndpoint = + AggregateEntry.canonicalizeOptional(AggregateEntry.HTTP_ENDPOINT_CACHE, httpEndpoint); + s.grpcStatusCode = + AggregateEntry.canonicalizeOptional(AggregateEntry.GRPC_STATUS_CODE_CACHE, grpcStatusCode); + s.tagAndDuration = tagAndDuration; + SpanSnapshot.computeAndSetKeyHash(s); + return s; + } + + /** + * Builds an {@link AggregateEntry} from {@code s}. Production callers route through {@link + * AggregateTable#findOrInsert}; tests use this helper for direct construction. */ public static AggregateEntry forSnapshot(SpanSnapshot s) { - return new AggregateEntry(s, AggregateEntry.hashOf(s)); + return new AggregateEntry(s); } /** diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java index 0f12e9b6a3a..e807dfdd338 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java @@ -313,7 +313,7 @@ void aggregateEntryCopiesPeerTagValuesSoSlotReuseDoesntCorruptIt() { private static SpanSnapshot snapshotWithPeerTags( String service, String operation, String spanKind, String[] peerTagValues) { PeerTagSchema schema = PeerTagSchema.testSchema(new String[] {"peer.host", "peer.service"}); - return new SpanSnapshot( + return AggregateEntryTestUtils.buildSnapshot( "resource", service, operation, @@ -332,7 +332,7 @@ private static SpanSnapshot snapshotWithPeerTags( } private static SpanSnapshot nullServiceKindSnapshot(String service, String spanKind) { - return new SpanSnapshot( + return AggregateEntryTestUtils.buildSnapshot( "resource", service, "op", @@ -352,7 +352,7 @@ private static SpanSnapshot nullServiceKindSnapshot(String service, String spanK private static SpanSnapshot nullableSnapshot( String resource, String operation, String type, String serviceNameSource) { - return new SpanSnapshot( + return AggregateEntryTestUtils.buildSnapshot( resource, "svc", operation, @@ -408,7 +408,7 @@ SnapshotBuilder peerTags(String... namesAndValues) { } SpanSnapshot build() { - return new SpanSnapshot( + return AggregateEntryTestUtils.buildSnapshot( "resource", service, operation,