diff --git a/.github/scripts/dependency_age.py b/.github/scripts/dependency_age.py index 62e27422e2e..9c0a77a8bc2 100644 --- a/.github/scripts/dependency_age.py +++ b/.github/scripts/dependency_age.py @@ -452,8 +452,10 @@ def build_validation_summary( if old_gav not in seen: seen.add(old_gav) if new_gav in baseline_coords: - continue # no-op downgrade — replacement matches baseline - lines.append(f"- `{old_gav}` is {hours_remaining}h away from meeting {min_age_hours}h cooldown, updated to `{new_gav}`") + lines.append(f"- `{old_gav}` is {hours_remaining}h away from meeting {min_age_hours}h cooldown, reverted") + else: + new_version = new_gav.rsplit(":", 1)[1] + lines.append(f"- `{old_gav}` is {hours_remaining}h away from meeting {min_age_hours}h cooldown, updated to `{new_version}`") for entries in violations_by_file.values(): for gav, kind, hours_remaining in entries: if gav not in seen: @@ -462,8 +464,6 @@ def build_validation_summary( lines.append(f"- `{gav}` — cannot verify age, reverted") else: lines.append(f"- `{gav}` is {hours_remaining}h away from meeting {min_age_hours}h cooldown, reverted") - if len(lines) == 2: - return "" # only header, no entries after filtering return "\n".join(lines) diff --git a/.github/workflows/analyze-changes.yaml b/.github/workflows/analyze-changes.yaml index 8622431686a..51f652b2b4b 100644 --- a/.github/workflows/analyze-changes.yaml +++ b/.github/workflows/analyze-changes.yaml @@ -30,7 +30,7 @@ jobs: ${{ runner.os }}-gradle- - name: Initialize CodeQL - uses: github/codeql-action/init@9e0d7b8d25671d64c341c19c0152d693099fb5ba # v4.35.5 + uses: github/codeql-action/init@7211b7c8077ea37d8641b6271f6a365a22a5fbfa # v4.36.0 with: languages: 'java' build-mode: 'manual' @@ -43,7 +43,7 @@ jobs: ./gradlew clean :dd-java-agent:shadowJar --build-cache --parallel --stacktrace --no-daemon --max-workers=4 - name: Perform CodeQL Analysis and upload results to GitHub Security tab - uses: github/codeql-action/analyze@9e0d7b8d25671d64c341c19c0152d693099fb5ba # v4.35.5 + uses: github/codeql-action/analyze@7211b7c8077ea37d8641b6271f6a365a22a5fbfa # v4.36.0 trivy: name: Analyze changes with Trivy @@ -102,7 +102,7 @@ jobs: TRIVY_JAVA_DB_REPOSITORY: ghcr.io/aquasecurity/trivy-java-db,public.ecr.aws/aquasecurity/trivy-java-db - name: Upload Trivy scan results to GitHub Security tab - uses: github/codeql-action/upload-sarif@9e0d7b8d25671d64c341c19c0152d693099fb5ba # v4.35.5 + uses: github/codeql-action/upload-sarif@7211b7c8077ea37d8641b6271f6a365a22a5fbfa # v4.36.0 if: always() with: sarif_file: 'trivy-results.sarif' diff --git a/.github/workflows/prune-old-pull-requests.yaml b/.github/workflows/prune-old-pull-requests.yaml index e5633a39415..db92774ea0c 100644 --- a/.github/workflows/prune-old-pull-requests.yaml +++ b/.github/workflows/prune-old-pull-requests.yaml @@ -13,7 +13,7 @@ jobs: pull-requests: write steps: - name: Prune old pull requests - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f # v10.2.0 + uses: actions/stale@eb5cf3af3ac0a1aa4c9c45633dd1ae542a27a899 # v10.3.0 with: days-before-stale: -1 # Disable general stale bot days-before-pr-stale: 90 # Only enable stale bot for PRs with no activity for 90 days diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8f19a760870..4081c4b7123 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -161,8 +161,8 @@ default: fi echo -e "${TEXT_BOLD}${TEXT_YELLOW}Runner dashboard, these are live (limited to a 1-hour window, and expire after 36 hours)${TEXT_CLEAR}" - echo -e "${TEXT_BOLD}${TEXT_YELLOW} Containers:${TEXT_CLEAR} https://app.datadoghq.com/containers?${TIME_PARAMS}query=image_name%3A%2A%2Fdatadog%2Fdd-trace-java-docker-build%20AND%20pod_name%3A${POD_NAME}&live=false" - echo -e "${TEXT_BOLD}${TEXT_YELLOW} Processes:${TEXT_CLEAR} https://app.datadoghq.com/process?${TIME_PARAMS}query=image_name%3A%2A%2Fdatadog%2Fdd-trace-java-docker-build%20AND%20pod_name%3A${POD_NAME}&live=false" + echo -e "${TEXT_BOLD}${TEXT_YELLOW} Containers:${TEXT_CLEAR} https://app.datadoghq.com/containers?${TIME_PARAMS}query=image_name%3A%2A%2Fdd-trace-java-docker-build%20AND%20pod_name%3A${POD_NAME}&live=false" + echo -e "${TEXT_BOLD}${TEXT_YELLOW} Processes:${TEXT_CLEAR} https://app.datadoghq.com/process?${TIME_PARAMS}query=image_name%3A%2A%2Fdd-trace-java-docker-build%20AND%20pod_name%3A${POD_NAME}&live=false" .gitlab_base_ref_params: &gitlab_base_ref_params - | @@ -548,7 +548,15 @@ check_debugger: muzzle: extends: .gradle_build - needs: [ build_tests ] + # needs:parallel:matrix limits this job to a specific build_tests combination. + # Keep matrix vars exact and in build_tests declaration order: + # https://docs.gitlab.com/ci/yaml/#needsparallelmatrix + needs: &needs_build_tests_inst + - job: build_tests + parallel: + matrix: + - GRADLE_TARGET: ":instrumentationTest" + CACHE_TYPE: "inst" stage: tests rules: - if: '$CI_COMMIT_BRANCH =~ /^mq-working-branch-/' @@ -586,7 +594,7 @@ muzzle: muzzle-dep-report: extends: .gradle_build - needs: [ build_tests ] + needs: *needs_build_tests_inst stage: tests rules: - if: '$CI_COMMIT_BRANCH =~ /^mq-working-branch-/' @@ -737,6 +745,15 @@ agent_integration_tests: test_base: extends: .test_job + # needs:parallel:matrix limits this job to a specific build_tests combination. + # Keep matrix vars exact and in build_tests declaration order: + # https://docs.gitlab.com/ci/yaml/#needsparallelmatrix + needs: + - job: build_tests + parallel: + matrix: + - GRADLE_TARGET: ":baseTest" + CACHE_TYPE: "base" variables: GRADLE_TARGET: ":baseTest" CACHE_TYPE: "base" @@ -748,6 +765,7 @@ test_base: test_inst: extends: .test_job_with_test_agent + needs: *needs_build_tests_inst variables: GRADLE_TARGET: ":instrumentationTest" CACHE_TYPE: "inst" @@ -756,6 +774,15 @@ test_inst: test_inst_latest: extends: .test_job_with_test_agent + # needs:parallel:matrix limits this job to a specific build_tests combination. + # Keep matrix vars exact and in build_tests declaration order: + # https://docs.gitlab.com/ci/yaml/#needsparallelmatrix + needs: + - job: build_tests + parallel: + matrix: + - GRADLE_TARGET: ":instrumentationLatestDepTest" + CACHE_TYPE: "latestdep" variables: GRADLE_TARGET: ":instrumentationLatestDepTest" CACHE_TYPE: "latestdep" @@ -786,6 +813,7 @@ test_flaky: test_flaky_inst: extends: .test_job + needs: *needs_build_tests_inst variables: GRADLE_TARGET: ":instrumentationTest" GRADLE_PARAMS: "-PrunFlakyTests" @@ -800,6 +828,15 @@ test_flaky_inst: test_profiling: extends: .test_job + # needs:parallel:matrix limits this job to a specific build_tests combination. + # Keep matrix vars exact and in build_tests declaration order: + # https://docs.gitlab.com/ci/yaml/#needsparallelmatrix + needs: + - job: build_tests + parallel: + matrix: + - GRADLE_TARGET: ":profilingTest" + CACHE_TYPE: "profiling" variables: GRADLE_TARGET: ":profilingTest" CACHE_TYPE: "profiling" @@ -819,6 +856,16 @@ test_debugger: test_smoke: extends: .test_job + # needs:parallel:matrix limits this job to a specific build_tests combination. + # Keep matrix vars exact and in build_tests declaration order: + # https://docs.gitlab.com/ci/yaml/#needsparallelmatrix + needs: &needs_build_tests_smoke + - job: build_tests + parallel: + matrix: + - GRADLE_TARGET: ":smokeTest" + CACHE_TYPE: "smoke" + MAVEN_OPTS: "-Xms256M -Xmx1024M" variables: GRADLE_TARGET: "stageMainDist :smokeTest" GRADLE_PARAMS: "-PskipFlakyTests" @@ -828,6 +875,7 @@ test_smoke: test_ssi_smoke: extends: .test_job + needs: *needs_build_tests_smoke rules: - if: $CI_COMMIT_BRANCH == "master" when: on_success @@ -845,6 +893,7 @@ test_ssi_smoke: test_smoke_graalvm: extends: .test_job + needs: *needs_build_tests_smoke tags: [ "arch:amd64" ] variables: GRADLE_TARGET: "stageMainDist :dd-smoke-test:spring-boot-3.0-native:test" @@ -857,6 +906,7 @@ test_smoke_graalvm: test_smoke_semeru8_debugger: extends: .test_job + needs: *needs_build_tests_smoke tags: [ "arch:amd64" ] variables: GRADLE_TARGET: "stageMainDist dd-smoke-tests:debugger-integration-tests:test" diff --git a/communication/src/main/java/datadog/communication/serialization/FlushingBuffer.java b/communication/src/main/java/datadog/communication/serialization/FlushingBuffer.java index 332434ad46a..e3087bd5f27 100644 --- a/communication/src/main/java/datadog/communication/serialization/FlushingBuffer.java +++ b/communication/src/main/java/datadog/communication/serialization/FlushingBuffer.java @@ -1,5 +1,6 @@ package datadog.communication.serialization; +import datadog.trace.api.internal.VisibleForTesting; import java.nio.ByteBuffer; public final class FlushingBuffer implements StreamingBuffer { @@ -106,7 +107,7 @@ public void reset() { mark = 0; } - // for tests only + @VisibleForTesting int getMessageCount() { return messageCount; } diff --git a/components/annotations/build.gradle.kts b/components/annotations/build.gradle.kts new file mode 100644 index 00000000000..b8314ee7f33 --- /dev/null +++ b/components/annotations/build.gradle.kts @@ -0,0 +1 @@ +apply(from = "$rootDir/gradle/java.gradle") diff --git a/components/annotations/src/main/java/datadog/trace/api/internal/VisibleForTesting.java b/components/annotations/src/main/java/datadog/trace/api/internal/VisibleForTesting.java new file mode 100644 index 00000000000..60e3cd2c33a --- /dev/null +++ b/components/annotations/src/main/java/datadog/trace/api/internal/VisibleForTesting.java @@ -0,0 +1,10 @@ +package datadog.trace.api.internal; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.SOURCE) +@Target({ElementType.CONSTRUCTOR, ElementType.FIELD, ElementType.METHOD, ElementType.TYPE}) +public @interface VisibleForTesting {} diff --git a/components/environment/build.gradle.kts b/components/environment/build.gradle.kts index 7818fef2463..82c834d5ef3 100644 --- a/components/environment/build.gradle.kts +++ b/components/environment/build.gradle.kts @@ -5,6 +5,10 @@ plugins { apply(from = "$rootDir/gradle/java.gradle") +dependencies { + compileOnly(project(":components:annotations")) +} + /* * Add an addition gradle configuration to be consumed by bootstrap only. * "datadog.trace." prefix is required to be excluded from Jacoco instrumentation. diff --git a/components/environment/src/main/java/datadog/environment/JavaVirtualMachine.java b/components/environment/src/main/java/datadog/environment/JavaVirtualMachine.java index a71d66859bc..d6fe120c98b 100644 --- a/components/environment/src/main/java/datadog/environment/JavaVirtualMachine.java +++ b/components/environment/src/main/java/datadog/environment/JavaVirtualMachine.java @@ -1,5 +1,6 @@ package datadog.environment; +import datadog.trace.api.internal.VisibleForTesting; import java.util.List; import java.util.Locale; import javax.annotation.Nullable; @@ -212,7 +213,7 @@ public Runtime() { SystemProperties.get("java.vendor.version")); } - // Only visible for testing + @VisibleForTesting Runtime(String javaVer, String rtVer, String name, String vendor, String vendorVersion) { this.name = name == null ? "" : name; this.vendor = vendor == null ? "" : vendor; diff --git a/components/environment/src/main/java/datadog/environment/JvmOptions.java b/components/environment/src/main/java/datadog/environment/JvmOptions.java index ef549bc4d73..7dd77e29ff9 100644 --- a/components/environment/src/main/java/datadog/environment/JvmOptions.java +++ b/components/environment/src/main/java/datadog/environment/JvmOptions.java @@ -5,6 +5,7 @@ import static java.util.Collections.emptyList; import static java.util.Collections.singletonList; +import datadog.trace.api.internal.VisibleForTesting; import de.thetaphi.forbiddenapis.SuppressForbidden; import java.io.BufferedReader; import java.io.IOException; @@ -93,7 +94,7 @@ private List findVmOptions() { // Be aware that when running a native image, the command line in /proc/self/cmdline is just the // executable - // Visible for testing + @VisibleForTesting List findVmOptionsFromProcFs(String[] procfsCmdline) { // Create the list of VM options List vmOptions = new ArrayList<>(); diff --git a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/InstrumentationErrors.java b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/InstrumentationErrors.java index c18534a73e9..b2bc1702805 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/InstrumentationErrors.java +++ b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/InstrumentationErrors.java @@ -1,5 +1,6 @@ package datadog.trace.bootstrap; +import datadog.trace.api.internal.VisibleForTesting; import java.io.PrintWriter; import java.io.StringWriter; import java.util.List; @@ -30,7 +31,7 @@ public static Throwable recordError(Throwable error) { return error; // keep throwable at top of the stack } - // Visible for testing + @VisibleForTesting public static void resetErrors() { COUNTER.set(0); if (detailed) { diff --git a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/WeakMapContextStore.java b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/WeakMapContextStore.java index a3926412889..1750b7f2b9c 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/WeakMapContextStore.java +++ b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/WeakMapContextStore.java @@ -1,5 +1,7 @@ package datadog.trace.bootstrap; +import datadog.trace.api.internal.VisibleForTesting; + /** * Weak {@link ContextStore} that acts as a fall-back when field-injection isn't possible. * @@ -85,7 +87,7 @@ public V remove(final K key) { return (V) map.remove(key); } - // Package reachable for testing + @VisibleForTesting int size() { return map.size(); } diff --git a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/blocking/BlockingActionHelper.java b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/blocking/BlockingActionHelper.java index fbe5e150367..e72a86a7098 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/blocking/BlockingActionHelper.java +++ b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/blocking/BlockingActionHelper.java @@ -6,6 +6,7 @@ import datadog.appsec.api.blocking.BlockingContentType; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; @@ -187,7 +188,7 @@ private static String nextMediaRange(String s, int[] pos, float[] quality) { return mediaRangeMatcher.group(1); } - // public for testing + @VisibleForTesting public static void reset(Config config) { TEMPLATE_HTML = null; TEMPLATE_JSON = null; diff --git a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/dbm/SharedDBCommenter.java b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/dbm/SharedDBCommenter.java index 161e2f8ad11..3dbf916362c 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/dbm/SharedDBCommenter.java +++ b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/dbm/SharedDBCommenter.java @@ -4,6 +4,7 @@ import datadog.trace.api.BaseHash; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.instrumentation.api.AgentSpan; import datadog.trace.bootstrap.instrumentation.api.Tags; import java.io.UnsupportedEncodingException; @@ -88,7 +89,7 @@ private static void ensureStaticPrefixComputed() { staticPrefixComputed = true; } - // @VisibleForTesting + @VisibleForTesting public static void resetStaticPrefixForTesting() { staticPrefixComputed = false; } diff --git a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/ConfigManager.java b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/ConfigManager.java index 02c10016d83..99e6235a6b2 100644 --- a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/ConfigManager.java +++ b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/ConfigManager.java @@ -8,6 +8,7 @@ import datadog.trace.api.Config; import datadog.trace.api.ProcessTags; import datadog.trace.api.WellKnownTags; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.util.PidHelper; import datadog.trace.util.RandomUtils; import java.io.BufferedReader; @@ -137,7 +138,7 @@ public Builder extendedInfoEnabled(boolean extendedInfoEnabled) { return this; } - // @VisibleForTesting + @VisibleForTesting Builder reportUUID(String reportUUID) { this.reportUUID = reportUUID; return this; @@ -170,7 +171,7 @@ private static String getBaseName(File file) { return filename.substring(0, dotIndex); } - // @VisibleForTesting + @VisibleForTesting static String getMergedTagsForSerialization(Config config) { return config.getMergedCrashTrackingTags().entrySet().stream() .filter(e -> e.getValue() != null) @@ -195,7 +196,7 @@ public static void writeConfigToPath(File scriptFile, String... additionalEntrie writeConfigToFile(Config.get(), cfgFile, additionalEntries); } - // @VisibleForTesting + @VisibleForTesting static void writeConfigToFile(Config config, File cfgFile, String... additionalEntries) { final WellKnownTags wellKnownTags = config.getWellKnownTags(); diff --git a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploader.java b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploader.java index 76fd0e45f8c..5c37b7c5908 100644 --- a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploader.java +++ b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploader.java @@ -24,6 +24,7 @@ import datadog.environment.SystemProperties; import datadog.trace.api.Config; import datadog.trace.api.DDTags; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.config.provider.ConfigProvider; import datadog.trace.util.AgentThreadFactory; import datadog.trace.util.PidHelper; @@ -187,7 +188,7 @@ public void notifyCrashStarted(String error) { } } - // @VisibleForTesting + @VisibleForTesting void sendPingToTelemetry(String error) { // send a ping message to the telemetry to notify that the crash report started try (Buffer buf = new Buffer(); @@ -207,7 +208,7 @@ void sendPingToTelemetry(String error) { } } - // @VisibleForTesting + @VisibleForTesting void sendPingToErrorTracking(String error) { try { final CrashLog ping = @@ -253,7 +254,7 @@ public void upload(@Nonnull Path file) { } } - // @VisibleForTesting + @VisibleForTesting void remoteUpload( @Nonnull String fileContent, boolean sendToTelemetry, boolean sendToErrorTracking) { final String uuid = storedConfig.reportUUID; @@ -316,7 +317,7 @@ void uploadToLogs(@Nonnull String message, @Nonnull PrintStream out) throws IOEx } } - // @VisibleForTesting + @VisibleForTesting @SuppressForbidden static String extractErrorKind(String fileContent) { Matcher matcher = ERROR_MESSAGE_PATTERN.matcher(fileContent); @@ -348,7 +349,7 @@ static String extractErrorKind(String fileContent) { "$"), Pattern.DOTALL | Pattern.MULTILINE); - // @VisibleForTesting + @VisibleForTesting @SuppressForbidden static String extractErrorMessage(String fileContent) { Matcher matcher = ERROR_MESSAGE_PATTERN.matcher(fileContent); diff --git a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploaderScriptInitializer.java b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploaderScriptInitializer.java index 0e810694be1..5677d10f220 100644 --- a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploaderScriptInitializer.java +++ b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploaderScriptInitializer.java @@ -8,6 +8,7 @@ import static java.util.Locale.ROOT; import datadog.environment.SystemProperties; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.util.PidHelper; import datadog.trace.util.Strings; import java.io.BufferedReader; @@ -25,12 +26,12 @@ public final class CrashUploaderScriptInitializer { private CrashUploaderScriptInitializer() {} - // @VisibleForTests + @VisibleForTesting static void initialize(String onErrorVal, String onErrorFile) { initialize(onErrorVal, onErrorFile, null); } - // @VisibleForTests + @VisibleForTesting static void initialize(String onErrorVal, String onErrorFile, String javacorePath) { if (onErrorVal == null || onErrorVal.isEmpty()) { LOG.debug( diff --git a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/OOMENotifierScriptInitializer.java b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/OOMENotifierScriptInitializer.java index c558d3a83a5..5d37b975b98 100644 --- a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/OOMENotifierScriptInitializer.java +++ b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/OOMENotifierScriptInitializer.java @@ -8,6 +8,7 @@ import static datadog.crashtracking.Initializer.pidFromSpecialFileName; import static datadog.trace.api.telemetry.LogCollector.SEND_TELEMETRY; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.util.PidHelper; import java.io.File; import java.io.FileOutputStream; @@ -20,7 +21,7 @@ public final class OOMENotifierScriptInitializer { private OOMENotifierScriptInitializer() {} - // @VisibleForTests + @VisibleForTesting static void initialize(String onOutOfMemoryVal) { if (onOutOfMemoryVal == null || onOutOfMemoryVal.isEmpty()) { LOG.debug( diff --git a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/parsers/RedactUtils.java b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/parsers/RedactUtils.java index 7be12817525..6665d0f412d 100644 --- a/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/parsers/RedactUtils.java +++ b/dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/parsers/RedactUtils.java @@ -1,5 +1,6 @@ package datadog.crashtracking.parsers; +import datadog.trace.api.internal.VisibleForTesting; import de.thetaphi.forbiddenapis.SuppressForbidden; import java.util.function.Function; import java.util.regex.Matcher; @@ -215,7 +216,7 @@ static String redactNmethodClass(String line) { * java.lang.Class} oop) use {@link #redactRegisterToMemoryMapping} which detects the oop type * automatically. */ - // @VisibleForTesting + @VisibleForTesting static String redactDottedClassOopRef(String line) { return redactStringOopRef(line, false); } diff --git a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java index c392ab3c663..3caa2a149ab 100644 --- a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java +++ b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/agent/ConfigurationUpdater.java @@ -15,6 +15,7 @@ import datadog.environment.JavaVirtualMachine; import datadog.logging.RatelimitedLogger; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.debugger.DebuggerContext; import datadog.trace.bootstrap.debugger.ProbeId; import datadog.trace.bootstrap.debugger.ProbeImplementation; @@ -445,7 +446,7 @@ private void removeCurrentTransformer() { currentTransformer = null; } - // only visible for tests + @VisibleForTesting Map getAppliedDefinitions() { if (currentTransformer == null) { return Collections.emptyMap(); diff --git a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/sink/DebuggerSink.java b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/sink/DebuggerSink.java index 5c91e110acb..becf47b0337 100644 --- a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/sink/DebuggerSink.java +++ b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/sink/DebuggerSink.java @@ -5,6 +5,7 @@ import com.datadog.debugger.uploader.BatchUploader; import com.datadog.debugger.util.DebuggerMetrics; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.debugger.DebuggerContext.SkipCause; import datadog.trace.bootstrap.debugger.ProbeId; import datadog.trace.util.AgentTaskScheduler; @@ -159,7 +160,7 @@ private void lowRateReschedule() { TimeUnit.MILLISECONDS); } - // visible for testing + @VisibleForTesting void lowRateFlush(DebuggerSink ignored) { symbolSink.flush(); probeStatusSink.flush(tags); diff --git a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/uploader/BatchUploader.java b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/uploader/BatchUploader.java index 63ff0ef8dcc..993d8194668 100644 --- a/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/uploader/BatchUploader.java +++ b/dd-java-agent/agent-debugger/src/main/java/com/datadog/debugger/uploader/BatchUploader.java @@ -7,6 +7,7 @@ import datadog.communication.http.OkHttpUtils; import datadog.logging.RatelimitedLogger; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.util.AgentThreadFactory; import java.io.IOException; import java.time.Duration; @@ -124,7 +125,7 @@ public BatchUploader(String name, Config config, String endpoint, RetryPolicy re ContainerInfo.getEntityId()); } - // Visible for testing + @VisibleForTesting BatchUploader( String name, Config config, @@ -220,10 +221,7 @@ private void doUpload(Runnable makeRequest) { } } - /** - * Note that this method is only visible for testing and should not be used from outside this - * class. - */ + @VisibleForTesting OkHttpClient getClient() { return client; } diff --git a/dd-java-agent/agent-logging/src/main/java/datadog/trace/logging/PrintStreamWrapper.java b/dd-java-agent/agent-logging/src/main/java/datadog/trace/logging/PrintStreamWrapper.java index cd1e1b4a9c9..c71390a8e23 100644 --- a/dd-java-agent/agent-logging/src/main/java/datadog/trace/logging/PrintStreamWrapper.java +++ b/dd-java-agent/agent-logging/src/main/java/datadog/trace/logging/PrintStreamWrapper.java @@ -1,6 +1,7 @@ package datadog.trace.logging; import datadog.environment.OperatingSystem; +import datadog.trace.api.internal.VisibleForTesting; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.OutputStream; @@ -17,7 +18,7 @@ public PrintStreamWrapper(PrintStream ps) { super(ps); } - // use for tests only + @VisibleForTesting public OutputStream getOriginalPrintStream() { return super.out; } diff --git a/dd-java-agent/agent-otel/otel-shim/src/main/java/datadog/opentelemetry/shim/logs/OtelLogRecordBuilder.java b/dd-java-agent/agent-otel/otel-shim/src/main/java/datadog/opentelemetry/shim/logs/OtelLogRecordBuilder.java index 93554fdc8c5..81cfb7c875e 100644 --- a/dd-java-agent/agent-otel/otel-shim/src/main/java/datadog/opentelemetry/shim/logs/OtelLogRecordBuilder.java +++ b/dd-java-agent/agent-otel/otel-shim/src/main/java/datadog/opentelemetry/shim/logs/OtelLogRecordBuilder.java @@ -3,6 +3,7 @@ import static datadog.opentelemetry.shim.trace.OtelExtractedContext.extract; import static io.opentelemetry.api.common.AttributeKey.stringKey; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.time.SystemTimeSource; import datadog.trace.api.time.TimeSource; import datadog.trace.bootstrap.otel.logs.data.OtelLogRecordProcessor; @@ -23,8 +24,7 @@ @ParametersAreNonnullByDefault final class OtelLogRecordBuilder implements LogRecordBuilder { - // package-visible for testing - static TimeSource TIME_SOURCE = SystemTimeSource.INSTANCE; + @VisibleForTesting static TimeSource TIME_SOURCE = SystemTimeSource.INSTANCE; private static final AttributeKey EXCEPTION_TYPE_KEY = stringKey("exception.type"); private static final AttributeKey EXCEPTION_MESSAGE_KEY = stringKey("exception.message"); diff --git a/dd-java-agent/agent-profiling/profiling-controller-ddprof/src/main/java/com/datadog/profiling/controller/ddprof/DatadogProfilerOngoingRecording.java b/dd-java-agent/agent-profiling/profiling-controller-ddprof/src/main/java/com/datadog/profiling/controller/ddprof/DatadogProfilerOngoingRecording.java index 7976f7f81a9..26ffc13e972 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-ddprof/src/main/java/com/datadog/profiling/controller/ddprof/DatadogProfilerOngoingRecording.java +++ b/dd-java-agent/agent-profiling/profiling-controller-ddprof/src/main/java/com/datadog/profiling/controller/ddprof/DatadogProfilerOngoingRecording.java @@ -20,6 +20,7 @@ import com.datadog.profiling.controller.UnsupportedEnvironmentException; import com.datadog.profiling.ddprof.DatadogProfiler; import datadog.environment.JavaVirtualMachine; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.ProfilingSnapshot; import datadog.trace.api.profiling.RecordingData; import java.time.Instant; @@ -52,7 +53,7 @@ public RecordingData stop() { return recording.stop(); } - // @VisibleForTesting + @VisibleForTesting final RecordingData snapshot(final Instant start) { return snapshot(start, ProfilingSnapshot.Kind.PERIODIC); } diff --git a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkOngoingRecording.java b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkOngoingRecording.java index ddac86e663f..4b9d71e3a26 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkOngoingRecording.java +++ b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkOngoingRecording.java @@ -3,6 +3,7 @@ import com.datadog.profiling.controller.ControllerContext; import com.datadog.profiling.controller.OngoingRecording; import com.datadog.profiling.utils.ProfilingMode; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.ProfilingSnapshot; import datadog.trace.api.profiling.RecordingData; import datadog.trace.bootstrap.config.provider.ConfigProvider; @@ -135,7 +136,7 @@ public RecordingData stop() { return new OpenJdkRecordingData(recording, ProfilingSnapshot.Kind.PERIODIC); } - // @VisibleForTesting + @VisibleForTesting final RecordingData snapshot(@Nonnull final Instant start) { return snapshot(start, ProfilingSnapshot.Kind.PERIODIC); } diff --git a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkRecordingData.java b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkRecordingData.java index 6cedf96691b..da022c226ed 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkRecordingData.java +++ b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkRecordingData.java @@ -15,6 +15,7 @@ */ package com.datadog.profiling.controller.openjdk; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.RecordingData; import datadog.trace.api.profiling.RecordingInputStream; import java.io.IOException; @@ -54,7 +55,7 @@ public String getName() { return recording.getName(); } - // Visible for testing + @VisibleForTesting Recording getRecording() { return recording; } diff --git a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/events/SmapEntryCache.java b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/events/SmapEntryCache.java index 4b03fff0396..b890904a461 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/events/SmapEntryCache.java +++ b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/events/SmapEntryCache.java @@ -1,6 +1,7 @@ package com.datadog.profiling.controller.openjdk.events; import datadog.environment.JavaVirtualMachine; +import datadog.trace.api.internal.VisibleForTesting; import de.thetaphi.forbiddenapis.SuppressForbidden; import java.io.BufferedReader; import java.io.IOException; @@ -60,7 +61,7 @@ static class AnnotatedRegion { this.smapsPath = smapsPath; } - // @VisibleForTesting + @VisibleForTesting void invalidate() { UPDATER.getAndSet(this, System.nanoTime() - (2 * ttl)); } @@ -81,7 +82,7 @@ public List getEvents() { return (List) events[index]; } - // accessible for testing + @VisibleForTesting static AnnotatedRegion fromAnnotatedEntry(String line, int javaVersion) { boolean isRegion = line.startsWith("0x"); if (isRegion) { @@ -149,7 +150,7 @@ private static boolean isSmapHeader(String line) { return ((firstChar >= '0' && firstChar <= '9') || (firstChar >= 'a' && firstChar <= 'f')); } - // accessible for testing + @VisibleForTesting static void readEvents(BufferedReader br, List events) throws IOException { String line = br.readLine(); while (line != null) { diff --git a/dd-java-agent/agent-profiling/profiling-controller-oracle/src/main/java/com/datadog/profiling/controller/oracle/OracleJdkOngoingRecording.java b/dd-java-agent/agent-profiling/profiling-controller-oracle/src/main/java/com/datadog/profiling/controller/oracle/OracleJdkOngoingRecording.java index 18f104e4e6f..2edbc1a369e 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-oracle/src/main/java/com/datadog/profiling/controller/oracle/OracleJdkOngoingRecording.java +++ b/dd-java-agent/agent-profiling/profiling-controller-oracle/src/main/java/com/datadog/profiling/controller/oracle/OracleJdkOngoingRecording.java @@ -1,6 +1,7 @@ package com.datadog.profiling.controller.oracle; import com.datadog.profiling.controller.OngoingRecording; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.ProfilingSnapshot; import java.io.IOException; import java.time.Duration; @@ -52,7 +53,7 @@ public OracleJdkRecordingData stop() { } } - // @VisibleForTesting + @VisibleForTesting final OracleJdkRecordingData snapshot(@Nonnull final Instant start) { return snapshot(start, ProfilingSnapshot.Kind.PERIODIC); } diff --git a/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java b/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java index 7f57b356d99..bfb3df0f333 100644 --- a/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java +++ b/dd-java-agent/agent-profiling/profiling-controller/src/main/java/com/datadog/profiling/controller/ProfilingSystem.java @@ -22,6 +22,7 @@ import static datadog.trace.util.AgentThreadFactory.AgentThread.PROFILER_RECORDING_SCHEDULER; import datadog.environment.JavaVirtualMachine; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.ProfilerFlareLogger; import datadog.trace.api.profiling.ProfilingSnapshot; import datadog.trace.api.profiling.RecordingData; @@ -238,7 +239,7 @@ public boolean isStarted() { return started; } - /** VisibleForTesting */ + @VisibleForTesting final Duration getStartupDelay() { return startupDelay; } diff --git a/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java b/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java index b1e07b08c32..9aedde9e49f 100644 --- a/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java +++ b/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfiler.java @@ -38,6 +38,7 @@ import datadog.environment.JavaVirtualMachine; import datadog.libs.ddprof.DdprofLibraryLoader; import datadog.trace.api.config.ProfilingConfig; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.RecordingData; import datadog.trace.bootstrap.config.provider.ConfigProvider; import datadog.trace.bootstrap.instrumentation.api.TaskWrapper; @@ -115,7 +116,7 @@ private DatadogProfiler(ConfigProvider configProvider) { this(configProvider, getContextAttributes(configProvider)); } - // visible for testing + @VisibleForTesting DatadogProfiler(ConfigProvider configProvider, Set contextAttributes) { this.configProvider = configProvider; this.profiler = DdprofLibraryLoader.javaProfiler().getComponent(); diff --git a/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfilerRecording.java b/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfilerRecording.java index 8154a8a444e..82bf3d5b868 100644 --- a/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfilerRecording.java +++ b/dd-java-agent/agent-profiling/profiling-ddprof/src/main/java/com/datadog/profiling/ddprof/DatadogProfilerRecording.java @@ -1,6 +1,7 @@ package com.datadog.profiling.ddprof; import com.datadog.profiling.controller.OngoingRecording; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.ProfilingSnapshot; import datadog.trace.api.profiling.RecordingData; import java.io.IOException; @@ -37,7 +38,7 @@ public RecordingData stop() { recordingFile, started, Instant.now(), ProfilingSnapshot.Kind.ON_SHUTDOWN); } - // @VisibleForTesting + @VisibleForTesting final RecordingData snapshot(@Nonnull Instant start) { return snapshot(start, ProfilingSnapshot.Kind.PERIODIC); } @@ -62,7 +63,7 @@ public void close() { } } - // used for tests only + @VisibleForTesting Path getRecordingFile() { return recordingFile; } diff --git a/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java b/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java index 80afc760feb..86ea3f6e3e0 100644 --- a/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java +++ b/dd-java-agent/agent-profiling/profiling-uploader/src/main/java/com/datadog/profiling/uploader/ProfileUploader.java @@ -31,6 +31,7 @@ import datadog.trace.api.ProcessTags; import datadog.trace.api.git.GitInfo; import datadog.trace.api.git.GitInfoProvider; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.RecordingData; import datadog.trace.api.profiling.RecordingType; import datadog.trace.bootstrap.config.provider.ConfigProvider; @@ -134,10 +135,7 @@ public ProfileUploader(final Config config, final ConfigProvider configProvider) this(config, configProvider, new IOLogger(log), TERMINATION_TIMEOUT_SEC); } - /** - * Note that this method is only visible for testing and should not be used from outside this - * class. - */ + @VisibleForTesting ProfileUploader( final Config config, final ConfigProvider configProvider, @@ -443,10 +441,7 @@ private List tagsToList(final Map tags) { .collect(Collectors.toList()); } - /** - * Note that this method is only visible for testing and should not be used from outside this - * class. - */ + @VisibleForTesting OkHttpClient getClient() { return client; } diff --git a/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/CompositeController.java b/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/CompositeController.java index 5cec160a754..236bf52b1ae 100644 --- a/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/CompositeController.java +++ b/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/CompositeController.java @@ -17,6 +17,7 @@ import datadog.trace.api.Config; import datadog.trace.api.Platform; import datadog.trace.api.config.ProfilingConfig; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.ProfilerFlareLogger; import datadog.trace.api.profiling.ProfilingSnapshot; import datadog.trace.api.profiling.RecordingData; @@ -50,7 +51,7 @@ public CompositeController(List controllers) { this.controllers = controllers; } - // visible for testing + @VisibleForTesting public List getControllers() { return Collections.unmodifiableList(controllers); } diff --git a/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ScrubRecordingDataListener.java b/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ScrubRecordingDataListener.java index 6e195a0b7a7..211d0e23c64 100644 --- a/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ScrubRecordingDataListener.java +++ b/dd-java-agent/agent-profiling/src/main/java/com/datadog/profiling/agent/ScrubRecordingDataListener.java @@ -4,6 +4,7 @@ import com.datadog.profiling.scrubber.DefaultScrubDefinition; import com.datadog.profiling.scrubber.JfrScrubber; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.RecordingData; import datadog.trace.api.profiling.RecordingDataListener; import datadog.trace.api.profiling.RecordingInputStream; @@ -45,7 +46,7 @@ static RecordingDataListener wrap( this(delegate, scrubber, failOpen, null); } - // visible for testing + @VisibleForTesting ScrubRecordingDataListener( RecordingDataListener delegate, JfrScrubber scrubber, boolean failOpen, Path tempDir) { this.delegate = delegate; diff --git a/dd-java-agent/build.gradle b/dd-java-agent/build.gradle index 58ad7540df2..a2001cb4ee8 100644 --- a/dd-java-agent/build.gradle +++ b/dd-java-agent/build.gradle @@ -60,6 +60,7 @@ tasks.named("compileJava") { dependencies { implementation sourceSets.main_java11.output main_java11CompileOnly libs.forbiddenapis + main_java6CompileOnly project(':components:annotations') main_java6CompileOnly libs.forbiddenapis testImplementation sourceSets.main_java6.output } diff --git a/dd-java-agent/src/main/java6/datadog/trace/bootstrap/AgentPreCheck.java b/dd-java-agent/src/main/java6/datadog/trace/bootstrap/AgentPreCheck.java index de49e86ec73..e9ea4fc7b7a 100644 --- a/dd-java-agent/src/main/java6/datadog/trace/bootstrap/AgentPreCheck.java +++ b/dd-java-agent/src/main/java6/datadog/trace/bootstrap/AgentPreCheck.java @@ -1,5 +1,6 @@ package datadog.trace.bootstrap; +import datadog.trace.api.internal.VisibleForTesting; import de.thetaphi.forbiddenapis.SuppressForbidden; import java.io.BufferedReader; import java.io.IOException; @@ -91,9 +92,9 @@ private static boolean compatible() { return compatible(javaVersion, javaHome, System.err); } - // Reachable for testing // System.getenv usage is necessary since class is designed to be Java 6 compatible, while // Environment component is for Java 8+ + @VisibleForTesting @SuppressForbidden static boolean compatible(String javaVersion, String javaHome, PrintStream output) { int majorJavaVersion = parseJavaMajorVersion(javaVersion); @@ -114,7 +115,7 @@ static boolean compatible(String javaVersion, String javaHome, PrintStream outpu return false; } - // Reachable for testing + @VisibleForTesting static int parseJavaMajorVersion(String javaVersion) { int major = 0; if (javaVersion == null || javaVersion.isEmpty()) { diff --git a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy index 5a640da4ae8..0e35061d8ee 100644 --- a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy +++ b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy @@ -3,6 +3,7 @@ package datadog.smoketest import com.squareup.moshi.Moshi import com.squareup.moshi.Types import datadog.environment.JavaVirtualMachine +import datadog.environment.OperatingSystem import datadog.trace.agent.test.server.http.TestHttpServer.HandlerApi.RequestApi import datadog.trace.api.config.GeneralConfig import datadog.trace.test.util.Flaky @@ -378,15 +379,213 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest { // The default error ("Condition not satisfied after 30s") is useless — enrich with diagnostic state def alive = testedProcess?.isAlive() def lastLines = tailProcessLog(30) + def threadDump = alive ? dumpThreadStacks() : "(process not alive, skipping thread dump)" throw new AssertionError( "Timed out waiting for ${count} traces after ${defaultPoll.timeout}s. " + "traceCount=${traceCount.get()}, process.alive=${alive}, " + "RC polls received: ${rcClientMessages.size()}.\n" + - "Last process output:\n${lastLines}", e) + "Last process output:\n${lastLines}\n" + + "Thread dump:\n${threadDump}", e) } traceCount.get() } + /** + * Capture a thread dump of the forked process via {@code jstack}. jstack's output is captured by + * the smoketest JVM directly, bypassing the tested-process output-capture thread that has been + * observed to be starved at timeout (which makes the SIGQUIT-via-stderr approach unreliable for + * exactly the failures we want to diagnose). + * + *

No raw {@code kill -3} fallback: PID reuse on shared CI hosts could cause us to signal an + * unrelated process if the child has exited since the surrounding liveness check. + */ + private String dumpThreadStacks() { + try { + if (testedProcess == null) { + return "(no tested process)" + } + if (OperatingSystem.isWindows()) { + return "(thread dump not supported on Windows)" + } + long pid = getTestedProcessPid() + if (pid <= 0) { + return "(could not determine pid)" + } + // Re-check liveness immediately before invoking jstack. The earlier check that gates this + // method runs ~1 statement away, but if the child has exited and been reaped since then, + // the OS may have reused the PID — jstack-ing the wrong process would attach misleading + // diagnostics to the test failure. + if (!testedProcess.isAlive()) { + return "(process exited between liveness check and dump; skipping to avoid PID reuse)" + } + String jstackOut = runJstack(pid) + if (jstackOut == null) { + return "(jstack not available or failed)" + } + return filterThreadDump(jstackOut) + } catch (Throwable t) { + // Never let a diagnostic failure mask the original AssertionError. + return "(thread dump failed: ${t.getClass().simpleName}: ${t.message})" + } + } + + // Approximate budget for the inline dump in error.message. Datadog CI Visibility caps + // error.message at ~5000 chars; this leaves a few hundred for the "Timed out waiting..." + // prefix and the elision marker. + private static final int INLINE_DUMP_CAP = 4700 + + /** + * Reduce a jstack thread dump to the threads most likely to explain a hang: the main thread, + * dd-trace agent threads (dd-*, datadog-*), OkHttp threads, and anything BLOCKED. Drops known + * JVM boilerplate (compiler/GC/reference handler/etc). Truncates to {@link #INLINE_DUMP_CAP} + * with an elision marker. + */ + private String filterThreadDump(String fullDump) { + int firstBlockIdx = fullDump.indexOf('\n"') + if (firstBlockIdx < 0) { + // No recognizable thread blocks — return the original, truncated if needed + return fullDump.length() > INLINE_DUMP_CAP + ? fullDump.substring(0, INLINE_DUMP_CAP) + "\n(truncated)" + : fullDump + } + String header = fullDump.substring(0, firstBlockIdx + 1) + String rest = fullDump.substring(firstBlockIdx + 1) + + List blocks = [] + int i = 0 + while (i < rest.length()) { + int next = rest.indexOf('\n"', i) + if (next < 0) { + blocks.add(rest.substring(i)) + break + } + blocks.add(rest.substring(i, next + 1)) + i = next + 1 + } + + List highPriority = [] + List lowPriority = [] + int boilerplateDropped = 0 + for (String block : blocks) { + def m = block =~ /^"([^"]+)"/ + String name = m.find() ? m.group(1) : '' + if (isBoilerplateThread(name)) { + boilerplateDropped++ + } else if (isHighPriorityThread(name, block)) { + highPriority.add(block) + } else { + lowPriority.add(block) + } + } + + StringBuilder out = new StringBuilder(header) + int elided = 0 + for (String block : highPriority + lowPriority) { + if (out.length() + block.length() + 120 > INLINE_DUMP_CAP) { + elided++ + continue + } + out.append(block) + } + if (boilerplateDropped > 0 || elided > 0) { + out.append("\n(elided ${boilerplateDropped} JVM-boilerplate thread(s)") + if (elided > 0) { + out.append(", elided ${elided} other thread(s) for size") + } + out.append(")") + } + return out.toString() + } + + private static boolean isBoilerplateThread(String name) { + if (name in [ + "Reference Handler", "Finalizer", "Signal Dispatcher", "Common-Cleaner", + "Service Thread", "Monitor Deflation Thread", "Notification Thread", + "Attach Listener", "process reaper", "Sweeper thread", "VM Thread", "VM Periodic Task Thread" + ]) { + return true + } + return name.startsWith("C1 CompilerThread") || + name.startsWith("C2 CompilerThread") || + name.startsWith("GC Thread") || + name.startsWith("G1 ") || + name.startsWith("ParGC ") || + name.startsWith("CMS ") + } + + private static boolean isHighPriorityThread(String name, String block) { + if (name == "main") { + return true + } + if (name.startsWith("dd-") || name.startsWith("datadog-")) { + return true + } + if (name.startsWith("OkHttp") || name.contains("okhttp")) { + return true + } + return block.contains("java.lang.Thread.State: BLOCKED") + } + + private long getTestedProcessPid() { + try { + return (long) testedProcess.getClass().getMethod("pid").invoke(testedProcess) + } catch (Throwable ignored) { + try { + // UNIXProcess's private 'pid' field, for JDK 8 compatibility + def field = testedProcess.getClass().getDeclaredField("pid") + field.setAccessible(true) + return field.getInt(testedProcess) as long + } catch (Throwable ignored2) { + return -1L + } + } + } + + private String runJstack(long pid) { + def candidates = [] + // java.home is always set by the JVM and points to the active JDK/JRE; prefer it over the + // JAVA_HOME env var which is frequently absent in CI runners even when Java is present. + String javaHome = System.getProperty("java.home") + if (javaHome) { + candidates.add(javaHome + "/bin/jstack") + } + String javaHomeEnv = System.getenv("JAVA_HOME") + if (javaHomeEnv && javaHomeEnv != javaHome) { + candidates.add(javaHomeEnv + "/bin/jstack") + } + candidates.add("jstack") + for (String cmd : candidates) { + File tmp = null + try { + tmp = File.createTempFile("jstack", ".txt") + // Redirect output to a file to avoid pipe-buffer deadlock — a full thread dump can + // exceed the OS pipe buffer (typically 64 KB) before waitFor returns. + Process p = new ProcessBuilder(cmd, String.valueOf(pid)) + .redirectErrorStream(true) + .redirectOutput(tmp) + .start() + if (!p.waitFor(5, SECONDS)) { + p.destroyForcibly() + p.waitFor(2, SECONDS) + continue + } + if (p.exitValue() == 0) { + String output = tmp.getText("UTF-8") + if (output) { + return output + } + } + } catch (Throwable ignored) { + // try next candidate + } finally { + if (tmp != null) { + tmp.delete() + } + } + } + return null + } + private String tailProcessLog(int lines) { try { def logFile = new File(logFilePath) diff --git a/dd-trace-core/src/jmh/java/datadog/trace/util/concurrent/RingVsQueueBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/util/concurrent/RingVsQueueBenchmark.java new file mode 100644 index 00000000000..6a1e3d7c802 --- /dev/null +++ b/dd-trace-core/src/jmh/java/datadog/trace/util/concurrent/RingVsQueueBenchmark.java @@ -0,0 +1,191 @@ +package datadog.trace.util.concurrent; + +import static java.util.concurrent.TimeUnit.SECONDS; + +import java.util.function.BiConsumer; +import org.jctools.queues.MpscArrayQueue; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Group; +import org.openjdk.jmh.annotations.GroupThreads; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * Head-to-head comparison of {@link MpscRingBuffer} (mutable pre-allocated slots) against the + * conventional approach of a jctools {@link MpscArrayQueue} with a fresh {@code Slot} allocated per + * publish. The latter is the pattern the current CSS code uses for {@code SpanSnapshot} on the + * producer side, so the delta between the two measures the actual allocation/handoff saving of the + * ring-buffer rewrite. + * + *

    + *
  • {@code write_*_8p} — 8 producers, background drainer keeps the structure empty so the + * measurement reflects publish cost, not full-structure drop cost. Pair-compare ring vs queue + * at matched capacity. + *
  • {@code e2e_*_8p} — JMH {@code @Group} pairing 8 producers with 1 consumer for each + * structure. End-to-end ops/s under realistic backpressure. + *
+ * + *

Run with {@code -prof gc} to also see per-op allocation rate — that's where the ring's win is + * loudest, since the queue allocates one {@code Slot} per publish and the ring allocates none. + */ +@State(Scope.Benchmark) +@Warmup(iterations = 2, time = 15, timeUnit = SECONDS) +@Measurement(iterations = 5, time = 15, timeUnit = SECONDS) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(SECONDS) +@Fork(value = 1) +public class RingVsQueueBenchmark { + + /** + * Shared slot type. {@code MpscRingBuffer} pre-allocates these and the producer mutates in place; + * the queue path allocates a fresh one per publish and offers the reference. Two constructors so + * both styles read naturally. + */ + public static final class Slot { + long value; + + Slot() {} + + Slot(final long value) { + this.value = value; + } + } + + // Static (non-capturing) handlers. Passing ts/bh as context lets the JIT keep these as + // singleton functions and avoid per-call lambda allocation. + private static final BiConsumer RING_FILLER = + (ts, slot) -> { + slot.value = ts.counter; + ts.counter++; + }; + + private static final BiConsumer RING_CONSUMER = + (bh, slot) -> bh.consume(slot.value); + + @Param({"1024"}) + public int capacity; + + /** Write-side benchmark structures. Drained by a background thread so they never fill. */ + MpscRingBuffer ring; + + MpscArrayQueue queue; + + /** E2e benchmark structures. JMH drives both sides via {@code @Group}; no background drainer. */ + MpscRingBuffer e2eRing; + + MpscArrayQueue e2eQueue; + + private volatile boolean stopDrainers; + private Thread ringDrainer; + private Thread queueDrainer; + + @Setup(Level.Trial) + public void setup() { + ring = new MpscRingBuffer<>(Slot::new, capacity); + queue = new MpscArrayQueue<>(capacity); + e2eRing = new MpscRingBuffer<>(Slot::new, capacity); + e2eQueue = new MpscArrayQueue<>(capacity); + + stopDrainers = false; + ringDrainer = + new Thread( + () -> { + while (!stopDrainers) { + if (ring.drain((Slot s) -> {}) == 0) Thread.yield(); + } + }, + "RingVsQueueBenchmark-ringDrainer"); + ringDrainer.setDaemon(true); + ringDrainer.start(); + + queueDrainer = + new Thread( + () -> { + while (!stopDrainers) { + Slot s = queue.poll(); + if (s == null) Thread.yield(); + } + }, + "RingVsQueueBenchmark-queueDrainer"); + queueDrainer.setDaemon(true); + queueDrainer.start(); + } + + @TearDown(Level.Trial) + public void teardown() throws InterruptedException { + stopDrainers = true; + ringDrainer.join(5_000); + queueDrainer.join(5_000); + } + + @State(Scope.Thread) + public static class ThreadState { + long counter; + } + + // ============ Write-side throughput ============ + + @Threads(8) + @Benchmark + public boolean write_ring_8p(final ThreadState ts) { + return ring.tryWrite(ts, RING_FILLER); + } + + /** Mirrors the SpanSnapshot pattern: allocate a fresh instance per publish, offer it. */ + @Threads(8) + @Benchmark + public boolean write_queue_8p(final ThreadState ts) { + return queue.offer(new Slot(ts.counter++)); + } + + // ============ End-to-end producer/consumer ============ + + @Group("e2e_ring_8p") + @GroupThreads(8) + @Benchmark + public boolean e2e_ring_producer(final ThreadState ts) { + return e2eRing.tryWrite(ts, RING_FILLER); + } + + @Group("e2e_ring_8p") + @GroupThreads(1) + @Benchmark + public int e2e_ring_consumer(final Blackhole bh) { + int drained = e2eRing.drain(bh, RING_CONSUMER); + if (drained == 0) Thread.yield(); + return drained; + } + + @Group("e2e_queue_8p") + @GroupThreads(8) + @Benchmark + public boolean e2e_queue_producer(final ThreadState ts) { + return e2eQueue.offer(new Slot(ts.counter++)); + } + + @Group("e2e_queue_8p") + @GroupThreads(1) + @Benchmark + public int e2e_queue_consumer(final Blackhole bh) { + int drained = 0; + Slot slot; + while ((slot = e2eQueue.poll()) != null) { + bh.consume(slot.value); + drained++; + } + if (drained == 0) Thread.yield(); + return drained; + } +} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java index 8d6dc6b72d0..a518090f93b 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -8,7 +8,6 @@ import datadog.trace.api.cache.DDCaches; import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; import datadog.trace.util.Hashtable; -import datadog.trace.util.LongHashingUtils; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -62,23 +61,19 @@ final class AggregateEntry extends Hashtable.Entry { static final long ERROR_TAG = 0x8000000000000000L; static final long TOP_LEVEL_TAG = 0x4000000000000000L; - // UTF8 caches consolidated from the previous MetricKey + ConflatingMetricsAggregator split. - private static final DDCache RESOURCE_CACHE = - DDCaches.newFixedSizeCache(32); - private static final DDCache SERVICE_CACHE = - DDCaches.newFixedSizeCache(32); - private static final DDCache OPERATION_CACHE = - DDCaches.newFixedSizeCache(64); - private static final DDCache SERVICE_SOURCE_CACHE = - DDCaches.newFixedSizeCache(16); - private static final DDCache TYPE_CACHE = DDCaches.newFixedSizeCache(8); - private static final DDCache SPAN_KIND_CACHE = + // UTF8 caches. Package-private so ConflatingMetricsAggregator's fillSlot can canonicalize + // producer-side via the same caches the entry constructor used to consult. + static final DDCache RESOURCE_CACHE = DDCaches.newFixedSizeCache(32); + static final DDCache SERVICE_CACHE = DDCaches.newFixedSizeCache(32); + static final DDCache OPERATION_CACHE = DDCaches.newFixedSizeCache(64); + static final DDCache SERVICE_SOURCE_CACHE = DDCaches.newFixedSizeCache(16); - private static final DDCache HTTP_METHOD_CACHE = - DDCaches.newFixedSizeCache(8); - private static final DDCache HTTP_ENDPOINT_CACHE = + static final DDCache TYPE_CACHE = DDCaches.newFixedSizeCache(8); + static final DDCache SPAN_KIND_CACHE = DDCaches.newFixedSizeCache(16); + static final DDCache HTTP_METHOD_CACHE = DDCaches.newFixedSizeCache(8); + static final DDCache HTTP_ENDPOINT_CACHE = DDCaches.newFixedSizeCache(32); - private static final DDCache GRPC_STATUS_CODE_CACHE = + static final DDCache GRPC_STATUS_CODE_CACHE = DDCaches.newFixedSizeCache(32); /** @@ -141,23 +136,30 @@ final class AggregateEntry extends Hashtable.Entry { private int topLevelCount; private long duration; - /** Hot-path constructor for the producer/consumer flow. Builds UTF8 fields via the caches. */ - AggregateEntry(SpanSnapshot s, long keyHash) { - super(keyHash); - this.resource = canonicalize(RESOURCE_CACHE, s.resourceName); - this.service = canonicalize(SERVICE_CACHE, s.serviceName); - this.operationName = canonicalize(OPERATION_CACHE, s.operationName); - this.serviceSource = canonicalizeOptional(SERVICE_SOURCE_CACHE, s.serviceNameSource); - this.type = canonicalize(TYPE_CACHE, s.spanType); - this.spanKind = canonicalize(SPAN_KIND_CACHE, s.spanKind); - this.httpMethod = canonicalizeOptional(HTTP_METHOD_CACHE, s.httpMethod); - this.httpEndpoint = canonicalizeOptional(HTTP_ENDPOINT_CACHE, s.httpEndpoint); - this.grpcStatusCode = canonicalizeOptional(GRPC_STATUS_CODE_CACHE, s.grpcStatusCode); + /** Hot-path constructor for the producer/consumer flow. Slot fields are already canonical. */ + AggregateEntry(SpanSnapshot s) { + super(s.keyHash); + // The slot's string fields are already canonical UTF8BytesString instances (canonicalized by + // the producer in fillSlot via the same caches we used to consult here). Just copy refs; + // matches() can do identity comparison instead of contentEquals. + this.resource = s.resourceName; + this.service = s.serviceName; + this.operationName = s.operationName; + this.serviceSource = s.serviceNameSource; + this.type = s.spanType; + this.spanKind = s.spanKind; + this.httpMethod = s.httpMethod; + this.httpEndpoint = s.httpEndpoint; + this.grpcStatusCode = s.grpcStatusCode; this.httpStatusCode = s.httpStatusCode; this.synthetic = s.synthetic; this.traceRoot = s.traceRoot; this.peerTagNames = s.peerTagSchema == null ? null : s.peerTagSchema.names; - this.peerTagValues = s.peerTagValues; + // The slot's peerTagValues is a reusable scratch buffer owned by the ring -- the producer + // overwrites it on every claim of this slot. Snapshot it here by value so this entry's + // identity doesn't drift when the slot is reclaimed. + this.peerTagValues = + s.peerTagSchema == null ? null : Arrays.copyOf(s.peerTagValues, s.peerTagValues.length); this.peerTags = materializePeerTags(this.peerTagNames, this.peerTagValues); } @@ -241,21 +243,29 @@ void clear() { } boolean matches(SpanSnapshot s) { - String[] snapshotNames = s.peerTagSchema == null ? null : s.peerTagSchema.names; + // All string fields are canonical UTF8BytesString references from the per-field DDCaches. + // Same content => same canonical instance => identity comparison suffices. + // peerTagSchema is also shared by reference (producer reads cachedPeerTagSchema), so its + // names array reference is stable too. peerTagValues stays content-compared since values + // vary per publish and aren't canonicalized. return httpStatusCode == s.httpStatusCode && synthetic == s.synthetic && traceRoot == s.traceRoot - && contentEquals(resource, s.resourceName) - && contentEquals(service, s.serviceName) - && contentEquals(operationName, s.operationName) - && contentEquals(serviceSource, s.serviceNameSource) - && contentEquals(type, s.spanType) - && contentEquals(spanKind, s.spanKind) - && Arrays.equals(peerTagNames, snapshotNames) - && Arrays.equals(peerTagValues, s.peerTagValues) - && contentEquals(httpMethod, s.httpMethod) - && contentEquals(httpEndpoint, s.httpEndpoint) - && contentEquals(grpcStatusCode, s.grpcStatusCode); + && resource == s.resourceName + && service == s.serviceName + && operationName == s.operationName + && serviceSource == s.serviceNameSource + && type == s.spanType + && spanKind == s.spanKind + // PeerTagSchema instances aren't guaranteed to be shared by reference across all + // producers (cachedPeerTagSchema can be swapped during reconcile; tests build fresh + // instances), so compare names by content. The same-reference case short-circuits to + // O(1) via Arrays.equals's a==b fast path. + && Arrays.equals(peerTagNames, s.peerTagSchema == null ? null : s.peerTagSchema.names) + && (peerTagNames == null || Arrays.equals(peerTagValues, s.peerTagValues)) + && httpMethod == s.httpMethod + && httpEndpoint == s.httpEndpoint + && grpcStatusCode == s.grpcStatusCode; } /** @@ -267,41 +277,6 @@ boolean matches(long keyHash, SpanSnapshot s) { return this.keyHash == keyHash && matches(s); } - /** - * Computes the 64-bit lookup hash for a {@link SpanSnapshot}. Chained per-field calls -- no - * varargs / Object[] allocation, no autoboxing on primitive overloads. The constructor's - * super({@code hashOf(s)}) call uses the same function so an entry built from a snapshot hashes - * to the same bucket the snapshot itself looks up. - * - *

Hashes are content-stable across {@code String} / {@code UTF8BytesString}: {@link - * UTF8BytesString#hashCode()} returns the underlying {@code String}'s hash. - */ - static long hashOf(SpanSnapshot s) { - long h = 0; - h = LongHashingUtils.addToHash(h, s.resourceName); - h = LongHashingUtils.addToHash(h, s.serviceName); - h = LongHashingUtils.addToHash(h, s.operationName); - h = LongHashingUtils.addToHash(h, s.serviceNameSource); - h = LongHashingUtils.addToHash(h, s.spanType); - h = LongHashingUtils.addToHash(h, s.httpStatusCode); - h = LongHashingUtils.addToHash(h, s.synthetic); - h = LongHashingUtils.addToHash(h, s.traceRoot); - h = LongHashingUtils.addToHash(h, s.spanKind); - // Always mix in both the schema's content hash and the values' content hash, unconditionally - // (no null-skip). Arrays.hashCode is content-based for both String[]s; the default - // Object[].hashCode is identity-based, which would let two snapshots with content-equal but - // distinct PeerTagSchema instances hash to different buckets. Null inputs hash to 0 here, - // distinct from {@code Arrays.hashCode(empty)} = 1 or any non-empty array. - h = - LongHashingUtils.addToHash( - h, s.peerTagSchema == null ? 0 : Arrays.hashCode(s.peerTagSchema.names)); - h = LongHashingUtils.addToHash(h, Arrays.hashCode(s.peerTagValues)); - h = LongHashingUtils.addToHash(h, s.httpMethod); - h = LongHashingUtils.addToHash(h, s.httpEndpoint); - h = LongHashingUtils.addToHash(h, s.grpcStatusCode); - return h; - } - // Accessors for SerializingMetricWriter. UTF8BytesString getResource() { return resource; @@ -366,9 +341,12 @@ List getPeerTags() { // ----- helpers ----- - private static UTF8BytesString canonicalize( + static UTF8BytesString canonicalize( DDCache cache, CharSequence charSeq) { - if (charSeq == null) { + // Treat null and length-zero as the same canonical EMPTY instance so the producer-side + // canonicalization produces identity-equal results for null/"" inputs. matches() now uses + // identity comparison, so the previous content-equals collapse needs to happen here. + if (charSeq == null || charSeq.length() == 0) { return EMPTY; } if (charSeq instanceof UTF8BytesString) { @@ -388,9 +366,11 @@ private static UTF8BytesString canonicalize( * single helper keeps the constructor consistent. */ @Nullable - private static UTF8BytesString canonicalizeOptional( + static UTF8BytesString canonicalizeOptional( DDCache cache, @Nullable CharSequence charSeq) { - if (charSeq == null) { + // Treat null and length-zero the same: both map to null (i.e. "absent"). matches() uses + // identity comparison, so the prior content-equals collapse needs to happen here. + if (charSeq == null || charSeq.length() == 0) { return null; } if (charSeq instanceof UTF8BytesString) { diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java index abadc7e5f17..b23e492ce82 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java @@ -55,7 +55,9 @@ boolean isEmpty() { * caller should drop the data point in that case. */ AggregateEntry findOrInsert(SpanSnapshot snapshot) { - long keyHash = AggregateEntry.hashOf(snapshot); + // keyHash is precomputed by the producer in SpanSnapshot.computeAndSetKeyHash, no + // re-hashing on the aggregator thread. + final long keyHash = snapshot.keyHash; for (AggregateEntry candidate = Support.bucket(buckets, keyHash); candidate != null; candidate = candidate.next()) { @@ -66,7 +68,7 @@ AggregateEntry findOrInsert(SpanSnapshot snapshot) { if (size >= maxAggregates && !evictOneStale()) { return null; } - AggregateEntry entry = new AggregateEntry(snapshot, keyHash); + AggregateEntry entry = new AggregateEntry(snapshot); Support.insertHeadEntry(buckets, keyHash, entry); size++; return entry; diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index d809d452522..d70a1fbaae1 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -5,8 +5,11 @@ import datadog.trace.common.metrics.SignalItem.ClearSignal; import datadog.trace.common.metrics.SignalItem.StopSignal; import datadog.trace.core.monitor.HealthMetrics; +import datadog.trace.util.concurrent.MpscRingBuffer; import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.LockSupport; +import java.util.function.BiConsumer; import org.jctools.queues.MessagePassingQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,7 +20,8 @@ final class Aggregator implements Runnable { private static final Logger log = LoggerFactory.getLogger(Aggregator.class); - private final MessagePassingQueue inbox; + private final MpscRingBuffer dataInbox; + private final MessagePassingQueue signalInbox; private final AggregateTable aggregates; private final MetricWriter writer; private final HealthMetrics healthMetrics; @@ -42,9 +46,36 @@ final class Aggregator implements Runnable { justification = "the field is confined to the agent thread running the Aggregator") private boolean dirty; + @SuppressFBWarnings( + value = "AT_STALE_THREAD_WRITE_OF_PRIMITIVE", + justification = "the field is confined to the agent thread running the Aggregator") + private boolean stopped; + + /** + * True iff the aggregator thread is currently parked (or about to park) waiting for new work. + * Producers volatile-read this after publishing; on true they call {@link #unparkIfWaiting()} to + * wake the aggregator without paying the {@code sleepMillis} latency. + * + *

Volatile so the producer side sees the latest value. Set true before park, false on wake. + * The aggregator re-checks the inbox after setting this true (see {@link #run}) to close the race + * window: if a producer published just before the aggregator set the flag, the aggregator would + * have missed the unpark; the re-check catches that work and skips the park. + */ + private volatile boolean waiting; + + private volatile Thread aggregatorThread; + + /** + * Static-singleton snapshot handler. Reads from the slot via {@code this} (passed as the context) + * and avoids per-drain lambda capture. + */ + private static final BiConsumer SNAPSHOT_HANDLER = + Aggregator::handleSnapshot; + Aggregator( MetricWriter writer, - MessagePassingQueue inbox, + MpscRingBuffer dataInbox, + MessagePassingQueue signalInbox, int maxAggregates, long reportingInterval, TimeUnit reportingIntervalTimeUnit, @@ -52,7 +83,8 @@ final class Aggregator implements Runnable { Runnable onReportCycle) { this( writer, - inbox, + dataInbox, + signalInbox, maxAggregates, reportingInterval, reportingIntervalTimeUnit, @@ -63,7 +95,8 @@ final class Aggregator implements Runnable { Aggregator( MetricWriter writer, - MessagePassingQueue inbox, + MpscRingBuffer dataInbox, + MessagePassingQueue signalInbox, int maxAggregates, long reportingInterval, TimeUnit reportingIntervalTimeUnit, @@ -71,7 +104,8 @@ final class Aggregator implements Runnable { HealthMetrics healthMetrics, Runnable onReportCycle) { this.writer = writer; - this.inbox = inbox; + this.dataInbox = dataInbox; + this.signalInbox = signalInbox; this.aggregates = new AggregateTable(maxAggregates); this.reportingIntervalNanos = reportingIntervalTimeUnit.toNanos(reportingInterval); this.sleepMillis = sleepMillis; @@ -81,17 +115,34 @@ final class Aggregator implements Runnable { @Override public void run() { - Thread currentThread = Thread.currentThread(); - Drainer drainer = new Drainer(); - while (!currentThread.isInterrupted() && !drainer.stopped) { + final Thread currentThread = Thread.currentThread(); + aggregatorThread = currentThread; + final long parkNanos = TimeUnit.MILLISECONDS.toNanos(sleepMillis); + while (!currentThread.isInterrupted() && !stopped) { try { - if (!inbox.isEmpty()) { - inbox.drain(drainer); - } else { - Thread.sleep(sleepMillis); + int drainedData = dataInbox.drain(this, SNAPSHOT_HANDLER); + // Signals are processed after the data drain. REPORT/STOP additionally drain any + // snapshots that arrived between the drain above and the signal's enqueue -- the + // signal-channel offer happens-before the poll here, so any publish that completed + // before report()/stop() is guaranteed visible to the inline drain in handleSignal. + SignalItem signal; + while (!stopped && (signal = signalInbox.poll()) != null) { + handleSignal(signal); + } + if (stopped) break; + if (drainedData == 0 && signalInbox.isEmpty()) { + waiting = true; + try { + // Re-check after publishing the waiting flag. A producer that published just before + // we set waiting=true won't have unparked us (they read waiting=false); the re-check + // catches that work via the volatile producerCursor. + if (dataInbox.isEmpty() && signalInbox.isEmpty()) { + LockSupport.parkNanos(this, parkNanos); + } + } finally { + waiting = false; + } } - } catch (InterruptedException e) { - currentThread.interrupt(); } catch (Throwable error) { log.debug("error aggregating metrics", error); } @@ -99,57 +150,68 @@ public void run() { log.debug("metrics aggregator exited"); } - private final class Drainer implements MessagePassingQueue.Consumer { - - boolean stopped = false; - - @Override - public void accept(InboxItem item) { - if (item == ClearSignal.CLEAR) { - // ClearSignal is routed through the inbox (rather than letting the caller mutate - // AggregateTable directly) so the aggregator thread stays the sole writer. AggregateTable - // is not thread-safe; a direct clear() from e.g. the OkHttpSink callback thread would - // race with Drainer.accept on this thread. - // - // We deliberately do NOT call inbox.clear() here. Doing so would erase any queued STOP - // (or REPORT) signals that happen to sit behind CLEAR -- a real concern when a - // downgrade is followed quickly by close(), where the trampled STOP leaves the - // aggregator thread spinning until thread.join times out. features.supportsMetrics() is - // already false by the time CLEAR was offered, so producers have stopped publishing; - // any in-flight snapshots will drain naturally into the just-cleared table, get - // re-aggregated, and flushed on the next report -- where the agent rejects them again, - // triggering another DOWNGRADED -> disable() -> CLEAR cycle. Worst case: one extra - // reporting cycle of wasted work, which we accept for the safety of preserving STOP. - if (!stopped) { - aggregates.clear(); - // Clear dirty too -- without this, the next report() would see dirty=true, run - // expungeStaleAggregates against the (now-empty) table, find isEmpty()=true, and skip - // the flush anyway. Same observable outcome, but resetting here keeps the invariant - // "dirty implies there's data to flush" honest. - dirty = false; - } - ((SignalItem) item).complete(); - } else if (item instanceof SignalItem) { - SignalItem signal = (SignalItem) item; - if (!stopped) { - report(wallClockTime(), signal); - stopped = item instanceof StopSignal; - if (stopped) { - signal.complete(); - } - } else { - signal.ignore(); - } - } else if (item instanceof SpanSnapshot && !stopped) { - SpanSnapshot snapshot = (SpanSnapshot) item; - AggregateEntry entry = aggregates.findOrInsert(snapshot); - if (entry != null) { - entry.recordOneDuration(snapshot.tagAndDuration); - dirty = true; - } else { - // table at cap with no stale entry available to evict - healthMetrics.onStatsAggregateDropped(); - } + /** + * Wake the aggregator if it's currently parked. Called from producer threads after publishing. + * + *

Cost on the producer hot path is one volatile read; only on a {@code true} read does the + * producer pay an {@code unpark} (cheap idempotent atomic). At saturating workloads where the + * aggregator is never parked, the volatile read is the only cost. + */ + void unparkIfWaiting() { + if (waiting) { + final Thread t = aggregatorThread; + if (t != null) { + LockSupport.unpark(t); + } + } + } + + private static void handleSnapshot(Aggregator agg, SpanSnapshot snapshot) { + // Skip-sentinel from the producer's overclaim path: the producer claimed this slot + // speculatively (one slot per span in the trace) but the span turned out to be non-eligible + // or the trace hit an ignored-resource. shouldComputeMetric guarantees duration > 0 for any + // real publish, so tagAndDuration == 0 uniquely identifies a skip. + if (snapshot.tagAndDuration == 0L) { + return; + } + AggregateEntry entry = agg.aggregates.findOrInsert(snapshot); + if (entry != null) { + entry.recordOneDuration(snapshot.tagAndDuration); + agg.dirty = true; + } else { + // table at cap with no stale entry available to evict + agg.healthMetrics.onStatsAggregateDropped(); + } + } + + private void handleSignal(SignalItem signal) { + if (signal == ClearSignal.CLEAR) { + // ClearSignal is routed through the signal channel (rather than letting the caller mutate + // AggregateTable directly) so the aggregator thread stays the sole writer. AggregateTable + // is not thread-safe; a direct clear() from e.g. the OkHttpSink callback thread would race + // with the data-drain on this thread. + // + // We do NOT clear the snapshot data ring here. Any in-flight snapshots will drain + // naturally into the just-cleared table, get re-aggregated, and flushed on the next + // report -- where the agent rejects them again, triggering another DOWNGRADED -> disable() + // -> CLEAR cycle. Worst case: one extra reporting cycle of wasted work. + aggregates.clear(); + // Clear dirty too -- without this, the next report() would see dirty=true, run + // expungeStaleAggregates against the (now-empty) table, find isEmpty()=true, and skip + // the flush anyway. Same observable outcome, but resetting here keeps the invariant + // "dirty implies there's data to flush" honest. + dirty = false; + signal.complete(); + } else { + // STOP or REPORT: catch up on any data that arrived between the loop's main drain and the + // signal being enqueued. Producer's signalInbox.offer happens-after its dataInbox publish, + // so by the time we observe the signal, every snapshot the producer had published is + // visible to drain() here. This is what gives report() bucket-boundary determinism. + dataInbox.drain(this, SNAPSHOT_HANDLER); + report(wallClockTime(), signal); + if (signal instanceof StopSignal) { + stopped = true; + signal.complete(); } } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index 895ee434854..dc6ea86bac7 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -28,6 +28,7 @@ import datadog.trace.core.SpanKindFilter; import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.util.AgentTaskScheduler; +import datadog.trace.util.concurrent.MpscRingBuffer; import java.util.Collections; import java.util.List; import java.util.Map; @@ -63,9 +64,27 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve private static final SpanKindFilter INTERNAL_KIND = SpanKindFilter.builder().includeInternal().build(); + /** Capacity of the small MPSC queue carrying control signals (REPORT / STOP / CLEAR). */ + private static final int SIGNAL_INBOX_CAPACITY = 16; + private final Set ignoredResources; private final Thread thread; - private final MessagePassingQueue inbox; + + /** + * Producer/consumer data channel: a {@link MpscRingBuffer} of pre-allocated, recyclable {@link + * SpanSnapshot} slots. {@link #publish(List)} claims a range of slots in one CAS via {@link + * MpscRingBuffer#tryClaimRange} and fills each in place via {@link #fillSlot} -- no per-publish + * allocation. + */ + private final MpscRingBuffer dataInbox; + + /** + * Separate small queue for control signals. Kept off the data ring because the ring only holds + * one element type (the slot), and routing signals through a side channel lets the aggregator + * service them ahead of the data backlog each loop iteration. + */ + private final MessagePassingQueue signalInbox; + private final Sink sink; private final Aggregator aggregator; private final long reportingInterval; @@ -173,14 +192,16 @@ public ConflatingMetricsAggregator( boolean includeEndpointInMetrics) { this.ignoredResources = ignoredResources; this.includeEndpointInMetrics = includeEndpointInMetrics; - this.inbox = Queues.mpscArrayQueue(queueSize); + this.dataInbox = new MpscRingBuffer<>(SpanSnapshot::new, queueSize); + this.signalInbox = Queues.mpscArrayQueue(SIGNAL_INBOX_CAPACITY); this.features = features; this.healthMetrics = healthMetric; this.sink = sink; this.aggregator = new Aggregator( metricWriter, - inbox, + dataInbox, + signalInbox, maxAggregates, reportingInterval, timeUnit, @@ -218,11 +239,11 @@ public boolean report() { boolean published; int attempts = 0; do { - published = inbox.offer(REPORT); + published = signalInbox.offer(REPORT); ++attempts; } while (!published && attempts < 10); if (!published) { - log.debug("Skipped metrics reporting because the queue is full"); + log.debug("Skipped metrics reporting because the signal queue is full"); } return published; } @@ -245,7 +266,7 @@ public Future forceReport() { ReportSignal reportSignal = new ReportSignal(); boolean published = false; while (thread.isAlive() && !published) { - published = inbox.offer(reportSignal); + published = signalInbox.offer(reportSignal); if (!published) { try { Thread.sleep(10); @@ -264,27 +285,81 @@ public Future forceReport() { @Override public boolean publish(List> trace) { + if (!features.supportsMetrics()) { + return false; + } + final int traceSize = trace.size(); + if (traceSize == 0) { + return false; + } + + // Overclaim the worst case: assume every span in the trace is metrics-eligible and claim + // that many ring slots in a single CAS. Non-eligible spans get a "skip" sentinel + // (tagAndDuration = 0L); the aggregator recognises this and bypasses findOrInsert for those + // slots. We expect most spans in a typical trace to be eligible (top-level / measured / + // server-client-producer-consumer kinds), so the slot waste is small in exchange for a + // single-pass producer loop. Indexed iteration (vs. enhanced-for) avoids per-call Iterator + // allocation -- trace is typically a SpanList, but even Collections.singletonList's + // iterator() allocates. + final long startSeq = dataInbox.tryClaimRange(traceSize); + if (startSeq < 0L) { + healthMetrics.onStatsInboxFull(); + return false; + } + boolean forceKeep = false; - int counted = 0; - if (features.supportsMetrics()) { - for (CoreSpan span : trace) { - boolean isTopLevel = span.isTopLevel(); - if (shouldComputeMetric(span, isTopLevel)) { - final CharSequence resourceName = span.getResourceName(); - if (resourceName != null && ignoredResources.contains(resourceName.toString())) { - // skip publishing all children - forceKeep = false; - break; + int filled = 0; + + for (int i = 0; i < traceSize; i++) { + final CoreSpan span = trace.get(i); + final long seq = startSeq + i; + final SpanSnapshot slot = dataInbox.slotAt(seq); + final boolean isTopLevel = span.isTopLevel(); + + if (shouldComputeMetric(span, isTopLevel)) { + final CharSequence resourceName = span.getResourceName(); + if (resourceName != null && ignoredResources.contains(resourceName.toString())) { + // Ignored resource: drop metrics for the whole trace. Mark this slot AND every + // remaining slot in the claimed range as skip-sentinel so the aggregator advances past + // them. + slot.tagAndDuration = 0L; + dataInbox.publish(seq); + for (int j = i + 1; j < traceSize; j++) { + final long skipSeq = startSeq + j; + dataInbox.slotAt(skipSeq).tagAndDuration = 0L; + dataInbox.publish(skipSeq); } - counted++; - forceKeep |= publish(span, isTopLevel); + healthMetrics.onClientStatTraceComputed(0, traceSize, true); + aggregator.unparkIfWaiting(); + return false; + } + try { + fillSlot(span, isTopLevel, slot); + } finally { + // Publish in finally so a throwing fillSlot can't strand the consumer at this + // sequence -- matches the publish-on-throw guarantee in MpscRingBuffer.tryWrite. + dataInbox.publish(seq); } + forceKeep |= span.getError() > 0; + filled++; + } else { + // Non-eligible span: mark slot as skip-sentinel and publish so the aggregator advances. + slot.tagAndDuration = 0L; + dataInbox.publish(seq); } - healthMetrics.onClientStatTraceComputed(counted, trace.size(), !forceKeep); } + + healthMetrics.onClientStatTraceComputed(filled, traceSize, !forceKeep); + // Wake the aggregator if it parked while we were filling. Producer-side cost is one volatile + // read when the aggregator isn't parked. + aggregator.unparkIfWaiting(); return forceKeep; } + // Other site that posts work the aggregator needs to see: ignored-resource short-circuit + // returns above before reaching unparkIfWaiting. Mirror that wake-up there too so the + // aggregator picks up the skip-sentinel slots quickly. + private boolean shouldComputeMetric(CoreSpan span, boolean isTopLevel) { return (span.isMeasured() || isTopLevel || span.isKind(METRICS_ELIGIBLE_KINDS)) && span.getLongRunningVersion() @@ -292,21 +367,16 @@ private boolean shouldComputeMetric(CoreSpan span, boolean isTopLevel) { && span.getDurationNano() > 0; } - private boolean publish(CoreSpan span, boolean isTopLevel) { - // Error decision drives force-keep sampling regardless of whether the snapshot gets queued. - boolean error = span.getError() > 0; - - // Fast-path the inbox-full case before any tag extraction or snapshot allocation. size() is - // approximate on jctools' MPSC queue but that's fine: if we under-estimate, we fall through - // and let inbox.offer be the source of truth (existing behavior); if we over-estimate, we - // drop a snapshot that would have fit -- acceptable, onStatsInboxFull was going to fire - // imminently anyway. - if (inbox.size() >= inbox.capacity()) { - healthMetrics.onStatsInboxFull(); - return error; - } + /** + * Producer-side slot fill. Called between a {@link MpscRingBuffer#tryClaimRange} and the matching + * {@link MpscRingBuffer#publish}: reads from {@code span} and instance state, writes every field + * on {@code slot} (including {@code null} where applicable) so stale values from a prior occupant + * of this slot don't bleed through. + */ + private void fillSlot(CoreSpan span, boolean isTopLevel, SpanSnapshot slot) { + final boolean error = span.getError() > 0; - // Extract HTTP method and endpoint only if the feature is enabled + // Extract HTTP method and endpoint only if the feature is enabled. String httpMethod = null; String httpEndpoint = null; if (includeEndpointInMetrics) { @@ -329,37 +399,63 @@ private boolean publish(CoreSpan span, boolean isTopLevel) { long tagAndDuration = span.getDurationNano() | (error ? ERROR_TAG : 0L) | (isTopLevel ? TOP_LEVEL_TAG : 0L); + // Fill the slot's reusable peerTagValues scratch in place. The slot owns its array across + // publishes; we only allocate when the slot has none or the schema's length changed. When + // no tags fire, leave the scratch in place and clear peerTagSchema -- consumers gate on + // peerTagSchema (see AggregateEntry.matches / hashOf), so stale scratch contents are inert. PeerTagSchema peerTagSchema = peerTagSchemaFor(span); - String[] peerTagValues = - peerTagSchema == null ? null : capturePeerTagValues(span, peerTagSchema); - if (peerTagValues == null) { - // No tags fired -- drop the schema reference so the consumer doesn't bother iterating an - // all-null array. - peerTagSchema = null; + if (peerTagSchema != null) { + final String[] names = peerTagSchema.names; + final int n = names.length; + String[] valuesArr = slot.peerTagValues; + if (valuesArr == null || valuesArr.length != n) { + valuesArr = new String[n]; + slot.peerTagValues = valuesArr; + } + boolean anyHit = false; + for (int i = 0; i < n; i++) { + Object v = span.unsafeGetTag(names[i]); + String tag = v == null ? null : v.toString(); + valuesArr[i] = tag; + anyHit |= (tag != null); + } + if (!anyHit) { + // Schema fired no tags -- treat like the no-peer-tag case. Keep valuesArr alive on the + // slot for the next publish; nullifying peerTagSchema makes downstream gates skip it. + peerTagSchema = null; + } } - SpanSnapshot snapshot = - new SpanSnapshot( - span.getResourceName(), - span.getServiceName(), - span.getOperationName(), - span.getServiceNameSource(), - spanType, - span.getHttpStatusCode(), - isSynthetic(span), - span.getParentId() == 0, - spanKind, - peerTagSchema, - peerTagValues, - httpMethod, - httpEndpoint, - grpcStatusCode, - tagAndDuration); - if (!inbox.offer(snapshot)) { - healthMetrics.onStatsInboxFull(); - } - // force keep keys if there are errors - return error; + // Canonicalize every string field to UTF8BytesString via the per-field DDCaches on + // AggregateEntry. This work used to happen on the aggregator thread; doing it here + // distributes the cost across producer threads and gives the aggregator's matches() identity + // comparisons instead of contentEquals chains. + slot.resourceName = + AggregateEntry.canonicalize(AggregateEntry.RESOURCE_CACHE, span.getResourceName()); + slot.serviceName = + AggregateEntry.canonicalize(AggregateEntry.SERVICE_CACHE, span.getServiceName()); + slot.operationName = + AggregateEntry.canonicalize(AggregateEntry.OPERATION_CACHE, span.getOperationName()); + slot.serviceNameSource = + AggregateEntry.canonicalizeOptional( + AggregateEntry.SERVICE_SOURCE_CACHE, span.getServiceNameSource()); + slot.spanType = AggregateEntry.canonicalize(AggregateEntry.TYPE_CACHE, spanType); + slot.httpStatusCode = span.getHttpStatusCode(); + slot.synthetic = isSynthetic(span); + slot.traceRoot = span.getParentId() == 0; + slot.spanKind = AggregateEntry.canonicalize(AggregateEntry.SPAN_KIND_CACHE, spanKind); + slot.peerTagSchema = peerTagSchema; + slot.httpMethod = + AggregateEntry.canonicalizeOptional(AggregateEntry.HTTP_METHOD_CACHE, httpMethod); + slot.httpEndpoint = + AggregateEntry.canonicalizeOptional(AggregateEntry.HTTP_ENDPOINT_CACHE, httpEndpoint); + slot.grpcStatusCode = + AggregateEntry.canonicalizeOptional(AggregateEntry.GRPC_STATUS_CODE_CACHE, grpcStatusCode); + slot.tagAndDuration = tagAndDuration; + + // Precompute the table-lookup key hash on the producer side. AggregateTable#findOrInsert + // reads this directly instead of running the chained hashOf on the aggregator thread. + SpanSnapshot.computeAndSetKeyHash(slot); } /** @@ -444,28 +540,6 @@ private void reconcilePeerTagSchema() { } } - /** - * Captures the span's peer-tag values into a {@code String[]} parallel to {@code schema.names}. - * Slots remain {@code null} for tags the span didn't set; the array itself is lazily allocated on - * the first hit so spans that fire no peer tags pay zero allocation. Returns {@code null} when - * none of the configured peer tags are set on the span. - */ - private static String[] capturePeerTagValues(CoreSpan span, PeerTagSchema schema) { - String[] names = schema.names; - int n = names.length; - String[] values = null; - for (int i = 0; i < n; i++) { - Object v = span.unsafeGetTag(names[i]); - if (v != null) { - if (values == null) { - values = new String[n]; - } - values[i] = v.toString(); - } - } - return values; - } - private static boolean isSynthetic(CoreSpan span) { return span.getOrigin() != null && SYNTHETICS_ORIGIN.equals(span.getOrigin().toString()); } @@ -474,7 +548,7 @@ public void stop() { if (null != cancellation) { cancellation.cancel(); } - inbox.offer(STOP); + signalInbox.offer(STOP); } @Override @@ -512,17 +586,18 @@ private void disable() { features.discover(); if (!features.supportsMetrics()) { log.debug("Disabling metric reporting because an agent downgrade was detected"); - // Route the clear through the inbox so the aggregator thread is the only writer. + // Route the clear through the signal channel so the aggregator thread is the only writer. // AggregateTable is not thread-safe; mutating it directly from this thread would race - // with Drainer.accept on the aggregator thread. + // with snapshot processing on the aggregator thread. // - // Best-effort single offer rather than the retry-loop pattern in report(). If the inbox is - // full at downgrade time the clear is dropped, but the system self-heals: features.discover() - // already flipped supportsMetrics() false, so producer publish() calls now skip the inbox; - // the aggregator drains existing snapshots and ships them on the next report cycle; the - // sink rejects that payload and fires DOWNGRADED again, which retries disable() against a - // now-empty inbox. Worst case: one extra reporting cycle of stale data. - inbox.offer(CLEAR); + // Best-effort single offer rather than the retry-loop pattern in report(). If the signal + // queue is full at downgrade time the clear is dropped, but the system self-heals: + // features.discover() already flipped supportsMetrics() false, so producer publish() calls + // now skip the data ring; the aggregator drains existing snapshots and ships them on the + // next report cycle; the sink rejects that payload and fires DOWNGRADED again, which + // retries disable() against a now-empty signal queue. Worst case: one extra reporting + // cycle of stale data. + signalInbox.offer(CLEAR); } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java index 4821d1b33a4..d3a3d47d65a 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java @@ -43,6 +43,14 @@ final class PeerTagSchema { final String[] names; + /** + * Precomputed {@code Arrays.hashCode(names)}. The schema is shared across many publishes so + * recomputing it on the aggregator hot path (per-publish call to {@code AggregateEntry.hashOf}) + * was waste -- it showed up as a top aggregator-thread sample. Cached here, computed once at + * construction. + */ + final int namesHash; + /** * The {@code DDAgentFeaturesDiscovery.state()} hash this schema was built from. The aggregator * thread reads and updates this once per reporting cycle when reconciling against the latest @@ -54,6 +62,7 @@ final class PeerTagSchema { private PeerTagSchema(String[] names, String state) { this.names = names; + this.namesHash = java.util.Arrays.hashCode(names); this.state = state; } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java index 152ac42bb55..f10d60167cc 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java @@ -1,76 +1,118 @@ package datadog.trace.common.metrics; +import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import datadog.trace.util.LongHashingUtils; +import java.util.Arrays; +import javax.annotation.Nullable; + /** - * Immutable per-span value posted from the producer to the aggregator thread. Carries the raw - * inputs the aggregator needs to look up or build an {@link AggregateEntry} and update its - * counters. + * Per-span value posted from the producer to the aggregator thread. Carries the canonical inputs + * the aggregator needs to look up or build an {@link AggregateEntry} and update its counters. + * + *

Fields are mutable so this class can serve as a slot in {@link + * datadog.trace.util.concurrent.MpscRingBuffer}: the producer claims a slot, writes its fields, and + * publishes; the aggregator reads them. There is exactly one writer per outstanding sequence (the + * producer that claimed it) and one reader (the aggregator), so no synchronization on fields is + * required. * - *

All cache-canonicalization (service-name, span-kind, peer-tag string interning) happens on the - * aggregator thread; the producer just shuffles references. + *

Producer-side canonicalization and hashing. String fields are canonicalized to {@link + * UTF8BytesString} on the producer side (see {@link AggregateEntry#canonicalize}); same-content + * strings collapse to the same canonical reference through the per-field {@code DDCache} instances + * on {@link AggregateEntry}. The producer also precomputes {@link #keyHash} via the same chain the + * consumer used to use. The aggregator then does identity comparisons in {@link + * AggregateEntry#matches} and a precomputed-hash table lookup, with all the per-field work + * distributed across producer threads where there's CPU headroom. */ -final class SpanSnapshot implements InboxItem { - - final CharSequence resourceName; - final String serviceName; - final CharSequence operationName; - final CharSequence serviceNameSource; - final CharSequence spanType; - final short httpStatusCode; - final boolean synthetic; - final boolean traceRoot; - final String spanKind; +final class SpanSnapshot { + + /** Canonical UTF-8 form of the span's resource name. {@link UTF8BytesString#EMPTY} when null. */ + UTF8BytesString resourceName; + + /** Canonical UTF-8 form of the span's service. {@link UTF8BytesString#EMPTY} when null. */ + UTF8BytesString serviceName; + + /** Canonical UTF-8 form of the span's operation name. {@link UTF8BytesString#EMPTY} when null. */ + UTF8BytesString operationName; + + /** Canonical UTF-8 form, or {@code null} when the span had no service-name source set. */ + @Nullable UTF8BytesString serviceNameSource; + + /** Canonical UTF-8 form of the span's type. {@link UTF8BytesString#EMPTY} when null. */ + UTF8BytesString spanType; + + short httpStatusCode; + boolean synthetic; + boolean traceRoot; + + /** Canonical UTF-8 form of the span kind. {@link UTF8BytesString#EMPTY} when not set. */ + UTF8BytesString spanKind; /** * Schema for {@link #peerTagValues}. {@code null} when the span has no peer tags. The schema * carries the names in parallel-array form; {@code peerTagValues} holds the per-span tag values * at the same indices. */ - final PeerTagSchema peerTagSchema; + @Nullable PeerTagSchema peerTagSchema; /** * Peer tag values captured from the span, parallel to {@code peerTagSchema.names}. A {@code null} - * entry means the span didn't have that peer tag set. {@code null} (the whole array) when {@link - * #peerTagSchema} is {@code null}. + * entry means the span didn't have that peer tag set. These are per-publish strings (typically + * not shared across calls), so they stay as {@link String} -- canonicalizing them is more + * expensive than the aggregator's content-comparison on a 1-3 element array. + * + *

The slot owns this array as a reusable scratch buffer; see {@code + * ConflatingMetricsAggregator#fillSlot}. */ - final String[] peerTagValues; + @Nullable String[] peerTagValues; - final String httpMethod; - final String httpEndpoint; - final String grpcStatusCode; + @Nullable UTF8BytesString httpMethod; + @Nullable UTF8BytesString httpEndpoint; + @Nullable UTF8BytesString grpcStatusCode; /** Duration in nanoseconds, OR-ed with {@code ERROR_TAG} / {@code TOP_LEVEL_TAG} as needed. */ - final long tagAndDuration; - - SpanSnapshot( - CharSequence resourceName, - String serviceName, - CharSequence operationName, - CharSequence serviceNameSource, - CharSequence spanType, - short httpStatusCode, - boolean synthetic, - boolean traceRoot, - String spanKind, - PeerTagSchema peerTagSchema, - String[] peerTagValues, - String httpMethod, - String httpEndpoint, - String grpcStatusCode, - long tagAndDuration) { - this.resourceName = resourceName; - this.serviceName = serviceName; - this.operationName = operationName; - this.serviceNameSource = serviceNameSource; - this.spanType = spanType; - this.httpStatusCode = httpStatusCode; - this.synthetic = synthetic; - this.traceRoot = traceRoot; - this.spanKind = spanKind; - this.peerTagSchema = peerTagSchema; - this.peerTagValues = peerTagValues; - this.httpMethod = httpMethod; - this.httpEndpoint = httpEndpoint; - this.grpcStatusCode = grpcStatusCode; - this.tagAndDuration = tagAndDuration; + long tagAndDuration; + + /** + * Hashtable bucket key, computed by the producer at the end of {@code fillSlot} via {@link + * #computeAndSetKeyHash}. {@code AggregateTable#findOrInsert} uses this directly instead of + * re-running the chained hash on the aggregator thread. + */ + long keyHash; + + /** No-arg constructor used by {@link datadog.trace.util.concurrent.MpscRingBuffer}. */ + SpanSnapshot() {} + + /** + * Computes and stores {@link #keyHash} from the snapshot's current canonical fields. The producer + * calls this at the end of {@code fillSlot}; the aggregator then uses {@code keyHash} directly + * for bucket lookup in {@code AggregateTable}. + * + *

Mixing identical to the prior {@code AggregateEntry.hashOf} so the bucket distribution is + * unchanged: {@code UTF8BytesString.hashCode} returns the underlying {@code String} hash, so + * snapshots built from {@code String} and {@code UTF8BytesString} of the same content fold to the + * same long. + */ + static void computeAndSetKeyHash(SpanSnapshot s) { + long h = 0; + h = LongHashingUtils.addToHash(h, s.resourceName); + h = LongHashingUtils.addToHash(h, s.serviceName); + h = LongHashingUtils.addToHash(h, s.operationName); + h = LongHashingUtils.addToHash(h, s.serviceNameSource); + h = LongHashingUtils.addToHash(h, s.spanType); + h = LongHashingUtils.addToHash(h, s.httpStatusCode); + h = LongHashingUtils.addToHash(h, s.synthetic); + h = LongHashingUtils.addToHash(h, s.traceRoot); + h = LongHashingUtils.addToHash(h, s.spanKind); + // peerTagValues is gated by peerTagSchema -- the slot's array is a reusable scratch buffer + // that may carry stale contents from a prior tag-firing publish when this publish had no + // peer tags. Hash it only when the schema says it's meaningful. + h = LongHashingUtils.addToHash(h, s.peerTagSchema == null ? 0 : s.peerTagSchema.namesHash); + h = + LongHashingUtils.addToHash( + h, s.peerTagSchema == null ? 0 : Arrays.hashCode(s.peerTagValues)); + h = LongHashingUtils.addToHash(h, s.httpMethod); + h = LongHashingUtils.addToHash(h, s.httpEndpoint); + h = LongHashingUtils.addToHash(h, s.grpcStatusCode); + s.keyHash = h; } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/TraceStructureWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/TraceStructureWriter.java index afeaebf2772..f5cb0d2a978 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/TraceStructureWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/TraceStructureWriter.java @@ -3,6 +3,7 @@ import datadog.environment.OperatingSystem; import datadog.trace.api.DDSpanId; import datadog.trace.api.DDTraceId; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.core.DDSpan; import de.thetaphi.forbiddenapis.SuppressForbidden; import java.io.FileOutputStream; @@ -82,7 +83,7 @@ private static String[] parseArgs(String outputFile) { return parseArgs(outputFile, OperatingSystem.isWindows()); } - // package visibility for testing + @VisibleForTesting static String[] parseArgs(String outputFile, boolean windows) { String[] args = ARGS_DELIMITER.split(outputFile); // Check Windows absolute paths (:) as column is used as arg delimiter diff --git a/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/TraceMapperV0_5.java b/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/TraceMapperV0_5.java index b8d1c77afdc..a993fdb3c26 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/TraceMapperV0_5.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/writer/ddagent/TraceMapperV0_5.java @@ -9,6 +9,7 @@ import datadog.communication.serialization.msgpack.MsgPackWriter; import datadog.trace.api.TagMap; import datadog.trace.api.TagMap.EntryReader; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.instrumentation.api.InstrumentationTags; import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; import datadog.trace.common.writer.Payload; @@ -125,12 +126,12 @@ public String endpoint() { return "v0.5"; } - // Visible for tests + @VisibleForTesting Map getEncoding() { return encoding; } - // Visible for tests + @VisibleForTesting GrowableBuffer getDictionary() { return dictionary; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java b/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java index 55db6962976..73e3c2063e0 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java @@ -53,6 +53,7 @@ import datadog.trace.api.interceptor.MutableSpan; import datadog.trace.api.interceptor.TraceInterceptor; import datadog.trace.api.internal.TraceSegment; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.metrics.SpanMetricRegistry; import datadog.trace.api.naming.SpanNaming; import datadog.trace.api.remoteconfig.ServiceNameCollector; @@ -1391,7 +1392,7 @@ public String getSpanId(AgentSpan span) { return "0"; } - // @VisibleForTesting + @VisibleForTesting TraceInterceptors getInterceptors() { return interceptors; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java b/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java index d404343bb96..8ffcc77b49c 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java @@ -19,6 +19,7 @@ import datadog.trace.api.debugger.DebuggerConfigBridge; import datadog.trace.api.gateway.Flow; import datadog.trace.api.gateway.RequestContext; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.metrics.SpanMetricRegistry; import datadog.trace.api.metrics.SpanMetrics; import datadog.trace.api.sampling.PrioritySampling; @@ -125,7 +126,7 @@ static DDSpan create( * @param timestampMicro if greater than zero, use this time instead of the current time * @param context the context used for the span */ - // @VisibleForTesting + @VisibleForTesting DDSpan( @Nonnull String instrumentationName, final long timestampMicro, @@ -681,7 +682,7 @@ public long getDurationNano() { return durationNano; } - // @VisibleForTesting + @VisibleForTesting void setDurationNano(long duration) { DURATION_NANO_UPDATER.set(this, duration); } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java b/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java index dbbd57b0c48..6002fce5415 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java @@ -3,6 +3,7 @@ import datadog.communication.ddagent.DDAgentFeaturesDiscovery; import datadog.communication.ddagent.SharedCommunicationObjects; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.core.monitor.HealthMetrics; import java.util.ArrayList; import java.util.List; @@ -140,12 +141,12 @@ private void flushStats() { expired = 0; } - // @VisibleForTesting + @VisibleForTesting int trackedCount() { return traceArray.size(); } - // @VisibleForTesting + @VisibleForTesting int getDropped() { return dropped; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java b/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java index 5a0d1e992a4..11f8566cc48 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java @@ -2,6 +2,7 @@ import datadog.metrics.api.Recording; import datadog.trace.api.DDTraceId; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.time.TimeSource; import datadog.trace.bootstrap.instrumentation.api.AgentScope; import datadog.trace.core.CoreTracer.ConfigSnapshot; @@ -139,7 +140,7 @@ public PendingTrace create(@Nonnull DDTraceId traceId, ConfigSnapshot traceConfi private static final AtomicLongFieldUpdater LAST_REFERENCED = AtomicLongFieldUpdater.newUpdater(PendingTrace.class, "lastReferenced"); - // @VisibleForTesting + @VisibleForTesting PendingTrace( @Nonnull CoreTracer tracer, @Nonnull DDTraceId traceId, @@ -224,37 +225,37 @@ boolean compareAndSetLongRunningState(int expected, int newState) { return LONG_RUNNING_STATE.compareAndSet(this, expected, newState); } - // @VisibleForTesting + @VisibleForTesting int getLongRunningTrackedState() { return longRunningTrackedState; } - // @VisibleForTesting + @VisibleForTesting void setLongRunningTrackedState(int state) { LONG_RUNNING_STATE.set(this, state); } - // @VisibleForTesting + @VisibleForTesting int getPendingReferenceCount() { return pendingReferenceCount; } - // @VisibleForTesting + @VisibleForTesting PendingTraceBuffer getPendingTraceBuffer() { return pendingTraceBuffer; } - // @VisibleForTesting + @VisibleForTesting DDTraceId getTraceId() { return traceId; } - // @VisibleForTesting + @VisibleForTesting boolean isRootSpanWritten() { return rootSpanWritten; } - // @VisibleForTesting + @VisibleForTesting int getIsEnqueued() { return isEnqueued; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/PendingTraceBuffer.java b/dd-trace-core/src/main/java/datadog/trace/core/PendingTraceBuffer.java index 9cd7f73eb2e..c0e8de95a5c 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/PendingTraceBuffer.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/PendingTraceBuffer.java @@ -11,6 +11,7 @@ import datadog.communication.ddagent.SharedCommunicationObjects; import datadog.trace.api.Config; import datadog.trace.api.flare.TracerFlare; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.time.TimeSource; import datadog.trace.common.writer.TraceDumpJsonExporter; import datadog.trace.core.monitor.HealthMetrics; @@ -304,17 +305,17 @@ public DelayingPendingTraceBuffer( : null; } - // @VisibleForTesting + @VisibleForTesting LongRunningTracesTracker getRunningTracesTracker() { return runningTracesTracker; } - // @VisibleForTesting + @VisibleForTesting Thread getWorker() { return worker; } - // @VisibleForTesting + @VisibleForTesting MessagePassingBlockingQueue getQueue() { return queue; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/propagation/W3CHttpCodec.java b/dd-trace-core/src/main/java/datadog/trace/core/propagation/W3CHttpCodec.java index 52037859cb2..f8372f0d76c 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/propagation/W3CHttpCodec.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/propagation/W3CHttpCodec.java @@ -16,6 +16,7 @@ import datadog.trace.api.DDTraceId; import datadog.trace.api.TraceConfig; import datadog.trace.api.TracePropagationStyle; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.internal.util.LongStringUtils; import datadog.trace.api.propagation.W3CTraceParent; import datadog.trace.api.sampling.PrioritySampling; @@ -40,10 +41,9 @@ class W3CHttpCodec { private static final int TRACE_PARENT_FLAGS_SAMPLED = 1; private static final int TRACE_PARENT_LENGTH = TRACE_PARENT_FLAGS_START + 2; - // Package-protected for testing - static final String TRACE_PARENT_KEY = "traceparent"; - static final String TRACE_STATE_KEY = "tracestate"; - static final String OT_BAGGAGE_PREFIX = "ot-baggage-"; + @VisibleForTesting static final String TRACE_PARENT_KEY = "traceparent"; + @VisibleForTesting static final String TRACE_STATE_KEY = "tracestate"; + @VisibleForTesting static final String OT_BAGGAGE_PREFIX = "ot-baggage-"; static final String E2E_START_KEY = OT_BAGGAGE_PREFIX + DDTags.TRACE_START_TIME; private W3CHttpCodec() { diff --git a/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/HttpEndpointPostProcessor.java b/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/HttpEndpointPostProcessor.java index 340a5d82649..c2e0dd72761 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/HttpEndpointPostProcessor.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/HttpEndpointPostProcessor.java @@ -6,6 +6,7 @@ import datadog.trace.api.TagMap; import datadog.trace.api.endpoint.EndpointResolver; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.instrumentation.api.AppendableSpanLinks; import datadog.trace.core.DDSpanContext; import org.slf4j.Logger; @@ -45,10 +46,9 @@ public HttpEndpointPostProcessor() { /** * Creates a new HttpEndpointPostProcessor with the given endpoint resolver. * - *

Visible for testing. - * * @param endpointResolver the resolver to use for endpoint inference */ + @VisibleForTesting HttpEndpointPostProcessor(EndpointResolver endpointResolver) { this.endpointResolver = endpointResolver; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/PeerServiceCalculator.java b/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/PeerServiceCalculator.java index ed1fee8b2d6..198e2c78f1c 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/PeerServiceCalculator.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/PeerServiceCalculator.java @@ -3,6 +3,7 @@ import datadog.trace.api.Config; import datadog.trace.api.DDTags; import datadog.trace.api.TagMap; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.naming.NamingSchema; import datadog.trace.api.naming.SpanNaming; import datadog.trace.bootstrap.instrumentation.api.AppendableSpanLinks; @@ -22,7 +23,7 @@ public PeerServiceCalculator() { this(SpanNaming.instance().namingSchema().peerService(), Config.get().getPeerServiceMapping()); } - // Visible for testing + @VisibleForTesting PeerServiceCalculator( @Nonnull final NamingSchema.ForPeerService peerServiceNaming, @Nonnull final Map peerServiceMapping) { diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy index 00bd706b8fb..193d46b18bd 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy @@ -27,7 +27,12 @@ class ConflatingMetricAggregatorTest extends DDSpecification { @Shared long reportingInterval = 1 @Shared - int queueSize = 256 + // The CSS overclaim path claims traceSize slots per publish (not just the eligible count), + // so tests with multi-span traces and tight ring sizes can spuriously hit capacity. 1024 leaves + // plenty of headroom for the largest test traces (~10 spans) across hundreds of iterations. + // ConflatingMetricsAggregatorInboxFullTest deliberately keeps a tiny ring to exercise full-ring + // behaviour. + int queueSize = 1024 def "should ignore traces with no measured spans"() { setup: @@ -1718,7 +1723,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { def waitUntilEmpty(ConflatingMetricsAggregator aggregator) { int i = 0 - while (!aggregator.inbox.isEmpty() && i++ < 100) { + while ((!aggregator.dataInbox.isEmpty() || !aggregator.signalInbox.isEmpty()) && i++ < 100) { Thread.sleep(10) } } diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java index 7fd767533c7..b35dd4cd938 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java @@ -124,7 +124,7 @@ void testUtilsEqualEntriesHaveEqualHashCodes() { } private static SpanSnapshot snapshotWithPeerTags(String[] names, String[] values) { - return new SpanSnapshot( + return AggregateEntryTestUtils.buildSnapshot( "resource", "svc", "op", @@ -144,7 +144,7 @@ private static SpanSnapshot snapshotWithPeerTags(String[] names, String[] values private static AggregateEntry newEntry() { SpanSnapshot snapshot = - new SpanSnapshot( + AggregateEntryTestUtils.buildSnapshot( "resource", "svc", "op", diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTestUtils.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTestUtils.java index ed6fd5a3a7e..acc298fd436 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTestUtils.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTestUtils.java @@ -69,33 +69,76 @@ public static AggregateEntry of( schema = PeerTagSchema.testSchema(names); } SpanSnapshot syntheticSnapshot = - new SpanSnapshot( + buildSnapshot( resource, - service == null ? null : service.toString(), + service, operationName, serviceSource, type, (short) httpStatusCode, synthetic, traceRoot, - spanKind == null ? null : spanKind.toString(), + spanKind, schema, values, - httpMethod == null ? null : httpMethod.toString(), - httpEndpoint == null ? null : httpEndpoint.toString(), - grpcStatusCode == null ? null : grpcStatusCode.toString(), + httpMethod, + httpEndpoint, + grpcStatusCode, 0L); return forSnapshot(syntheticSnapshot); } /** - * Builds an {@link AggregateEntry} from {@code s} by computing its lookup hash via {@link - * AggregateEntry#hashOf(SpanSnapshot)} and calling the package-private constructor directly. - * Production callers route through {@link AggregateTable#findOrInsert} which already has the - * {@code keyHash} on hand; tests rarely do, so this helper hides the second argument. + * Test-only snapshot builder. Production fillSlot canonicalizes each field via the per-field + * DDCaches on {@link AggregateEntry} and precomputes {@link SpanSnapshot#keyHash}; this helper + * does the same so tests exercise the production matches/hash path. + */ + static SpanSnapshot buildSnapshot( + CharSequence resource, + CharSequence service, + CharSequence operationName, + @Nullable CharSequence serviceSource, + CharSequence type, + short httpStatusCode, + boolean synthetic, + boolean traceRoot, + CharSequence spanKind, + @Nullable PeerTagSchema peerTagSchema, + @Nullable String[] peerTagValues, + @Nullable CharSequence httpMethod, + @Nullable CharSequence httpEndpoint, + @Nullable CharSequence grpcStatusCode, + long tagAndDuration) { + SpanSnapshot s = new SpanSnapshot(); + s.resourceName = AggregateEntry.canonicalize(AggregateEntry.RESOURCE_CACHE, resource); + s.serviceName = AggregateEntry.canonicalize(AggregateEntry.SERVICE_CACHE, service); + s.operationName = AggregateEntry.canonicalize(AggregateEntry.OPERATION_CACHE, operationName); + s.serviceNameSource = + AggregateEntry.canonicalizeOptional(AggregateEntry.SERVICE_SOURCE_CACHE, serviceSource); + s.spanType = AggregateEntry.canonicalize(AggregateEntry.TYPE_CACHE, type); + s.httpStatusCode = httpStatusCode; + s.synthetic = synthetic; + s.traceRoot = traceRoot; + s.spanKind = AggregateEntry.canonicalize(AggregateEntry.SPAN_KIND_CACHE, spanKind); + s.peerTagSchema = peerTagSchema; + s.peerTagValues = peerTagValues; + s.httpMethod = + AggregateEntry.canonicalizeOptional(AggregateEntry.HTTP_METHOD_CACHE, httpMethod); + s.httpEndpoint = + AggregateEntry.canonicalizeOptional(AggregateEntry.HTTP_ENDPOINT_CACHE, httpEndpoint); + s.grpcStatusCode = + AggregateEntry.canonicalizeOptional(AggregateEntry.GRPC_STATUS_CODE_CACHE, grpcStatusCode); + s.tagAndDuration = tagAndDuration; + SpanSnapshot.computeAndSetKeyHash(s); + return s; + } + + /** + * Builds an {@link AggregateEntry} from {@code s}. Production callers route through {@link + * AggregateTable#findOrInsert}; tests use this helper for direct construction. */ public static AggregateEntry forSnapshot(SpanSnapshot s) { - return new AggregateEntry(s, AggregateEntry.hashOf(s)); + return new AggregateEntry(s); } /** diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java index 618ead2ab43..e807dfdd338 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java @@ -266,8 +266,73 @@ void nullServiceAndSpanKindDoNotNpeAndCollapseWithEmpty() { assertEquals(0, first.getSpanKind().length(), "null spanKind should canonicalize to EMPTY"); } + @Test + void noPeerTagSnapshotMatchesEntryEvenWithStaleScratchPeerTagValues() { + // Models the ring-buffer slot-reuse pattern: a slot whose previous publish fired peer tags + // ends up reused for a span with no peer tags. The producer clears peerTagSchema but leaves + // peerTagValues populated (the array is the slot's reusable scratch). The consumer must + // gate on peerTagSchema and ignore stale scratch values for both hashOf and matches -- + // otherwise the slot's stale state would steer the snapshot to a different bucket or fail + // the field-by-field comparison, producing a duplicate aggregate. + AggregateTable table = new AggregateTable(8); + SpanSnapshot cleanNoPeerTags = snapshot("svc", "op", "internal"); + AggregateEntry inserted = table.findOrInsert(cleanNoPeerTags); + + SpanSnapshot withStaleScratch = snapshot("svc", "op", "internal"); + // Simulate stale scratch: same logical contents (no peer tags -> schema null), but the + // values array carries leftover data the slot's prior tag-firing publish wrote into it. + withStaleScratch.peerTagSchema = null; + withStaleScratch.peerTagValues = new String[] {"leftover.host", "leftover.service"}; + + AggregateEntry hit = table.findOrInsert(withStaleScratch); + assertSame(inserted, hit, "stale peerTagValues must be ignored when peerTagSchema is null"); + assertEquals(1, table.size()); + } + + @Test + void aggregateEntryCopiesPeerTagValuesSoSlotReuseDoesntCorruptIt() { + // The AggregateEntry must snapshot peerTagValues by value at insert time -- otherwise + // when the producer reuses the slot's array on the next publish, the entry's identity + // would silently change. Verify by mutating the source array after insert and confirming + // the entry still matches its original-content snapshot. + AggregateTable table = new AggregateTable(8); + String[] values = new String[] {"host1", "svc1"}; + SpanSnapshot s1 = snapshotWithPeerTags("svc", "op", "client", values); + AggregateEntry inserted = table.findOrInsert(s1); + + // Producer "reuses" the array: same reference, new contents. + values[0] = "host2"; + values[1] = "svc2"; + + // A FRESH snapshot with the original peer tag values should still match the entry. + SpanSnapshot s2 = snapshotWithPeerTags("svc", "op", "client", new String[] {"host1", "svc1"}); + AggregateEntry hit = table.findOrInsert(s2); + assertSame(inserted, hit, "entry's peer tag values must be a snapshot, not a live ref"); + } + + private static SpanSnapshot snapshotWithPeerTags( + String service, String operation, String spanKind, String[] peerTagValues) { + PeerTagSchema schema = PeerTagSchema.testSchema(new String[] {"peer.host", "peer.service"}); + return AggregateEntryTestUtils.buildSnapshot( + "resource", + service, + operation, + null, + "web", + (short) 200, + false, + true, + spanKind, + schema, + peerTagValues, + null, + null, + null, + 0L); + } + private static SpanSnapshot nullServiceKindSnapshot(String service, String spanKind) { - return new SpanSnapshot( + return AggregateEntryTestUtils.buildSnapshot( "resource", service, "op", @@ -287,7 +352,7 @@ private static SpanSnapshot nullServiceKindSnapshot(String service, String spanK private static SpanSnapshot nullableSnapshot( String resource, String operation, String type, String serviceNameSource) { - return new SpanSnapshot( + return AggregateEntryTestUtils.buildSnapshot( resource, "svc", operation, @@ -343,7 +408,7 @@ SnapshotBuilder peerTags(String... namesAndValues) { } SpanSnapshot build() { - return new SpanSnapshot( + return AggregateEntryTestUtils.buildSnapshot( "resource", service, operation, diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 6abe7896aab..2014db87f13 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -8,7 +8,7 @@ gradle-tooling-api = "8.14.5" spotbugs_annotations = "4.9.8" # DataDog libs and forks -ddprof = "1.42.0" +ddprof = "1.43.0" dogstatsd = "4.4.5" okhttp = "3.12.15" # Datadog fork to support Java 7 diff --git a/internal-api/build.gradle.kts b/internal-api/build.gradle.kts index b78e5b38b14..fc95dd9e1f1 100644 --- a/internal-api/build.gradle.kts +++ b/internal-api/build.gradle.kts @@ -260,6 +260,7 @@ dependencies { // references TraceScope and Continuation from public api api(project(":dd-trace-api")) api(libs.slf4j) + compileOnlyApi(project(":components:annotations")) api(project(":components:context")) api(project(":components:environment")) api(project(":components:json")) diff --git a/internal-api/src/jmh/java/datadog/trace/util/concurrent/MpscRingBufferBenchmark.java b/internal-api/src/jmh/java/datadog/trace/util/concurrent/MpscRingBufferBenchmark.java new file mode 100644 index 00000000000..e9d084f3334 --- /dev/null +++ b/internal-api/src/jmh/java/datadog/trace/util/concurrent/MpscRingBufferBenchmark.java @@ -0,0 +1,146 @@ +package datadog.trace.util.concurrent; + +import static java.util.concurrent.TimeUnit.SECONDS; + +import java.util.function.BiConsumer; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Group; +import org.openjdk.jmh.annotations.GroupThreads; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * Throughput benchmarks for {@link MpscRingBuffer}. + * + *

    + *
  • {@code write_1p / write_8p / write_16p} — producer-side throughput with a background + * drainer consuming what's published. Measures the cost of one {@code tryWrite} including CAS + * contention on the producer cursor at the given thread count. + *
  • {@code e2e_8p} — JMH {@code @Group} pairing 8 producers with 1 consumer. Aggregate + * throughput reflects whichever side is the bottleneck under realistic pressure. + *
+ * + *

Run with {@code -p capacity=...} to override the default ring capacity. + */ +@State(Scope.Benchmark) +@Warmup(iterations = 2, time = 15, timeUnit = SECONDS) +@Measurement(iterations = 5, time = 15, timeUnit = SECONDS) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(SECONDS) +@Fork(value = 1) +public class MpscRingBufferBenchmark { + + /** + * Static filler so the lambda is non-capturing and the JIT can hoist it past the {@code tryWrite} + * call. Context arg comes first, slot last — matches {@code TagMap.forEach} convention. + */ + private static final BiConsumer FILLER = (v, slot) -> slot.value = v; + + /** Mutable slot. Replicates the per-publish allocation a real producer wants to avoid. */ + public static final class Slot { + long value; + } + + @Param({"1024"}) + public int capacity; + + /** + * Shared ring for the {@code write_*} benches. A background drainer keeps space available so + * producer benchmarks measure write throughput rather than full-ring drop throughput. + */ + MpscRingBuffer ring; + + private volatile boolean stopDrainer; + private Thread drainerThread; + + /** + * Separate ring for the {@code e2e_*} group benches. JMH drives both sides directly so we don't + * want our own background drainer for those. + */ + MpscRingBuffer e2eRing; + + @Setup(Level.Trial) + public void setup() { + ring = new MpscRingBuffer<>(Slot::new, capacity); + e2eRing = new MpscRingBuffer<>(Slot::new, capacity); + stopDrainer = false; + drainerThread = + new Thread( + () -> { + while (!stopDrainer) { + if (ring.drain((Slot s) -> {}) == 0) Thread.yield(); + } + }, + "MpscRingBufferBenchmark-drainer"); + drainerThread.setDaemon(true); + drainerThread.start(); + } + + @TearDown(Level.Trial) + public void teardown() throws InterruptedException { + stopDrainer = true; + drainerThread.join(5_000); + } + + @State(Scope.Thread) + public static class ThreadState { + long counter; + } + + // ============ Write throughput with background drainer ============ + + @Threads(1) + @Benchmark + public boolean write_1p(ThreadState ts) { + return ring.tryWrite(ts.counter++, FILLER); + } + + @Threads(8) + @Benchmark + public boolean write_8p(ThreadState ts) { + return ring.tryWrite(ts.counter++, FILLER); + } + + @Threads(16) + @Benchmark + public boolean write_16p(ThreadState ts) { + return ring.tryWrite(ts.counter++, FILLER); + } + + // ============ End-to-end producer/consumer pairing ============ + // + // JMH runs both methods in the same trial: 8 producer threads + 1 consumer thread. Throughput + // is reported as ops/sec aggregated across all 9 threads, but the consumer's drain count + // dwarfs the producer ops since one call processes many slots -- in practice the bottleneck + // is the producer side (CAS contention), and that's what the number reflects. + + private static final BiConsumer CONSUMER = (bh, slot) -> bh.consume(slot.value); + + @Group("e2e_8p") + @GroupThreads(8) + @Benchmark + public boolean e2e_producer(ThreadState ts) { + return e2eRing.tryWrite(ts.counter++, FILLER); + } + + @Group("e2e_8p") + @GroupThreads(1) + @Benchmark + public int e2e_consumer(Blackhole bh) { + int drained = e2eRing.drain(bh, CONSUMER); + if (drained == 0) Thread.yield(); + return drained; + } +} diff --git a/internal-api/src/main/java/datadog/trace/api/Config.java b/internal-api/src/main/java/datadog/trace/api/Config.java index 07f10672273..b90fcb0fd87 100644 --- a/internal-api/src/main/java/datadog/trace/api/Config.java +++ b/internal-api/src/main/java/datadog/trace/api/Config.java @@ -2184,7 +2184,23 @@ private Config(final ConfigProvider configProvider, final InstrumenterConfig ins configProvider.getBoolean(TRACE_STATS_COMPUTATION_IGNORE_AGENT_VERSION, false); tracerMetricsBufferingEnabled = configProvider.getBoolean(TRACER_METRICS_BUFFERING_ENABLED, false); - tracerMetricsMaxAggregates = configProvider.getInteger(TRACER_METRICS_MAX_AGGREGATES, 2048); + // The MpscRingBuffer pre-allocates one SpanSnapshot per slot at construction, so capacity + // becomes a resident-RSS cost. The old TRACER_METRICS_MAX_PENDING default of 2048 (logical) + // was sized for the previous on-demand batch-allocation model; with the ring it would pin + // ~15 MB upfront for what only needs to absorb sub-second consumer stalls. Cut the default + // accordingly: + // - normal heap: 128 logical * 64 LEGACY_BATCH_SIZE = 8192 slots, ~1 MB upfront, ~0.8 s + // of buffer at 10K spans/s. Plenty of margin for GC pauses. + // - tight heap (<128 MB): 64 logical * 64 = 4096 slots, ~500 KB upfront. Observed + // catastrophic at Xmx64m with the prior 131072-slot default (Full-GC death spiral). + // Customers who explicitly configured TRACER_METRICS_MAX_PENDING keep their value (the + // LEGACY_BATCH_SIZE multiplier still applies to it) -- only the implicit default shrinks. + final boolean tightHeap = Runtime.getRuntime().maxMemory() < 128L * 1024 * 1024; + final int defaultMaxAggregates = tightHeap ? 256 : 2048; + final int defaultMaxPending = tightHeap ? 64 : 128; + + tracerMetricsMaxAggregates = + configProvider.getInteger(TRACER_METRICS_MAX_AGGREGATES, defaultMaxAggregates); /* * TRACER_METRICS_MAX_PENDING historically counted conflating Batch slots (~64 spans per batch * via Batch.MAX_BATCH_SIZE). The inbox now holds 1 SpanSnapshot per metrics-eligible span, so @@ -2199,7 +2215,8 @@ private Config(final ConfigProvider configProvider, final InstrumenterConfig ins * see java.util.ArraysSupport.SOFT_MAX_ARRAY_LENGTH for the same convention. */ long requestedMaxPending = - (long) configProvider.getInteger(TRACER_METRICS_MAX_PENDING, 2048) * LEGACY_BATCH_SIZE; + (long) configProvider.getInteger(TRACER_METRICS_MAX_PENDING, defaultMaxPending) + * LEGACY_BATCH_SIZE; tracerMetricsMaxPending = (int) Math.min(requestedMaxPending, MAX_SAFE_ARRAY_SIZE); reportHostName = diff --git a/internal-api/src/main/java/datadog/trace/api/ProcessTags.java b/internal-api/src/main/java/datadog/trace/api/ProcessTags.java index 932ec19c565..40b23faa45c 100644 --- a/internal-api/src/main/java/datadog/trace/api/ProcessTags.java +++ b/internal-api/src/main/java/datadog/trace/api/ProcessTags.java @@ -3,6 +3,7 @@ import datadog.environment.EnvironmentVariables; import datadog.environment.SystemProperties; import datadog.trace.api.env.CapturedEnvironment; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; import datadog.trace.util.TraceUtils; import java.nio.file.Path; @@ -30,8 +31,7 @@ public class ProcessTags { public static final String ENTRYPOINT_BASEDIR = "entrypoint.basedir"; public static final String ENTRYPOINT_WORKDIR = "entrypoint.workdir"; - // visible for testing - static Function envGetter = EnvironmentVariables::get; + @VisibleForTesting static Function envGetter = EnvironmentVariables::get; private static class Lazy { // the tags are used to compute a hash for dsm hence that map must be sorted. @@ -269,7 +269,7 @@ public static UTF8BytesString getTagsForSerialization() { return Lazy.serializedForm; } - /** Visible for testing. */ + @VisibleForTesting static void empty() { synchronized (Lazy.TAGS) { Lazy.TAGS.clear(); @@ -279,12 +279,12 @@ static void empty() { } } - /** Visible for testing. */ + @VisibleForTesting static void reset() { reset(Config.get()); } - /** Visible for testing. */ + @VisibleForTesting public static void reset(Config config) { synchronized (Lazy.TAGS) { empty(); diff --git a/internal-api/src/main/java/datadog/trace/api/civisibility/config/BazelMode.java b/internal-api/src/main/java/datadog/trace/api/civisibility/config/BazelMode.java index 898dd76129b..56698fe7085 100644 --- a/internal-api/src/main/java/datadog/trace/api/civisibility/config/BazelMode.java +++ b/internal-api/src/main/java/datadog/trace/api/civisibility/config/BazelMode.java @@ -3,6 +3,7 @@ import static java.nio.charset.StandardCharsets.UTF_8; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.config.inversion.ConfigHelper; import datadog.trace.util.Strings; import java.io.BufferedReader; @@ -51,7 +52,7 @@ public static BazelMode get() { return INSTANCE; } - // visible for testing + @VisibleForTesting BazelMode(Config config) { String manifestRloc = config.getTestOptimizationManifestFile(); if (Strings.isNotBlank(manifestRloc)) { @@ -331,7 +332,7 @@ private static String lookupInRunfilesManifest(String manifestFile, String rloca return null; } - // visible for testing + @VisibleForTesting static void reset() { INSTANCE = null; } diff --git a/internal-api/src/main/java/datadog/trace/api/datastreams/TransactionInfo.java b/internal-api/src/main/java/datadog/trace/api/datastreams/TransactionInfo.java index 9f09186d30d..b4755d58342 100644 --- a/internal-api/src/main/java/datadog/trace/api/datastreams/TransactionInfo.java +++ b/internal-api/src/main/java/datadog/trace/api/datastreams/TransactionInfo.java @@ -1,5 +1,6 @@ package datadog.trace.api.datastreams; +import datadog.trace.api.internal.VisibleForTesting; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.charset.StandardCharsets; @@ -71,7 +72,7 @@ public byte[] getBytes() { return buffer.array(); } - // @VisibleForTesting + @VisibleForTesting static synchronized void resetCache() { CACHE.clear(); CACHE_BYTES = new byte[0]; diff --git a/internal-api/src/main/java/datadog/trace/api/profiling/ProfilerFlareLogger.java b/internal-api/src/main/java/datadog/trace/api/profiling/ProfilerFlareLogger.java index 2672d8e1567..6fc57ae641d 100644 --- a/internal-api/src/main/java/datadog/trace/api/profiling/ProfilerFlareLogger.java +++ b/internal-api/src/main/java/datadog/trace/api/profiling/ProfilerFlareLogger.java @@ -1,6 +1,7 @@ package datadog.trace.api.profiling; import datadog.trace.api.flare.TracerFlare; +import datadog.trace.api.internal.VisibleForTesting; import java.io.IOException; import java.time.Instant; import java.time.ZoneOffset; @@ -23,7 +24,7 @@ private static final class Singleton { private final List flareReportLines = new ArrayList<>(); private int usedReportCapacity = 0; - // @VisibleForTesting + @VisibleForTesting ProfilerFlareLogger() { TracerFlare.addReporter(this); } @@ -76,17 +77,17 @@ public void cleanupAfterFlare() { cleanup(); } - // @VisibleForTesting + @VisibleForTesting int getUsedReportCapacity() { return usedReportCapacity; } - // @VisibleForTesting + @VisibleForTesting int getMaxReportCapacity() { return REPORT_CAPACITY; } - // @VisibleForTesting + @VisibleForTesting int linesSize() { synchronized (flareReportLines) { return flareReportLines.size(); diff --git a/internal-api/src/main/java/datadog/trace/api/remoteconfig/ServiceNameCollector.java b/internal-api/src/main/java/datadog/trace/api/remoteconfig/ServiceNameCollector.java index b7c0a52a63f..664e1f1fe6f 100644 --- a/internal-api/src/main/java/datadog/trace/api/remoteconfig/ServiceNameCollector.java +++ b/internal-api/src/main/java/datadog/trace/api/remoteconfig/ServiceNameCollector.java @@ -3,6 +3,7 @@ import static datadog.trace.api.telemetry.LogCollector.SEND_TELEMETRY; import datadog.trace.api.Config; +import datadog.trace.api.internal.VisibleForTesting; import java.util.ArrayList; import java.util.List; import java.util.Set; @@ -73,7 +74,7 @@ public void clear() { services.clear(); } - // Visible for testing + @VisibleForTesting static void setInstance(ServiceNameCollector instance) { INSTANCE = instance; } diff --git a/internal-api/src/main/java/datadog/trace/util/ProcessSupervisor.java b/internal-api/src/main/java/datadog/trace/util/ProcessSupervisor.java index 686c35ce65b..006d921bec9 100644 --- a/internal-api/src/main/java/datadog/trace/util/ProcessSupervisor.java +++ b/internal-api/src/main/java/datadog/trace/util/ProcessSupervisor.java @@ -7,6 +7,7 @@ import static datadog.trace.util.ProcessSupervisor.Health.INTERRUPTED; import static datadog.trace.util.ProcessSupervisor.Health.READY_TO_START; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.bootstrap.instrumentation.api.AgentTracer; import datadog.trace.context.TraceScope; import java.io.Closeable; @@ -146,7 +147,7 @@ public void close() { } } - // Package reachable for testing + @VisibleForTesting Process getCurrentProcess() { return currentProcess; } diff --git a/internal-api/src/main/java/datadog/trace/util/TempLocationManager.java b/internal-api/src/main/java/datadog/trace/util/TempLocationManager.java index 7574c9c5edc..94c74c9f5f8 100644 --- a/internal-api/src/main/java/datadog/trace/util/TempLocationManager.java +++ b/internal-api/src/main/java/datadog/trace/util/TempLocationManager.java @@ -2,6 +2,7 @@ import datadog.environment.SystemProperties; import datadog.trace.api.config.ProfilingConfig; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.profiling.ProfilerFlareLogger; import datadog.trace.api.time.SystemTimeSource; import datadog.trace.api.time.TimeSource; @@ -307,7 +308,7 @@ private TempLocationManager() { createTempDir(tempDir); } - // @VisibleForTesting + @VisibleForTesting static String getBaseTempDirName() { String userName = SystemProperties.get("user.name"); // unlikely, but fall-back to system env based user name @@ -399,7 +400,7 @@ boolean cleanup(long timeout, TimeUnit unit) { return false; } - // accessible for tests + @VisibleForTesting boolean waitForCleanup(long timeout, TimeUnit unit) { try { return cleanupTask.await(timeout, unit); @@ -416,7 +417,7 @@ boolean waitForCleanup(long timeout, TimeUnit unit) { return false; } - // accessible for tests + @VisibleForTesting void createDirStructure() throws IOException { Files.createDirectories(baseTempDir); } diff --git a/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java b/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java new file mode 100644 index 00000000000..4cefa43cac0 --- /dev/null +++ b/internal-api/src/main/java/datadog/trace/util/concurrent/MpscRingBuffer.java @@ -0,0 +1,434 @@ +package datadog.trace.util.concurrent; + +import datadog.trace.api.function.TriConsumer; +import java.util.concurrent.atomic.AtomicLongArray; +import java.util.concurrent.atomic.AtomicLongFieldUpdater; +import java.util.function.BiConsumer; +import java.util.function.Consumer; +import java.util.function.Supplier; + +// Disruptor-style cache-line padding around the cursors. The two cursors live on different cache +// lines so consumer-side writes to consumerCursor don't invalidate the line producers read for +// producerCursor (and vice versa). Each padding class declares 7 longs (56 bytes); combined with +// the cursor's own 8 bytes plus the JVM object header, each cursor + its surrounding pad fills +// a 64-byte cache line. The HotSpot field-layout strategy preserves the declaration order across +// the class hierarchy, so this pattern is reliable on all production JVMs we target. + +abstract class MpscRingBufferPad0 { + long p01, p02, p03, p04, p05, p06, p07; +} + +abstract class MpscRingBufferProducerCursor extends MpscRingBufferPad0 { + /** Next sequence to claim. Producers increment via CAS through {@code PRODUCER_CURSOR}. */ + volatile long producerCursor = -1L; +} + +abstract class MpscRingBufferPad1 extends MpscRingBufferProducerCursor { + long p11, p12, p13, p14, p15, p16, p17; +} + +abstract class MpscRingBufferConsumerCursor extends MpscRingBufferPad1 { + /** Highest sequence consumed. Only the consumer thread writes; producers read volatile. */ + volatile long consumerCursor = -1L; +} + +abstract class MpscRingBufferPad2 extends MpscRingBufferConsumerCursor { + long p21, p22, p23, p24, p25, p26, p27; +} + +/** + * Bounded multi-producer / single-consumer ring buffer of pre-allocated {@code T} instances. + * + *

Each slot is a long-lived {@code T} instance created at construction time via the supplied + * factory and recycled forever. Producers mutate slots in place via callbacks; the consumer reads + * them the same way. No allocation occurs per write/read after construction, which is the entire + * point of this class over a queue of references. + * + *

The {@code BiConsumer} and {@link TriConsumer} variants take their context object(s) + * before the slot, matching the convention of {@code TagMap.forEach} and {@code + * Hashtable.forEach}. That ordering lets callers declare the callback as a {@code static final} + * non-capturing lambda and pass per-call context at the call site without allocating a closure. + * + *

Thread safety contract

+ * + *

The ring buffer is thread-safe for any number of producer threads plus exactly one consumer + * thread. Calling {@code drain} from multiple threads concurrently is not supported and will + * corrupt state. + * + *

For the slot type {@code T}: + * + *

    + *
  • Slot fields can be plain ({@code int}, {@code long}, object references) -- they do + * not need to be {@code volatile} or guarded by synchronization. Happens-before + * between the producer's slot mutation and the consumer's slot read is provided by the ring's + * internal publication-sequence machinery: a release write on the per-slot sequence inside + * {@code tryWrite}, paired with an acquire read inside {@code drain}. + *
  • Don't retain slot references past your handler's return. Once a {@code tryWrite} + * filler returns, the slot becomes visible to the consumer; once a {@code drain} handler + * returns, the slot may be reclaimed by another producer and its fields overwritten. If the + * consumer needs to keep any state from a slot, it must extract by value (or copy references) + * before returning. + *
  • Don't expose slot references outside the ring. Treat {@code T} as ring-buffer-owned; + * sharing a slot reference with code that doesn't follow the same discipline breaks the + * happens-before story. + *
+ * + *

For producer fillers: + * + *

    + *
  • Filler invocations on the same slot are serialized (one producer wins the sequence CAS), so + * the filler can write fields without synchronization. + *
  • If a filler throws, the slot is published anyway (with whatever the filler had + * written so far) and the exception propagates to the caller. This prevents the consumer from + * getting stuck waiting for an unfinished slot; the cost is that the consumer may observe a + * partially-filled or stale-fielded slot. Fillers should be written to either not throw or to + * leave the slot in a state the consumer can recognize and skip. + *
+ * + *

Implementation

+ * + *

Producer cursor is CAS-claimed; visibility of a claimed slot to the consumer is gated by a + * per-slot publication-sequence array. Consumer cursor is updated with a volatile write so + * producers observe space being freed. Cursors are cache-line-padded against each other (see the + * {@code MpscRingBufferPad*} hierarchy at the top of this file) and the publication-sequence array + * is strided so each logical entry occupies a distinct cache line. + */ +public final class MpscRingBuffer extends MpscRingBufferPad2 { + + /** + * Cache line size in {@code long}-units. 64-byte cache lines on every common CPU we ship to (x86, + * ARM); 8 bytes per long. Each logical slot in {@link #publishedSequences} is spread out by this + * stride so adjacent logical sequences don't share a cache line and don't ping-pong between + * producer cores under heavy contention. + */ + private static final int CACHE_LINE_LONGS = 8; + + @SuppressWarnings("rawtypes") // AtomicLongFieldUpdater can't take a parameterized class + private static final AtomicLongFieldUpdater PRODUCER_CURSOR = + AtomicLongFieldUpdater.newUpdater(MpscRingBufferProducerCursor.class, "producerCursor"); + + private final T[] slots; + + /** + * Per-slot publication sequence, strided by {@link #CACHE_LINE_LONGS} to avoid false sharing. + * Producers write the claimed sequence here as the last step of a publish (release write via + * {@link AtomicLongArray#set}); the consumer reads it (acquire read) to determine whether the + * slot at the next position is ready. A slot is considered published for sequence {@code s} iff + * {@code publishedSequences[(s & mask) * CACHE_LINE_LONGS] == s}. + */ + private final AtomicLongArray publishedSequences; + + private final int capacity; + private final int mask; + + @SuppressWarnings("unchecked") + public MpscRingBuffer(final Supplier factory, final int capacityHint) { + if (capacityHint < 1) { + throw new IllegalArgumentException("capacityHint must be >= 1, got " + capacityHint); + } + this.capacity = nextPowerOfTwo(capacityHint); + if (this.capacity < 1) { + throw new IllegalArgumentException("capacity overflow for hint " + capacityHint); + } + this.mask = capacity - 1; + this.slots = (T[]) new Object[capacity]; + this.publishedSequences = new AtomicLongArray(capacity * CACHE_LINE_LONGS); + for (int i = 0; i < capacity; i++) { + slots[i] = factory.get(); + // Initial: sentinel "no sequence published here yet" -- anything < 0 works since + // sequences are 0-based and monotonically increasing. + publishedSequences.set(i * CACHE_LINE_LONGS, Long.MIN_VALUE); + } + } + + public int capacity() { + return capacity; + } + + /** Approximate count of slots holding unread items. May briefly exceed capacity under race. */ + public int size() { + final long p = producerCursor; + final long c = consumerCursor; + final long diff = p - c; + if (diff <= 0) return 0; + if (diff > capacity) return capacity; + return (int) diff; + } + + public boolean isEmpty() { + return producerCursor == consumerCursor; + } + + /** {@code true} if the slot was filled and published; {@code false} if the ring is full. */ + public boolean tryWrite(final Consumer filler) { + final long seq = claim(); + if (seq < 0L) return false; + // publish in finally so a throwing filler doesn't leave the slot un-published -- the + // consumer would otherwise wait at that sequence forever. See class javadoc. + try { + filler.accept(slots[(int) (seq & mask)]); + } finally { + publish(seq); + } + return true; + } + + public boolean tryWrite(final C context, final BiConsumer filler) { + final long seq = claim(); + if (seq < 0L) return false; + try { + filler.accept(context, slots[(int) (seq & mask)]); + } finally { + publish(seq); + } + return true; + } + + public boolean tryWrite( + final C1 context1, + final C2 context2, + final TriConsumer filler) { + final long seq = claim(); + if (seq < 0L) return false; + try { + filler.accept(context1, context2, slots[(int) (seq & mask)]); + } finally { + publish(seq); + } + return true; + } + + /** Drains all currently-available slots. Returns the count processed. */ + public int drain(final Consumer handler) { + long cursor = consumerCursor; + int count = 0; + while (true) { + final long nextSeq = cursor + 1L; + final int idx = (int) (nextSeq & mask); + if (publishedSequences.get(idx * CACHE_LINE_LONGS) != nextSeq) break; + handler.accept(slots[idx]); + cursor = nextSeq; + count++; + } + if (count > 0) consumerCursor = cursor; + return count; + } + + public int drain(final C context, final BiConsumer handler) { + long cursor = consumerCursor; + int count = 0; + while (true) { + final long nextSeq = cursor + 1L; + final int idx = (int) (nextSeq & mask); + if (publishedSequences.get(idx * CACHE_LINE_LONGS) != nextSeq) break; + handler.accept(context, slots[idx]); + cursor = nextSeq; + count++; + } + if (count > 0) consumerCursor = cursor; + return count; + } + + public int drain( + final C1 context1, + final C2 context2, + final TriConsumer handler) { + long cursor = consumerCursor; + int count = 0; + while (true) { + final long nextSeq = cursor + 1L; + final int idx = (int) (nextSeq & mask); + if (publishedSequences.get(idx * CACHE_LINE_LONGS) != nextSeq) break; + handler.accept(context1, context2, slots[idx]); + cursor = nextSeq; + count++; + } + if (count > 0) consumerCursor = cursor; + return count; + } + + /** + * Try to claim a contiguous range of {@code n} sequences in a single CAS. Returns {@code null} if + * the ring doesn't have room for the whole batch -- the caller treats that as "drop all {@code + * n}", which is the natural shape for callers that batch by a higher-level unit (e.g. one CSS + * publish per completed trace). When the caller has a list of N items to write, this amortizes + * producer-cursor contention from O(N) CASes to O(1) per call. + * + *

The returned {@link Batch} must be filled via {@link Batch#fillAndPublish} exactly {@code n} + * times. Under-publishing leaves the ring stuck at the unfilled sequence -- the consumer waits + * there forever. Over-publishing throws {@link IllegalStateException}. + * + * @throws IllegalArgumentException if {@code n < 1} + */ + public Batch tryClaim(final int n) { + if (n < 1) { + throw new IllegalArgumentException("n must be >= 1, got " + n); + } + while (true) { + final long current = producerCursor; + // Stale read of consumerCursor is fine: a false "full" reading just causes a drop, and a + // real one is correctly identified because consumerCursor only advances. + final long consumed = consumerCursor; + final long next = current + n; + if (next - consumed > capacity) { + return null; + } + if (PRODUCER_CURSOR.compareAndSet(this, current, next)) { + // Claimed sequences [current + 1, next] inclusive (== n sequences total). + return new Batch(current + 1L, n); + } + // CAS failure -> another producer claimed; retry. + } + } + + /** CAS-claim the next sequence, or return {@code -1} if the ring is full. */ + private long claim() { + while (true) { + final long current = producerCursor; + // Stale read of consumerCursor is fine: a false "full" reading just causes a drop, and + // a real "full" reading is correctly identified because consumerCursor only advances. + final long consumed = consumerCursor; + if (current - consumed >= capacity) { + return -1L; + } + final long next = current + 1L; + if (PRODUCER_CURSOR.compareAndSet(this, current, next)) { + return next; + } + // CAS failure -> another producer claimed; retry. + } + } + + /** + * Handle returned by {@link MpscRingBuffer#tryClaim}. Holds a contiguous range of pre-claimed + * sequences belonging to the producer thread that called {@code tryClaim}; the caller must fill + * and publish each via {@link #fillAndPublish}. + * + *

Not thread-safe -- the producer thread owns it for the lifetime of the call. Do not + * share across threads. + */ + public final class Batch { + private final long startSeq; + private final int size; + private int published; + + Batch(final long startSeq, final int size) { + this.startSeq = startSeq; + this.size = size; + } + + /** Total slots in this batch (the {@code n} passed to {@code tryClaim}). */ + public int size() { + return size; + } + + /** Slots not yet filled. */ + public int remaining() { + return size - published; + } + + public void fillAndPublish(final Consumer filler) { + final long seq = nextSeq(); + final int idx = (int) (seq & mask); + try { + filler.accept(slots[idx]); + } finally { + publishedSequences.set(idx * CACHE_LINE_LONGS, seq); + } + } + + public void fillAndPublish(final C context, final BiConsumer filler) { + final long seq = nextSeq(); + final int idx = (int) (seq & mask); + try { + filler.accept(context, slots[idx]); + } finally { + publishedSequences.set(idx * CACHE_LINE_LONGS, seq); + } + } + + public void fillAndPublish( + final C1 context1, + final C2 context2, + final TriConsumer filler) { + final long seq = nextSeq(); + final int idx = (int) (seq & mask); + try { + filler.accept(context1, context2, slots[idx]); + } finally { + publishedSequences.set(idx * CACHE_LINE_LONGS, seq); + } + } + + private long nextSeq() { + if (published >= size) { + throw new IllegalStateException( + "Batch over-published: size=" + size + " published=" + published); + } + final long seq = startSeq + published; + published++; + return seq; + } + } + + // ============ Low-level primitives ============ + // + // These three methods (tryClaimRange / slotAt / publish) expose the producer-side machinery + // directly. Callers manage the claimed sequence range themselves -- no per-call handle + // allocated, no callback dispatched. This is intended for hot paths where the higher-level + // tryWrite/tryClaim+Batch APIs would allocate per call (Iterator on the iterable being + // batched, or the Batch object itself) that escape analysis doesn't eliminate. + // + // Misuse hazards: every claimed sequence MUST be published exactly once; sequences must be + // published in [start, start+n) range only; the slot returned by slotAt() must not escape past + // the next publish() of any sequence (the producer "owns" the slot until publish). + + /** + * Try to claim a contiguous range of {@code n} sequences in a single CAS. Returns the + * start sequence on success (a non-negative long) or {@code -1L} if the ring doesn't have + * room for the whole batch. The caller must publish each sequence in {@code [start, start + n)} + * exactly once via {@link #publish(long)}. + * + * @throws IllegalArgumentException if {@code n < 1} + */ + public long tryClaimRange(final int n) { + if (n < 1) { + throw new IllegalArgumentException("n must be >= 1, got " + n); + } + while (true) { + final long current = producerCursor; + final long consumed = consumerCursor; + final long next = current + n; + if (next - consumed > capacity) { + return -1L; + } + if (PRODUCER_CURSOR.compareAndSet(this, current, next)) { + return current + 1L; + } + } + } + + /** + * Slot for sequence {@code seq}. Only safe when {@code seq} is in a range currently owned by the + * caller (claimed via {@link #tryClaimRange} and not yet published). + */ + public T slotAt(final long seq) { + return slots[(int) (seq & mask)]; + } + + /** + * Publish sequence {@code seq}, making the slot at {@code slotAt(seq)} visible to the consumer. + * Release semantics via {@link AtomicLongArray#set}. + */ + public void publish(final long seq) { + publishedSequences.set(((int) (seq & mask)) * CACHE_LINE_LONGS, seq); + } + + private static int nextPowerOfTwo(final int n) { + if (n <= 1) return 1; + // 32 - leadingZeros(n-1) gives the number of bits needed to represent n-1; shifting 1 by that + // gives the smallest power of two >= n. + final int bits = 32 - Integer.numberOfLeadingZeros(n - 1); + return 1 << bits; + } +} diff --git a/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java b/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java new file mode 100644 index 00000000000..6aeb635abd0 --- /dev/null +++ b/internal-api/src/test/java/datadog/trace/util/concurrent/MpscRingBufferTest.java @@ -0,0 +1,498 @@ +package datadog.trace.util.concurrent; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import datadog.trace.api.function.TriConsumer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiConsumer; +import org.junit.jupiter.api.Test; + +class MpscRingBufferTest { + + /** Mutable slot used by the tests; replaces the per-publish allocation a real consumer avoids. */ + static final class Slot { + int value; + String tag; + } + + // ============ Construction ============ + + @Test + void capacityRoundsUpToPowerOfTwo() { + assertEquals(1, new MpscRingBuffer<>(Slot::new, 1).capacity()); + assertEquals(2, new MpscRingBuffer<>(Slot::new, 2).capacity()); + assertEquals(4, new MpscRingBuffer<>(Slot::new, 3).capacity()); + assertEquals(16, new MpscRingBuffer<>(Slot::new, 10).capacity()); + assertEquals(1024, new MpscRingBuffer<>(Slot::new, 1024).capacity()); + assertEquals(2048, new MpscRingBuffer<>(Slot::new, 1025).capacity()); + } + + @Test + void rejectsNonPositiveCapacityHint() { + assertThrows(IllegalArgumentException.class, () -> new MpscRingBuffer<>(Slot::new, 0)); + assertThrows(IllegalArgumentException.class, () -> new MpscRingBuffer<>(Slot::new, -1)); + } + + @Test + void slotsArePreAllocatedFromFactory() { + AtomicInteger calls = new AtomicInteger(); + MpscRingBuffer ring = + new MpscRingBuffer<>( + () -> { + calls.incrementAndGet(); + return new Slot(); + }, + 8); + assertEquals(8, ring.capacity()); + assertEquals(8, calls.get(), "factory must run capacity times during construction"); + } + + // ============ Basic produce / consume ============ + + @Test + void emptyRingIsEmpty() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + assertTrue(ring.isEmpty()); + assertEquals(0, ring.size()); + assertEquals(0, ring.drain(s -> {})); + } + + @Test + void singleWriteThenDrain() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + assertTrue( + ring.tryWrite( + s -> { + s.value = 42; + s.tag = "hello"; + })); + assertEquals(1, ring.size()); + + List seen = new ArrayList<>(); + int drained = ring.drain(s -> seen.add(s.value)); + assertEquals(1, drained); + assertEquals(Arrays.asList(42), seen); + assertTrue(ring.isEmpty()); + } + + @Test + void writesPreserveFIFOOrder() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + for (int i = 0; i < 6; i++) { + final int v = i; + assertTrue(ring.tryWrite(s -> s.value = v)); + } + List seen = new ArrayList<>(); + ring.drain(s -> seen.add(s.value)); + assertEquals(Arrays.asList(0, 1, 2, 3, 4, 5), seen); + } + + // ============ Full / drop behavior ============ + + @Test + void writesBeyondCapacityFail() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + for (int i = 0; i < 4; i++) { + assertTrue(ring.tryWrite(s -> s.value = 0)); + } + assertFalse(ring.tryWrite(s -> s.value = 0), "5th write must fail when capacity == 4"); + assertEquals(4, ring.size()); + } + + @Test + void writesResumeAfterDrain() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + for (int i = 0; i < 4; i++) { + final int v = i; + assertTrue(ring.tryWrite(s -> s.value = v)); + } + assertFalse(ring.tryWrite(s -> {})); + + AtomicInteger sum = new AtomicInteger(); + ring.drain(s -> sum.addAndGet(s.value)); + assertEquals(0 + 1 + 2 + 3, sum.get()); + + // Now should accept another full round. + for (int i = 100; i < 104; i++) { + final int v = i; + assertTrue(ring.tryWrite(s -> s.value = v)); + } + assertFalse(ring.tryWrite(s -> {})); + } + + // ============ Context-passing variants (context first, slot last) ============ + + @Test + void writeAndDrainWithOneContext() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + BiConsumer filler = (ctx, s) -> s.value = ctx; + assertTrue(ring.tryWrite(7, filler)); + assertTrue(ring.tryWrite(8, filler)); + + List seen = new ArrayList<>(); + BiConsumer, Slot> reader = (sink, s) -> sink.add(s.value); + assertEquals(2, ring.drain(seen, reader)); + assertEquals(Arrays.asList(7, 8), seen); + } + + @Test + void writeAndDrainWithTwoContexts() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + TriConsumer filler = + (v, t, s) -> { + s.value = v; + s.tag = t; + }; + assertTrue(ring.tryWrite(1, "a", filler)); + assertTrue(ring.tryWrite(2, "b", filler)); + + List tags = new ArrayList<>(); + List vals = new ArrayList<>(); + TriConsumer, List, Slot> reader = + (ts, vs, s) -> { + ts.add(s.tag); + vs.add(s.value); + }; + assertEquals(2, ring.drain(tags, vals, reader)); + assertEquals(Arrays.asList("a", "b"), tags); + assertEquals(Arrays.asList(1, 2), vals); + } + + // ============ Concurrency ============ + + @Test + void manyProducersSingleConsumerSeesEveryWrittenValue() throws InterruptedException { + final int producers = 8; + final int perProducer = 50_000; + final int total = producers * perProducer; + + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 1024); + + ExecutorService producerPool = Executors.newFixedThreadPool(producers); + AtomicInteger dropped = new AtomicInteger(); + AtomicInteger written = new AtomicInteger(); + CountDownLatch start = new CountDownLatch(1); + + BiConsumer filler = (v, s) -> s.value = v; + + for (int p = 0; p < producers; p++) { + final int base = p * perProducer; + producerPool.submit( + () -> { + try { + start.await(); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + return; + } + for (int i = 0; i < perProducer; i++) { + int v = base + i; + // Spin until the consumer makes room. We're testing correctness, not throughput. + while (!ring.tryWrite(v, filler)) { + dropped.incrementAndGet(); + Thread.yield(); + } + written.incrementAndGet(); + } + }); + } + + Set seen = new HashSet<>(total); + BiConsumer, Slot> reader = (sink, s) -> sink.add(s.value); + + Thread consumer = + new Thread( + () -> { + while (seen.size() < total) { + if (ring.drain(seen, reader) == 0) Thread.yield(); + } + }, + "ring-consumer"); + consumer.start(); + + start.countDown(); + producerPool.shutdown(); + assertTrue(producerPool.awaitTermination(30, TimeUnit.SECONDS), "producers timed out"); + assertTrue(consumer.isAlive() || seen.size() == total); + consumer.join(30_000); + assertFalse(consumer.isAlive(), "consumer timed out"); + + assertEquals(total, written.get(), "every producer call should eventually succeed"); + assertEquals(total, seen.size(), "consumer must see every value exactly once"); + } + + @Test + void sizeReflectsOutstandingWrites() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + assertEquals(0, ring.size()); + ring.tryWrite(s -> {}); + assertEquals(1, ring.size()); + ring.tryWrite(s -> {}); + assertEquals(2, ring.size()); + ring.drain(s -> {}); + assertEquals(0, ring.size()); + } + + @Test + void throwingFillerStillPublishesSoConsumerDoesntHang() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + + // First write succeeds. + assertTrue(ring.tryWrite(s -> s.value = 1)); + + // Second write throws midway through filling. Slot must still be published so the consumer's + // drain can advance past it. + RuntimeException boom = new RuntimeException("boom"); + RuntimeException thrown = + assertThrows( + RuntimeException.class, + () -> + ring.tryWrite( + s -> { + s.value = 2; + throw boom; + })); + assertSame(boom, thrown); + + // Third write proves the ring's cursors are still healthy after the throw. + assertTrue(ring.tryWrite(s -> s.value = 3)); + + // Consumer drains all three: it must not hang on the partially-filled slot. + List seen = new ArrayList<>(); + int drained = ring.drain(s -> seen.add(s.value)); + assertEquals(3, drained, "consumer must advance past the throwing slot"); + assertEquals(Arrays.asList(1, 2, 3), seen, "throwing slot keeps whatever filler had written"); + } + + // ============ Batch claim (tryClaim) ============ + + @Test + void tryClaimReturnsBatchOfRequestedSize() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + MpscRingBuffer.Batch batch = ring.tryClaim(3); + + assertNotNull(batch); + assertEquals(3, batch.size()); + assertEquals(3, batch.remaining()); + } + + @Test + void tryClaimRejectsZeroOrNegative() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + assertThrows(IllegalArgumentException.class, () -> ring.tryClaim(0)); + assertThrows(IllegalArgumentException.class, () -> ring.tryClaim(-1)); + } + + @Test + void tryClaimReturnsNullWhenRingCantFitBatch() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + assertNotNull(ring.tryClaim(3)); + // Only 1 slot left; claiming 2 must fail wholesale. + assertNull(ring.tryClaim(2), "all-or-nothing: partial batches are not allowed"); + // But one more slot does fit. + assertNotNull(ring.tryClaim(1)); + // Now full. + assertNull(ring.tryClaim(1)); + } + + @Test + void tryClaimFillAndPublishDeliversAllToDrain() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + MpscRingBuffer.Batch batch = ring.tryClaim(5); + + for (int i = 0; i < 5; i++) { + final int v = i; + batch.fillAndPublish(s -> s.value = v); + } + assertEquals(0, batch.remaining()); + + List seen = new ArrayList<>(); + int drained = ring.drain(s -> seen.add(s.value)); + assertEquals(5, drained); + assertEquals(Arrays.asList(0, 1, 2, 3, 4), seen, "batch publishes in order"); + } + + @Test + void overPublishingBatchThrows() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + MpscRingBuffer.Batch batch = ring.tryClaim(2); + + batch.fillAndPublish(s -> s.value = 1); + batch.fillAndPublish(s -> s.value = 2); + assertThrows(IllegalStateException.class, () -> batch.fillAndPublish(s -> s.value = 3)); + } + + @Test + void batchSupportsContextFillers() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + MpscRingBuffer.Batch batch = ring.tryClaim(3); + + BiConsumer oneCtx = (v, s) -> s.value = v; + TriConsumer twoCtx = + (v, t, s) -> { + s.value = v; + s.tag = t; + }; + + batch.fillAndPublish(s -> s.value = 1); + batch.fillAndPublish(2, oneCtx); + batch.fillAndPublish(3, "three", twoCtx); + + List seen = new ArrayList<>(); + ring.drain(s -> seen.add(s.value + "/" + s.tag)); + assertEquals(Arrays.asList("1/null", "2/null", "3/three"), seen); + } + + @Test + void batchFillerThrowStillPublishesAndAdvances() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + MpscRingBuffer.Batch batch = ring.tryClaim(3); + + batch.fillAndPublish(s -> s.value = 1); + RuntimeException boom = new RuntimeException("boom"); + assertThrows( + RuntimeException.class, + () -> + batch.fillAndPublish( + s -> { + s.value = 2; + throw boom; + })); + // The throwing slot's sequence has already been consumed; published counter advanced. + assertEquals(1, batch.remaining(), "throwing slot still counts as published"); + batch.fillAndPublish(s -> s.value = 3); + + List seen = new ArrayList<>(); + int drained = ring.drain(s -> seen.add(s.value)); + assertEquals(3, drained); + assertEquals(Arrays.asList(1, 2, 3), seen); + } + + // ============ Low-level primitives (tryClaimRange / slotAt / publish) ============ + + @Test + void tryClaimRangeReturnsStartSequence() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + long start1 = ring.tryClaimRange(3); + long start2 = ring.tryClaimRange(2); + + assertEquals(0L, start1, "first range starts at sequence 0"); + assertEquals(3L, start2, "second range begins immediately after the first"); + } + + @Test + void tryClaimRangeRejectsZeroOrNegative() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + assertThrows(IllegalArgumentException.class, () -> ring.tryClaimRange(0)); + assertThrows(IllegalArgumentException.class, () -> ring.tryClaimRange(-1)); + } + + @Test + void tryClaimRangeReturnsMinusOneWhenFull() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + assertTrue(ring.tryClaimRange(3) >= 0); + assertEquals(-1L, ring.tryClaimRange(2), "all-or-nothing"); + assertTrue(ring.tryClaimRange(1) >= 0); + assertEquals(-1L, ring.tryClaimRange(1)); + } + + @Test + void slotAtAndPublishRoundTrip() { + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 8); + long start = ring.tryClaimRange(3); + assertTrue(start >= 0); + + for (int i = 0; i < 3; i++) { + long seq = start + i; + Slot slot = ring.slotAt(seq); + slot.value = (int) (seq + 100); + ring.publish(seq); + } + + List seen = new ArrayList<>(); + int drained = ring.drain(s -> seen.add(s.value)); + assertEquals(3, drained); + assertEquals(Arrays.asList(100, 101, 102), seen); + } + + @Test + void slotAtReturnsSameInstanceForSameModuloPosition() { + // After publish+drain wraps around, the slot at sequence N and sequence N+capacity are the + // same physical object (this is the whole point of the ring). + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 4); + Slot firstSlot = ring.slotAt(0L); + Slot wrappedSlot = ring.slotAt(4L); // 4 & mask(3) == 0 + assertSame(firstSlot, wrappedSlot); + } + + @Test + void concurrentBatchClaimsAreOrderedAndDontInterleave() throws InterruptedException { + final int producers = 8; + final int batchesPerProducer = 200; + final int batchSize = 16; + final int total = producers * batchesPerProducer * batchSize; + + MpscRingBuffer ring = new MpscRingBuffer<>(Slot::new, 256); + ExecutorService pool = Executors.newFixedThreadPool(producers); + AtomicInteger writes = new AtomicInteger(); + CountDownLatch start = new CountDownLatch(1); + + for (int p = 0; p < producers; p++) { + final int base = p * batchesPerProducer * batchSize; + pool.submit( + () -> { + try { + start.await(); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + return; + } + for (int b = 0; b < batchesPerProducer; b++) { + MpscRingBuffer.Batch batch; + while ((batch = ring.tryClaim(batchSize)) == null) { + Thread.yield(); + } + for (int i = 0; i < batchSize; i++) { + final int v = base + b * batchSize + i; + batch.fillAndPublish(s -> s.value = v); + } + writes.addAndGet(batchSize); + } + }); + } + + Set seen = new HashSet<>(total); + Thread consumer = + new Thread( + () -> { + while (seen.size() < total) { + if (ring.drain((Slot s) -> seen.add(s.value)) == 0) Thread.yield(); + } + }, + "ring-batch-consumer"); + consumer.start(); + + start.countDown(); + pool.shutdown(); + assertTrue(pool.awaitTermination(30, TimeUnit.SECONDS), "producers timed out"); + consumer.join(30_000); + assertFalse(consumer.isAlive(), "consumer timed out"); + assertEquals(total, writes.get()); + assertEquals(total, seen.size(), "consumer must see every value exactly once"); + } +} diff --git a/settings.gradle.kts b/settings.gradle.kts index e6c9469b9c6..dda1f432e6f 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -112,6 +112,7 @@ include( include( ":communication", + ":components:annotations", ":components:context", ":components:environment", ":components:http:http-api", diff --git a/utils/config-utils/build.gradle.kts b/utils/config-utils/build.gradle.kts index 84f494596d8..b1bb146231e 100644 --- a/utils/config-utils/build.gradle.kts +++ b/utils/config-utils/build.gradle.kts @@ -55,6 +55,7 @@ val excludedClassesInstructionCoverage by extra( ) dependencies { + compileOnly(project(":components:annotations")) implementation(project(":components:environment")) implementation(project(":dd-trace-api")) api(project(":utils:filesystem-utils")) diff --git a/utils/config-utils/src/main/java/datadog/trace/api/env/CapturedEnvironment.java b/utils/config-utils/src/main/java/datadog/trace/api/env/CapturedEnvironment.java index 44e55b6fd18..2db02edd88a 100644 --- a/utils/config-utils/src/main/java/datadog/trace/api/env/CapturedEnvironment.java +++ b/utils/config-utils/src/main/java/datadog/trace/api/env/CapturedEnvironment.java @@ -2,6 +2,7 @@ import datadog.environment.JavaVirtualMachine; import datadog.trace.api.config.GeneralConfig; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.config.inversion.ConfigHelper; import java.io.File; import java.util.HashMap; @@ -24,12 +25,7 @@ public ProcessInfo() { mainClass = JavaVirtualMachine.getMainClass(); } - /** - * Visible for testing - * - * @param mainClass - * @param jarFile - */ + @VisibleForTesting ProcessInfo(String mainClass, File jarFile) { this.mainClass = mainClass; this.jarFile = jarFile; @@ -55,17 +51,13 @@ public ProcessInfo getProcessInfo() { return processInfo; } - // Testing purposes + @VisibleForTesting static void useFixedEnv(final Map props) { INSTANCE.properties.clear(); INSTANCE.properties.putAll(props); } - /** - * For testing purposes. - * - * @param processInfo - */ + @VisibleForTesting static void useFixedProcessInfo(final ProcessInfo processInfo) { INSTANCE.processInfo = processInfo; } diff --git a/utils/logging-utils/src/main/java/datadog/logging/IOLogger.java b/utils/logging-utils/src/main/java/datadog/logging/IOLogger.java index 407f7509454..af21f4a0c57 100644 --- a/utils/logging-utils/src/main/java/datadog/logging/IOLogger.java +++ b/utils/logging-utils/src/main/java/datadog/logging/IOLogger.java @@ -2,6 +2,7 @@ import static datadog.trace.api.telemetry.LogCollector.EXCLUDE_TELEMETRY; +import datadog.trace.api.internal.VisibleForTesting; import java.util.concurrent.TimeUnit; import org.slf4j.Logger; @@ -15,7 +16,7 @@ public IOLogger(final Logger log) { this(log, new RatelimitedLogger(log, 5, TimeUnit.MINUTES)); } - // Visible for testing + @VisibleForTesting IOLogger(final Logger log, final RatelimitedLogger ratelimitedLogger) { this.log = log; this.ratelimitedLogger = ratelimitedLogger; diff --git a/utils/logging-utils/src/main/java/datadog/logging/RatelimitedLogger.java b/utils/logging-utils/src/main/java/datadog/logging/RatelimitedLogger.java index 3e5728b6bb7..b88505c1e59 100644 --- a/utils/logging-utils/src/main/java/datadog/logging/RatelimitedLogger.java +++ b/utils/logging-utils/src/main/java/datadog/logging/RatelimitedLogger.java @@ -1,5 +1,6 @@ package datadog.logging; +import datadog.trace.api.internal.VisibleForTesting; import datadog.trace.api.time.SystemTimeSource; import datadog.trace.api.time.TimeSource; import java.util.Locale; @@ -25,7 +26,7 @@ public RatelimitedLogger(final Logger log, final int delay, final TimeUnit timeU this(log, delay, timeUnit, SystemTimeSource.INSTANCE); } - // Visible for testing + @VisibleForTesting RatelimitedLogger( final Logger log, final int delay, final TimeUnit timeUnit, final TimeSource timeSource) { this.log = log;