diff --git a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy index 0e35061d8ee..f409301b9b2 100644 --- a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy +++ b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy @@ -3,7 +3,6 @@ package datadog.smoketest import com.squareup.moshi.Moshi import com.squareup.moshi.Types import datadog.environment.JavaVirtualMachine -import datadog.environment.OperatingSystem import datadog.trace.agent.test.server.http.TestHttpServer.HandlerApi.RequestApi import datadog.trace.api.config.GeneralConfig import datadog.trace.test.util.Flaky @@ -348,258 +347,6 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest { return logEvent[key] } - /** - * Like {@link AbstractSmokeTest#waitForTraceCount} but checks process liveness on every poll - * iteration and dumps diagnostic state on failure, so CI failures produce actionable output - * instead of a bare "Condition not satisfied" after a 30s timeout. - */ - int waitForTraceCountAlive(int count) { - try { - defaultPoll.eventually { - if (traceDecodingFailure != null) { - throw traceDecodingFailure - } - // Check the count BEFORE liveness — the process may have exited normally - // after delivering all traces, and we don't want to treat that as a failure. - if (traceCount.get() >= count) { - return - } - if (testedProcess != null && !testedProcess.isAlive()) { - def lastLines = tailProcessLog(20) - // RuntimeException (not AssertionError) so PollingConditions propagates - // immediately instead of retrying for the full timeout. - throw new RuntimeException( - "Process exited with code ${testedProcess.exitValue()} while waiting for ${count} traces " + - "(received ${traceCount.get()}, RC polls: ${rcClientMessages.size()}).\n" + - "Last process output:\n${lastLines}") - } - assert traceCount.get() >= count - } - } catch (AssertionError e) { - // The default error ("Condition not satisfied after 30s") is useless — enrich with diagnostic state - def alive = testedProcess?.isAlive() - def lastLines = tailProcessLog(30) - def threadDump = alive ? dumpThreadStacks() : "(process not alive, skipping thread dump)" - throw new AssertionError( - "Timed out waiting for ${count} traces after ${defaultPoll.timeout}s. " + - "traceCount=${traceCount.get()}, process.alive=${alive}, " + - "RC polls received: ${rcClientMessages.size()}.\n" + - "Last process output:\n${lastLines}\n" + - "Thread dump:\n${threadDump}", e) - } - traceCount.get() - } - - /** - * Capture a thread dump of the forked process via {@code jstack}. jstack's output is captured by - * the smoketest JVM directly, bypassing the tested-process output-capture thread that has been - * observed to be starved at timeout (which makes the SIGQUIT-via-stderr approach unreliable for - * exactly the failures we want to diagnose). - * - *
No raw {@code kill -3} fallback: PID reuse on shared CI hosts could cause us to signal an
- * unrelated process if the child has exited since the surrounding liveness check.
- */
- private String dumpThreadStacks() {
- try {
- if (testedProcess == null) {
- return "(no tested process)"
- }
- if (OperatingSystem.isWindows()) {
- return "(thread dump not supported on Windows)"
- }
- long pid = getTestedProcessPid()
- if (pid <= 0) {
- return "(could not determine pid)"
- }
- // Re-check liveness immediately before invoking jstack. The earlier check that gates this
- // method runs ~1 statement away, but if the child has exited and been reaped since then,
- // the OS may have reused the PID — jstack-ing the wrong process would attach misleading
- // diagnostics to the test failure.
- if (!testedProcess.isAlive()) {
- return "(process exited between liveness check and dump; skipping to avoid PID reuse)"
- }
- String jstackOut = runJstack(pid)
- if (jstackOut == null) {
- return "(jstack not available or failed)"
- }
- return filterThreadDump(jstackOut)
- } catch (Throwable t) {
- // Never let a diagnostic failure mask the original AssertionError.
- return "(thread dump failed: ${t.getClass().simpleName}: ${t.message})"
- }
- }
-
- // Approximate budget for the inline dump in error.message. Datadog CI Visibility caps
- // error.message at ~5000 chars; this leaves a few hundred for the "Timed out waiting..."
- // prefix and the elision marker.
- private static final int INLINE_DUMP_CAP = 4700
-
- /**
- * Reduce a jstack thread dump to the threads most likely to explain a hang: the main thread,
- * dd-trace agent threads (dd-*, datadog-*), OkHttp threads, and anything BLOCKED. Drops known
- * JVM boilerplate (compiler/GC/reference handler/etc). Truncates to {@link #INLINE_DUMP_CAP}
- * with an elision marker.
- */
- private String filterThreadDump(String fullDump) {
- int firstBlockIdx = fullDump.indexOf('\n"')
- if (firstBlockIdx < 0) {
- // No recognizable thread blocks — return the original, truncated if needed
- return fullDump.length() > INLINE_DUMP_CAP
- ? fullDump.substring(0, INLINE_DUMP_CAP) + "\n(truncated)"
- : fullDump
- }
- String header = fullDump.substring(0, firstBlockIdx + 1)
- String rest = fullDump.substring(firstBlockIdx + 1)
-
- List