Skip to content

Commit 1b95bfa

Browse files
bm1549devflow.devflow-routing-intake
andauthored
Remove log-injection diagnostic code now that deadlock is fixed in #11509 (#11548)
Remove log-injection diagnostic code now that deadlock is fixed in #11509 The DDTraceId/DD64bTraceId class-initialization deadlock was the root cause of the traceCount=0 flakiness that prompted these diagnostics. With the deadlock fixed, the diagnostic instrumentation is no longer needed. Removes from #11075 and #11400: - waitForTraceCountAlive() and all helpers (dumpThreadStacks, filterThreadDump, isBoilerplateThread, isHighPriorityThread, getTestedProcessPid, runJstack, tailProcessLog, INLINE_DUMP_CAP) - The OperatingSystem import that was only used by dumpThreadStacks Keeps the actual fixes from #10999, #11375: - BaseApplication TIMEOUT_IN_NANOS 10s→30s (prevents app-side timeout racing the test-side timeout under CI load) - @flaky(isZulu8()) and JavaVirtualMachine.isZulu8() (covers a separate JUL race on Zulu 8 that is not fixed by #11509) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> Merge branch 'master' into brian.marks/remove-log-injection-diagnostics Co-authored-by: devflow.devflow-routing-intake <devflow.devflow-routing-intake@kubernetes.us1.ddbuild.io>
1 parent 951d9a7 commit 1b95bfa

1 file changed

Lines changed: 3 additions & 256 deletions

File tree

dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy

Lines changed: 3 additions & 256 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package datadog.smoketest
33
import com.squareup.moshi.Moshi
44
import com.squareup.moshi.Types
55
import datadog.environment.JavaVirtualMachine
6-
import datadog.environment.OperatingSystem
76
import datadog.trace.agent.test.server.http.TestHttpServer.HandlerApi.RequestApi
87
import datadog.trace.api.config.GeneralConfig
98
import datadog.trace.test.util.Flaky
@@ -348,258 +347,6 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
348347
return logEvent[key]
349348
}
350349

351-
/**
352-
* Like {@link AbstractSmokeTest#waitForTraceCount} but checks process liveness on every poll
353-
* iteration and dumps diagnostic state on failure, so CI failures produce actionable output
354-
* instead of a bare "Condition not satisfied" after a 30s timeout.
355-
*/
356-
int waitForTraceCountAlive(int count) {
357-
try {
358-
defaultPoll.eventually {
359-
if (traceDecodingFailure != null) {
360-
throw traceDecodingFailure
361-
}
362-
// Check the count BEFORE liveness — the process may have exited normally
363-
// after delivering all traces, and we don't want to treat that as a failure.
364-
if (traceCount.get() >= count) {
365-
return
366-
}
367-
if (testedProcess != null && !testedProcess.isAlive()) {
368-
def lastLines = tailProcessLog(20)
369-
// RuntimeException (not AssertionError) so PollingConditions propagates
370-
// immediately instead of retrying for the full timeout.
371-
throw new RuntimeException(
372-
"Process exited with code ${testedProcess.exitValue()} while waiting for ${count} traces " +
373-
"(received ${traceCount.get()}, RC polls: ${rcClientMessages.size()}).\n" +
374-
"Last process output:\n${lastLines}")
375-
}
376-
assert traceCount.get() >= count
377-
}
378-
} catch (AssertionError e) {
379-
// The default error ("Condition not satisfied after 30s") is useless — enrich with diagnostic state
380-
def alive = testedProcess?.isAlive()
381-
def lastLines = tailProcessLog(30)
382-
def threadDump = alive ? dumpThreadStacks() : "(process not alive, skipping thread dump)"
383-
throw new AssertionError(
384-
"Timed out waiting for ${count} traces after ${defaultPoll.timeout}s. " +
385-
"traceCount=${traceCount.get()}, process.alive=${alive}, " +
386-
"RC polls received: ${rcClientMessages.size()}.\n" +
387-
"Last process output:\n${lastLines}\n" +
388-
"Thread dump:\n${threadDump}", e)
389-
}
390-
traceCount.get()
391-
}
392-
393-
/**
394-
* Capture a thread dump of the forked process via {@code jstack}. jstack's output is captured by
395-
* the smoketest JVM directly, bypassing the tested-process output-capture thread that has been
396-
* observed to be starved at timeout (which makes the SIGQUIT-via-stderr approach unreliable for
397-
* exactly the failures we want to diagnose).
398-
*
399-
* <p>No raw {@code kill -3} fallback: PID reuse on shared CI hosts could cause us to signal an
400-
* unrelated process if the child has exited since the surrounding liveness check.
401-
*/
402-
private String dumpThreadStacks() {
403-
try {
404-
if (testedProcess == null) {
405-
return "(no tested process)"
406-
}
407-
if (OperatingSystem.isWindows()) {
408-
return "(thread dump not supported on Windows)"
409-
}
410-
long pid = getTestedProcessPid()
411-
if (pid <= 0) {
412-
return "(could not determine pid)"
413-
}
414-
// Re-check liveness immediately before invoking jstack. The earlier check that gates this
415-
// method runs ~1 statement away, but if the child has exited and been reaped since then,
416-
// the OS may have reused the PID — jstack-ing the wrong process would attach misleading
417-
// diagnostics to the test failure.
418-
if (!testedProcess.isAlive()) {
419-
return "(process exited between liveness check and dump; skipping to avoid PID reuse)"
420-
}
421-
String jstackOut = runJstack(pid)
422-
if (jstackOut == null) {
423-
return "(jstack not available or failed)"
424-
}
425-
return filterThreadDump(jstackOut)
426-
} catch (Throwable t) {
427-
// Never let a diagnostic failure mask the original AssertionError.
428-
return "(thread dump failed: ${t.getClass().simpleName}: ${t.message})"
429-
}
430-
}
431-
432-
// Approximate budget for the inline dump in error.message. Datadog CI Visibility caps
433-
// error.message at ~5000 chars; this leaves a few hundred for the "Timed out waiting..."
434-
// prefix and the elision marker.
435-
private static final int INLINE_DUMP_CAP = 4700
436-
437-
/**
438-
* Reduce a jstack thread dump to the threads most likely to explain a hang: the main thread,
439-
* dd-trace agent threads (dd-*, datadog-*), OkHttp threads, and anything BLOCKED. Drops known
440-
* JVM boilerplate (compiler/GC/reference handler/etc). Truncates to {@link #INLINE_DUMP_CAP}
441-
* with an elision marker.
442-
*/
443-
private String filterThreadDump(String fullDump) {
444-
int firstBlockIdx = fullDump.indexOf('\n"')
445-
if (firstBlockIdx < 0) {
446-
// No recognizable thread blocks — return the original, truncated if needed
447-
return fullDump.length() > INLINE_DUMP_CAP
448-
? fullDump.substring(0, INLINE_DUMP_CAP) + "\n(truncated)"
449-
: fullDump
450-
}
451-
String header = fullDump.substring(0, firstBlockIdx + 1)
452-
String rest = fullDump.substring(firstBlockIdx + 1)
453-
454-
List<String> blocks = []
455-
int i = 0
456-
while (i < rest.length()) {
457-
int next = rest.indexOf('\n"', i)
458-
if (next < 0) {
459-
blocks.add(rest.substring(i))
460-
break
461-
}
462-
blocks.add(rest.substring(i, next + 1))
463-
i = next + 1
464-
}
465-
466-
List<String> highPriority = []
467-
List<String> lowPriority = []
468-
int boilerplateDropped = 0
469-
for (String block : blocks) {
470-
def m = block =~ /^"([^"]+)"/
471-
String name = m.find() ? m.group(1) : ''
472-
if (isBoilerplateThread(name)) {
473-
boilerplateDropped++
474-
} else if (isHighPriorityThread(name, block)) {
475-
highPriority.add(block)
476-
} else {
477-
lowPriority.add(block)
478-
}
479-
}
480-
481-
StringBuilder out = new StringBuilder(header)
482-
int elided = 0
483-
for (String block : highPriority + lowPriority) {
484-
if (out.length() + block.length() + 120 > INLINE_DUMP_CAP) {
485-
elided++
486-
continue
487-
}
488-
out.append(block)
489-
}
490-
if (boilerplateDropped > 0 || elided > 0) {
491-
out.append("\n(elided ${boilerplateDropped} JVM-boilerplate thread(s)")
492-
if (elided > 0) {
493-
out.append(", elided ${elided} other thread(s) for size")
494-
}
495-
out.append(")")
496-
}
497-
return out.toString()
498-
}
499-
500-
private static boolean isBoilerplateThread(String name) {
501-
if (name in [
502-
"Reference Handler", "Finalizer", "Signal Dispatcher", "Common-Cleaner",
503-
"Service Thread", "Monitor Deflation Thread", "Notification Thread",
504-
"Attach Listener", "process reaper", "Sweeper thread", "VM Thread", "VM Periodic Task Thread"
505-
]) {
506-
return true
507-
}
508-
return name.startsWith("C1 CompilerThread") ||
509-
name.startsWith("C2 CompilerThread") ||
510-
name.startsWith("GC Thread") ||
511-
name.startsWith("G1 ") ||
512-
name.startsWith("ParGC ") ||
513-
name.startsWith("CMS ")
514-
}
515-
516-
private static boolean isHighPriorityThread(String name, String block) {
517-
if (name == "main") {
518-
return true
519-
}
520-
if (name.startsWith("dd-") || name.startsWith("datadog-")) {
521-
return true
522-
}
523-
if (name.startsWith("OkHttp") || name.contains("okhttp")) {
524-
return true
525-
}
526-
return block.contains("java.lang.Thread.State: BLOCKED")
527-
}
528-
529-
private long getTestedProcessPid() {
530-
try {
531-
return (long) testedProcess.getClass().getMethod("pid").invoke(testedProcess)
532-
} catch (Throwable ignored) {
533-
try {
534-
// UNIXProcess's private 'pid' field, for JDK 8 compatibility
535-
def field = testedProcess.getClass().getDeclaredField("pid")
536-
field.setAccessible(true)
537-
return field.getInt(testedProcess) as long
538-
} catch (Throwable ignored2) {
539-
return -1L
540-
}
541-
}
542-
}
543-
544-
private String runJstack(long pid) {
545-
def candidates = []
546-
// java.home is always set by the JVM and points to the active JDK/JRE; prefer it over the
547-
// JAVA_HOME env var which is frequently absent in CI runners even when Java is present.
548-
String javaHome = System.getProperty("java.home")
549-
if (javaHome) {
550-
candidates.add(javaHome + "/bin/jstack")
551-
}
552-
String javaHomeEnv = System.getenv("JAVA_HOME")
553-
if (javaHomeEnv && javaHomeEnv != javaHome) {
554-
candidates.add(javaHomeEnv + "/bin/jstack")
555-
}
556-
candidates.add("jstack")
557-
for (String cmd : candidates) {
558-
File tmp = null
559-
try {
560-
tmp = File.createTempFile("jstack", ".txt")
561-
// Redirect output to a file to avoid pipe-buffer deadlock — a full thread dump can
562-
// exceed the OS pipe buffer (typically 64 KB) before waitFor returns.
563-
Process p = new ProcessBuilder(cmd, String.valueOf(pid))
564-
.redirectErrorStream(true)
565-
.redirectOutput(tmp)
566-
.start()
567-
if (!p.waitFor(5, SECONDS)) {
568-
p.destroyForcibly()
569-
p.waitFor(2, SECONDS)
570-
continue
571-
}
572-
if (p.exitValue() == 0) {
573-
String output = tmp.getText("UTF-8")
574-
if (output) {
575-
return output
576-
}
577-
}
578-
} catch (Throwable ignored) {
579-
// try next candidate
580-
} finally {
581-
if (tmp != null) {
582-
tmp.delete()
583-
}
584-
}
585-
}
586-
return null
587-
}
588-
589-
private String tailProcessLog(int lines) {
590-
try {
591-
def logFile = new File(logFilePath)
592-
if (!logFile.exists()) {
593-
return "(log file does not exist: ${logFilePath})"
594-
}
595-
def allLines = logFile.readLines()
596-
def tail = allLines.size() > lines ? allLines[-lines..-1] : allLines
597-
return tail.join("\n")
598-
} catch (Exception e) {
599-
return "(failed to read log: ${e.message})"
600-
}
601-
}
602-
603350
def parseTraceFromStdOut( String line ) {
604351
if (line == null) {
605352
throw new IllegalArgumentException("Line is null")
@@ -618,7 +365,7 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
618365
@Flaky(condition = () -> JavaVirtualMachine.isIbm8() || JavaVirtualMachine.isOracleJDK8() || JavaVirtualMachine.isZulu8())
619366
def "check raw file injection"() {
620367
when:
621-
def count = waitForTraceCountAlive(2)
368+
def count = waitForTraceCount(2)
622369

623370
def newConfig = """
624371
{"lib_config":
@@ -627,12 +374,12 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
627374
""".toString()
628375
setRemoteConfig("datadog/2/APM_TRACING/config_overrides/config", newConfig)
629376

630-
count = waitForTraceCountAlive(3)
377+
count = waitForTraceCount(3)
631378

632379
setRemoteConfig("datadog/2/APM_TRACING/config_overrides/config", """{"lib_config":{}}""".toString())
633380

634381
// Wait for all 4 traces before waiting for process exit to ensure trace delivery is confirmed
635-
count = waitForTraceCountAlive(4)
382+
count = waitForTraceCount(4)
636383

637384
assert testedProcess.waitFor(TIMEOUT_SECS, SECONDS) : "Process did not exit within ${TIMEOUT_SECS}s"
638385
def exitValue = testedProcess.exitValue()

0 commit comments

Comments
 (0)