@@ -3,7 +3,6 @@ package datadog.smoketest
33import com.squareup.moshi.Moshi
44import com.squareup.moshi.Types
55import datadog.environment.JavaVirtualMachine
6- import datadog.environment.OperatingSystem
76import datadog.trace.agent.test.server.http.TestHttpServer.HandlerApi.RequestApi
87import datadog.trace.api.config.GeneralConfig
98import datadog.trace.test.util.Flaky
@@ -348,258 +347,6 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
348347 return logEvent[key]
349348 }
350349
351- /**
352- * Like {@link AbstractSmokeTest#waitForTraceCount} but checks process liveness on every poll
353- * iteration and dumps diagnostic state on failure, so CI failures produce actionable output
354- * instead of a bare "Condition not satisfied" after a 30s timeout.
355- */
356- int waitForTraceCountAlive (int count ) {
357- try {
358- defaultPoll. eventually {
359- if (traceDecodingFailure != null ) {
360- throw traceDecodingFailure
361- }
362- // Check the count BEFORE liveness — the process may have exited normally
363- // after delivering all traces, and we don't want to treat that as a failure.
364- if (traceCount. get() >= count) {
365- return
366- }
367- if (testedProcess != null && ! testedProcess. isAlive()) {
368- def lastLines = tailProcessLog(20 )
369- // RuntimeException (not AssertionError) so PollingConditions propagates
370- // immediately instead of retrying for the full timeout.
371- throw new RuntimeException (
372- " Process exited with code ${ testedProcess.exitValue()} while waiting for ${ count} traces " +
373- " (received ${ traceCount.get()} , RC polls: ${ rcClientMessages.size()} ).\n " +
374- " Last process output:\n ${ lastLines} " )
375- }
376- assert traceCount. get() >= count
377- }
378- } catch (AssertionError e) {
379- // The default error ("Condition not satisfied after 30s") is useless — enrich with diagnostic state
380- def alive = testedProcess?. isAlive()
381- def lastLines = tailProcessLog(30 )
382- def threadDump = alive ? dumpThreadStacks() : " (process not alive, skipping thread dump)"
383- throw new AssertionError (
384- " Timed out waiting for ${ count} traces after ${ defaultPoll.timeout} s. " +
385- " traceCount=${ traceCount.get()} , process.alive=${ alive} , " +
386- " RC polls received: ${ rcClientMessages.size()} .\n " +
387- " Last process output:\n ${ lastLines} \n " +
388- " Thread dump:\n ${ threadDump} " , e)
389- }
390- traceCount. get()
391- }
392-
393- /**
394- * Capture a thread dump of the forked process via {@code jstack }. jstack's output is captured by
395- * the smoketest JVM directly, bypassing the tested-process output-capture thread that has been
396- * observed to be starved at timeout (which makes the SIGQUIT-via-stderr approach unreliable for
397- * exactly the failures we want to diagnose).
398- *
399- * <p >No raw {@code kill -3 } fallback: PID reuse on shared CI hosts could cause us to signal an
400- * unrelated process if the child has exited since the surrounding liveness check.
401- */
402- private String dumpThreadStacks () {
403- try {
404- if (testedProcess == null ) {
405- return " (no tested process)"
406- }
407- if (OperatingSystem . isWindows()) {
408- return " (thread dump not supported on Windows)"
409- }
410- long pid = getTestedProcessPid()
411- if (pid <= 0 ) {
412- return " (could not determine pid)"
413- }
414- // Re-check liveness immediately before invoking jstack. The earlier check that gates this
415- // method runs ~1 statement away, but if the child has exited and been reaped since then,
416- // the OS may have reused the PID — jstack-ing the wrong process would attach misleading
417- // diagnostics to the test failure.
418- if (! testedProcess. isAlive()) {
419- return " (process exited between liveness check and dump; skipping to avoid PID reuse)"
420- }
421- String jstackOut = runJstack(pid)
422- if (jstackOut == null ) {
423- return " (jstack not available or failed)"
424- }
425- return filterThreadDump(jstackOut)
426- } catch (Throwable t) {
427- // Never let a diagnostic failure mask the original AssertionError.
428- return " (thread dump failed: ${ t.getClass().simpleName} : ${ t.message} )"
429- }
430- }
431-
432- // Approximate budget for the inline dump in error.message. Datadog CI Visibility caps
433- // error.message at ~5000 chars; this leaves a few hundred for the "Timed out waiting..."
434- // prefix and the elision marker.
435- private static final int INLINE_DUMP_CAP = 4700
436-
437- /**
438- * Reduce a jstack thread dump to the threads most likely to explain a hang: the main thread,
439- * dd-trace agent threads (dd-*, datadog-*), OkHttp threads, and anything BLOCKED. Drops known
440- * JVM boilerplate (compiler/GC/reference handler/etc). Truncates to {@link #INLINE_DUMP_CAP}
441- * with an elision marker.
442- */
443- private String filterThreadDump (String fullDump ) {
444- int firstBlockIdx = fullDump. indexOf(' \n "' )
445- if (firstBlockIdx < 0 ) {
446- // No recognizable thread blocks — return the original, truncated if needed
447- return fullDump. length() > INLINE_DUMP_CAP
448- ? fullDump. substring(0 , INLINE_DUMP_CAP ) + " \n (truncated)"
449- : fullDump
450- }
451- String header = fullDump. substring(0 , firstBlockIdx + 1 )
452- String rest = fullDump. substring(firstBlockIdx + 1 )
453-
454- List<String > blocks = []
455- int i = 0
456- while (i < rest. length()) {
457- int next = rest. indexOf(' \n "' , i)
458- if (next < 0 ) {
459- blocks. add(rest. substring(i))
460- break
461- }
462- blocks. add(rest. substring(i, next + 1 ))
463- i = next + 1
464- }
465-
466- List<String > highPriority = []
467- List<String > lowPriority = []
468- int boilerplateDropped = 0
469- for (String block : blocks) {
470- def m = block =~ / ^"([^"]+)"/
471- String name = m. find() ? m. group(1 ) : ' '
472- if (isBoilerplateThread(name)) {
473- boilerplateDropped++
474- } else if (isHighPriorityThread(name, block)) {
475- highPriority. add(block)
476- } else {
477- lowPriority. add(block)
478- }
479- }
480-
481- StringBuilder out = new StringBuilder (header)
482- int elided = 0
483- for (String block : highPriority + lowPriority) {
484- if (out. length() + block. length() + 120 > INLINE_DUMP_CAP ) {
485- elided++
486- continue
487- }
488- out. append(block)
489- }
490- if (boilerplateDropped > 0 || elided > 0 ) {
491- out. append(" \n (elided ${ boilerplateDropped} JVM-boilerplate thread(s)" )
492- if (elided > 0 ) {
493- out. append(" , elided ${ elided} other thread(s) for size" )
494- }
495- out. append(" )" )
496- }
497- return out. toString()
498- }
499-
500- private static boolean isBoilerplateThread (String name ) {
501- if (name in [
502- " Reference Handler" , " Finalizer" , " Signal Dispatcher" , " Common-Cleaner" ,
503- " Service Thread" , " Monitor Deflation Thread" , " Notification Thread" ,
504- " Attach Listener" , " process reaper" , " Sweeper thread" , " VM Thread" , " VM Periodic Task Thread"
505- ]) {
506- return true
507- }
508- return name. startsWith(" C1 CompilerThread" ) ||
509- name. startsWith(" C2 CompilerThread" ) ||
510- name. startsWith(" GC Thread" ) ||
511- name. startsWith(" G1 " ) ||
512- name. startsWith(" ParGC " ) ||
513- name. startsWith(" CMS " )
514- }
515-
516- private static boolean isHighPriorityThread (String name , String block ) {
517- if (name == " main" ) {
518- return true
519- }
520- if (name. startsWith(" dd-" ) || name. startsWith(" datadog-" )) {
521- return true
522- }
523- if (name. startsWith(" OkHttp" ) || name. contains(" okhttp" )) {
524- return true
525- }
526- return block. contains(" java.lang.Thread.State: BLOCKED" )
527- }
528-
529- private long getTestedProcessPid () {
530- try {
531- return (long ) testedProcess. getClass(). getMethod(" pid" ). invoke(testedProcess)
532- } catch (Throwable ignored) {
533- try {
534- // UNIXProcess's private 'pid' field, for JDK 8 compatibility
535- def field = testedProcess. getClass(). getDeclaredField(" pid" )
536- field. setAccessible(true )
537- return field. getInt(testedProcess) as long
538- } catch (Throwable ignored2) {
539- return -1L
540- }
541- }
542- }
543-
544- private String runJstack (long pid ) {
545- def candidates = []
546- // java.home is always set by the JVM and points to the active JDK/JRE; prefer it over the
547- // JAVA_HOME env var which is frequently absent in CI runners even when Java is present.
548- String javaHome = System . getProperty(" java.home" )
549- if (javaHome) {
550- candidates. add(javaHome + " /bin/jstack" )
551- }
552- String javaHomeEnv = System . getenv(" JAVA_HOME" )
553- if (javaHomeEnv && javaHomeEnv != javaHome) {
554- candidates. add(javaHomeEnv + " /bin/jstack" )
555- }
556- candidates. add(" jstack" )
557- for (String cmd : candidates) {
558- File tmp = null
559- try {
560- tmp = File . createTempFile(" jstack" , " .txt" )
561- // Redirect output to a file to avoid pipe-buffer deadlock — a full thread dump can
562- // exceed the OS pipe buffer (typically 64 KB) before waitFor returns.
563- Process p = new ProcessBuilder (cmd, String . valueOf(pid))
564- .redirectErrorStream(true )
565- .redirectOutput(tmp)
566- .start()
567- if (! p. waitFor(5 , SECONDS )) {
568- p. destroyForcibly()
569- p. waitFor(2 , SECONDS )
570- continue
571- }
572- if (p. exitValue() == 0 ) {
573- String output = tmp. getText(" UTF-8" )
574- if (output) {
575- return output
576- }
577- }
578- } catch (Throwable ignored) {
579- // try next candidate
580- } finally {
581- if (tmp != null ) {
582- tmp. delete()
583- }
584- }
585- }
586- return null
587- }
588-
589- private String tailProcessLog (int lines ) {
590- try {
591- def logFile = new File (logFilePath)
592- if (! logFile. exists()) {
593- return " (log file does not exist: ${ logFilePath} )"
594- }
595- def allLines = logFile. readLines()
596- def tail = allLines. size() > lines ? allLines[- lines.. -1 ] : allLines
597- return tail. join(" \n " )
598- } catch (Exception e) {
599- return " (failed to read log: ${ e.message} )"
600- }
601- }
602-
603350 def parseTraceFromStdOut ( String line ) {
604351 if (line == null ) {
605352 throw new IllegalArgumentException (" Line is null" )
@@ -618,7 +365,7 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
618365 @Flaky (condition = () -> JavaVirtualMachine . isIbm8() || JavaVirtualMachine . isOracleJDK8() || JavaVirtualMachine . isZulu8())
619366 def " check raw file injection" () {
620367 when :
621- def count = waitForTraceCountAlive (2 )
368+ def count = waitForTraceCount (2 )
622369
623370 def newConfig = """
624371 {"lib_config":
@@ -627,12 +374,12 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
627374 """ . toString()
628375 setRemoteConfig(" datadog/2/APM_TRACING/config_overrides/config" , newConfig)
629376
630- count = waitForTraceCountAlive (3 )
377+ count = waitForTraceCount (3 )
631378
632379 setRemoteConfig(" datadog/2/APM_TRACING/config_overrides/config" , """ {"lib_config":{}}""" . toString())
633380
634381 // Wait for all 4 traces before waiting for process exit to ensure trace delivery is confirmed
635- count = waitForTraceCountAlive (4 )
382+ count = waitForTraceCount (4 )
636383
637384 assert testedProcess. waitFor(TIMEOUT_SECS , SECONDS ) : " Process did not exit within ${ TIMEOUT_SECS} s"
638385 def exitValue = testedProcess. exitValue()
0 commit comments