From 39384aa42da6403799eeb00a8b04f2c478281676 Mon Sep 17 00:00:00 2001 From: Rujun Chen Date: Fri, 20 Mar 2026 10:26:35 +0800 Subject: [PATCH 1/6] Add forkedProcessTimeoutInSeconds to surefire plugin to prevent CI pipeline hangs --- sdk/parents/azure-client-sdk-parent/pom.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/parents/azure-client-sdk-parent/pom.xml b/sdk/parents/azure-client-sdk-parent/pom.xml index fb194a4906c6..42e1a0658f46 100644 --- a/sdk/parents/azure-client-sdk-parent/pom.xml +++ b/sdk/parents/azure-client-sdk-parent/pom.xml @@ -894,6 +894,7 @@ debug 1 + 600 false ${defaultSurefireArgLine} From 36374a3d82f94f4cd6686d84728b7538ecf79959 Mon Sep 17 00:00:00 2001 From: Rujun Chen Date: Fri, 20 Mar 2026 10:33:52 +0800 Subject: [PATCH 2/6] Add an empty line to trigger java - spring - ci --- sdk/spring/pom.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/spring/pom.xml b/sdk/spring/pom.xml index d85cf0f56bc8..be007e1921d0 100644 --- a/sdk/spring/pom.xml +++ b/sdk/spring/pom.xml @@ -136,6 +136,7 @@ azure-spring-data-cosmos + monitor From 26c996bdac064156851e36c34bebfb0521a33f34 Mon Sep 17 00:00:00 2001 From: Rujun Chen Date: Fri, 20 Mar 2026 13:02:43 +0800 Subject: [PATCH 3/6] Output more log to test --- eng/pipelines/scripts/Get-Test-Logs.ps1 | 28 +++++-- .../scripts/Monitor-Java-Processes.ps1 | 82 +++++++++++++++++++ sdk/parents/azure-client-sdk-parent/pom.xml | 3 +- sdk/spring/ci.yml | 10 +++ 4 files changed, 116 insertions(+), 7 deletions(-) create mode 100644 eng/pipelines/scripts/Monitor-Java-Processes.ps1 diff --git a/eng/pipelines/scripts/Get-Test-Logs.ps1 b/eng/pipelines/scripts/Get-Test-Logs.ps1 index d840a71ca2ae..d4afe231a76a 100644 --- a/eng/pipelines/scripts/Get-Test-Logs.ps1 +++ b/eng/pipelines/scripts/Get-Test-Logs.ps1 @@ -1,13 +1,19 @@ <# .SYNOPSIS -Captures any test.log files in the build directory and moves them to a staging directory for artifact publishing. +Captures any test.log files, JVM crash logs, surefire dumpstream files, and jstack dumps in the build directory +and moves them to a staging directory for artifact publishing. .DESCRIPTION -This script is used to capture any test.log files in the build directory and move them to a staging directory for -artifact publishing. It also sets a pipeline variable to indicate whether any test.log files were found. +This script is used to capture diagnostic files from the build directory and move them to a staging directory for +artifact publishing. It also sets a pipeline variable to indicate whether any diagnostic files were found. +Collected files include: + - *test.log (test logs) + - hs_err_pid*.log (JVM crash reports) + - *.dumpstream (Surefire forked JVM crash/corruption reports) + - jstack-dumps.log (periodic jstack thread dumps from the Java process monitor) .PARAMETER StagingDirectory -The directory where the test.log files will be moved to. +The directory where the diagnostic files will be moved to. .PARAMETER TestLogsArtifactName The name of the artifact to be created. @@ -22,11 +28,21 @@ param( ) $testLogs = Get-ChildItem -Path . -Recurse -Filter *test.log -File -Depth 4 +$jvmCrashLogs = Get-ChildItem -Path . -Recurse -Filter hs_err_pid*.log -File -Depth 6 +$dumpstreamFiles = Get-ChildItem -Path . -Recurse -Filter *.dumpstream -File -Depth 6 +$jstackDumps = Get-ChildItem -Path "$StagingDirectory/troubleshooting" -Filter jstack-dumps.log -File -ErrorAction SilentlyContinue -if ($testLogs.Count -gt 0) { +$allFiles = @() +if ($testLogs) { $allFiles += $testLogs } +if ($jvmCrashLogs) { $allFiles += $jvmCrashLogs } +if ($dumpstreamFiles) { $allFiles += $dumpstreamFiles } +if ($jstackDumps) { $allFiles += $jstackDumps } + +if ($allFiles.Count -gt 0) { if (-not (Test-Path "$StagingDirectory/troubleshooting")) { New-Item -ItemType Directory -Path "$StagingDirectory/troubleshooting" | Out-Null } Write-Host "##vso[task.setvariable variable=HAS_TROUBLESHOOTING]true" - Compress-Archive -Path $testLogs -DestinationPath "$StagingDirectory/troubleshooting/$TestLogsArtifactName.zip" + Write-Host "Found $($testLogs.Count) test log(s), $($jvmCrashLogs.Count) JVM crash log(s), $($dumpstreamFiles.Count) dumpstream file(s), $($jstackDumps.Count) jstack dump(s)" + Compress-Archive -Path $allFiles -DestinationPath "$StagingDirectory/troubleshooting/$TestLogsArtifactName.zip" } diff --git a/eng/pipelines/scripts/Monitor-Java-Processes.ps1 b/eng/pipelines/scripts/Monitor-Java-Processes.ps1 new file mode 100644 index 000000000000..b97aa1e0c02f --- /dev/null +++ b/eng/pipelines/scripts/Monitor-Java-Processes.ps1 @@ -0,0 +1,82 @@ +<# +.SYNOPSIS +Monitors Java processes by taking periodic jstack thread dumps. + +.DESCRIPTION +This script runs in the background, periodically capturing jstack thread dumps of all running Java processes. +It writes the output to a log file in the troubleshooting directory. This is useful for diagnosing CI pipeline +hangs caused by deadlocked or stuck Java processes. + +.PARAMETER StagingDirectory +The directory where jstack dump files will be written. + +.PARAMETER IntervalSeconds +The interval in seconds between jstack captures. Default is 180 (3 minutes). + +.PARAMETER DurationMinutes +The maximum duration in minutes to run the monitor. Default is 55 minutes. +#> + +param( + [Parameter(Mandatory = $true)] + [string]$StagingDirectory, + + [Parameter(Mandatory = $false)] + [int]$IntervalSeconds = 180, + + [Parameter(Mandatory = $false)] + [int]$DurationMinutes = 55 +) + +$troubleshootingDir = "$StagingDirectory/troubleshooting" +if (-not (Test-Path $troubleshootingDir)) { + New-Item -ItemType Directory -Path $troubleshootingDir | Out-Null +} + +$outputFile = "$troubleshootingDir/jstack-dumps.log" +$endTime = (Get-Date).AddMinutes($DurationMinutes) + +Write-Host "Starting Java process monitor. Writing jstack dumps to $outputFile every $IntervalSeconds seconds for up to $DurationMinutes minutes." + +while ((Get-Date) -lt $endTime) { + Start-Sleep -Seconds $IntervalSeconds + + $timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss" + Add-Content -Path $outputFile -Value "`n========== jstack dump at $timestamp ==========" + + # List all Java processes + $javaHome = $env:JAVA_HOME + $jpsPath = if ($javaHome) { "$javaHome/bin/jps" } else { "jps" } + $jstackPath = if ($javaHome) { "$javaHome/bin/jstack" } else { "jstack" } + + try { + $jpsOutput = & $jpsPath -l 2>&1 + Add-Content -Path $outputFile -Value "`n--- Java processes (jps -l) ---" + Add-Content -Path $outputFile -Value $jpsOutput + + # Get PIDs of Java processes (excluding jps itself) + $pids = $jpsOutput | ForEach-Object { + if ($_ -match '^\d+' -and $_ -notmatch 'jps') { + ($_ -split '\s+')[0] + } + } | Where-Object { $_ } + + foreach ($pid in $pids) { + Add-Content -Path $outputFile -Value "`n--- jstack for PID $pid ---" + try { + $stackTrace = & $jstackPath $pid 2>&1 + Add-Content -Path $outputFile -Value $stackTrace + } catch { + Add-Content -Path $outputFile -Value "Failed to get jstack for PID $pid : $_" + } + } + } catch { + Add-Content -Path $outputFile -Value "Error running jps: $_" + } +} + +Write-Host "Java process monitor finished after $DurationMinutes minutes." +# Mark that we have troubleshooting artifacts +if (Test-Path $outputFile) { + Write-Host "##vso[task.setvariable variable=HAS_TROUBLESHOOTING]true" +} diff --git a/sdk/parents/azure-client-sdk-parent/pom.xml b/sdk/parents/azure-client-sdk-parent/pom.xml index 42e1a0658f46..c80d6d63c1b9 100644 --- a/sdk/parents/azure-client-sdk-parent/pom.xml +++ b/sdk/parents/azure-client-sdk-parent/pom.xml @@ -894,7 +894,7 @@ debug 1 - 600 + 1800 false ${defaultSurefireArgLine} @@ -945,6 +945,7 @@ debug 1 + 1800 false ${defaultFailsafeArgLine} diff --git a/sdk/spring/ci.yml b/sdk/spring/ci.yml index b91fedb2a87a..ee8225b535fd 100644 --- a/sdk/spring/ci.yml +++ b/sdk/spring/ci.yml @@ -254,6 +254,16 @@ extends: template: ../../eng/pipelines/templates/stages/archetype-sdk-client.yml parameters: ServiceDirectory: spring + PreBuildSteps: + - pwsh: | + $scriptPath = "$(Build.SourcesDirectory)/eng/pipelines/scripts/Monitor-Java-Processes.ps1" + $stagingDir = "$(System.DefaultWorkingDirectory)" + Write-Host "Starting Java process monitor in background..." + Start-Process -NoNewWindow -FilePath "pwsh" -ArgumentList "-File", $scriptPath, "-StagingDirectory", $stagingDir, "-IntervalSeconds", "180", "-DurationMinutes", "55" + Write-Host "Java process monitor started." + displayName: 'Start Java process monitor (background)' + continueOnError: true + condition: always() Artifacts: - name: azure-spring-data-cosmos groupId: com.azure From 5c0786649a537d88a9fdb60c6b60f8e5c1acf02d Mon Sep 17 00:00:00 2001 From: Rujun Chen Date: Fri, 20 Mar 2026 14:43:28 +0800 Subject: [PATCH 4/6] Improve the test script --- .../scripts/Monitor-Java-Processes.ps1 | 69 ++++++++++++------- sdk/spring/ci.yml | 13 ++-- 2 files changed, 53 insertions(+), 29 deletions(-) diff --git a/eng/pipelines/scripts/Monitor-Java-Processes.ps1 b/eng/pipelines/scripts/Monitor-Java-Processes.ps1 index b97aa1e0c02f..283db7b9f678 100644 --- a/eng/pipelines/scripts/Monitor-Java-Processes.ps1 +++ b/eng/pipelines/scripts/Monitor-Java-Processes.ps1 @@ -3,7 +3,8 @@ Monitors Java processes by taking periodic jstack thread dumps. .DESCRIPTION -This script runs in the background, periodically capturing jstack thread dumps of all running Java processes. +This script runs in the background, periodically capturing thread dumps of all running Java processes. +It uses both 'ps' (to reliably find Java processes on Linux) and 'jstack' (for thread dumps). It writes the output to a log file in the troubleshooting directory. This is useful for diagnosing CI pipeline hangs caused by deadlocked or stuck Java processes. @@ -11,7 +12,7 @@ hangs caused by deadlocked or stuck Java processes. The directory where jstack dump files will be written. .PARAMETER IntervalSeconds -The interval in seconds between jstack captures. Default is 180 (3 minutes). +The interval in seconds between captures. Default is 120 (2 minutes). .PARAMETER DurationMinutes The maximum duration in minutes to run the monitor. Default is 55 minutes. @@ -22,7 +23,7 @@ param( [string]$StagingDirectory, [Parameter(Mandatory = $false)] - [int]$IntervalSeconds = 180, + [int]$IntervalSeconds = 120, [Parameter(Mandatory = $false)] [int]$DurationMinutes = 55 @@ -36,15 +37,33 @@ if (-not (Test-Path $troubleshootingDir)) { $outputFile = "$troubleshootingDir/jstack-dumps.log" $endTime = (Get-Date).AddMinutes($DurationMinutes) -Write-Host "Starting Java process monitor. Writing jstack dumps to $outputFile every $IntervalSeconds seconds for up to $DurationMinutes minutes." +Add-Content -Path $outputFile -Value "Monitor started at $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')" +Add-Content -Path $outputFile -Value "JAVA_HOME=$($env:JAVA_HOME)" while ((Get-Date) -lt $endTime) { Start-Sleep -Seconds $IntervalSeconds $timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss" - Add-Content -Path $outputFile -Value "`n========== jstack dump at $timestamp ==========" + Add-Content -Path $outputFile -Value "`n========== Snapshot at $timestamp ==========" - # List all Java processes + # Use 'ps' to find Java processes (more reliable than jps on CI agents) + try { + if ($IsLinux -or $IsMacOS) { + $psOutput = bash -c "ps aux | grep '[j]ava'" 2>&1 + } else { + $psOutput = Get-Process -Name java -ErrorAction SilentlyContinue | Format-Table Id, CPU, WorkingSet64, CommandLine -AutoSize | Out-String + } + Add-Content -Path $outputFile -Value "`n--- Java processes (ps) ---" + if ($psOutput) { + Add-Content -Path $outputFile -Value $psOutput + } else { + Add-Content -Path $outputFile -Value "(no Java processes found)" + } + } catch { + Add-Content -Path $outputFile -Value "Error listing processes: $_" + } + + # Also try jps for comparison $javaHome = $env:JAVA_HOME $jpsPath = if ($javaHome) { "$javaHome/bin/jps" } else { "jps" } $jstackPath = if ($javaHome) { "$javaHome/bin/jstack" } else { "jstack" } @@ -53,29 +72,33 @@ while ((Get-Date) -lt $endTime) { $jpsOutput = & $jpsPath -l 2>&1 Add-Content -Path $outputFile -Value "`n--- Java processes (jps -l) ---" Add-Content -Path $outputFile -Value $jpsOutput + } catch { + Add-Content -Path $outputFile -Value "Error running jps: $_" + } - # Get PIDs of Java processes (excluding jps itself) - $pids = $jpsOutput | ForEach-Object { - if ($_ -match '^\d+' -and $_ -notmatch 'jps') { - ($_ -split '\s+')[0] - } - } | Where-Object { $_ } - - foreach ($pid in $pids) { - Add-Content -Path $outputFile -Value "`n--- jstack for PID $pid ---" - try { - $stackTrace = & $jstackPath $pid 2>&1 - Add-Content -Path $outputFile -Value $stackTrace - } catch { - Add-Content -Path $outputFile -Value "Failed to get jstack for PID $pid : $_" + # Extract PIDs from ps output and take jstack dumps + if ($IsLinux -or $IsMacOS) { + try { + $javaPids = bash -c "ps -eo pid,comm | grep '[j]ava' | awk '{print \$1}'" 2>&1 + if ($javaPids) { + foreach ($pid in ($javaPids -split "`n" | Where-Object { $_.Trim() })) { + $pid = $pid.Trim() + Add-Content -Path $outputFile -Value "`n--- jstack for PID $pid ---" + try { + $stackTrace = & $jstackPath $pid 2>&1 + Add-Content -Path $outputFile -Value $stackTrace + } catch { + Add-Content -Path $outputFile -Value "Failed to get jstack for PID $pid : $_" + } + } } + } catch { + Add-Content -Path $outputFile -Value "Error extracting PIDs: $_" } - } catch { - Add-Content -Path $outputFile -Value "Error running jps: $_" } } -Write-Host "Java process monitor finished after $DurationMinutes minutes." +Add-Content -Path $outputFile -Value "`nMonitor finished at $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')" # Mark that we have troubleshooting artifacts if (Test-Path $outputFile) { Write-Host "##vso[task.setvariable variable=HAS_TROUBLESHOOTING]true" diff --git a/sdk/spring/ci.yml b/sdk/spring/ci.yml index ee8225b535fd..b76556b50240 100644 --- a/sdk/spring/ci.yml +++ b/sdk/spring/ci.yml @@ -255,12 +255,13 @@ extends: parameters: ServiceDirectory: spring PreBuildSteps: - - pwsh: | - $scriptPath = "$(Build.SourcesDirectory)/eng/pipelines/scripts/Monitor-Java-Processes.ps1" - $stagingDir = "$(System.DefaultWorkingDirectory)" - Write-Host "Starting Java process monitor in background..." - Start-Process -NoNewWindow -FilePath "pwsh" -ArgumentList "-File", $scriptPath, "-StagingDirectory", $stagingDir, "-IntervalSeconds", "180", "-DurationMinutes", "55" - Write-Host "Java process monitor started." + - bash: | + nohup pwsh -File "$(Build.SourcesDirectory)/eng/pipelines/scripts/Monitor-Java-Processes.ps1" \ + -StagingDirectory "$(System.DefaultWorkingDirectory)" \ + -IntervalSeconds 180 \ + -DurationMinutes 55 \ + > /dev/null 2>&1 & + echo "Java process monitor started in background (PID: $!)" displayName: 'Start Java process monitor (background)' continueOnError: true condition: always() From 438224f7dd4f48b0518c33af341eae174efe6f89 Mon Sep 17 00:00:00 2001 From: Rujun Chen Date: Fri, 20 Mar 2026 15:02:27 +0800 Subject: [PATCH 5/6] Delete an empty line to trigger java - spring - ci again --- sdk/spring/pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/spring/pom.xml b/sdk/spring/pom.xml index be007e1921d0..d85cf0f56bc8 100644 --- a/sdk/spring/pom.xml +++ b/sdk/spring/pom.xml @@ -136,7 +136,6 @@ azure-spring-data-cosmos - monitor From 9ff0200adf727fe68c37c80930efdf20ad3bda6f Mon Sep 17 00:00:00 2001 From: Rujun Chen Date: Fri, 20 Mar 2026 15:24:51 +0800 Subject: [PATCH 6/6] Add empty to trigger pipeline again --- sdk/spring/pom.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/spring/pom.xml b/sdk/spring/pom.xml index d85cf0f56bc8..be007e1921d0 100644 --- a/sdk/spring/pom.xml +++ b/sdk/spring/pom.xml @@ -136,6 +136,7 @@ azure-spring-data-cosmos + monitor