From 70d242482f2474d1c551a7940d99ad1547939f81 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Wed, 4 Mar 2026 12:45:54 +0100 Subject: [PATCH] e2e: test concurrent SIGTERM flush across 25 pods Creates 25 pods (50 S3 buffers with 2 outputs), kills fluent-bit with grace-period=15, and verifies all markers appear in S3. Sequential flush would need ~50-100s, so completion within 15s proves concurrency. Validates each pod marker individually rather than counting grep lines, avoiding false positives from duplicate log reads. Co-Authored-By: Claude Opus 4.6 --- test.sh | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/test.sh b/test.sh index 4ec8205..9a5203c 100755 --- a/test.sh +++ b/test.sh @@ -384,6 +384,67 @@ else ERRORS=$((ERRORS + 1)) fi +# 7i. Concurrent SIGTERM flush — verify buffers from 20+ pods all flush within grace period. +# Each pod×container×output creates a separate S3 buffer, so 25 pods × 2 outputs = 50 buffers. +# Sequential flush would need ~50-100s; within 15s grace period proves concurrency. +echo "==> Testing concurrent SIGTERM flush (25 pods)..." +CONCURRENT_MARKER="concurrent-$(date +%s)" +FLUSH_POD_COUNT=25 + +for i in $(seq 1 $FLUSH_POD_COUNT); do + $KUBECTL run "flush-test-$i" --restart=Never \ + --image=busybox:1.37 --labels=test=concurrent-flush \ + --command -- sh -c "echo '${CONCURRENT_MARKER}-${i}'; sleep 3600" & +done +wait + +echo " Waiting for $FLUSH_POD_COUNT pods to start..." +$KUBECTL wait pod -l test=concurrent-flush --for=condition=Ready --timeout=60s + +# Refresh_Interval=5 means up to 5s for fluent-bit to discover new log files, +# then Read_from_Head reads them immediately. Two refresh cycles for safety. +echo " Waiting for fluent-bit to discover and tail $FLUSH_POD_COUNT log files..." +sleep 12 + +FB_POD=$($KUBECTL get pod -l app=fluent-bit -o jsonpath='{.items[0].metadata.name}') +echo " Killing fluent-bit pod $FB_POD with grace-period=15..." +$KUBECTL delete pod "$FB_POD" --grace-period=15 +wait_for_rollout daemonset fluent-bit 60s + +echo " Previous fluent-bit shutdown logs:" +$KUBECTL logs -l app=fluent-bit --previous --tail=15 2>/dev/null || echo " (no previous logs available)" + +FLUSH_TIMEOUT=60 +FLUSH_INTERVAL=5 +flush_elapsed=0 +FOUND_COUNT=0 + +while [ "$flush_elapsed" -lt "$FLUSH_TIMEOUT" ]; do + FLUSH_OUTPUT=$(./y-logcli --context=dev query '{namespace="default"}' -f parquet -o raw 2>&1) || true + # Count distinct pod markers (not lines — duplicates from re-reads would inflate grep -c) + FOUND_COUNT=0 + for i in $(seq 1 $FLUSH_POD_COUNT); do + if grep -q "${CONCURRENT_MARKER}-${i}\b" <<< "$FLUSH_OUTPUT"; then + FOUND_COUNT=$((FOUND_COUNT + 1)) + fi + done + if [ "$FOUND_COUNT" -ge "$FLUSH_POD_COUNT" ]; then + break + fi + flush_elapsed=$((flush_elapsed + FLUSH_INTERVAL)) + echo " Found $FOUND_COUNT/$FLUSH_POD_COUNT markers, retrying... (${flush_elapsed}/${FLUSH_TIMEOUT}s)" + sleep "$FLUSH_INTERVAL" +done + +if [ "$FOUND_COUNT" -ge "$FLUSH_POD_COUNT" ]; then + echo " PASS: Concurrent flush — all $FLUSH_POD_COUNT pod markers found in S3 (50 buffers flushed within 15s)" +else + echo " FAIL: Concurrent flush — only $FOUND_COUNT/$FLUSH_POD_COUNT pod markers found (buffers lost on SIGTERM)" >&2 + ERRORS=$((ERRORS + 1)) +fi + +$KUBECTL delete pod -l test=concurrent-flush --grace-period=0 --force 2>/dev/null || true + # --- 8. Result --- echo ""