From d3f0b400a7b331dbf589de0c34f01108f7ac8abb Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Mon, 20 Apr 2026 22:06:18 +0530 Subject: [PATCH 1/2] Testing workflow for llama3.1-8b (MLC2 self hosted runner) --- .../workflows/test-llama3_1-8b-gpu-mlc2.yml | 397 ++++++++++++++++++ 1 file changed, 397 insertions(+) create mode 100644 .github/workflows/test-llama3_1-8b-gpu-mlc2.yml diff --git a/.github/workflows/test-llama3_1-8b-gpu-mlc2.yml b/.github/workflows/test-llama3_1-8b-gpu-mlc2.yml new file mode 100644 index 00000000..98e32a03 --- /dev/null +++ b/.github/workflows/test-llama3_1-8b-gpu-mlc2.yml @@ -0,0 +1,397 @@ +name: Test llama3-8b GPU run on MLC2 server + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + - 'src/inference_endpoints/**' + - 'examples/05_Llama3.1-8B_Example/**' + - 'pyproject.toml' + +concurrency: + group: endpts-gpu-benchmark-testing-pipeline + cancel-in-progress: false + +jobs: + setup_vllm_server: + runs-on: [ self-hosted, mlc2, endpoints ] + environment: sef-hosted-runner-benchmark-approval + outputs: + base_sha: ${{ steps.capture_shas.outputs.base_sha }} + head_sha: ${{ steps.capture_shas.outputs.head_sha }} + steps: + - name: Capture PR commit SHAs + id: capture_shas + run: | + BASE_SHA="${{ github.event.pull_request.base.sha }}" + HEAD_SHA="${{ github.event.pull_request.head.sha }}" + BASE_SHA_SHORT="${BASE_SHA:0:7}" + HEAD_SHA_SHORT="${HEAD_SHA:0:7}" + echo "base_sha=$BASE_SHA_SHORT" >> $GITHUB_OUTPUT + echo "head_sha=$HEAD_SHA_SHORT" >> $GITHUB_OUTPUT + echo "Base SHA : $BASE_SHA_SHORT" + echo "Head SHA : $HEAD_SHA_SHORT" + + - name: Wait for free GPU + run: | + REQUIRED_GPUS=1 + MAX_WAIT=7200 + INTERVAL=60 + ELAPSED=0 + LAST_LOG=0 + LOG_INTERVAL=300 + + while true; do + FREE_GPU_IDS=$(nvidia-smi --query-gpu=index,memory.used \ + --format=csv,noheader,nounits | \ + awk -F',' '$2 < 100 {print $1}' | tr -d ' ') + + FREE_GPU_COUNT=$(echo "$FREE_GPU_IDS" | grep -c '[0-9]' 2>/dev/null) || FREE_GPU_COUNT=0 + + if [ "$FREE_GPU_COUNT" -ge "$REQUIRED_GPUS" ]; then + SELECTED_GPUS=$(echo "$FREE_GPU_IDS" | head -n "$REQUIRED_GPUS" | tr '\n' ',' | sed 's/,$//') + echo "Free GPUs found: $FREE_GPU_IDS" + echo "Selected GPUs : $SELECTED_GPUS" + echo "SELECTED_GPUS=$SELECTED_GPUS" >> $GITHUB_ENV + PODMAN_GPU_FLAGS="" + for GPU_ID in $(echo "$SELECTED_GPUS" | tr ',' ' '); do + PODMAN_GPU_FLAGS="$PODMAN_GPU_FLAGS --device nvidia.com/gpu=$GPU_ID" + done + PODMAN_GPU_FLAGS="$PODMAN_GPU_FLAGS -e NVIDIA_VISIBLE_DEVICES=$SELECTED_GPUS" + echo "PODMAN_GPU_FLAGS=$PODMAN_GPU_FLAGS" >> $GITHUB_ENV + break + fi + + if [ "$ELAPSED" -ge "$MAX_WAIT" ]; then + echo "Timeout! Could not get $REQUIRED_GPUS free GPUs after $MAX_WAIT seconds." + exit 1 + fi + + if [ "$((ELAPSED - LAST_LOG))" -ge "$LOG_INTERVAL" ]; then + echo "Still waiting for GPUs... ($((ELAPSED / 60)) minutes elapsed, $FREE_GPU_COUNT free)" + LAST_LOG=$ELAPSED + fi + + sleep $INTERVAL + ELAPSED=$((ELAPSED + INTERVAL)) + done + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Setup vLLM server + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + # Pre-cleanup: In case if runner exited/crashed + docker rm -f vllm_server_llama3_endpts 2>/dev/null || true + + MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct + HF_HOME=/data/common/anandhu/endptshfdown + echo "Extra GPU flags for PODMAN: $PODMAN_GPU_FLAGS" + docker run -dt --name vllm_server_llama3_endpts \ + -v ${HF_HOME}:/root/.cache/huggingface \ + $PODMAN_GPU_FLAGS \ + --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + --network host \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model ${MODEL_NAME} \ + --gpu_memory_utilization 0.80 \ + --tensor-parallel-size 1 \ + --trust-request-chat-template \ + --port 9000 + + echo "Waiting for vLLM server to be ready..." + MAX_WAIT=1200 + ELAPSED=0 + INTERVAL=10 + + until [ "$(curl -s -o /dev/null -w '%{http_code}' http://localhost:9000/health)" = "200" ]; do + if [ "$ELAPSED" -ge "$MAX_WAIT" ]; then + echo "Timeout! vLLM server did not start in time." + docker logs vllm_server_llama3_endpts + exit 1 + fi + echo "Still waiting... (${ELAPSED}s elapsed)" + sleep $INTERVAL + ELAPSED=$((ELAPSED + INTERVAL)) + done + + echo "vLLM server is ready!" + docker logs vllm_server_llama3_endpts + + run_benchmarks: + needs: setup_vllm_server + runs-on: [ self-hosted, mlc2, endpoints ] + strategy: + max-parallel: 1 + matrix: + target_concurrencies: [1, 4, 16] + steps: + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + # MAIN BRANCH + - name: Checkout main branch + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.base.ref }} + + - name: Install dependencies (main) + run: | + python3 -m venv gh_action_endpts + source gh_action_endpts/bin/activate + pip install . + pip install nltk evaluate rouge_score + + - name: Warmup run — main (no upload) + run: | + source gh_action_endpts/bin/activate + CFG=$(mktemp /tmp/bench_cfg_llama3_1-8b.yaml) + cp examples/05_Llama3.1-8B_Example/online_llama3_8b_cnn.yaml "$CFG" + sed -i 's/samples: .*/samples: 50/g' "$CFG" + sed -i 's/localhost:8000/localhost:9000/g' "$CFG" + sed -i "s/target_concurrency: .*/target_concurrency: ${{ matrix.target_concurrencies }}/g" "$CFG" + sed -i 's/n_samples_to_issue: .*/n_samples_to_issue: 50/g' "$CFG" + inference-endpoint benchmark from-config -c "$CFG" --timeout 6000 + rm -f "$CFG" + echo "Warmup complete." + + - name: Benchmark — main branch + run: | + source gh_action_endpts/bin/activate + CFG=$(mktemp /tmp/bench_cfg_llama3_1-8b.yaml) + cp examples/05_Llama3.1-8B_Example/online_llama3_8b_cnn.yaml "$CFG" + sed -i 's/samples: .*/samples: 2000/g' "$CFG" + sed -i 's/localhost:8000/localhost:9000/g' "$CFG" + sed -i "s/target_concurrency: .*/target_concurrency: ${{ matrix.target_concurrencies }}/g" "$CFG" + sed -i 's/n_samples_to_issue: .*/n_samples_to_issue: 2000/g' "$CFG" + inference-endpoint benchmark from-config -c "$CFG" --timeout 600000 + rm -f "$CFG" + echo "===== result_summary.json (main) =====" + rm logs/llama3_8b_cnn_online/events.jsonl + cat logs/llama3_8b_cnn_online/result_summary.json + + - name: Upload main branch results as artifact + uses: actions/upload-artifact@v4 + with: + # Artifact name encodes concurrency + SHA pair for easy retrieval + name: llama-3.1-8b_vllm_perf_concurrency${{ matrix.target_concurrencies }}-${{ needs.setup_vllm_server.outputs.base_sha }}-${{ needs.setup_vllm_server.outputs.head_sha }}-Main + path: logs/llama3_8b_cnn_online/ + retention-days: 30 + if-no-files-found: error + + # PR BRANCH + - name: Checkout PR head branch + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + + - name: Install dependencies (PR) + run: | + python3 -m venv gh_action_endpts + source gh_action_endpts/bin/activate + pip install . + pip install nltk evaluate rouge_score + + - name: Warmup run — PR branch (no upload) + run: | + source gh_action_endpts/bin/activate + CFG=$(mktemp /tmp/bench_cfg_llama3_1-8b.yaml) + cp examples/05_Llama3.1-8B_Example/online_llama3_8b_cnn.yaml "$CFG" + sed -i 's/samples: .*/samples: 50/g' "$CFG" + sed -i 's/localhost:8000/localhost:9000/g' "$CFG" + sed -i "s/target_concurrency: .*/target_concurrency: ${{ matrix.target_concurrencies }}/g" "$CFG" + sed -i 's/n_samples_to_issue: .*/n_samples_to_issue: 50/g' "$CFG" + echo "Running warmup on PR branch (50 samples, results discarded)..." + inference-endpoint benchmark from-config -c "$CFG" --timeout 6000 + rm -f "$CFG" + echo "Warmup complete." + + - name: Benchmark — PR branch + run: | + source gh_action_endpts/bin/activate + CFG=$(mktemp /tmp/bench_cfg_llama3_1-8b.yaml) + cp examples/05_Llama3.1-8B_Example/online_llama3_8b_cnn.yaml "$CFG" + sed -i 's/samples: .*/samples: 2000/g' "$CFG" + sed -i 's/localhost:8000/localhost:9000/g' "$CFG" + sed -i "s/target_concurrency: .*/target_concurrency: ${{ matrix.target_concurrencies }}/g" "$CFG" + sed -i 's/n_samples_to_issue: .*/n_samples_to_issue: 2000/g' "$CFG" + echo "Running benchmark on PR branch (2000 samples)..." + inference-endpoint benchmark from-config -c "$CFG" --timeout 600000 + rm -f "$CFG" + echo "===== result_summary.json (PR) =====" + rm logs/llama3_8b_cnn_online/events.jsonl + cat logs/llama3_8b_cnn_online/result_summary.json + + - name: Upload PR branch results as artifact + uses: actions/upload-artifact@v4 + with: + name: llama-3.1-8b_vllm_perf_concurrency${{ matrix.target_concurrencies }}-${{ needs.setup_vllm_server.outputs.base_sha }}-${{ needs.setup_vllm_server.outputs.head_sha }}-PR + path: logs/llama3_8b_cnn_online/ + retention-days: 30 + if-no-files-found: error + + post_pr_comment: + needs: [ setup_vllm_server, run_benchmarks ] + runs-on: [ self-hosted, mlc2 ] + steps: + # Download all 6 artifacts (main + PR) x (concurrency 1, 4, 16) + # Each lands in its own subdirectory named after the artifact + - name: Download all benchmark artifacts + uses: actions/download-artifact@v4 + with: + # No `name` specified → downloads ALL artifacts from this run + path: downloaded-artifacts/ + + - name: List downloaded artifacts (debug) + run: | + echo "=== Downloaded artifact structure ===" + find downloaded-artifacts/ -type f | sort + + - name: Generate regression report and post PR comment + env: + BASE_SHA: ${{ needs.setup_vllm_server.outputs.base_sha }} + HEAD_SHA: ${{ needs.setup_vllm_server.outputs.head_sha }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.pull_request.number }} + REPO: ${{ github.repository }} + RUN_ID: ${{ github.run_id }} + RUN_NUMBER: ${{ github.run_number }} + run: | + python3 << 'EOF' + import json, os + from pathlib import Path + + base_sha = os.environ["BASE_SHA"] + head_sha = os.environ["HEAD_SHA"] + pr_number = os.environ["PR_NUMBER"] + repo = os.environ["REPO"] + run_id = os.environ["RUN_ID"] + run_number = os.environ["RUN_NUMBER"] + THRESHOLD = 0.02 # 2% + concurrencies = [1, 4, 16] + + # Artifacts are downloaded into: + # downloaded-artifacts/llama-3.1-8b_vllm_perf_concurrency{N}-{base_sha}-{head_sha}-{branch_tag}/ + ARTIFACT_ROOT = Path("downloaded-artifacts") + + def load_result(branch_tag, concurrency): + artifact_dir = ARTIFACT_ROOT / f"llama-3.1-8b_vllm_perf_concurrency{concurrency}-{base_sha}-{head_sha}-{branch_tag}" + result_file = artifact_dir / "result_summary.json" + if not result_file.exists(): + print(f"WARNING: result file not found: {result_file}") + return None + with open(result_file) as f: + return json.load(f) + + def ms(v): + return round(v / 1_000_000, 2) if v is not None else None + + def delta(main_v, pr_v, lower_is_better=False): + if main_v is None or pr_v is None or main_v == 0: + return "N/A", "⚪" + pct = (pr_v - main_v) / main_v + label = f"{'+'if pct>=0 else ''}{round(pct*100,2)}%" + bad = pct > THRESHOLD if lower_is_better else pct < -THRESHOLD + return label, ("⚠️" if bad else "✅") + + lines = [] + overall_bad = False + + lines += [ + "## 🔬 Performance Benchmark Report", + f"**PR #{pr_number}** | Run [#{run_number}](https://github.com/{repo}/actions/runs/{run_id})", + f"**Base (main):** `{base_sha}` → **PR head:** `{head_sha}`", + f"> Regression threshold: **{int(THRESHOLD*100)}%** | Samples: **2000** | Warmup: **50 samples** per branch", + "", + ] + + for c in concurrencies: + m = load_result("Main", c) + p = load_result("PR", c) + lines.append(f"### Concurrency {c}") + + if not m or not p: + missing = "main" if not m else "PR" + lines += [f"> ⚠️ Results missing for **{missing}** at concurrency {c}.", ""] + continue + + lines += [ + "", + "| Metric | Main | PR | Delta | Status |", + "|--------|------|----|-------|--------|", + ] + + rows = [ + ("QPS", round(m.get("qps",0),3), round(p.get("qps",0),3), False), + ("TTFT median (ms)", ms(m.get("ttft",{}).get("median")), ms(p.get("ttft",{}).get("median")), True), + ("TTFT p90 (ms)", ms(m.get("ttft",{}).get("percentiles",{}).get("90")), ms(p.get("ttft",{}).get("percentiles",{}).get("90")), True), + ("TTFT p99 (ms)", ms(m.get("ttft",{}).get("percentiles",{}).get("99")), ms(p.get("ttft",{}).get("percentiles",{}).get("99")), True), + ("Latency median (ms)", ms(m.get("latency",{}).get("median")), ms(p.get("latency",{}).get("median")), True), + ("Latency p90 (ms)", ms(m.get("latency",{}).get("percentiles",{}).get("90")), ms(p.get("latency",{}).get("percentiles",{}).get("90")), True), + ("Latency p99 (ms)", ms(m.get("latency",{}).get("percentiles",{}).get("99")), ms(p.get("latency",{}).get("percentiles",{}).get("99")), True), + ] + + for label, mv, pv, lib in rows: + d, e = delta(mv, pv, lib) + if e == "⚠️": overall_bad = True + lines.append(f"| {label} | {mv} | {pv} | {d} | {e} |") + + lines.append("") + + lines += ["---"] + if overall_bad: + lines += [ + "### ⚠️ Performance regression detected", + f"One or more metrics degraded by more than {int(THRESHOLD*100)}% vs main. Please review before merging.", + ] + else: + lines += [ + "### ✅ No performance regression detected", + f"All metrics are within the {int(THRESHOLD*100)}% threshold compared to main.", + ] + + body = "\n".join(lines) + with open("/tmp/pr_comment.json", "w") as f: + json.dump({"body": body}, f) + print(body) + EOF + + - name: Post comment to PR + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.pull_request.number }} + REPO: ${{ github.repository }} + run: | + curl -s -X POST \ + -H "Authorization: Bearer $GH_TOKEN" \ + -H "Content-Type: application/json" \ + -d @/tmp/pr_comment.json \ + "https://api.github.com/repos/${REPO}/issues/${PR_NUMBER}/comments" + + teardown_server: + needs: [ run_benchmarks, post_pr_comment ] + runs-on: [ self-hosted, mlc2 ] + if: always() + steps: + - name: Stop vLLM server + run: | + docker stop vllm_server_llama3_endpts 2>/dev/null || true + docker rm -f vllm_server_llama3_endpts 2>/dev/null || true + echo "vLLM server stopped and removed." + + - name: Clean virtual env + run: | + rm -rf ${{ github.workspace }}/gh_action_endpts || true + + - name: Clean workspace + run: | + rm -rf ${{ github.workspace }} + mkdir -p ${{ github.workspace }} + rm -rf /tmp/benchmark_main_* /tmp/benchmark_pr_* /tmp/pr_comment.json || true From 3d7df308baf0f8de6379b76181d14ab0508d9664 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Tue, 21 Apr 2026 09:45:22 +0530 Subject: [PATCH 2/2] Security fix: Run the workflow from the base branch's (main) code --- .github/workflows/test-llama3_1-8b-gpu-mlc2.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-llama3_1-8b-gpu-mlc2.yml b/.github/workflows/test-llama3_1-8b-gpu-mlc2.yml index 98e32a03..eb759c53 100644 --- a/.github/workflows/test-llama3_1-8b-gpu-mlc2.yml +++ b/.github/workflows/test-llama3_1-8b-gpu-mlc2.yml @@ -1,7 +1,7 @@ name: Test llama3-8b GPU run on MLC2 server on: - pull_request: + pull_request_target: types: [opened, synchronize, reopened] paths: - 'src/inference_endpoints/**' @@ -174,7 +174,7 @@ jobs: inference-endpoint benchmark from-config -c "$CFG" --timeout 600000 rm -f "$CFG" echo "===== result_summary.json (main) =====" - rm logs/llama3_8b_cnn_online/events.jsonl + rm -f logs/llama3_8b_cnn_online/events.jsonl cat logs/llama3_8b_cnn_online/result_summary.json - name: Upload main branch results as artifact @@ -226,7 +226,7 @@ jobs: inference-endpoint benchmark from-config -c "$CFG" --timeout 600000 rm -f "$CFG" echo "===== result_summary.json (PR) =====" - rm logs/llama3_8b_cnn_online/events.jsonl + rm -f logs/llama3_8b_cnn_online/events.jsonl cat logs/llama3_8b_cnn_online/result_summary.json - name: Upload PR branch results as artifact