SemiAnalysisAI · Oseltamivir · May 2, 2026 · May 2, 2026 · May 2, 2026 · May 2, 2026
@@ -1635,13 +1635,9 @@ dsv4-fp8-mi355x-vllm:
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 1 }
 
-# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
-# PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks
-# that OOM once warmup/prefill batches multiple requests; keep CONC=1 until
-# the AITER sparse-attention kernel / multi-request path lands upstream.
-# --enforce-eager and ATOM_USE_TRITON_MOE=1 are required on gfx950. Image is
-# the standard atom0.1.2.post MI355X base (matching qwen3.5-fp8-mi355x-atom);
-# the DSv4 PR is overlaid at runtime by dsv4_fp4_mi355x_atom.sh at a pinned SHA.
+# DeepSeek-V4 on ATOM using the updated atom0.1.2.post image. The launcher
+# overlays ROCm/ATOM#650 only for DSv4 model registration/skeleton support,
+# then overlays ROCm/aiter#2998 for sparse/indexer kernels.
 dsv4-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -1655,8 +1651,8 @@ dsv4-fp4-mi355x-atom:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml
@@ -161,8 +161,8 @@ jobs:
             - If jobs cannot be run, say exactly what you could not run and why
             - **Important** Modify perf-changelog.yaml for any config changes affecting performance
 
-            ## Profiling (SGLang only)
-            When asked to profile a config, dispatch the `profile.yml` workflow. **Only SGLang configs can be profiled** — the profiler uses SGLang's `/start_profile` and `/stop_profile` HTTP endpoints. Reject profiling requests for vLLM, TRT, or other frameworks.
+            ## Profiling
+            When asked to profile a config, dispatch the `profile.yml` workflow. SGLang, vLLM, and ATOM single-node configs can be profiled through their `/start_profile` and `/stop_profile` HTTP endpoints when the server is launched with the corresponding torch profiler directory. Reject profiling requests for TRT, disaggregated/multi-node configs, or other frameworks.
 
             **Syntax:**
             ```
@@ -172,9 +172,10 @@ jobs:
                 workflow_id="profile.yml",
                 ref="main",
                 inputs={
-                    "config-key": "<config-key-ending-in-sglang>",
+                    "config-key": "<config-key>",
                     "config-file": "<.github/configs/nvidia-master.yaml or amd-master.yaml>",
-                    "conc": "<concurrency>"
+                    "conc": "<concurrency>",
+                    "seq-len": "<1k1k or 8k1k>"
                 }
             )
             ```
@@ -184,19 +185,16 @@ jobs:
             - Model: "deepseek" / "dsr1" → model-prefix `dsr1`; "gptoss" → `gptoss`; "qwen" → `qwen3.5`
             - Precision: "fp4" / "fp8" / "bf16"
             - Runner/hardware: "b200", "h200", "h100", "mi300x", "mi325x", "mi355x", etc.
-            - Framework: must be "sglang" (reject if not)
+            - Framework: must be "sglang", "vllm", or "atom" (reject TRT and disaggregated/multi-node)
             - Concurrency: "conc=N" → `"conc": "N"`. Default to `"64"` if not specified.
+            - Sequence length: default to `"1k1k"` unless the user asks for `"8k1k"`.
 
-            Construct the config-key as: `{model-prefix}-{precision}-{runner}-sglang`
+            Construct the config-key as: `{model-prefix}-{precision}-{runner}-{framework}`
             Choose config-file: NVIDIA runners (b200, h200, h100, gb200, gb300) → `nvidia-master.yaml`; AMD runners (mi300x, mi325x, mi355x) → `amd-master.yaml`
 
-            **Available SGLang config keys:**
-            NVIDIA: `dsr1-fp4-b200-sglang`, `dsr1-fp8-b200-sglang`, `dsr1-fp8-h200-sglang`, `qwen3.5-bf16-b200-sglang`
-            AMD: `dsr1-fp4-mi355x-sglang`, `dsr1-fp8-mi300x-sglang`, `dsr1-fp8-mi325x-sglang`, `dsr1-fp8-mi355x-sglang`, `qwen3.5-bf16-mi355x-sglang`, `qwen3.5-fp8-mi355x-sglang`
-
             **Examples:**
-            - "profile sglang b200 deepseek fp4 conc=4" → `config-key: dsr1-fp4-b200-sglang`, `config-file: .github/configs/nvidia-master.yaml`, `conc: 4`
-            - "profile sglang mi355x dsr1 fp8" → `config-key: dsr1-fp8-mi355x-sglang`, `config-file: .github/configs/amd-master.yaml`, `conc: 64`
+            - "profile sglang b200 deepseek fp4 conc=4" → `config-key: dsr1-fp4-b200-sglang`, `config-file: .github/configs/nvidia-master.yaml`, `conc: 4`, `seq-len: 1k1k`
+            - "profile atom mi355x dsv4 fp4 conc=4 8k1k" → `config-key: dsv4-fp4-mi355x-atom`, `config-file: .github/configs/amd-master.yaml`, `conc: 4`, `seq-len: 8k1k`
 
             **After dispatch:**
             Monitor with `mcp__github__get_workflow_run`. The profile workflow takes ~15-30 minutes. When complete, the **Perfetto relay link** is in the workflow run's step summary. Retrieve it with:

diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
@@ -17,6 +17,14 @@ on:
         required: false
         type: string
         default: '64'
+      seq-len:
+        description: "Sequence length config to profile"
+        required: false
+        type: choice
+        options:
+          - 1k1k
+          - 8k1k
+        default: 1k1k
       moe-debug:
         description: "Enable MoE debug patch and log (MOE_DEBUG_LOG)"
         required: false
@@ -54,7 +62,7 @@ jobs:
         name: Generate matrix via script
         run: |
           pip install pydantic
-          CLI_ARGS="test-config --config-files ${{ inputs.config-file }} --config-keys ${{ inputs.config-key }} --conc ${{ inputs.conc }}"
+          CLI_ARGS="test-config --config-files ${{ inputs.config-file }} --config-keys ${{ inputs.config-key }} --conc ${{ inputs.conc }} --seq-lens ${{ inputs.seq-len }}"
           CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py $CLI_ARGS)
           echo "raw=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
@@ -148,13 +156,16 @@ jobs:
           ref: ${{ inputs.ref || github.sha }}
           clean: false
 
-      - name: Launch + Profile (single-node sglang/vllm)
+      - name: Launch + Profile (single-node)
         id: run
         env:
           RUNNER_NAME: ${{ runner.name }}
           PROFILE: '1'
           SGLANG_TORCH_PROFILER_DIR: /workspace/
           VLLM_TORCH_PROFILER_DIR: /workspace/
+          ATOM_TORCH_PROFILER_DIR: /workspace/atom_profiles
+          PROFILE_NUM_STEPS: '1'
+          PROFILE_OUTPUT_LEN: '1'
           VLLM_RPC_TIMEOUT: '1800000'
         shell: bash
         run: |
@@ -173,6 +184,11 @@ jobs:
 
           trace_path="profile_${res_name}.trace.json.gz"
           if [ -f "$trace_path" ]; then
+            if [ ! -s "$trace_path" ]; then
+              echo "Profile trace is empty: $trace_path" >&2
+              exit 1
+            fi
+            gzip -t "$trace_path"
             echo "trace=$trace_path" >> "$GITHUB_OUTPUT"
             if [ "${FRAMEWORK}" = "sglang" ]; then
               # Try to locate corresponding TP-0 traces produced by SGLang profiler
@@ -193,32 +209,47 @@ jobs:
             fi
           else
             echo "Profile trace not found: $trace_path" >&2
+            exit 1
           fi
 
       - name: Process result (json -> agg)
+        continue-on-error: true
         env:
           RUNNER_TYPE: ${{ matrix.config.runner }}
         run: |
           python3 utils/process_result.py
 
+      - name: Upload profile diagnostics
+        if: ${{ always() && env.RESULT_FILENAME != '' }}
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: profile_diagnostics_${{ env.RESULT_FILENAME }}
+          path: |
+            ${{ env.RESULT_FILENAME }}.json
+            agg_${{ env.RESULT_FILENAME }}.json
+            server.log
+            gpu_metrics.csv
+            atom_profiles/**/*.trace.json.gz
+          if-no-files-found: ignore
+
       - name: Upload profile as artifact
-        if: ${{ steps.run.outputs.trace != '' }}
+        if: ${{ always() && steps.run.outputs.trace != '' }}
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: profile_${{ env.RESULT_FILENAME }}
           path: profile_${{ env.RESULT_FILENAME }}.trace.json.gz
           if-no-files-found: ignore
 
       - name: Upload TP-0-DECODE trace as artifact
-        if: ${{ steps.run.outputs.tp0_decode != '' }}
+        if: ${{ always() && steps.run.outputs.tp0_decode != '' }}
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: profile_${{ env.RESULT_FILENAME }}_TP0_DECODE
           path: ${{ steps.run.outputs.tp0_decode }}
           if-no-files-found: ignore
 
       - name: Upload TP-0-EXTEND trace as artifact
-        if: ${{ steps.run.outputs.tp0_extend != '' }}
+        if: ${{ always() && steps.run.outputs.tp0_extend != '' }}
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: profile_${{ env.RESULT_FILENAME }}_TP0_EXTEND