Arm backend: Add adaptive pooling node visitors (#20220) #1302

Workflow file for this run

.github/workflows/cuda-perf.yml at d0a8dd6

	name: cuda-perf

	on:
	push:
	branches:
	- main
	- release/*
	tags:
	- ciflow/cuda-perf/*
	pull_request:
	paths:
	- .github/workflows/cuda-perf.yml
	- .ci/scripts/cuda_benchmark.py
	- .ci/scripts/cuda_perf_prompts/**
	- .ci/scripts/export_model_artifact.sh
	- .ci/scripts/test_model_e2e.sh
	workflow_dispatch:
	inputs:
	models:
	description: Models to be benchmarked (comma-separated HuggingFace model IDs)
	required: false
	type: string
	quantizations:
	description: Quantization types (comma-separated)
	required: false
	type: string
	num_runs:
	description: Number of benchmark runs per model
	required: false
	type: string
	default: "50"

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
	cancel-in-progress: true

	permissions:
	contents: read

	jobs:
	changed-files:
	name: Get changed files
	uses: ./.github/workflows/_get-changed-files.yml
	with:
	include-push-diff: true

	run-decision:
	name: CI run decision
	uses: ./.github/workflows/_ci-run-decision.yml

	set-parameters:
	needs: [changed-files, run-decision]
	# Path-filtered: mirrors the workflow-level pull_request `paths:`
	# filter so push commits that don't touch perf-relevant paths skip
	# this whole workflow on non-sampled commits. Sampling preserves
	# perf time-series at every 4th commit (vs every commit pre-PR).
	if: \|
	contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-perf.yml') \|\|
	contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_benchmark.py') \|\|
	contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_perf_prompts') \|\|
	contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') \|\|
	contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') \|\|
	needs.run-decision.outputs.is-full-run == 'true'
	runs-on: ubuntu-22.04
	outputs:
	benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }}
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: 'false'
	- uses: actions/setup-python@v4
	with:
	python-version: '3.10'
	- name: Set parameters
	id: set-parameters
	shell: bash
	env:
	ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it,nvidia/parakeet-tdt,SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4'
	ALL_QUANTIZATIONS: 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only'
	NUM_RUNS: ${{ inputs.num_runs \|\| '50' }}
	run: \|
	set -eux

	MODELS="${{ inputs.models }}"
	QUANTIZATIONS="${{ inputs.quantizations }}"

	# Use all models/quantizations unless overridden by workflow_dispatch
	if [ -z "$MODELS" ]; then
	MODELS="$ALL_MODELS"
	fi
	if [ -z "$QUANTIZATIONS" ]; then
	QUANTIZATIONS="$ALL_QUANTIZATIONS"
	fi

	# Split models and quantizations into arrays
	IFS=',' read -ra MODEL_ARRAY <<< "$MODELS"
	IFS=',' read -ra QUANT_ARRAY <<< "$QUANTIZATIONS"

	# Generate benchmark configs (skip invalid model/quant combinations)
	CONFIGS='{"include":['
	FIRST=true
	for MODEL in "${MODEL_ARRAY[@]}"; do
	for QUANT in "${QUANT_ARRAY[@]}"; do
	# Qwen3.5 MoE only supports quantized-int4-tile-packed
	if [[ "$MODEL" == "Qwen3.5-35B-A3B" ]] && [ "$QUANT" != "quantized-int4-tile-packed" ]; then
	continue
	fi
	if [ "$FIRST" = true ]; then
	FIRST=false
	else
	CONFIGS+=','
	fi
	# Sanitize model name for use in artifact paths
	MODEL_SAFE=$(echo "$MODEL" \| sed 's/\//_/g')
	CONFIGS+="{\"model\":\"$MODEL\",\"quant\":\"$QUANT\",\"model_safe\":\"$MODEL_SAFE\",\"num_runs\":\"$NUM_RUNS\"}"
	done
	done
	CONFIGS+=']}'

	echo "benchmark_configs=$CONFIGS" >> $GITHUB_OUTPUT
	echo "Generated benchmark configs:"
	echo "$CONFIGS" \| python -m json.tool

	export-models:
	name: export-models
	needs: set-parameters
	uses: pytorch/test-infra/.github/workflows/linux_job_v3.yml@main
	permissions:
	id-token: write
	contents: read
	secrets: inherit
	strategy:
	matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
	fail-fast: false
	with:
	timeout: 90
	secrets-env: EXECUTORCH_HF_TOKEN
	runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'mt-l-x86iavx512-11-125-a100' \|\| 'mt-l-x86aavx2-29-113-a10g' }}
	gpu-arch-type: cuda
	gpu-arch-version: "13.0"
	use-custom-docker-registry: false
	submodules: recursive
	upload-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	script: \|
	set -eux
	echo "::group::Setup ExecuTorch"
	# OSDC runners can't reach the public PyPI CDN that download.pytorch.org's
	# transitive deps resolve to. Pre-install torch's pure-python deps from the
	# in-cluster pypi-cache and drop the default cpu extra-index so the cuda
	# torch wheel is the only candidate.
	export PIP_EXTRA_INDEX_URL=
	# fsspec is pinned to satisfy datasets' fsspec[http]<=2025.3.0 so the later
	# examples install doesn't try to downgrade it from the public CDN.
	pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 "fsspec[http]<=2025.3.0" numpy pillow
	# Disable MKL to avoid duplicate target error when conda has multiple MKL installations
	export USE_MKL=OFF
	./install_executorch.sh
	echo "::endgroup::"

	echo "::group::Setup Huggingface"
	pip install -U "huggingface_hub[cli]>=1.2.1,<2.0" accelerate "optimum~=2.0.0" "transformers==5.0.0rc1"
	HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" \| tr -d '\r\n')"
	hf auth login --token "$HF_AUTH_TOKEN"
	OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
	pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
	echo "::endgroup::"

	echo "::group::Exporting model ${{ matrix.model }} with quantization ${{ matrix.quant }}"
	OUTPUT_DIR="model_artifacts"
	mkdir -p "$OUTPUT_DIR"

	bash .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" "$OUTPUT_DIR"

	# Move artifacts to RUNNER_ARTIFACT_DIR for upload
	mv "$OUTPUT_DIR"/* "${RUNNER_ARTIFACT_DIR}/"
	ls -lah "${RUNNER_ARTIFACT_DIR}"
	echo "::endgroup::"

	benchmark-cuda:
	name: benchmark-cuda
	needs:
	- changed-files
	- run-decision
	- set-parameters
	- export-models
	# Inherit the gate from set-parameters/export-models (they cascade-
	# skip when the gate evaluates false). `always()` keeps benchmark-
	# cuda running even when some export-models matrix cells fail —
	# but only if the gate itself is open. Without the explicit gate
	# here, `always()` would fire benchmark-cuda even when set-
	# parameters was gated out.
	if: \|
	always() &&
	(
	contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-perf.yml') \|\|
	contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_benchmark.py') \|\|
	contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_perf_prompts') \|\|
	contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') \|\|
	contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') \|\|
	needs.run-decision.outputs.is-full-run == 'true'
	)
	uses: pytorch/test-infra/.github/workflows/linux_job_v3.yml@main
	permissions:
	id-token: write
	contents: read
	strategy:
	matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
	fail-fast: false
	with:
	timeout: 90
	runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'mt-l-x86iavx512-11-125-a100' \|\| 'mt-l-x86aavx2-29-113-a10g' }}
	gpu-arch-type: cuda
	gpu-arch-version: "13.0"
	use-custom-docker-registry: false
	submodules: recursive
	download-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
	upload-artifact: results-${{ matrix.model_safe }}-${{ matrix.quant }}
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	script: \|
	set -eux
	echo "::group::Setup environment"
	# OSDC runners can't reach the public PyPI CDN that download.pytorch.org's
	# transitive deps resolve to. Pre-install torch's pure-python deps from the
	# in-cluster pypi-cache and drop the default cpu extra-index so the cuda
	# torch wheel is the only candidate.
	export PIP_EXTRA_INDEX_URL=
	# fsspec is pinned to satisfy datasets' fsspec[http]<=2025.3.0 so the later
	# examples install doesn't try to downgrade it from the public CDN.
	pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 "fsspec[http]<=2025.3.0" numpy pillow
	./install_requirements.sh
	pip list
	echo "::endgroup::"

	echo "::group::Prepare model artifacts"
	mkdir -p model_artifacts
	cp "${RUNNER_ARTIFACT_DIR}/model.pte" model_artifacts/model.pte
	cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" model_artifacts/aoti_cuda_blob.ptd

	# Copy additional files if they exist
	if [ -f "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" model_artifacts/
	fi
	if [ -f "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" model_artifacts/
	fi
	if [ -f "${RUNNER_ARTIFACT_DIR}/tekken.json" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/tekken.json" model_artifacts/
	fi
	if [ -f "${RUNNER_ARTIFACT_DIR}/poem.wav" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/poem.wav" model_artifacts/
	fi
	if [ -f "${RUNNER_ARTIFACT_DIR}/output.wav" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/output.wav" model_artifacts/
	fi
	if [ -f "${RUNNER_ARTIFACT_DIR}/tokenizer.model" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/tokenizer.model" model_artifacts/
	fi
	if [ -f "${RUNNER_ARTIFACT_DIR}/test_audio.wav" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/test_audio.wav" model_artifacts/
	fi
	# Copy tokenizer files
	for file in tokenizer.json tokenizer_config.json special_tokens_map.json; do
	if [ -f "${RUNNER_ARTIFACT_DIR}/$file" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/$file" model_artifacts/
	fi
	done

	ls -lah model_artifacts/
	echo "::endgroup::"

	echo "::group::Build runner"
	bash .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" model_artifacts
	echo "::endgroup::"

	echo "::group::Running benchmark for ${{ matrix.model }} (${{ matrix.quant }}) with ${{ matrix.num_runs }} runs"
	export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH

	# Get GPU name using nvidia-smi
	GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader \| head -1)
	echo "Detected GPU: $GPU_NAME"

	# Get CUDA driver version
	CUDA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader \| head -1)
	echo "CUDA Driver Version: $CUDA_DRIVER_VERSION"

	# Create results directory (separate from model artifacts)
	RESULTS_DIR="benchmark_results"
	mkdir -p "$RESULTS_DIR"

	# Determine model name and runner command based on model
	case "${{ matrix.model }}" in
	mistralai/Voxtral-Mini-3B-2507)
	RUNNER="cmake-out/examples/models/voxtral/voxtral_runner"
	PREPROCESSOR="model_artifacts/voxtral_preprocessor.pte"
	TOKENIZER="model_artifacts/tekken.json"
	AUDIO="model_artifacts/poem.wav"
	RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0"
	MODEL_NAME="voxtral_${{ matrix.quant }}"
	;;
	openai/whisper-*)
	RUNNER="cmake-out/examples/models/whisper/whisper_runner"
	PREPROCESSOR="model_artifacts/whisper_preprocessor.pte"
	AUDIO="model_artifacts/output.wav"
	RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0"
	MODEL_NAME=$(echo "${{ matrix.model }}" \| sed 's/openai\///')_${{ matrix.quant }}
	;;
	google/gemma-3-4b-it)
	RUNNER="cmake-out/examples/models/gemma3/gemma3_e2e_runner"
	IMAGE="docs/source/_static/img/et-logo.png"
	RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --image_path $IMAGE --temperature 0"
	MODEL_NAME="gemma3_${{ matrix.quant }}"
	;;
	nvidia/parakeet-tdt)
	RUNNER="cmake-out/examples/models/parakeet/parakeet_runner"
	AUDIO="model_artifacts/test_audio.wav"
	TOKENIZER="model_artifacts/tokenizer.model"
	RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --audio_path $AUDIO --tokenizer_path $TOKENIZER"
	MODEL_NAME="parakeet_${{ matrix.quant }}"
	;;
	SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
	RUNNER="cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
	TOKENIZER="model_artifacts/tokenizer.json"
	# Use a checked-in long prompt (>1000 tokens) for benchmarking. A
	# static, meaningful prompt avoids the degenerate / repetitive
	# outputs that can result from synthetic prompts built by
	# repeating the same sentence.
	PROMPT_FILE=".ci/scripts/cuda_perf_prompts/qwen3_5_moe_long_prompt.txt"
	RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --prompt_file $PROMPT_FILE --max_new_tokens 512 --temperature 0"
	MODEL_NAME="qwen3_5_moe_${{ matrix.quant }}"
	;;
	*)
	echo "Error: Unsupported model '${{ matrix.model }}'"
	exit 1
	;;
	esac

	# Run benchmark using cuda_benchmark.py
	python .ci/scripts/cuda_benchmark.py \
	--runner_command "$RUNNER_CMD" \
	--model_name "$MODEL_NAME" \
	--num_runs "${{ matrix.num_runs }}" \
	--output_json "$RESULTS_DIR/benchmark_results.json" \
	--output_v3 "$RESULTS_DIR/benchmark_results_v3.json" \
	--model "${{ matrix.model }}" \
	--quantization "${{ matrix.quant }}" \
	--git_sha "${{ github.sha }}" \
	--workflow_run_id "${{ github.run_id }}" \
	--workflow_run_url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
	--gpu_name "$GPU_NAME" \
	--cuda_driver_version "$CUDA_DRIVER_VERSION"

	# Save additional metadata
	cat > "$RESULTS_DIR/metadata.json" <<EOF
	{
	"model": "${{ matrix.model }}",
	"quantization": "${{ matrix.quant }}",
	"num_runs": ${{ matrix.num_runs }},
	"runner": "$RUNNER",
	"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
	"git_sha": "${{ github.sha }}",
	"workflow_run_id": "${{ github.run_id }}",
	"workflow_run_url": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
	}
	EOF

	# Only copy benchmark results to RUNNER_ARTIFACT_DIR for upload (not the entire model)
	# First, clean up the downloaded model artifacts from RUNNER_ARTIFACT_DIR
	rm -rf "${RUNNER_ARTIFACT_DIR}"/*

	# Then copy only the benchmark result JSON files
	cp "$RESULTS_DIR"/*.json "${RUNNER_ARTIFACT_DIR}/"
	echo "Benchmark results prepared for upload:"
	ls -lah "${RUNNER_ARTIFACT_DIR}"
	echo "::endgroup::"

	upload-benchmark-results:
	needs:
	- changed-files
	- run-decision
	- benchmark-cuda
	# Same gate as benchmark-cuda — skip the upload when the gate
	# closed (no benchmarks ran).
	if: \|
	always() &&
	(
	contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-perf.yml') \|\|
	contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_benchmark.py') \|\|
	contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_perf_prompts') \|\|
	contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') \|\|
	contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') \|\|
	needs.run-decision.outputs.is-full-run == 'true'
	)
	runs-on: ubuntu-22.04
	environment: upload-benchmark-results
	permissions:
	id-token: write
	contents: read
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: false

	- name: Setup Python
	uses: actions/setup-python@v4
	with:
	python-version: '3.10'

	- name: Download all benchmark results
	uses: actions/download-artifact@v4
	with:
	pattern: results-*
	path: all_results/

	- name: Process and display results
	shell: bash
	run: \|
	set -eux
	echo "::group::Benchmark Results Summary"

	for RESULT_DIR in all_results/results-*/; do
	if [ -f "$RESULT_DIR/benchmark_results.json" ]; then
	echo ""
	echo "================================"
	echo "Results from: $(basename "$RESULT_DIR")"
	echo "================================"

	# Display benchmark results (mean performance)
	cat "$RESULT_DIR/benchmark_results.json" \| python -m json.tool

	# Display metadata
	if [ -f "$RESULT_DIR/metadata.json" ]; then
	echo ""
	echo "--- Metadata ---"
	cat "$RESULT_DIR/metadata.json" \| python -m json.tool
	fi
	echo ""
	fi
	done

	echo "::endgroup::"

	- name: Authenticate with AWS
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
	role-duration-seconds: 18000
	aws-region: us-east-1

	- name: Upload to S3
	shell: bash
	env:
	S3_BUCKET: gha-artifacts
	S3_PREFIX: executorch-cuda-perf/${{ github.run_id }}/${{ github.run_attempt }}
	run: \|
	set -eux
	pip install awscli

	echo "Uploading benchmark results to S3..."
	aws s3 sync all_results/ "s3://${S3_BUCKET}/${S3_PREFIX}/" \
	--exclude "*" \
	--include "*.json" \
	--include "*.log"

	echo "Results uploaded to: s3://${S3_BUCKET}/${S3_PREFIX}/"

	- name: Prepare v3 results for dashboard upload
	shell: bash
	run: \|
	set -eux
	echo "::group::Prepare v3 results"

	mkdir -p benchmark-results/v3

	# Collect all v3 results into a single directory
	for RESULT_DIR in all_results/results-*/; do
	if [ -f "$RESULT_DIR/benchmark_results_v3.json" ]; then
	# Generate unique filename based on directory name
	FILENAME=$(basename "$RESULT_DIR")
	cp "$RESULT_DIR/benchmark_results_v3.json" "benchmark-results/v3/${FILENAME}.json"
	echo "✓ Copied $FILENAME v3 results"
	fi
	done

	echo "V3 results prepared:"
	ls -lah benchmark-results/v3/
	echo "::endgroup::"

	- name: Upload benchmark results to dashboard
	uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
	with:
	benchmark-results-dir: benchmark-results/v3
	dry-run: false
	schema-version: v3
	github-token: ${{ secrets.GITHUB_TOKEN }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Arm backend: Add adaptive pooling node visitors (#20220) #1302

Workflow file

Arm backend: Add adaptive pooling node visitors (#20220) #1302

Uh oh!

Workflow file for this run