diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt index 0ece60559d8..249ca72981d 100644 --- a/.ci/docker/requirements.txt +++ b/.ci/docker/requirements.txt @@ -26,6 +26,7 @@ fastapi matplotlib librosa torch==2.12 +cuda-bindings>=13.1.0 # Required for CUDA graph annotations tutorial torchvision torchdata networkx diff --git a/_static/img/cuda_graph_trace_after.png b/_static/img/cuda_graph_trace_after.png new file mode 100644 index 00000000000..78c5ec613e1 Binary files /dev/null and b/_static/img/cuda_graph_trace_after.png differ diff --git a/_static/img/cuda_graph_trace_before.png b/_static/img/cuda_graph_trace_before.png new file mode 100644 index 00000000000..9a2e51e4167 Binary files /dev/null and b/_static/img/cuda_graph_trace_before.png differ diff --git a/advanced_source/cuda_graph_annotations_tutorial.py b/advanced_source/cuda_graph_annotations_tutorial.py new file mode 100644 index 00000000000..e2701aefba7 --- /dev/null +++ b/advanced_source/cuda_graph_annotations_tutorial.py @@ -0,0 +1,536 @@ +# -*- coding: utf-8 -*- +""" +.. _cuda-graph-annotations-tutorial: + +CUDA Graph Kernel Annotations and Profiling +============================================ + +**Author**: `PyTorch Team `_ + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How to capture CUDA graphs with kernel annotations + * How to profile annotated graphs + * How to post-process traces with semantic kernel lanes + * How to visualize graph execution with custom stream assignments + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 2.12+ + * CUDA-capable GPU + * Driver/CUDA-compat >= 13.1 for annotation support + * cuda-bindings >= 13.1.0 + +CUDA graphs are a powerful optimization technique that can significantly reduce +kernel launch overhead by capturing and replaying sequences of CUDA operations. +However, when profiling CUDA graphs, all kernels appear on the same stream, +making it difficult to understand the logical structure of your computation. + +This tutorial demonstrates how to use **kernel annotations** to add semantic +labels to kernels within CUDA graphs. These annotations can be merged back into +profiler traces to create custom visualization lanes, making it easier to +understand and debug complex graph executions. +""" + +############################################################################### +# Overview +# -------- +# +# CUDA graph kernel annotations allow you to add semantic labels to kernels +# during graph capture. These labels help you understand what each kernel does +# when profiling, making it easy to identify which parts of your model (e.g., +# attention, MLP, normalization) are executing at any given time. +# +# Without annotations, profiler traces show all kernels on a single stream with +# auto-generated names, making it difficult to understand the logical structure +# of your computation. With annotations, you can: +# +# 1. **Label kernel groups** with meaningful names during capture +# 2. **Assign custom stream IDs** for visual organization +# 3. **Merge labels into profiler traces** for semantic visualization +# +# The result is a profiler trace where kernels are labeled and organized by +# their function, making it much easier to identify performance bottlenecks +# and understand execution flow. +# +# **Before annotations:** All kernels appear on a single stream with +# auto-generated names, making it difficult to understand which operations +# belong to which logical component of your model. +# +# .. image:: /_static/img/cuda_graph_trace_before.png +# :width: 100% +# :alt: CUDA graph trace before annotations showing all kernels on one stream +# +# **After annotations:** Kernels are organized into semantic lanes (streams 61 +# and 62) with meaningful labels like "attention" and "mlp", making it easy to +# identify different components and understand the execution structure. +# +# .. image:: /_static/img/cuda_graph_trace_after.png +# :width: 100% +# :alt: CUDA graph trace after annotations showing kernels organized by function +# +# Requirements +# ------------ +# +# For this tutorial, you'll need: +# +# - PyTorch 2.12+ +# - A CUDA GPU +# - Driver/CUDA-compat >= 13.1 for annotation support +# - The ``cuda-bindings`` package >= 13.1.0 (``pip install cuda-python``) +# +# The cuda-bindings package provides the Python bindings for CUDA runtime APIs. +# Version 13.1.0+ is required for the ``cudaGraphNodeGetToolsId`` API that +# enables kernel annotations. If you have an older version, the tutorial will +# run but annotations will be disabled with a warning message explaining how +# to upgrade. +# +# On older drivers or cuda-bindings versions, the capture and profiling will +# still work, but ``mark_kernels`` will be a no-op and no semantic lanes will +# appear in the final trace. + +import copy +import math +import pickle +import sys +from collections import Counter +from pathlib import Path + +import torch +from torch.profiler import profile, ProfilerActivity +from torch.cuda._graph_annotations import ( + get_kernel_annotations, + mark_kernels, + _is_tools_id_unavailable, +) +from torch.cuda._annotate_cuda_graph_trace import ( + annotate_trace, + load_trace, + save_trace, + _fix_overlapping_timestamps, + _move_overlapping_to_stream, +) + +############################################################################### +# Building a Model +# ---------------- +# +# Let's create a simple transformer block as our example model. We'll annotate +# different parts of the computation (QKV projection, attention, output +# projection, MLP) to see them as separate lanes in the profiler. + +def build_transformer_block(): + """Create a simple transformer block with parameters.""" + device = "cuda" + torch.manual_seed(0) + + # Model dimensions + batch_size, seq_len, dim, num_heads = 4, 256, 1024, 8 + head_dim = dim // num_heads + + # Initialize parameters + params = { + "x": torch.randn(batch_size, seq_len, dim, device=device), + "Wqkv": torch.randn(dim, 3 * dim, device=device) / math.sqrt(dim), + "Wo": torch.randn(dim, dim, device=device) / math.sqrt(dim), + "W1": torch.randn(dim, 4 * dim, device=device) / math.sqrt(dim), + "W2": torch.randn(4 * dim, dim, device=device) / math.sqrt(4 * dim), + } + + def forward(): + """Forward pass with annotated regions.""" + B, T, D, H = batch_size, seq_len, dim, num_heads + hd = head_dim + + # Annotate QKV projection + with mark_kernels({"name": "qkv_proj"}): + qkv = params["x"] @ params["Wqkv"] + + # Reshape for multi-head attention + q, k, v = qkv.split(D, dim=-1) + q = q.view(B, T, H, hd).transpose(1, 2) + k = k.view(B, T, H, hd).transpose(1, 2) + v = v.view(B, T, H, hd).transpose(1, 2) + + # Annotate attention computation (optionally on a custom stream) + with mark_kernels({"name": "attention", "stream": 62}): + scores = (q @ k.transpose(-1, -2)) / math.sqrt(hd) + attn = torch.softmax(scores, dim=-1) + ctx = (attn @ v).transpose(1, 2).reshape(B, T, D) + + # Annotate output projection + with mark_kernels({"name": "out_proj"}): + o = ctx @ params["Wo"] + + # Annotate MLP (on another custom stream) + with mark_kernels({"name": "mlp", "stream": 61}): + return torch.nn.functional.gelu(o @ params["W1"]) @ params["W2"] + + return forward + +############################################################################### +# The ``mark_kernels`` Context Manager +# ------------------------------------- +# +# The key API is ``mark_kernels()``, which takes a dictionary with: +# +# - ``name``: A string label for this kernel group (becomes the lane name) +# - ``stream`` (optional): A virtual stream ID for visualization +# +# Any CUDA kernels launched within the context will be tagged with these +# annotations. Later, when we post-process the profiler trace, these tags +# will be used to organize kernels into custom lanes. + +############################################################################### +# Capturing a CUDA Graph with Annotations +# ---------------------------------------- +# +# To capture a graph with annotations enabled, we pass +# ``enable_annotations=True`` to ``torch.cuda.graph()``. This automatically +# handles the annotation lifecycle: enabling, resolving, and remapping. + +def capture_graph_with_annotations(model_fn): + """Capture the model into a CUDA graph with annotations enabled.""" + # Warm up on a side stream before capture + warmup_stream = torch.cuda.Stream() + warmup_stream.wait_stream(torch.cuda.current_stream()) + + with torch.cuda.stream(warmup_stream): + for _ in range(3): + model_fn() + + torch.cuda.current_stream().wait_stream(warmup_stream) + + # Capture with annotations enabled + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, enable_annotations=True): + output = model_fn() + + num_annotations = len(get_kernel_annotations()) + print(f"Captured graph with {num_annotations} annotated nodes") + + return graph, output + +############################################################################### +# Profiling the Graph +# ------------------- +# +# After capturing the graph, we replay it a few times to warm up, then profile +# subsequent replays. The profiler will record kernel execution times, which +# we'll later merge with our annotations. + +def profile_graph(graph, output_dir): + """Profile graph replays and save the trace.""" + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + + # Warm up replays + for _ in range(3): + graph.replay() + torch.cuda.synchronize() + + # Profile several replays + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: + for _ in range(5): + graph.replay() + torch.cuda.synchronize() + + # Export the raw trace + trace_path = output_dir / "trace_raw.json.gz" + prof.export_chrome_trace(str(trace_path)) + print(f"Saved raw trace to {trace_path}") + + return trace_path + +############################################################################### +# Saving Annotation Metadata +# --------------------------- +# +# We need to save the annotation metadata in a pickle file that the +# post-processing tool can discover. The file should be named +# ``kernel_annotations_rank0_fwd_bwd.pkl`` and placed where the trace tool +# can find it. + +def save_annotations(output_dir): + """Save kernel annotations to a pickle file.""" + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + annotations_path = output_dir / "kernel_annotations_rank0_fwd_bwd.pkl" + + annotations = dict(get_kernel_annotations()) + with open(annotations_path, "wb") as f: + pickle.dump(annotations, f) + + print(f"Saved {len(annotations)} annotations to {annotations_path}") + return annotations_path + +############################################################################### +# Post-Processing: Merging Annotations into Traces +# ------------------------------------------------- +# +# The final step is to merge the annotations back into the trace. This involves: +# +# 1. Loading the raw trace and annotations +# 2. Calling ``annotate_trace()`` to apply the annotations +# 3. Running cleanup passes to handle overlapping kernels +# 4. Saving the annotated trace +# +# The result is a trace where kernels are organized by your semantic labels. + +def post_process_trace(raw_trace_path, annotations_path, output_dir): + """Merge annotations into the trace and apply cleanup.""" + output_dir = Path(output_dir) + + # Load raw trace and annotations + raw_trace = load_trace(raw_trace_path) + with open(annotations_path, "rb") as f: + annotations = pickle.load(f) + + # Make a copy for post-processing + annotated_trace = copy.deepcopy(raw_trace) + + # Apply annotations + num_annotated = annotate_trace(annotated_trace, annotations) + print(f"Annotated {num_annotated} kernels in the trace") + + # Cleanup passes: move overlapping kernels and fix timestamps + _move_overlapping_to_stream(annotated_trace) + _fix_overlapping_timestamps(annotated_trace) + + # Save the annotated trace + annotated_path = output_dir / "trace_annotated.json.gz" + save_trace(annotated_trace, annotated_path) + print(f"Saved annotated trace to {annotated_path}") + + return annotated_path, raw_trace, annotated_trace + +############################################################################### +# Comparing Before and After +# --------------------------- +# +# To see the impact of annotations, let's count how kernels are distributed +# across thread IDs (which represent visualization lanes in the trace). + +def compare_traces(raw_trace, annotated_trace): + """Compare kernel distribution before and after annotation.""" + def count_lanes(trace): + """Count kernels per lane (tid).""" + counter = Counter( + event["tid"] + for event in trace["traceEvents"] + if event.get("cat") == "kernel" + ) + return dict(sorted(counter.items())) + + raw_lanes = count_lanes(raw_trace) + annotated_lanes = count_lanes(annotated_trace) + + print("\n" + "="*60) + print("BEFORE annotation - kernels per lane (tid -> count):") + for tid, count in raw_lanes.items(): + print(f" Stream {tid}: {count} kernels") + + print("\nAFTER annotation - kernels per lane (tid -> count):") + for tid, count in annotated_lanes.items(): + print(f" Stream {tid}: {count} kernels") + print("="*60) + +############################################################################### +# Putting It All Together +# ------------------------ +# +# Now let's run the complete workflow: build a model, capture it with +# annotations, profile it, and post-process the trace. + +def main(): + """End-to-end CUDA graph annotation and profiling demo.""" + if not torch.cuda.is_available(): + raise SystemExit("CUDA required for this tutorial") + + # Check if annotation support is available + # PyTorch will log a warning if cuda-bindings version is too old + supported = not _is_tools_id_unavailable() + print(f"Annotation support available: {supported}") + if not supported: + print("NOTE: Annotation API not available.") + print("This could be due to:") + print(" - Driver/CUDA-compat < 13.1") + print(" - Outdated cuda-bindings (check PyTorch warnings above)") + print("Annotations will not be recorded, but the demo will still run.") + print("Any lane changes you see are from cleanup passes, not annotations.\n") + + output_dir = Path("traces") + + # Build the model + print("\n1. Building transformer block model...") + model_fn = build_transformer_block() + + # Capture graph with annotations + print("\n2. Capturing CUDA graph with annotations...") + graph, output = capture_graph_with_annotations(model_fn) + + # Save annotations + print("\n3. Saving annotation metadata...") + annotations_path = save_annotations(output_dir) + + # Profile the graph + print("\n4. Profiling graph replays...") + raw_trace_path = profile_graph(graph, output_dir) + + # Post-process the trace + print("\n5. Post-processing: merging annotations into trace...") + annotated_path, raw_trace, annotated_trace = post_process_trace( + raw_trace_path, annotations_path, output_dir + ) + + # Compare before and after + print("\n6. Comparing traces...") + compare_traces(raw_trace, annotated_trace) + + # Summary + print("\n" + "="*60) + print("SUMMARY") + print("="*60) + print(f"Raw trace: {raw_trace_path}") + print(f"Annotated trace: {annotated_path}") + print(f"Annotations: {annotations_path}") + print("\nOpen the annotated trace in chrome://tracing to visualize") + print("the semantic kernel lanes.") + print("="*60) + +# Example output: +# if __name__ == "__main__": +# main() +# +# Annotation support available: True +# +# 1. Building transformer block model... +# +# 2. Capturing CUDA graph with annotations... +# Captured graph with 13 annotated nodes +# +# 3. Saving annotation metadata... +# Saved 13 annotations to traces/kernel_annotations_rank0_fwd_bwd.pkl +# +# 4. Profiling graph replays... +# Saved raw trace to traces/trace_raw.json.gz +# +# 5. Post-processing: merging annotations into trace... +# Annotated 65 kernels in the trace +# Saved annotated trace to traces/trace_annotated.json.gz +# +# 6. Comparing traces... +# +# ============================================================ +# BEFORE annotation - kernels per lane (tid -> count): +# Stream 7: 65 kernels +# +# AFTER annotation - kernels per lane (tid -> count): +# Stream 7: 10 kernels +# Stream 61: 15 kernels +# Stream 62: 40 kernels +# ============================================================ +# +# ============================================================ +# SUMMARY +# ============================================================ +# Raw trace: traces/trace_raw.json.gz +# Annotated trace: traces/trace_annotated.json.gz +# Annotations: traces/kernel_annotations_rank0_fwd_bwd.pkl +# +# Open the annotated trace in chrome://tracing to visualize +# the semantic kernel lanes. +# ============================================================ + +############################################################################### +# Visualizing Results +# ------------------- +# +# To view the annotated trace: +# +# 1. Open Chrome/Chromium browser +# 2. Navigate to ``chrome://tracing`` +# 3. Click "Load" and select the ``trace_annotated.json.gz`` file +# 4. You should see kernels organized into custom lanes like "qkv_proj", +# "attention", "out_proj", and "mlp" +# +# The custom stream IDs (61, 62) specified in ``mark_kernels`` appear as +# separate lanes, making it easy to see which operations run concurrently +# or sequentially. + +############################################################################### +# Understanding the Cleanup Passes +# --------------------------------- +# +# The post-processing applies two cleanup functions: +# +# 1. ``_move_overlapping_to_stream()``: If kernels on the same lane overlap +# in time, move one to a different lane. This prevents visual overlap in +# the trace viewer. +# +# 2. ``_fix_overlapping_timestamps()``: Adjust timestamps slightly if +# overlapping kernels would cause confusion. This is a last resort to +# ensure the trace renders correctly. +# +# These passes ensure that the trace is both accurate and readable, even +# when the original execution has complex concurrency patterns. + +############################################################################### +# Performance Considerations +# --------------------------- +# +# Kernel annotations add minimal overhead: +# +# - Annotation marking happens during graph capture (one-time cost) +# - Graph replay performance is identical to unannotated graphs +# - Post-processing is offline and doesn't affect runtime +# +# The main cost is the profiling itself, which you would do anyway when +# optimizing performance. Annotations simply make the profiler output more +# useful by adding semantic structure. + +############################################################################### +# Troubleshooting +# --------------- +# +# **No annotations in the trace?** +# +# - Check that your driver/CUDA-compat >= 13.1 +# - Verify that ``enable_annotations=True`` was passed to ``torch.cuda.graph()`` +# - Ensure ``cuda-python`` is installed +# +# **Kernels still overlapping in the trace?** +# +# - The cleanup passes should handle this automatically +# - If issues persist, try assigning explicit stream IDs in ``mark_kernels`` +# +# **Annotations not showing up in specific kernels?** +# +# - Some operations may not launch kernels (e.g., tensor views) +# - Only kernels launched within the ``mark_kernels`` context are annotated +# - Verify the operation actually produces CUDA kernels using ``torch.profiler`` + +############################################################################### +# Conclusion +# ---------- +# +# CUDA graph kernel annotations provide a powerful way to add semantic +# structure to your profiling traces. By marking logical components of your +# model during graph capture and merging these annotations in post-processing, +# you can create visualizations that make it much easier to understand and +# optimize complex CUDA graph executions. +# +# Key takeaways: +# +# - Use ``mark_kernels()`` to label regions during graph capture +# - Enable annotations with ``enable_annotations=True`` +# - Post-process traces with ``annotate_trace()`` and cleanup passes +# - View results in chrome://tracing for intuitive visualization +# +# This technique is especially valuable for large models with many components, +# distributed training setups, or any scenario where understanding the +# execution structure is critical for performance optimization. diff --git a/deep-dive.rst b/deep-dive.rst index 89a18d48e29..77df5d45c89 100644 --- a/deep-dive.rst +++ b/deep-dive.rst @@ -37,6 +37,13 @@ and speed. :image: _static/img/thumbnails/cropped/pytorch-logo.png :tags: Profiling +.. customcarditem:: + :header: CUDA Graph Kernel Annotations and Profiling + :card_description: Learn how to annotate CUDA graph kernels for semantic profiling traces with custom visualization lanes. + :image: _static/img/thumbnails/cropped/profiler.png + :link: advanced/cuda_graph_annotations_tutorial.html + :tags: Profiling,CUDA,Model-Optimization + .. customcarditem:: :header: Parametrizations Tutorial :card_description: Learn how to use torch.nn.utils.parametrize to put constraints on your parameters (e.g. make them orthogonal, symmetric positive definite, low-rank...) @@ -140,6 +147,7 @@ and speed. :hidden: beginner/profiler + advanced/cuda_graph_annotations_tutorial beginner/vt_tutorial intermediate/parametrizations intermediate/pruning_tutorial diff --git a/index.rst b/index.rst index e4bff75217f..8cbf2954acb 100644 --- a/index.rst +++ b/index.rst @@ -515,6 +515,13 @@ Welcome to PyTorch Tutorials :link: beginner/profiler.html :tags: Model-Optimization,Best-Practice,Profiling +.. customcarditem:: + :header: CUDA Graph Kernel Annotations and Profiling + :card_description: Learn how to annotate CUDA graph kernels for semantic profiling traces with custom visualization lanes. + :image: _static/img/thumbnails/cropped/profiler.png + :link: advanced/cuda_graph_annotations_tutorial.html + :tags: Model-Optimization,Best-Practice,Profiling,CUDA + .. customcarditem:: :header: Hyperparameter Tuning Tutorial :card_description: Learn how to use Ray Tune to find the best performing set of hyperparameters for your model.