SandAI-org · wtr0504 · Apr 11, 2026 · Apr 13, 2026 · Apr 28, 2026 · Apr 29, 2026
diff --git a/.github/codestyle/copyright.hook b/.github/codestyle/copyright.hook
@@ -43,7 +43,7 @@ def _get_comment_mark(path):
     if lang_type.search(path) is not None:
         return "#"
 
-    lang_type=re.compile(r"\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$")
+    lang_type=re.compile(r"\.(h|c|hpp|hxx|cc|cpp|cxx|cu|go|cuh|proto)$")
     if lang_type.search(path) is not None:
         return "//"
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ repos:
         name: copyright_checker
         entry: python3 ./.github/codestyle/copyright.hook
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$
+        files: \.(c|cc|cxx|cpp|cu|cuh|h|hpp|hxx|proto|py|sh)$
 -   repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.4.0
     hooks:

diff --git a/Dockerfile b/Dockerfile
@@ -3,6 +3,25 @@ FROM nvcr.io/nvidia/pytorch:25.10-py3
 
 ARG FLASH_ATTENTION_COMMIT_ID="b613d9e2c8475945baff3fd68f2030af1b890acf"
 
+# CUTLASS — source is always cloned (the magi_compiler EVT-fusion path
+# JIT-includes its headers and our /usr/local/cutlass tree is the readable
+# reference checkout). The CMake-driven profiler/library is compiled
+# only for supported targets; every other arch gets headers only.
+#
+# Supported NVCC arch strings (CUTLASS_NVCC_ARCHS):
+#   90a  — Hopper (H100, compute_cap 9.x, WGMMA/TMA)
+#   120a — consumer Blackwell (RTX 50 series, compute_cap 12.x)
+#
+# Override behaviour with build args:
+#   --build-arg CUTLASS_BUILD=yes|no|auto
+#     yes  — force cmake configure (requires CUTLASS_NVCC_ARCHS or a GPU)
+#     no   — skip cmake even if a supported GPU is present
+#     auto — (default) compile iff nvidia-smi reports 9.x or 12.x
+#   --build-arg CUTLASS_NVCC_ARCHS=90a|120a
+ARG CUTLASS_COMMIT_ID="f74fea9ce35868d3ae9f8d1dce1969d7250d3f90"
+ARG CUTLASS_BUILD="auto"
+ARG CUTLASS_NVCC_ARCHS=""
+
 ENV PIP_NO_CACHE_DIR=1 \
     PIP_DISABLE_PIP_VERSION_CHECK=1 \
     PYTHONDONTWRITEBYTECODE=1
@@ -18,6 +37,7 @@ RUN --mount=type=secret,id=http_proxy,required=false \
     ca-certificates \
     git \
     build-essential \
+    cmake \
     ninja-build && \
     rm -rf /var/lib/apt/lists/* && \
     apt-get clean
@@ -42,6 +62,65 @@ RUN --mount=type=secret,id=http_proxy,required=false \
     cp /tmp/flash-attention/hopper/flash_attn_interface.py ${python_path}/flash_attn_3/ && \
     rm -rf /tmp/flash-attention
 
+
+RUN --mount=type=secret,id=http_proxy,required=false \
+    --mount=type=secret,id=https_proxy,required=false \
+    export http_proxy="$(cat /run/secrets/http_proxy 2>/dev/null || true)" && \
+    export https_proxy="$(cat /run/secrets/https_proxy 2>/dev/null || true)" && \
+    mkdir -p /usr/local/cutlass && \
+    cd /usr/local/cutlass && \
+    git init -q && \
+    git remote add origin https://github.com/NVIDIA/cutlass.git && \
+    git fetch origin ${CUTLASS_COMMIT_ID} --depth 1 && \
+    git checkout ${CUTLASS_COMMIT_ID} && \
+    (git submodule update --init --recursive --depth 1 --jobs 8 || \
+     git submodule update --init --recursive --depth 1 --jobs 1)
+
+
+RUN set -eu; \
+    _cutlass_arch_from_gpu() { \
+        if ! command -v nvidia-smi >/dev/null 2>&1; then return 1; fi; \
+        cap="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -n1 | tr -d ' ')"; \
+        case "${cap}" in \
+            9.*) echo "90a" ;; \
+            12.*) echo "120a" ;; \
+            *) return 1 ;; \
+        esac; \
+    }; \
+    if [ -n "${CUTLASS_NVCC_ARCHS}" ]; then \
+        NVCC_ARCHS="${CUTLASS_NVCC_ARCHS}"; \
+        echo "[CUTLASS] Using CUTLASS_NVCC_ARCHS=${NVCC_ARCHS} (build-arg override)."; \
+    elif arch="$(_cutlass_arch_from_gpu)"; then \
+        NVCC_ARCHS="${arch}"; \
+        echo "[CUTLASS] nvidia-smi → CUTLASS_NVCC_ARCHS=${NVCC_ARCHS}."; \
+    else \
+        NVCC_ARCHS=""; \
+    fi; \
+    case "${CUTLASS_BUILD}" in \
+        no) echo "[CUTLASS] CUTLASS_BUILD=no — skipping cmake configure."; exit 0 ;; \
+        yes) \
+            if [ -z "${NVCC_ARCHS}" ]; then \
+                echo "[CUTLASS] CUTLASS_BUILD=yes but no arch: set CUTLASS_NVCC_ARCHS=90a|120a or build on a 9.x/12.x GPU."; \
+                exit 1; \
+            fi; \
+            DO_BUILD=1 ;; \
+        auto) \
+            if [ -z "${NVCC_ARCHS}" ]; then \
+                echo "[CUTLASS] No sm_90/sm_120 GPU and no CUTLASS_NVCC_ARCHS — skipping cmake (headers still available)."; \
+                exit 0; \
+            fi; \
+            DO_BUILD=1 ;; \
+        *) echo "[CUTLASS] Unknown CUTLASS_BUILD=${CUTLASS_BUILD}"; exit 1 ;; \
+    esac; \
+    case "${NVCC_ARCHS}" in \
+        90a|120a) ;; \
+        *) echo "[CUTLASS] Unsupported CUTLASS_NVCC_ARCHS=${NVCC_ARCHS} (expected 90a or 120a)."; exit 1 ;; \
+    esac; \
+    [ -n "${DO_BUILD:-}" ] && cd /usr/local/cutlass && \
+    export CUDACXX="${CUDA_INSTALL_PATH:-${CUDA_HOME:-/usr/local/cuda}}/bin/nvcc" && \
+    mkdir -p build && cd build && \
+    cmake .. -DCUTLASS_NVCC_ARCHS="${NVCC_ARCHS}"
+
 RUN --mount=type=secret,id=http_proxy,required=false \
     --mount=type=secret,id=https_proxy,required=false \
     export http_proxy="$(cat /run/secrets/http_proxy 2>/dev/null || true)" && \

diff --git a/README.md b/README.md
@@ -106,6 +106,18 @@ pip install -r requirements.txt
 # Step 4 — Install MagiCompiler (pick one)
 pip install .   # End users (recommended)
 # pip install -e . --no-build-isolation --config-settings editable_mode=compat  # Developer / editable
+
+# Step 5 (optional) — Install CUTLASS for matmul epilogue fusion
+# Required for the CUTLASS-based matmul + epilogue fusion pass (sm_90 / sm_120).
+# Without CUTLASS the compiler still works but skips this optimization.
+git clone --depth 1 https://github.com/NVIDIA/cutlass.git /usr/local/cutlass
+# Or specify a custom path:
+#   git clone --depth 1 https://github.com/NVIDIA/cutlass.git /your/path
+#   export MAGI_CUTLASS_ROOT=/your/path
+export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
+mkdir /usr/local/cutlass/build && cd /usr/local/cutlass/build
+cmake .. -DCUTLASS_NVCC_ARCHS=90a # compiles for NVIDIA Hopper GPU architecture
+# cmake .. -DCUTLASS_NVCC_ARCHS=120a # compiles for NVIDIA consumer Blackwell (RTX 50 series)
 ```
 
 ---

diff --git a/magi_compiler/config.py b/magi_compiler/config.py
@@ -64,6 +64,18 @@ class PassConfig(BaseModel):
     # TODO: Add sequence parallelism pass and async TP pass.
     # TODO: Add Ulysses overlap pass.
     enable_sage_attn: bool = Field(False, description="Whether to replace flash attention with sage attention.")
+    enable_mm_epilogue_fusion: bool = Field(
+        False,
+        description=(
+            "Whether to enable the matmul + elementwise epilogue fusion pass. "
+            "On RTX 5090 (sm_120) this lowers fused chains to a CUTLASS Sm80EVT "
+            "kernel via the fusion.MatmulEvtEpilogueFusionPass; on H100 "
+            "(sm_90) the swiglu sub-path additionally uses the native Sm90 "
+            "TMA + WGMMA DualGemm. The pass is a no-op on older architectures "
+            "regardless of this flag, but the flag still controls whether it "
+            "is registered at all."
+        ),
+    )
 
     @property
     def hash(self) -> str:
@@ -141,6 +153,14 @@ class OffloadConfig(BaseModel):
     bandwidth_safety_factor: float = Field(0.9, description="The safety factor for the H2D bandwidth.")
 
 
+def _find_cutlass_root() -> str:
+    """Return the CUTLASS source root, or empty string if not found."""
+    path = os.environ.get("MAGI_CUTLASS_ROOT", "/usr/local/cutlass")
+    if os.path.isdir(path):
+        return path
+    return ""
+
+
 class CompileConfig(BaseSettings):
     """Top-level configuration consumed by ``magi_compile`` and the MagiCompiler backend.
 
@@ -172,6 +192,10 @@ class CompileConfig(BaseSettings):
         default=os.path.expanduser("~/.cache/magi_compiler"),
         description="Root directory for persisting compiled artifacts and debug dumps.",
     )
+    cutlass_root: str = Field(
+        default_factory=_find_cutlass_root,
+        description="Path to the CUTLASS source tree. Default: $MAGI_CUTLASS_ROOT or /usr/local/cutlass.",
+    )
 
     # ---- Compilation mode ----
     aot: bool = Field(
@@ -234,6 +258,10 @@ class CompileConfig(BaseSettings):
         ),
     )
 
+    @property
+    def has_cutlass(self) -> bool:
+        return bool(self.cutlass_root)
+
     @property
     def hash(self) -> str:
         return compute_hash(self.model_dump(mode="json"))

diff --git a/magi_compiler/passes/full_graph/full_graph_pass_mgr.py b/magi_compiler/passes/full_graph/full_graph_pass_mgr.py
@@ -16,6 +16,7 @@
 
 from ...magi_depyf.timeline import observe_lifecycle
 from .remove_item import RemoveItemPass
+from .remove_useless_ops import EliminateIdentityViewCastPass
 from .replace_sage_atten import ReplaceSageAttentionPass
 
 
@@ -30,6 +31,7 @@ def __init__(self, pass_config):
         if self.pass_config.enable_sage_attn:
             self.passes.append(ReplaceSageAttentionPass())
         self.passes.append(RemoveItemPass())
+        self.passes.append(EliminateIdentityViewCastPass())
 
     @observe_lifecycle("full_graph_manager")
     def __call__(self, gm: torch.fx.GraphModule):

diff --git a/magi_compiler/passes/full_graph/remove_useless_ops.py b/magi_compiler/passes/full_graph/remove_useless_ops.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch._inductor.fx_passes.pre_grad
+
+from ...magi_depyf.timeline import emit_pass_lifecycle
+from ..pass_base import MagiInductorPass
+
+
+class EliminateIdentityViewCastPass(MagiInductorPass):
+    """
+    Remove useless convert, view, reshape operations.
+    When their input already has the target type and shape, these operations are redundant.
+    """
+
+    TARGET_METHODS = {
+        "view",
+        "reshape",
+        "to",
+        "type",
+        "contiguous",
+        "flatten",
+        "permute",
+        "transpose",
+        "t",
+        "unsqueeze",
+        "squeeze",
+        "expand",
+        "repeat",
+        "bfloat16",
+        "float",
+        "half",
+        "int",
+        "long",
+        "short",
+        "double",
+        "bool",
+        "byte",
+    }
+
+    @staticmethod
+    def _get_tensor_info(node: torch.fx.Node):
+        # Get tensor info from example_value
+        if "example_value" in node.meta:
+            val = node.meta["example_value"]
+            if isinstance(val, torch.Tensor):
+                return val.shape, val.dtype, val.stride()
+            elif isinstance(val, (list, tuple)) and len(val) > 0 and isinstance(val[0], torch.Tensor):
+                return val[0].shape, val[0].dtype, val[0].stride()
+
+        return None, None, None
+
+    def is_applicable(self, graph: torch.fx.Graph, shape: int | None = None) -> bool:
+        for node in graph.nodes:
+            if node.op == "call_method" and node.target in self.TARGET_METHODS:
+                return True
+        return False
+
+    @emit_pass_lifecycle
+    def __call__(self, graph: torch.fx.Graph):
+        nodes_to_remove = []
+
+        for node in graph.nodes:
+            is_target_method = node.op == "call_method" and node.target in self.TARGET_METHODS
+            if not is_target_method:
+                continue
+
+            # Need at least one argument (the input tensor)
+            if not node.args or not isinstance(node.args[0], torch.fx.Node):
+                continue
+
+            input_node = node.args[0]
+
+            node_shape, node_dtype, node_stride = self._get_tensor_info(node)
+            input_shape, input_dtype, input_stride = self._get_tensor_info(input_node)
+            if node_shape is None or input_shape is None:
+                continue
+            if node_dtype is None or input_dtype is None:
+                continue
+            # Some ops or metadata might not have stride properly captured,
+            # but if they do, we should require them to match to be totally safe against contiguous-forcing ops.
+            if node_stride is not None and input_stride is not None and node_stride != input_stride:
+                continue
+
+            # Check if shape and dtype match exactly
+            if node_shape == input_shape and node_dtype == input_dtype:
+                # For _to_copy, ensure we are not changing memory format or device or other properties implicitly,
+                # but typically in full graph if shape and dtype match, and it's on the same device, it's safe.
+                # Let's also check device just in case if it's available.
+                def get_device(n):
+                    if "example_value" in n.meta and isinstance(n.meta["example_value"], torch.Tensor):
+                        return n.meta["example_value"].device
+
+                node_device = get_device(node)
+                input_device = get_device(input_node)
+                if node_device is not None and input_device is not None and node_device != input_device:
+                    continue
+
+                # Replace uses
+                node.replace_all_uses_with(input_node)
+                nodes_to_remove.append(node)
+
+        for node in nodes_to_remove:
+            graph.erase_node(node)
diff --git a/magi_compiler/passes/piecewise_graph/fusion/__init__.py b/magi_compiler/passes/piecewise_graph/fusion/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/magi_compiler/passes/piecewise_graph/fusion/common/__init__.py b/magi_compiler/passes/piecewise_graph/fusion/common/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2026 SandAI. All Rights Reserved.