From bb053fbe689fb32b7004833a7c56ccd7c473eaaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 11 Nov 2025 14:45:20 +0000
Subject: [PATCH 1/3] Allow dashes in kernel names

This allows us to upload kernels with dashes in their names without
setting the repo ID in `general.hub`.
---
 build2cmake/src/config/v2.rs               | 7 +++++++
 build2cmake/src/main.rs                    | 4 +++-
 build2cmake/src/torch/cpu.rs               | 4 ++--
 build2cmake/src/torch/cuda.rs              | 4 ++--
 build2cmake/src/torch/metal.rs             | 2 +-
 build2cmake/src/torch/universal.rs         | 4 ++--
 build2cmake/src/torch/xpu.rs               | 4 ++--
 examples/silu-and-mul-universal/build.toml | 2 +-
 lib/build.nix                              | 4 ++--
 lib/torch-extension/arch.nix               | 4 +++-
 lib/torch-extension/no-arch.nix            | 8 ++++++--
 11 files changed, 31 insertions(+), 16 deletions(-)
diff --git a/build2cmake/src/config/v2.rs b/build2cmake/src/config/v2.rs
index ecbdd9ec..07fe2a1a 100644
--- a/build2cmake/src/config/v2.rs
+++ b/build2cmake/src/config/v2.rs
@@ -56,6 +56,13 @@ pub struct General {
     pub hub: Option<Hub>,
 }
 
+impl General {
+    /// Name of the kernel as a Python extension.
+    pub fn python_name(&self) -> String {
+        self.name.replace("-", "_")
+    }
+}
+
 #[derive(Debug, Deserialize, Serialize)]
 #[serde(deny_unknown_fields, rename_all = "kebab-case")]
 pub struct Hub {
diff --git a/build2cmake/src/main.rs b/build2cmake/src/main.rs
index 528b37ec..ca419389 100644
--- a/build2cmake/src/main.rs
+++ b/build2cmake/src/main.rs
@@ -344,7 +344,9 @@ fn clean(
     // Clean up empty directories
     let dirs_to_check = [
         target_dir.join("cmake"),
-        target_dir.join("torch-ext").join(&build.general.name),
+        target_dir
+            .join("torch-ext")
+            .join(build.general.python_name()),
         target_dir.join("torch-ext"),
     ];
 
diff --git a/build2cmake/src/torch/cpu.rs b/build2cmake/src/torch/cpu.rs
index bad4ea38..a2ef77d6 100644
--- a/build2cmake/src/torch/cpu.rs
+++ b/build2cmake/src/torch/cpu.rs
@@ -26,7 +26,7 @@ pub fn write_torch_ext_cpu(
 
     let mut file_set = FileSet::default();
 
-    let ops_name = kernel_ops_identifier(&target_dir, &build.general.name, ops_id);
+    let ops_name = kernel_ops_identifier(&target_dir, &build.general.python_name(), ops_id);
 
     write_cmake(
         env,
@@ -45,7 +45,7 @@ pub fn write_torch_ext_cpu(
         &mut file_set,
     )?;
 
-    write_ops_py(env, &build.general.name, &ops_name, &mut file_set)?;
+    write_ops_py(env, &build.general.python_name(), &ops_name, &mut file_set)?;
 
     write_pyproject_toml(env, &mut file_set)?;
 
diff --git a/build2cmake/src/torch/cuda.rs b/build2cmake/src/torch/cuda.rs
index 267fe576..0a189104 100644
--- a/build2cmake/src/torch/cuda.rs
+++ b/build2cmake/src/torch/cuda.rs
@@ -38,7 +38,7 @@ pub fn write_torch_ext_cuda(
 
     let mut file_set = FileSet::default();
 
-    let ops_name = kernel_ops_identifier(&target_dir, &build.general.name, ops_id);
+    let ops_name = kernel_ops_identifier(&target_dir, &build.general.python_name(), ops_id);
 
     write_cmake(
         env,
@@ -58,7 +58,7 @@ pub fn write_torch_ext_cuda(
         &mut file_set,
     )?;
 
-    write_ops_py(env, &build.general.name, &ops_name, &mut file_set)?;
+    write_ops_py(env, &build.general.python_name(), &ops_name, &mut file_set)?;
 
     write_pyproject_toml(env, &mut file_set)?;
 
diff --git a/build2cmake/src/torch/metal.rs b/build2cmake/src/torch/metal.rs
index 4b1edcf2..0d6198b8 100644
--- a/build2cmake/src/torch/metal.rs
+++ b/build2cmake/src/torch/metal.rs
@@ -47,7 +47,7 @@ pub fn write_torch_ext_metal(
         &mut file_set,
     )?;
 
-    write_ops_py(env, &build.general.name, &ops_name, &mut file_set)?;
+    write_ops_py(env, &build.general.python_name(), &ops_name, &mut file_set)?;
 
     write_pyproject_toml(env, &mut file_set)?;
 
diff --git a/build2cmake/src/torch/universal.rs b/build2cmake/src/torch/universal.rs
index c92257ad..b52525a5 100644
--- a/build2cmake/src/torch/universal.rs
+++ b/build2cmake/src/torch/universal.rs
@@ -17,9 +17,9 @@ pub fn write_torch_ext_universal(
 ) -> Result<FileSet> {
     let mut file_set = FileSet::default();
 
-    let ops_name = kernel_ops_identifier(&target_dir, &build.general.name, ops_id);
+    let ops_name = kernel_ops_identifier(&target_dir, &build.general.python_name(), ops_id);
 
-    write_ops_py(env, &build.general.name, &ops_name, &mut file_set)?;
+    write_ops_py(env, &build.general.python_name(), &ops_name, &mut file_set)?;
     write_pyproject_toml(
         env,
         build.torch.as_ref(),
diff --git a/build2cmake/src/torch/xpu.rs b/build2cmake/src/torch/xpu.rs
index 41bd4518..a515180e 100644
--- a/build2cmake/src/torch/xpu.rs
+++ b/build2cmake/src/torch/xpu.rs
@@ -26,7 +26,7 @@ pub fn write_torch_ext_xpu(
 
     let mut file_set = FileSet::default();
 
-    let ops_name = kernel_ops_identifier(&target_dir, &build.general.name, ops_id);
+    let ops_name = kernel_ops_identifier(&target_dir, &build.general.python_name(), ops_id);
 
     write_cmake(
         env,
@@ -45,7 +45,7 @@ pub fn write_torch_ext_xpu(
         &mut file_set,
     )?;
 
-    write_ops_py(env, &build.general.name, &ops_name, &mut file_set)?;
+    write_ops_py(env, &build.general.python_name(), &ops_name, &mut file_set)?;
 
     write_pyproject_toml(env, &mut file_set)?;
 
diff --git a/examples/silu-and-mul-universal/build.toml b/examples/silu-and-mul-universal/build.toml
index c7515935..826e880e 100644
--- a/examples/silu-and-mul-universal/build.toml
+++ b/examples/silu-and-mul-universal/build.toml
@@ -1,3 +1,3 @@
 [general]
-name = "silu_and_mul_universal"
+name = "silu-and-mul-universal"
 universal = true
diff --git a/lib/build.nix b/lib/build.nix
index d6f77aa1..ae2e6dbe 100644
--- a/lib/build.nix
+++ b/lib/build.nix
@@ -145,7 +145,7 @@ rec {
           rev
           doGetKernelCheck
           ;
-        extensionName = buildToml.general.name;
+        kernelName = buildToml.general.name;
       }
     else
       extension.mkExtension {
@@ -159,7 +159,7 @@ rec {
           rev
           ;
 
-        extensionName = buildToml.general.name;
+        kernelName = buildToml.general.name;
         doAbiCheck = true;
       };
 
diff --git a/lib/torch-extension/arch.nix b/lib/torch-extension/arch.nix
index 0a4b4e02..507f5915 100644
--- a/lib/torch-extension/arch.nix
+++ b/lib/torch-extension/arch.nix
@@ -42,7 +42,7 @@
   # Whether to run get-kernel-check.
   doGetKernelCheck ? true,
 
-  extensionName,
+  kernelName,
 
   # Extra dependencies (such as CUTLASS).
   extraDeps ? [ ],
@@ -65,6 +65,8 @@ assert (buildConfig ? xpuVersion) -> xpuSupport;
 assert (buildConfig.metal or false) -> stdenv.hostPlatform.isDarwin;
 
 let
+  extensionName = builtins.replaceStrings [ "-" ] [ "_" ] kernelName;
+
   # On Darwin, we need the host's xcrun for `xcrun metal` to compile Metal shaders.
   # It's not supported by the nixpkgs shim.
   xcrunHost = writeScriptBin "xcrunHost" ''
diff --git a/lib/torch-extension/no-arch.nix b/lib/torch-extension/no-arch.nix
index 783a5ac4..fc57a7d3 100644
--- a/lib/torch-extension/no-arch.nix
+++ b/lib/torch-extension/no-arch.nix
@@ -13,7 +13,7 @@
   # Whether to run get-kernel-check.
   doGetKernelCheck ? true,
 
-  extensionName,
+  kernelName,
 
   # Revision to bake into the ops name.
   rev,
@@ -21,8 +21,12 @@
   src,
 }:
 
+let
+  extensionName = builtins.replaceStrings [ "-" ] [ "_" ] kernelName;
+in
+
 stdenv.mkDerivation (prevAttrs: {
-  name = "${extensionName}-torch-ext";
+  name = "${kernelName}-torch-ext";
 
   inherit extensionName src;
 

From ae7065f82b9b0b33f16ab2039cbf46310c5c7383 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 11 Nov 2025 15:06:20 +0000
Subject: [PATCH 2/3] CI: remove macos-13 from the matrix

This was used to build x86_64 macOS kernel-abi-check packages. Building
these does not make much sense anyway, since we only support ARM64 macOS
in kernel-builder.
---
 .github/workflows/kernel_abi_python_release.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/kernel_abi_python_release.yaml b/.github/workflows/kernel_abi_python_release.yaml
index d0514178..37b7ae34 100644
--- a/.github/workflows/kernel_abi_python_release.yaml
+++ b/.github/workflows/kernel_abi_python_release.yaml
@@ -141,8 +141,6 @@ jobs:
     strategy:
       matrix:
         platform:
-          - runner: macos-13
-            target: x86_64
           - runner: macos-14
             target: aarch64
     steps:

From 0aabdf2b2bc7e65f2d26f5e998d1eef15223bd38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 11 Nov 2025 15:37:14 +0000
Subject: [PATCH 3/3] Update some of the example kernel names

---
 examples/cutlass-gemm/build.toml               |  2 +-
 examples/relu-backprop-compile/build.toml      |  2 +-
 .../relu-backprop-compile/tests/test_relu.py   | 18 +++++++++---------
 .../__init__.py                                |  0
 examples/relu-compiler-flags/build.toml        |  2 +-
 .../{relu => relu_compiler_flags}/__init__.py  |  0
 examples/relu-specific-torch/build.toml        |  2 +-
 .../relu-specific-torch/tests/test_relu.py     |  4 ++--
 .../{relu => relu_specific_torch}/__init__.py  |  0
 9 files changed, 15 insertions(+), 15 deletions(-)
 rename examples/relu-backprop-compile/torch-ext/{relu => relu_backprop_compile}/__init__.py (100%)
 rename examples/relu-compiler-flags/torch-ext/{relu => relu_compiler_flags}/__init__.py (100%)
 rename examples/relu-specific-torch/torch-ext/{relu => relu_specific_torch}/__init__.py (100%)

diff --git a/examples/cutlass-gemm/build.toml b/examples/cutlass-gemm/build.toml
index 09199fe0..dc5d10c6 100644
--- a/examples/cutlass-gemm/build.toml
+++ b/examples/cutlass-gemm/build.toml
@@ -1,5 +1,5 @@
 [general]
-name = "cutlass_gemm"
+name = "cutlass-gemm"
 universal = false
 
 [torch]
diff --git a/examples/relu-backprop-compile/build.toml b/examples/relu-backprop-compile/build.toml
index c9bfab3c..130b6474 100644
--- a/examples/relu-backprop-compile/build.toml
+++ b/examples/relu-backprop-compile/build.toml
@@ -1,5 +1,5 @@
 [general]
-name = "relu"
+name = "relu-backprop-compile"
 universal = false
 
 [torch]
diff --git a/examples/relu-backprop-compile/tests/test_relu.py b/examples/relu-backprop-compile/tests/test_relu.py
index 450c10e2..104be7d8 100644
--- a/examples/relu-backprop-compile/tests/test_relu.py
+++ b/examples/relu-backprop-compile/tests/test_relu.py
@@ -5,7 +5,7 @@
 import torch.nn.functional as F
 from torch.library import opcheck
 
-import relu
+import relu_backprop_compile
 
 
 def get_device():
@@ -30,21 +30,21 @@ def test_relu_forward(dtype):
     device = get_device()
     x = torch.randn(1024, 1024, dtype=dtype, device=device)
     expected = F.relu(x)
-    actual = relu.relu(x)
+    actual = relu_backprop_compile.relu(x)
     torch.testing.assert_close(expected, actual)
 
 
 def test_relu_gradient_numerical():
     device = get_device()
     x = torch.randn(32, 32, dtype=torch.float64, device=device, requires_grad=True)
-    assert torch.autograd.gradcheck(relu.relu, x)
+    assert torch.autograd.gradcheck(relu_backprop_compile.relu, x)
 
 
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_relu_gradient_large_tensor(dtype):
     device = get_device()
     x = torch.randn(1024, 1024, dtype=dtype, device=device, requires_grad=True)
-    y = relu.relu(x)
+    y = relu_backprop_compile.relu(x)
     loss = y.sum()
     loss.backward()
 
@@ -69,7 +69,7 @@ def test_relu_gradient_comparison(dtype):
     )
 
     x_kernel = x_data.clone().requires_grad_(True)
-    y_kernel = relu.relu(x_kernel)
+    y_kernel = relu_backprop_compile.relu(x_kernel)
     loss_custom = y_kernel.sum()
     loss_custom.backward()
 
@@ -86,7 +86,7 @@ def test_relu_gradient_comparison(dtype):
 def test_relu_backward_chain(dtype):
     device = get_device()
     x = torch.randn(64, 128, dtype=dtype, device=device, requires_grad=True)
-    y = relu.relu(x)
+    y = relu_backprop_compile.relu(x)
     z = y * 2.0
     loss = z.sum()
     loss.backward()
@@ -115,7 +115,7 @@ def test_relu_backward_chain(dtype):
 def test_relu_fwd_opcheck(shape, dtype):
     device = get_device()
     x = torch.randn(shape, dtype=dtype, device=device, requires_grad=True)
-    opcheck(relu.ops.relu_fwd, (x,))
+    opcheck(relu_backprop_compile.ops.relu_fwd, (x,))
 
 
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -128,7 +128,7 @@ def __init__(self):
             self.linear = torch.nn.Linear(1024, 1024)
 
         def forward(self, x):
-            return relu.relu(self.linear(x))
+            return relu_backprop_compile.relu(self.linear(x))
 
     model = SimpleModel().to(device).to(dtype)
     compiled_model = torch.compile(model, fullgraph=True)
@@ -168,7 +168,7 @@ def __init__(self):
             self.linear = torch.nn.Linear(16, 16)
 
         def forward(self, x):
-            return relu.relu(self.linear(x))
+            return relu_backprop_compile.relu(self.linear(x))
 
     model = SimpleModel().to(device).to(dtype)
     compiled_model = torch.compile(model, fullgraph=True)
diff --git a/examples/relu-backprop-compile/torch-ext/relu/__init__.py b/examples/relu-backprop-compile/torch-ext/relu_backprop_compile/__init__.py
similarity index 100%
rename from examples/relu-backprop-compile/torch-ext/relu/__init__.py
rename to examples/relu-backprop-compile/torch-ext/relu_backprop_compile/__init__.py
diff --git a/examples/relu-compiler-flags/build.toml b/examples/relu-compiler-flags/build.toml
index e99d7ae3..d595e09e 100644
--- a/examples/relu-compiler-flags/build.toml
+++ b/examples/relu-compiler-flags/build.toml
@@ -1,5 +1,5 @@
 [general]
-name = "relu"
+name = "relu-compiler-flags"
 universal = false
 
 [torch]
diff --git a/examples/relu-compiler-flags/torch-ext/relu/__init__.py b/examples/relu-compiler-flags/torch-ext/relu_compiler_flags/__init__.py
similarity index 100%
rename from examples/relu-compiler-flags/torch-ext/relu/__init__.py
rename to examples/relu-compiler-flags/torch-ext/relu_compiler_flags/__init__.py
diff --git a/examples/relu-specific-torch/build.toml b/examples/relu-specific-torch/build.toml
index c9bfab3c..3db0e7e7 100644
--- a/examples/relu-specific-torch/build.toml
+++ b/examples/relu-specific-torch/build.toml
@@ -1,5 +1,5 @@
 [general]
-name = "relu"
+name = "relu-specific-torch"
 universal = false
 
 [torch]
diff --git a/examples/relu-specific-torch/tests/test_relu.py b/examples/relu-specific-torch/tests/test_relu.py
index 98b292b9..4ef804d4 100644
--- a/examples/relu-specific-torch/tests/test_relu.py
+++ b/examples/relu-specific-torch/tests/test_relu.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn.functional as F
 
-import relu
+import relu_specific_torch
 
 
 def test_relu():
@@ -12,4 +12,4 @@ def test_relu():
     else:
         device = torch.device("cuda")
     x = torch.randn(1024, 1024, dtype=torch.float32, device=device)
-    torch.testing.assert_allclose(F.relu(x), relu.relu(x))
+    torch.testing.assert_allclose(F.relu(x), relu_specific_torch.relu(x))
diff --git a/examples/relu-specific-torch/torch-ext/relu/__init__.py b/examples/relu-specific-torch/torch-ext/relu_specific_torch/__init__.py
similarity index 100%
rename from examples/relu-specific-torch/torch-ext/relu/__init__.py
rename to examples/relu-specific-torch/torch-ext/relu_specific_torch/__init__.py