huggingface · danieldk · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025
diff --git a/.github/workflows/kernel_abi_python_release.yaml b/.github/workflows/kernel_abi_python_release.yaml
@@ -141,8 +141,6 @@ jobs:
     strategy:
       matrix:
         platform:
-          - runner: macos-13
-            target: x86_64
           - runner: macos-14
             target: aarch64
     steps:

diff --git a/build2cmake/src/config/v2.rs b/build2cmake/src/config/v2.rs
@@ -56,6 +56,13 @@ pub struct General {
     pub hub: Option<Hub>,
 }
 
+impl General {
+    /// Name of the kernel as a Python extension.
+    pub fn python_name(&self) -> String {
+        self.name.replace("-", "_")
+    }
+}
+
 #[derive(Debug, Deserialize, Serialize)]
 #[serde(deny_unknown_fields, rename_all = "kebab-case")]
 pub struct Hub {

diff --git a/build2cmake/src/main.rs b/build2cmake/src/main.rs
@@ -344,7 +344,9 @@ fn clean(
     // Clean up empty directories
     let dirs_to_check = [
         target_dir.join("cmake"),
-        target_dir.join("torch-ext").join(&build.general.name),
+        target_dir
+            .join("torch-ext")
+            .join(build.general.python_name()),
         target_dir.join("torch-ext"),
     ];
 

diff --git a/build2cmake/src/torch/cpu.rs b/build2cmake/src/torch/cpu.rs
@@ -26,7 +26,7 @@ pub fn write_torch_ext_cpu(
 
     let mut file_set = FileSet::default();
 
-    let ops_name = kernel_ops_identifier(&target_dir, &build.general.name, ops_id);
+    let ops_name = kernel_ops_identifier(&target_dir, &build.general.python_name(), ops_id);
 
     write_cmake(
         env,
@@ -45,7 +45,7 @@ pub fn write_torch_ext_cpu(
         &mut file_set,
     )?;
 
-    write_ops_py(env, &build.general.name, &ops_name, &mut file_set)?;
+    write_ops_py(env, &build.general.python_name(), &ops_name, &mut file_set)?;
 
     write_pyproject_toml(env, &mut file_set)?;
 

diff --git a/build2cmake/src/torch/cuda.rs b/build2cmake/src/torch/cuda.rs
@@ -38,7 +38,7 @@ pub fn write_torch_ext_cuda(
 
     let mut file_set = FileSet::default();
 
-    let ops_name = kernel_ops_identifier(&target_dir, &build.general.name, ops_id);
+    let ops_name = kernel_ops_identifier(&target_dir, &build.general.python_name(), ops_id);
 
     write_cmake(
         env,
@@ -58,7 +58,7 @@ pub fn write_torch_ext_cuda(
         &mut file_set,
     )?;
 
-    write_ops_py(env, &build.general.name, &ops_name, &mut file_set)?;
+    write_ops_py(env, &build.general.python_name(), &ops_name, &mut file_set)?;
 
     write_pyproject_toml(env, &mut file_set)?;
 

diff --git a/build2cmake/src/torch/metal.rs b/build2cmake/src/torch/metal.rs
@@ -47,7 +47,7 @@ pub fn write_torch_ext_metal(
         &mut file_set,
     )?;
 
-    write_ops_py(env, &build.general.name, &ops_name, &mut file_set)?;
+    write_ops_py(env, &build.general.python_name(), &ops_name, &mut file_set)?;
 
     write_pyproject_toml(env, &mut file_set)?;
 

diff --git a/build2cmake/src/torch/universal.rs b/build2cmake/src/torch/universal.rs
@@ -17,9 +17,9 @@ pub fn write_torch_ext_universal(
 ) -> Result<FileSet> {
     let mut file_set = FileSet::default();
 
-    let ops_name = kernel_ops_identifier(&target_dir, &build.general.name, ops_id);
+    let ops_name = kernel_ops_identifier(&target_dir, &build.general.python_name(), ops_id);
 
-    write_ops_py(env, &build.general.name, &ops_name, &mut file_set)?;
+    write_ops_py(env, &build.general.python_name(), &ops_name, &mut file_set)?;
     write_pyproject_toml(
         env,
         build.torch.as_ref(),

diff --git a/build2cmake/src/torch/xpu.rs b/build2cmake/src/torch/xpu.rs
@@ -26,7 +26,7 @@ pub fn write_torch_ext_xpu(
 
     let mut file_set = FileSet::default();
 
-    let ops_name = kernel_ops_identifier(&target_dir, &build.general.name, ops_id);
+    let ops_name = kernel_ops_identifier(&target_dir, &build.general.python_name(), ops_id);
 
     write_cmake(
         env,
@@ -45,7 +45,7 @@ pub fn write_torch_ext_xpu(
         &mut file_set,
     )?;
 
-    write_ops_py(env, &build.general.name, &ops_name, &mut file_set)?;
+    write_ops_py(env, &build.general.python_name(), &ops_name, &mut file_set)?;
 
     write_pyproject_toml(env, &mut file_set)?;
 

diff --git a/examples/cutlass-gemm/build.toml b/examples/cutlass-gemm/build.toml
@@ -1,5 +1,5 @@
 [general]
-name = "cutlass_gemm"
+name = "cutlass-gemm"
 universal = false
 
 [torch]

diff --git a/examples/relu-backprop-compile/build.toml b/examples/relu-backprop-compile/build.toml
@@ -1,5 +1,5 @@
 [general]
-name = "relu"
+name = "relu-backprop-compile"
 universal = false
 
 [torch]

diff --git a/examples/relu-backprop-compile/tests/test_relu.py b/examples/relu-backprop-compile/tests/test_relu.py
@@ -5,7 +5,7 @@
 import torch.nn.functional as F
 from torch.library import opcheck
 
-import relu
+import relu_backprop_compile
 
 
 def get_device():
@@ -30,21 +30,21 @@ def test_relu_forward(dtype):
     device = get_device()
     x = torch.randn(1024, 1024, dtype=dtype, device=device)
     expected = F.relu(x)
-    actual = relu.relu(x)
+    actual = relu_backprop_compile.relu(x)
     torch.testing.assert_close(expected, actual)
 
 
 def test_relu_gradient_numerical():
     device = get_device()
     x = torch.randn(32, 32, dtype=torch.float64, device=device, requires_grad=True)
-    assert torch.autograd.gradcheck(relu.relu, x)
+    assert torch.autograd.gradcheck(relu_backprop_compile.relu, x)
 
 
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_relu_gradient_large_tensor(dtype):
     device = get_device()
     x = torch.randn(1024, 1024, dtype=dtype, device=device, requires_grad=True)
-    y = relu.relu(x)
+    y = relu_backprop_compile.relu(x)
     loss = y.sum()
     loss.backward()
 
@@ -69,7 +69,7 @@ def test_relu_gradient_comparison(dtype):
     )
 
     x_kernel = x_data.clone().requires_grad_(True)
-    y_kernel = relu.relu(x_kernel)
+    y_kernel = relu_backprop_compile.relu(x_kernel)
     loss_custom = y_kernel.sum()
     loss_custom.backward()
 
@@ -86,7 +86,7 @@ def test_relu_gradient_comparison(dtype):
 def test_relu_backward_chain(dtype):
     device = get_device()
     x = torch.randn(64, 128, dtype=dtype, device=device, requires_grad=True)
-    y = relu.relu(x)
+    y = relu_backprop_compile.relu(x)
     z = y * 2.0
     loss = z.sum()
     loss.backward()
@@ -115,7 +115,7 @@ def test_relu_backward_chain(dtype):
 def test_relu_fwd_opcheck(shape, dtype):
     device = get_device()
     x = torch.randn(shape, dtype=dtype, device=device, requires_grad=True)
-    opcheck(relu.ops.relu_fwd, (x,))
+    opcheck(relu_backprop_compile.ops.relu_fwd, (x,))
 
 
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -128,7 +128,7 @@ def __init__(self):
             self.linear = torch.nn.Linear(1024, 1024)
 
         def forward(self, x):
-            return relu.relu(self.linear(x))
+            return relu_backprop_compile.relu(self.linear(x))
 
     model = SimpleModel().to(device).to(dtype)
     compiled_model = torch.compile(model, fullgraph=True)
@@ -168,7 +168,7 @@ def __init__(self):
             self.linear = torch.nn.Linear(16, 16)
 
         def forward(self, x):
-            return relu.relu(self.linear(x))
+            return relu_backprop_compile.relu(self.linear(x))
 
     model = SimpleModel().to(device).to(dtype)
     compiled_model = torch.compile(model, fullgraph=True)

diff --git a/...ckprop-compile/torch-ext/relu/__init__.py → ...rch-ext/relu_backprop_compile/__init__.py b/...ckprop-compile/torch-ext/relu/__init__.py → ...rch-ext/relu_backprop_compile/__init__.py
diff --git a/examples/relu-compiler-flags/build.toml b/examples/relu-compiler-flags/build.toml
@@ -1,5 +1,5 @@
 [general]
-name = "relu"
+name = "relu-compiler-flags"
 universal = false
 
 [torch]

diff --git a/...compiler-flags/torch-ext/relu/__init__.py → ...torch-ext/relu_compiler_flags/__init__.py b/...compiler-flags/torch-ext/relu/__init__.py → ...torch-ext/relu_compiler_flags/__init__.py
diff --git a/examples/relu-specific-torch/build.toml b/examples/relu-specific-torch/build.toml
@@ -1,5 +1,5 @@
 [general]
-name = "relu"
+name = "relu-specific-torch"
 universal = false
 
 [torch]

diff --git a/examples/relu-specific-torch/tests/test_relu.py b/examples/relu-specific-torch/tests/test_relu.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn.functional as F
 
-import relu
+import relu_specific_torch
 
 
 def test_relu():
@@ -12,4 +12,4 @@ def test_relu():
     else:
         device = torch.device("cuda")
     x = torch.randn(1024, 1024, dtype=torch.float32, device=device)
-    torch.testing.assert_allclose(F.relu(x), relu.relu(x))
+    torch.testing.assert_allclose(F.relu(x), relu_specific_torch.relu(x))
diff --git a/...specific-torch/torch-ext/relu/__init__.py → ...torch-ext/relu_specific_torch/__init__.py b/...specific-torch/torch-ext/relu/__init__.py → ...torch-ext/relu_specific_torch/__init__.py
diff --git a/examples/silu-and-mul-universal/build.toml b/examples/silu-and-mul-universal/build.toml
@@ -1,3 +1,3 @@
 [general]
-name = "silu_and_mul_universal"
+name = "silu-and-mul-universal"
 universal = true
diff --git a/lib/build.nix b/lib/build.nix
@@ -145,7 +145,7 @@ rec {
           rev
           doGetKernelCheck
           ;
-        extensionName = buildToml.general.name;
+        kernelName = buildToml.general.name;
       }
     else
       extension.mkExtension {
@@ -159,7 +159,7 @@ rec {
           rev
           ;
 
-        extensionName = buildToml.general.name;
+        kernelName = buildToml.general.name;
         doAbiCheck = true;
       };
 

diff --git a/lib/torch-extension/arch.nix b/lib/torch-extension/arch.nix
@@ -42,7 +42,7 @@
   # Whether to run get-kernel-check.
   doGetKernelCheck ? true,
 
-  extensionName,
+  kernelName,
 
   # Extra dependencies (such as CUTLASS).
   extraDeps ? [ ],
@@ -65,6 +65,8 @@ assert (buildConfig ? xpuVersion) -> xpuSupport;
 assert (buildConfig.metal or false) -> stdenv.hostPlatform.isDarwin;
 
 let
+  extensionName = builtins.replaceStrings [ "-" ] [ "_" ] kernelName;
+
   # On Darwin, we need the host's xcrun for `xcrun metal` to compile Metal shaders.
   # It's not supported by the nixpkgs shim.
   xcrunHost = writeScriptBin "xcrunHost" ''

diff --git a/lib/torch-extension/no-arch.nix b/lib/torch-extension/no-arch.nix
@@ -13,16 +13,20 @@
   # Whether to run get-kernel-check.
   doGetKernelCheck ? true,
 
-  extensionName,
+  kernelName,
 
   # Revision to bake into the ops name.
   rev,
 
   src,
 }:
 
+let
+  extensionName = builtins.replaceStrings [ "-" ] [ "_" ] kernelName;
+in
+
 stdenv.mkDerivation (prevAttrs: {
-  name = "${extensionName}-torch-ext";
+  name = "${kernelName}-torch-ext";
 
   inherit extensionName src;