From da379e84df067b0be591247202861d88180c55d5 Mon Sep 17 00:00:00 2001
From: Renato Golin <rengolin@systemcall.eu>
Date: Thu, 14 May 2026 12:55:45 +0100
Subject: [PATCH 1/4] Add first 40 kernels to KB test

Only the first two are actually benchmarking, as they're the only ones
which we can achieve any reasonable performance. The others lower to
loops and use smaller sizes to not bloat the testing times.

The schedule selection also changed to allow for smaller deltas when
selecting the whole pipeline for multiple kernels. Generic sub-schedules
have been moved one directory lower. Can create sub-dirs with special
differences reusing the generic ones as needed.

To minimize CI impact, there's a new "CI mode", where only the first 5
tests are run, without benchmarking or bf16 support. This is just a
smoke test. Further tests / benchmarking should be calling the script
directly with the appropriate flags (per architecture).

Notes:
 * The element-wise ones fail to lower, some matmul ones fail in the
   same way, comments on their entries in the tests table.
 * Higher dimensional matmuls don't tile the same way, so using the
   loops lowering for now.
 * Skinny matmul fails to use the optimized pipeline due to the skinny
   dimension not tiling (1), so also using the loops lowering.

There's a new --test option to pick a particular test. For example, to
benchmark the BF16 version of level1/40_LayerNorm.py, call:

```
$ test_kernel_bench --test=level1/40_LayerNorm.py --benchmark --bf16
```

assisted-by: GitHub Copilot
---
 .../schedules/x86_64/{matmul => }/lower.yaml  |   0
 .../schedules/x86_64/matmul/bf16.yaml         |   8 +-
 .../schedules/x86_64/matmul/f32.yaml          |   6 +-
 .../schedules/x86_64/matmul/vectorize.yaml    |  11 -
 .../x86_64/{matmul => }/pack_and_tile.yaml    |   0
 .../schedules/x86_64/vectorize.yaml           |   7 +
 .../KernelBench/test_kernel_bench.py          | 361 +++++++++++++++++-
 7 files changed, 362 insertions(+), 31 deletions(-)
 rename examples/end-to-end/KernelBench/schedules/x86_64/{matmul => }/lower.yaml (100%)
 delete mode 100644 examples/end-to-end/KernelBench/schedules/x86_64/matmul/vectorize.yaml
 rename examples/end-to-end/KernelBench/schedules/x86_64/{matmul => }/pack_and_tile.yaml (100%)
 create mode 100644 examples/end-to-end/KernelBench/schedules/x86_64/vectorize.yaml

diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/lower.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/lower.yaml
similarity index 100%
rename from examples/end-to-end/KernelBench/schedules/x86_64/matmul/lower.yaml
rename to examples/end-to-end/KernelBench/schedules/x86_64/lower.yaml
diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/bf16.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/bf16.yaml
index c81d950..19bcf76 100644
--- a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/bf16.yaml
+++ b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/bf16.yaml
@@ -1,10 +1,8 @@
 # This is an optimizing pipeline for kernel_bench matmuls on bf16 types.
 # This is basically a copy of the fp32 pipeline, with ONE CHANGE:
 #  - register_tiling.py -> reg_unroll_k=2 (instead of 1)
-# Tested on x86_64 with AVX512 reaching good performance for simple KB kernels.
-# It may not apply to other workloads / extensions / architectures, so use with caution.
 Pipeline:
-  - include: pack_and_tile.yaml
+  - include: ../pack_and_tile.yaml
 
   ## CPU specific register tiling (depends on uArch & data type)
   - schedule: "x86/register_tiling.py[gen=matmul_register_tiling]{target=linalg.contract reg_tile_batch=1 reg_tile_m=8 reg_tile_n=32 reg_tile_k=2}"
@@ -12,6 +10,6 @@ Pipeline:
   - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.fill tile_sizes=[1,1,1]}"
   - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.generic tile_sizes=[1,8]}"
 
-  - include: vectorize.yaml
+  - include: ../vectorize.yaml
 
-  - include: lower.yaml
+  - include: ../lower.yaml
diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/f32.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/f32.yaml
index 742f91c..6043543 100644
--- a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/f32.yaml
+++ b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/f32.yaml
@@ -2,7 +2,7 @@
 # Tested on x86_64 with AVX512 reaching good performance for simple KB kernels.
 # It may not apply to other workloads / extensions / architectures, so use with caution.
 Pipeline:
-  - include: pack_and_tile.yaml
+  - include: ../pack_and_tile.yaml
 
   ## CPU specific register tiling (depends on uArch & data type)
   - schedule: "x86/register_tiling.py[gen=matmul_register_tiling]{target=linalg.contract reg_tile_batch=1 reg_tile_m=8 reg_tile_n=32 reg_tile_k=2}"
@@ -10,6 +10,6 @@ Pipeline:
   - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.fill tile_sizes=[1,1,1]}"
   - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.generic tile_sizes=[1,8]}"
 
-  - include: vectorize.yaml
+  - include: ../vectorize.yaml
 
-  - include: lower.yaml
+  - include: ../lower.yaml
diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/vectorize.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/vectorize.yaml
deleted file mode 100644
index 527920c..0000000
--- a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/vectorize.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-# This is an optimizing pipeline for kernel_bench matmuls on bf16 types.
-# This is basically a copy of the fp32 pipeline, with ONE CHANGE:
-#  - register_tiling.py -> reg_unroll_k=2 (instead of 1)
-# Tested on x86_64 with AVX512 reaching good performance for simple KB kernels.
-# It may not apply to other workloads / extensions / architectures, so use with caution.
-Pipeline:
-  ## Tensor vectorization (for the left-over element wise)
-  - schedule: "vectorization.py[gen=vectorize_linalg]"
-  - schedule: "hoisting.py[gen=hoist_loops]"
-  - schedule: "vectorization.py[gen=simplify_vector_ops]"
-  - schedule: "vectorization.py[gen=x86_vectorization]"
diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/pack_and_tile.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/pack_and_tile.yaml
similarity index 100%
rename from examples/end-to-end/KernelBench/schedules/x86_64/matmul/pack_and_tile.yaml
rename to examples/end-to-end/KernelBench/schedules/x86_64/pack_and_tile.yaml
diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/vectorize.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/vectorize.yaml
new file mode 100644
index 0000000..57abb67
--- /dev/null
+++ b/examples/end-to-end/KernelBench/schedules/x86_64/vectorize.yaml
@@ -0,0 +1,7 @@
+# Tensor level vectorization for matmul like kernels on any type.
+Pipeline:
+  ## Tensor vectorization (for the left-over element wise)
+  - schedule: "vectorization.py[gen=vectorize_linalg]"
+  - schedule: "hoisting.py[gen=hoist_loops]"
+  - schedule: "vectorization.py[gen=simplify_vector_ops]"
+  - schedule: "vectorization.py[gen=x86_vectorization]"
diff --git a/examples/end-to-end/KernelBench/test_kernel_bench.py b/examples/end-to-end/KernelBench/test_kernel_bench.py
index 202a468..69bfd8b 100755
--- a/examples/end-to-end/KernelBench/test_kernel_bench.py
+++ b/examples/end-to-end/KernelBench/test_kernel_bench.py
@@ -1,4 +1,4 @@
-# RUN: python %s | FileCheck %s
+# RUN: python %s --ci | FileCheck %s
 
 # REQUIRES: torch
 # REQUIRES: kernel_bench
@@ -16,17 +16,15 @@
 kb_path = project_root / "third_party" / "KernelBench" / "KernelBench"
 
 
-def get_pipeline_file(kernel_name: str, dtype: str) -> Path:
+def get_pipeline_file(name: str, dtype: str) -> Path:
     """
     Returns the appropriate pipeline file for a given kernel.
     """
     arch = platform.machine()
-    if arch != "x86_64":
-        return kb_default_pipeline
 
-    # Level 1 matmuls should use the same pipelines
-    if kernel_name.startswith("level1") and "matrix_multiplication" in kernel_name:
-        pipeline = script_path / f"schedules/{arch}/matmul/{dtype}.yaml"
+    # If the pipeline file exists for the given name and dtype
+    if name:
+        pipeline = script_path / f"schedules/{arch}/{name}/{dtype}.yaml"
         if pipeline.exists():
             return pipeline
 
@@ -42,6 +40,7 @@ def get_pipeline_file(kernel_name: str, dtype: str) -> Path:
         "output_shape": "1024x1024",
         "dtypes": ["f32", "bf16"],
         "gflops": (1024 * 1024 * 1024 * 2) / 1e9,
+        "pipeline": "matmul",
     },
     {
         "kernel": "level1/2_Standard_matrix_multiplication_.py",
@@ -50,6 +49,304 @@ def get_pipeline_file(kernel_name: str, dtype: str) -> Path:
         "output_shape": "512x512",
         "dtypes": ["f32", "bf16"],
         "gflops": (512 * 1024 * 512 * 2) / 1e9,
+        "pipeline": "matmul",
+    },
+    {
+        "kernel": "level1/3_Batched_matrix_multiplication.py",
+        "input_shapes": ["4x64x32", "4x32x64"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "4x64x64",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/4_Matrix_vector_multiplication_.py",
+        "input_shapes": ["1024x1024", "1024x1"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "1024x1",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/5_Matrix_scalar_multiplication.py",
+        "input_shapes": ["1024x1024", "1"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "1024x1024",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/6_Matmul_with_large_K_dimension_.py",
+        "input_shapes": ["256x524288", "524288x256"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "256x256",
+        "dtypes": ["f32", "bf16"],
+        "gflops": (256 * 524288 * 256 * 2) / 1e9,
+        "pipeline": "matmul",
+    },
+    {
+        "kernel": "level1/7_Matmul_with_small_K_dimension_.py",
+        "input_shapes": ["32768x64", "64x32768"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "32768x32768",
+        "dtypes": ["f32", "bf16"],
+        "gflops": (32768 * 64 * 32768 * 2) / 1e9,
+        "pipeline": "matmul",
+    },
+    {
+        "kernel": "level1/8_Matmul_with_irregular_shapes_.py",
+        "input_shapes": ["8205x2949", "2949x5921"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "8205x5921",
+        "dtypes": ["f32", "bf16"],
+        "gflops": (8205 * 2949 * 5921 * 2) / 1e9,
+        "pipeline": "matmul",
+    },
+    # too many tiles provided, expected at most 3 found 4
+    {
+        "kernel": "level1/9_Tall_skinny_matrix_multiplication_.py",
+        "input_shapes": ["1024x32", "32x1024"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "1024x1024",
+        "dtypes": ["f32", "bf16"],
+        # "gflops": (1024 * 32 * 1024 * 2) / 1e9,
+        # "pipeline": "matmul",
+    },
+    {
+        "kernel": "level1/10_3D_tensor_matrix_multiplication.py",
+        "input_shapes": ["16x1024x2048", "2048x768"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "16x1024x768",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/11_4D_tensor_matrix_multiplication.py",
+        "input_shapes": ["8x256x512x256", "256x768"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "8x256x512x768",
+        "dtypes": ["f32", "bf16"],
+    },
+    # level1/12_Matmul_with_diagonal_matrices_.py
+    # torch_mlir.compiler_utils.TorchMlirCompilerError:
+    # Lowering TorchFX IR -> Torch Backend IR failed with the following diagnostics:
+    # python exception: Failure while executing pass pipeline
+    {
+        "kernel": "level1/12_Matmul_with_diagonal_matrices_.py",
+        "input_shapes": ["4096", "4096x4096"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "4096x4096",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/13_Matmul_for_symmetric_matrices.py",
+        "input_shapes": ["4096x4096", "4096x4096"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "4096x4096",
+        "dtypes": ["f32", "bf16"],
+        "gflops": (4096 * 4096 * 4096 * 2) / 1e9,
+        "pipeline": "matmul",
+    },
+    # level1/14_Matmul_for_upper_triangular_matrices.py
+    # LLVM ERROR: operation destroyed but still has uses
+    {
+        "kernel": "level1/14_Matmul_for_upper_triangular_matrices.py",
+        "input_shapes": ["4096x4096", "4096x4096"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "4096x4096",
+        "dtypes": ["f32", "bf16"],
+        "gflops": (4096 * 4096 * 4096 * 2) / 1e9,
+        "pipeline": "matmul",
+    },
+    # level1/15_Matmul_for_lower_triangular_matrices.py
+    # LLVM ERROR: operation destroyed but still has uses
+    {
+        "kernel": "level1/15_Matmul_for_lower_triangular_matrices.py",
+        "input_shapes": ["4096x4096", "4096x4096"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "4096x4096",
+        "dtypes": ["f32", "bf16"],
+        "gflops": (4096 * 4096 * 4096 * 2) / 1e9,
+        "pipeline": "matmul",
+    },
+    {
+        "kernel": "level1/16_Matmul_with_transposed_A.py",
+        "input_shapes": ["8192x2048", "8192x4096"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "2048x4096",
+        "dtypes": ["f32", "bf16"],
+        "gflops": (2048 * 8192 * 4096 * 2) / 1e9,
+        "pipeline": "matmul",
+    },
+    {
+        "kernel": "level1/17_Matmul_with_transposed_B.py",
+        "input_shapes": ["2048x8192", "4096x8192"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "2048x4096",
+        "dtypes": ["f32", "bf16"],
+        "gflops": (2048 * 8192 * 4096 * 2) / 1e9,
+        "pipeline": "matmul",
+    },
+    {
+        "kernel": "level1/18_Matmul_with_transposed_both.py",
+        "input_shapes": ["8192x2048", "4096x8192"],
+        "initializations": ["rnd", "rnd"],
+        "output_shape": "2048x4096",
+        "dtypes": ["f32", "bf16"],
+        "gflops": (2048 * 8192 * 4096 * 2) / 1e9,
+        "pipeline": "matmul",
+    },
+    # All Element-wise kernels below fail with the same error:
+    # LLVM ERROR: operation destroyed but still has uses
+    {
+        "kernel": "level1/19_ReLU.py",
+        "input_shapes": ["4096x393216"],
+        "initializations": ["rnd"],
+        "output_shape": "4096x393216",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/20_LeakyReLU.py",
+        "input_shapes": ["4096x393216"],
+        "initializations": ["rnd"],
+        "output_shape": "4096x393216",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/21_Sigmoid.py",
+        "input_shapes": ["4096x393216"],
+        "initializations": ["rnd"],
+        "output_shape": "4096x393216",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/22_Tanh.py",
+        "input_shapes": ["4096x393216"],
+        "initializations": ["rnd"],
+        "output_shape": "4096x393216",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/23_Softmax.py",
+        "input_shapes": ["4096x393216"],
+        "initializations": ["rnd"],
+        "output_shape": "4096x393216",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/24_LogSoftmax.py",
+        "input_shapes": ["4096x393216"],
+        "initializations": ["rnd"],
+        "output_shape": "4096x393216",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/25_Swish.py",
+        "input_shapes": ["4096x393216"],
+        "initializations": ["rnd"],
+        "output_shape": "4096x393216",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/26_GELU_.py",
+        "input_shapes": ["4096x393216"],
+        "initializations": ["rnd"],
+        "output_shape": "4096x393216",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/27_SELU_.py",
+        "input_shapes": ["4096x393216"],
+        "initializations": ["rnd"],
+        "output_shape": "4096x393216",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/28_HardSigmoid.py",
+        "input_shapes": ["4096x393216"],
+        "initializations": ["rnd"],
+        "output_shape": "4096x393216",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/29_Softplus.py",
+        "input_shapes": ["4096x393216"],
+        "initializations": ["rnd"],
+        "output_shape": "4096x393216",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/30_Softsign.py",
+        "input_shapes": ["4096x393216"],
+        "initializations": ["rnd"],
+        "output_shape": "4096x393216",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/31_ELU.py",
+        "input_shapes": ["4096x393216"],
+        "initializations": ["rnd"],
+        "output_shape": "4096x393216",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/32_HardTanh.py",
+        "input_shapes": ["4096x393216"],
+        "initializations": ["rnd"],
+        "output_shape": "4096x393216",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/33_BatchNorm.py",
+        "input_shapes": ["64x64x512x512"],
+        "initializations": ["rnd"],
+        "output_shape": "64x64x512x512",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/34_InstanceNorm.py",
+        "input_shapes": ["112x64x512x512"],
+        "initializations": ["rnd"],
+        "output_shape": "112x64x512x512",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/35_GroupNorm_.py",
+        "input_shapes": ["112x64x512x512"],
+        "initializations": ["rnd"],
+        "output_shape": "112x64x512x512",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/36_RMSNorm_.py",
+        "input_shapes": ["112x64x512x512"],
+        "initializations": ["rnd"],
+        "output_shape": "112x64x512x512",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/37_FrobeniusNorm_.py",
+        "input_shapes": ["112x64x512x512"],
+        "initializations": ["rnd"],
+        "output_shape": "112x64x512x512",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/38_L1Norm_.py",
+        "input_shapes": ["32768x65535"],
+        "initializations": ["rnd"],
+        "output_shape": "32768x65535",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/39_L2Norm_.py",
+        "input_shapes": ["32768x65535"],
+        "initializations": ["rnd"],
+        "output_shape": "32768x65535",
+        "dtypes": ["f32", "bf16"],
+    },
+    {
+        "kernel": "level1/40_LayerNorm.py",
+        "input_shapes": ["16x64x256x256"],
+        "initializations": ["rnd"],
+        "output_shape": "16x64x256x256",
+        "dtypes": ["f32", "bf16"],
     },
 ]
 
@@ -58,11 +355,21 @@ def get_tests(args: argparse.Namespace) -> list[dict]:
     """
     Returns the list of tests to be executed.
     """
+    if args.ci:
+        print(
+            "Running in CI mode: fewer tests, no bf16, no benchmarking for faster feedback"
+        )
+        args.bf16 = False  # Disable bf16 tests in CI for faster feedback
+        args.benchmark = False  # Disable benchmarking in CI for faster feedback
+
     test_list = []
     for test in tests:
         for dtype in test["dtypes"]:
             if not args.bf16 and dtype == "bf16":
                 continue
+            # If a specific test is specified, only include that test
+            if args.test and test["kernel"] != args.test:
+                continue
             test_list.append(
                 {
                     "kernel": test["kernel"],
@@ -76,9 +383,12 @@ def get_tests(args: argparse.Namespace) -> list[dict]:
                     "gflops": test["gflops"]
                     if "gflops" in test and args.benchmark
                     else None,
-                    "pipeline": str(get_pipeline_file(test["kernel"], dtype)),
+                    "pipeline": str(get_pipeline_file(test.get("pipeline", ""), dtype)),
                 }
             )
+            # CI mode runs fewer tests for faster feedback
+            if args.ci and len(test_list) >= 5:
+                return test_list
     return test_list
 
 
@@ -105,9 +415,28 @@ def get_flops_per_second(stdout: str, gflops: float) -> float:
         action=argparse.BooleanOptionalAction,
         help="Enable bf16 precision kernels.",
     )
+    Parser.add_argument(
+        "--ci",
+        action=argparse.BooleanOptionalAction,
+        help="Enable CI mode (faster run, fewer kernels).",
+    )
+    Parser.add_argument(
+        "--test",
+        type=str,
+        help="Specify a particular test to run.",
+    )
     args = Parser.parse_args()
+    tests = get_tests(args)
+    if len(tests) == 0:
+        if args.test:
+            print(
+                f"No tests found matching '{args.test}'. Please check your arguments."
+            )
+        else:
+            print("No tests to run. Please check your arguments.")
+        exit(0)
 
-    for test in get_tests(args):
+    for test in tests:
         kb_kernel = kb_path / test["kernel"]
         command_line = [
             str(kb_program),
@@ -147,10 +476,18 @@ def get_flops_per_second(stdout: str, gflops: float) -> float:
 # CHECK: 0.3745{{.*}} 0.9507{{.*}} 0.7319{{.*}} ... 0.2973{{.*}} 0.9243{{.*}} 0.9710{{.*}}
 # CHECK: 0.7201{{.*}} 0.9926{{.*}} 0.1208{{.*}} ... 0.1742{{.*}} 0.3485{{.*}} 0.6436{{.*}}
 
-# CHECK-NOT: Execution failed
-
 # CHECK: 2_Standard_matrix_multiplication_.mlir
 # CHECK: 249.78{{.*}} 260.13{{.*}} 249.36{{.*}} ... 261.10{{.*}} 260.49{{.*}} 257.09{{.*}}
 # CHECK: 243.56{{.*}} 250.91{{.*}} 252.38{{.*}} ... 260.40{{.*}} 261.56{{.*}} 256.24{{.*}}
 
-# CHECK-NOT: Execution failed
+# CHECK: 3_Batched_matrix_multiplication.mlir
+# CHECK: 5.2403{{.*}} 7.7905{{.*}} 6.0769{{.*}} ... 7.8579{{.*}} 6.8890{{.*}} 6.6193{{.*}}
+# CHECK: 9.0407{{.*}} 6.3299{{.*}} 5.2003{{.*}} ... 6.2594{{.*}} 6.2980{{.*}} 5.9807{{.*}}
+
+# CHECK: 4_Matrix_vector_multiplication_.mlir
+# CHECK: 264.86{{.*}}
+# CHECK: 265.12{{.*}}
+
+# CHECK: 5_Matrix_scalar_multiplication.mlir
+# CHECK: 0.1750{{.*}} 0.4442{{.*}} 0.3420{{.*}} ... 0.1389{{.*}} 0.4319{{.*}} 0.4538{{.*}}
+# CHECK: 0.3365{{.*}} 0.4638{{.*}} 0.0564{{.*}} ... 0.0814{{.*}} 0.1628{{.*}} 0.3007{{.*}}

From 98172129c1a185e7b09d13b5f9e144bcb926632a Mon Sep 17 00:00:00 2001
From: Renato Golin <rengolin@systemcall.eu>
Date: Thu, 14 May 2026 17:10:42 +0100
Subject: [PATCH 2/4] Changes:

Allow partial kernel name match
Allow pass print-mlir-after-all
Allow bypass output capture for debugging purposes
---
 .../KernelBench/test_kernel_bench.py          | 39 +++++++++++++------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/examples/end-to-end/KernelBench/test_kernel_bench.py b/examples/end-to-end/KernelBench/test_kernel_bench.py
index 69bfd8b..910c611 100755
--- a/examples/end-to-end/KernelBench/test_kernel_bench.py
+++ b/examples/end-to-end/KernelBench/test_kernel_bench.py
@@ -368,7 +368,7 @@ def get_tests(args: argparse.Namespace) -> list[dict]:
             if not args.bf16 and dtype == "bf16":
                 continue
             # If a specific test is specified, only include that test
-            if args.test and test["kernel"] != args.test:
+            if args.test and not test["kernel"].startswith(args.test):
                 continue
             test_list.append(
                 {
@@ -425,6 +425,11 @@ def get_flops_per_second(stdout: str, gflops: float) -> float:
         type=str,
         help="Specify a particular test to run.",
     )
+    Parser.add_argument(
+        "--print-mlir-after-all",
+        action=argparse.BooleanOptionalAction,
+        help="Whether to print the MLIR module after all stages. Default is False.",
+    )
     args = Parser.parse_args()
     tests = get_tests(args)
     if len(tests) == 0:
@@ -450,25 +455,37 @@ def get_flops_per_second(stdout: str, gflops: float) -> float:
             "--print-tensor=1",
             "--seed=42",
         ]
-        benchmark = test.get("gflops") is not None
+        benchmark = args.benchmark and test.get("gflops") is not None
         if benchmark:
             command_line += ["--benchmark"]
+        if args.print_mlir_after_all:
+            command_line += ["--print-mlir-after-all"]
         print(f"Running command: {' '.join(command_line)}")
+
+        # While debugging kernels, it's useful to see the output as it comes.
+        # Note: GFLOPS can't be shown if the output is not captured.
+        capture_output = True
+        if args.print_mlir_after_all and not args.ci:
+            capture_output = False
+
         result = subprocess.run(
             command_line,
-            capture_output=True,
+            capture_output=capture_output,
             text=True,
         )
 
-        print("STDOUT:")
-        print(result.stdout)
-        if benchmark:
-            flops_per_second = get_flops_per_second(result.stdout, test["gflops"])
-            if flops_per_second > 0:
-                print(f"Performance: {flops_per_second:.2f} GFLOPS")
+        # If output is captured, print it out, including benchmark results if applicable.
+        if capture_output:
+            print("STDOUT:")
+            print(result.stdout)
+            if benchmark:
+                flops_per_second = get_flops_per_second(result.stdout, test["gflops"])
+                if flops_per_second > 0:
+                    print(f"Performance: {flops_per_second:.2f} GFLOPS")
+
+            print("STDERR:")
+            print(result.stderr)
 
-        print("STDERR:")
-        print(result.stderr)
         print(f"Return code: {result.returncode}")
         assert result.returncode == 0, "Execution failed"
 

From 3f90c3df1dbe3c276081b8a664150d7672df13d8 Mon Sep 17 00:00:00 2001
From: Renato Golin <rengolin@systemcall.eu>
Date: Fri, 15 May 2026 11:19:37 +0100
Subject: [PATCH 3/4] Move test list to another file, it will grow a lot, we
 may want to split by level.

Also adding a warning instead of a comment on the tests that don't work.
---
 .../KernelBench/test_kernel_bench.py          | 329 +-----------------
 examples/end-to-end/KernelBench/tests.yaml    | 289 +++++++++++++++
 2 files changed, 299 insertions(+), 319 deletions(-)
 create mode 100644 examples/end-to-end/KernelBench/tests.yaml

diff --git a/examples/end-to-end/KernelBench/test_kernel_bench.py b/examples/end-to-end/KernelBench/test_kernel_bench.py
index 910c611..954047b 100755
--- a/examples/end-to-end/KernelBench/test_kernel_bench.py
+++ b/examples/end-to-end/KernelBench/test_kernel_bench.py
@@ -9,11 +9,14 @@
 import platform
 from pathlib import Path
 
+import yaml
+
 script_path = Path(__file__).parent
 project_root = script_path.parent.parent.parent
 kb_program = project_root / "tools" / "kernel_bench"
 kb_default_pipeline = kb_program.parent / "kernel_bench.yaml"
 kb_path = project_root / "third_party" / "KernelBench" / "KernelBench"
+yaml_path = script_path / "tests.yaml"
 
 
 def get_pipeline_file(name: str, dtype: str) -> Path:
@@ -32,325 +35,6 @@ def get_pipeline_file(name: str, dtype: str) -> Path:
     return kb_default_pipeline
 
 
-tests = [
-    {
-        "kernel": "level1/1_Square_matrix_multiplication_.py",
-        "input_shapes": ["1024x1024", "1024x1024"],
-        "initializations": ["rnd", "id"],
-        "output_shape": "1024x1024",
-        "dtypes": ["f32", "bf16"],
-        "gflops": (1024 * 1024 * 1024 * 2) / 1e9,
-        "pipeline": "matmul",
-    },
-    {
-        "kernel": "level1/2_Standard_matrix_multiplication_.py",
-        "input_shapes": ["512x1024", "1024x512"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "512x512",
-        "dtypes": ["f32", "bf16"],
-        "gflops": (512 * 1024 * 512 * 2) / 1e9,
-        "pipeline": "matmul",
-    },
-    {
-        "kernel": "level1/3_Batched_matrix_multiplication.py",
-        "input_shapes": ["4x64x32", "4x32x64"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "4x64x64",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/4_Matrix_vector_multiplication_.py",
-        "input_shapes": ["1024x1024", "1024x1"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "1024x1",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/5_Matrix_scalar_multiplication.py",
-        "input_shapes": ["1024x1024", "1"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "1024x1024",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/6_Matmul_with_large_K_dimension_.py",
-        "input_shapes": ["256x524288", "524288x256"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "256x256",
-        "dtypes": ["f32", "bf16"],
-        "gflops": (256 * 524288 * 256 * 2) / 1e9,
-        "pipeline": "matmul",
-    },
-    {
-        "kernel": "level1/7_Matmul_with_small_K_dimension_.py",
-        "input_shapes": ["32768x64", "64x32768"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "32768x32768",
-        "dtypes": ["f32", "bf16"],
-        "gflops": (32768 * 64 * 32768 * 2) / 1e9,
-        "pipeline": "matmul",
-    },
-    {
-        "kernel": "level1/8_Matmul_with_irregular_shapes_.py",
-        "input_shapes": ["8205x2949", "2949x5921"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "8205x5921",
-        "dtypes": ["f32", "bf16"],
-        "gflops": (8205 * 2949 * 5921 * 2) / 1e9,
-        "pipeline": "matmul",
-    },
-    # too many tiles provided, expected at most 3 found 4
-    {
-        "kernel": "level1/9_Tall_skinny_matrix_multiplication_.py",
-        "input_shapes": ["1024x32", "32x1024"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "1024x1024",
-        "dtypes": ["f32", "bf16"],
-        # "gflops": (1024 * 32 * 1024 * 2) / 1e9,
-        # "pipeline": "matmul",
-    },
-    {
-        "kernel": "level1/10_3D_tensor_matrix_multiplication.py",
-        "input_shapes": ["16x1024x2048", "2048x768"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "16x1024x768",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/11_4D_tensor_matrix_multiplication.py",
-        "input_shapes": ["8x256x512x256", "256x768"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "8x256x512x768",
-        "dtypes": ["f32", "bf16"],
-    },
-    # level1/12_Matmul_with_diagonal_matrices_.py
-    # torch_mlir.compiler_utils.TorchMlirCompilerError:
-    # Lowering TorchFX IR -> Torch Backend IR failed with the following diagnostics:
-    # python exception: Failure while executing pass pipeline
-    {
-        "kernel": "level1/12_Matmul_with_diagonal_matrices_.py",
-        "input_shapes": ["4096", "4096x4096"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "4096x4096",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/13_Matmul_for_symmetric_matrices.py",
-        "input_shapes": ["4096x4096", "4096x4096"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "4096x4096",
-        "dtypes": ["f32", "bf16"],
-        "gflops": (4096 * 4096 * 4096 * 2) / 1e9,
-        "pipeline": "matmul",
-    },
-    # level1/14_Matmul_for_upper_triangular_matrices.py
-    # LLVM ERROR: operation destroyed but still has uses
-    {
-        "kernel": "level1/14_Matmul_for_upper_triangular_matrices.py",
-        "input_shapes": ["4096x4096", "4096x4096"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "4096x4096",
-        "dtypes": ["f32", "bf16"],
-        "gflops": (4096 * 4096 * 4096 * 2) / 1e9,
-        "pipeline": "matmul",
-    },
-    # level1/15_Matmul_for_lower_triangular_matrices.py
-    # LLVM ERROR: operation destroyed but still has uses
-    {
-        "kernel": "level1/15_Matmul_for_lower_triangular_matrices.py",
-        "input_shapes": ["4096x4096", "4096x4096"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "4096x4096",
-        "dtypes": ["f32", "bf16"],
-        "gflops": (4096 * 4096 * 4096 * 2) / 1e9,
-        "pipeline": "matmul",
-    },
-    {
-        "kernel": "level1/16_Matmul_with_transposed_A.py",
-        "input_shapes": ["8192x2048", "8192x4096"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "2048x4096",
-        "dtypes": ["f32", "bf16"],
-        "gflops": (2048 * 8192 * 4096 * 2) / 1e9,
-        "pipeline": "matmul",
-    },
-    {
-        "kernel": "level1/17_Matmul_with_transposed_B.py",
-        "input_shapes": ["2048x8192", "4096x8192"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "2048x4096",
-        "dtypes": ["f32", "bf16"],
-        "gflops": (2048 * 8192 * 4096 * 2) / 1e9,
-        "pipeline": "matmul",
-    },
-    {
-        "kernel": "level1/18_Matmul_with_transposed_both.py",
-        "input_shapes": ["8192x2048", "4096x8192"],
-        "initializations": ["rnd", "rnd"],
-        "output_shape": "2048x4096",
-        "dtypes": ["f32", "bf16"],
-        "gflops": (2048 * 8192 * 4096 * 2) / 1e9,
-        "pipeline": "matmul",
-    },
-    # All Element-wise kernels below fail with the same error:
-    # LLVM ERROR: operation destroyed but still has uses
-    {
-        "kernel": "level1/19_ReLU.py",
-        "input_shapes": ["4096x393216"],
-        "initializations": ["rnd"],
-        "output_shape": "4096x393216",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/20_LeakyReLU.py",
-        "input_shapes": ["4096x393216"],
-        "initializations": ["rnd"],
-        "output_shape": "4096x393216",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/21_Sigmoid.py",
-        "input_shapes": ["4096x393216"],
-        "initializations": ["rnd"],
-        "output_shape": "4096x393216",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/22_Tanh.py",
-        "input_shapes": ["4096x393216"],
-        "initializations": ["rnd"],
-        "output_shape": "4096x393216",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/23_Softmax.py",
-        "input_shapes": ["4096x393216"],
-        "initializations": ["rnd"],
-        "output_shape": "4096x393216",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/24_LogSoftmax.py",
-        "input_shapes": ["4096x393216"],
-        "initializations": ["rnd"],
-        "output_shape": "4096x393216",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/25_Swish.py",
-        "input_shapes": ["4096x393216"],
-        "initializations": ["rnd"],
-        "output_shape": "4096x393216",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/26_GELU_.py",
-        "input_shapes": ["4096x393216"],
-        "initializations": ["rnd"],
-        "output_shape": "4096x393216",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/27_SELU_.py",
-        "input_shapes": ["4096x393216"],
-        "initializations": ["rnd"],
-        "output_shape": "4096x393216",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/28_HardSigmoid.py",
-        "input_shapes": ["4096x393216"],
-        "initializations": ["rnd"],
-        "output_shape": "4096x393216",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/29_Softplus.py",
-        "input_shapes": ["4096x393216"],
-        "initializations": ["rnd"],
-        "output_shape": "4096x393216",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/30_Softsign.py",
-        "input_shapes": ["4096x393216"],
-        "initializations": ["rnd"],
-        "output_shape": "4096x393216",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/31_ELU.py",
-        "input_shapes": ["4096x393216"],
-        "initializations": ["rnd"],
-        "output_shape": "4096x393216",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/32_HardTanh.py",
-        "input_shapes": ["4096x393216"],
-        "initializations": ["rnd"],
-        "output_shape": "4096x393216",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/33_BatchNorm.py",
-        "input_shapes": ["64x64x512x512"],
-        "initializations": ["rnd"],
-        "output_shape": "64x64x512x512",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/34_InstanceNorm.py",
-        "input_shapes": ["112x64x512x512"],
-        "initializations": ["rnd"],
-        "output_shape": "112x64x512x512",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/35_GroupNorm_.py",
-        "input_shapes": ["112x64x512x512"],
-        "initializations": ["rnd"],
-        "output_shape": "112x64x512x512",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/36_RMSNorm_.py",
-        "input_shapes": ["112x64x512x512"],
-        "initializations": ["rnd"],
-        "output_shape": "112x64x512x512",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/37_FrobeniusNorm_.py",
-        "input_shapes": ["112x64x512x512"],
-        "initializations": ["rnd"],
-        "output_shape": "112x64x512x512",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/38_L1Norm_.py",
-        "input_shapes": ["32768x65535"],
-        "initializations": ["rnd"],
-        "output_shape": "32768x65535",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/39_L2Norm_.py",
-        "input_shapes": ["32768x65535"],
-        "initializations": ["rnd"],
-        "output_shape": "32768x65535",
-        "dtypes": ["f32", "bf16"],
-    },
-    {
-        "kernel": "level1/40_LayerNorm.py",
-        "input_shapes": ["16x64x256x256"],
-        "initializations": ["rnd"],
-        "output_shape": "16x64x256x256",
-        "dtypes": ["f32", "bf16"],
-    },
-]
-
-
 def get_tests(args: argparse.Namespace) -> list[dict]:
     """
     Returns the list of tests to be executed.
@@ -362,6 +46,10 @@ def get_tests(args: argparse.Namespace) -> list[dict]:
         args.bf16 = False  # Disable bf16 tests in CI for faster feedback
         args.benchmark = False  # Disable benchmarking in CI for faster feedback
 
+    tests = []
+    with open(yaml_path) as f:
+        tests = yaml.safe_load(f)
+
     test_list = []
     for test in tests:
         for dtype in test["dtypes"]:
@@ -384,6 +72,7 @@ def get_tests(args: argparse.Namespace) -> list[dict]:
                     if "gflops" in test and args.benchmark
                     else None,
                     "pipeline": str(get_pipeline_file(test.get("pipeline", ""), dtype)),
+                    "warning": test.get("warning", None),
                 }
             )
             # CI mode runs fewer tests for faster feedback
@@ -460,6 +149,8 @@ def get_flops_per_second(stdout: str, gflops: float) -> float:
             command_line += ["--benchmark"]
         if args.print_mlir_after_all:
             command_line += ["--print-mlir-after-all"]
+        if test.get("warning"):
+            print(f"WARNING: {test['warning']}")
         print(f"Running command: {' '.join(command_line)}")
 
         # While debugging kernels, it's useful to see the output as it comes.
diff --git a/examples/end-to-end/KernelBench/tests.yaml b/examples/end-to-end/KernelBench/tests.yaml
new file mode 100644
index 0000000..c178f4c
--- /dev/null
+++ b/examples/end-to-end/KernelBench/tests.yaml
@@ -0,0 +1,289 @@
+- kernel: level1/1_Square_matrix_multiplication_.py
+  input_shapes: [1024x1024, 1024x1024]
+  initializations: [rnd, id]
+  output_shape: 1024x1024
+  dtypes: [f32, bf16]
+  gflops: 2.147483648  # (1024 * 1024 * 1024 * 2) / 1e9
+  pipeline: matmul
+
+- kernel: level1/2_Standard_matrix_multiplication_.py
+  input_shapes: [512x1024, 1024x512]
+  initializations: [rnd, rnd]
+  output_shape: 512x512
+  dtypes: [f32, bf16]
+  gflops: 0.536870912  # (512 * 1024 * 512 * 2) / 1e9
+  pipeline: matmul
+
+- kernel: level1/3_Batched_matrix_multiplication.py
+  input_shapes: [4x64x32, 4x32x64]
+  initializations: [rnd, rnd]
+  output_shape: 4x64x64
+  dtypes: [f32, bf16]
+
+- kernel: level1/4_Matrix_vector_multiplication_.py
+  input_shapes: [1024x1024, 1024x1]
+  initializations: [rnd, rnd]
+  output_shape: 1024x1
+  dtypes: [f32, bf16]
+
+- kernel: level1/5_Matrix_scalar_multiplication.py
+  input_shapes: [1024x1024, "1"]
+  initializations: [rnd, rnd]
+  output_shape: 1024x1024
+  dtypes: [f32, bf16]
+
+- kernel: level1/6_Matmul_with_large_K_dimension_.py
+  input_shapes: [256x524288, 524288x256]
+  initializations: [rnd, rnd]
+  output_shape: 256x256
+  dtypes: [f32, bf16]
+  gflops: 68.719476736  # (256 * 524288 * 256 * 2) / 1e9
+  pipeline: matmul
+
+- kernel: level1/7_Matmul_with_small_K_dimension_.py
+  input_shapes: [32768x64, 64x32768]
+  initializations: [rnd, rnd]
+  output_shape: 32768x32768
+  dtypes: [f32, bf16]
+  gflops: 137.438953472  # (32768 * 64 * 32768 * 2) / 1e9
+  pipeline: matmul
+
+- kernel: level1/8_Matmul_with_irregular_shapes_.py
+  input_shapes: [8205x2949, 2949x5921]
+  initializations: [rnd, rnd]
+  output_shape: 8205x5921
+  dtypes: [f32, bf16]
+  gflops: 286.535485890  # (8205 * 2949 * 5921 * 2) / 1e9
+  pipeline: matmul
+
+- kernel: level1/9_Tall_skinny_matrix_multiplication_.py
+  input_shapes: [1024x32, 32x1024]
+  initializations: [rnd, rnd]
+  output_shape: 1024x1024
+  dtypes: [f32, bf16]
+  # gflops: 0.067108864  # (1024 * 32 * 1024 * 2) / 1e9
+  # pipeline: matmul
+  warning: "Optimized pipeline error: too many tiles provided, expected at most 3 found 4"
+
+- kernel: level1/10_3D_tensor_matrix_multiplication.py
+  input_shapes: [16x1024x2048, 2048x768]
+  initializations: [rnd, rnd]
+  output_shape: 16x1024x768
+  dtypes: [f32, bf16]
+
+- kernel: level1/11_4D_tensor_matrix_multiplication.py
+  input_shapes: [8x256x512x256, 256x768]
+  initializations: [rnd, rnd]
+  output_shape: 8x256x512x768
+  dtypes: [f32, bf16]
+
+- kernel: level1/12_Matmul_with_diagonal_matrices_.py
+  input_shapes: ["4096", 4096x4096]
+  initializations: [rnd, rnd]
+  output_shape: 4096x4096
+  dtypes: [f32, bf16]
+  warning: "torch_mlir.compiler_utils.TorchMlirCompilerError: Lowering TorchFX IR -> Torch Backend IR failed with the following diagnostics: python exception: Failure while executing pass pipeline"
+
+- kernel: level1/13_Matmul_for_symmetric_matrices.py
+  input_shapes: [4096x4096, 4096x4096]
+  initializations: [rnd, rnd]
+  output_shape: 4096x4096
+  dtypes: [f32, bf16]
+  gflops: 137.438953472  # (4096 * 4096 * 4096 * 2) / 1e9
+  pipeline: matmul
+
+- kernel: level1/14_Matmul_for_upper_triangular_matrices.py
+  input_shapes: [4096x4096, 4096x4096]
+  initializations: [rnd, rnd]
+  output_shape: 4096x4096
+  dtypes: [f32, bf16]
+  gflops: 137.438953472  # (4096 * 4096 * 4096 * 2) / 1e9
+  pipeline: matmul
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/15_Matmul_for_lower_triangular_matrices.py
+  input_shapes: [4096x4096, 4096x4096]
+  initializations: [rnd, rnd]
+  output_shape: 4096x4096
+  dtypes: [f32, bf16]
+  gflops: 137.438953472  # (4096 * 4096 * 4096 * 2) / 1e9
+  pipeline: matmul
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/16_Matmul_with_transposed_A.py
+  input_shapes: [8192x2048, 8192x4096]
+  initializations: [rnd, rnd]
+  output_shape: 2048x4096
+  dtypes: [f32, bf16]
+  gflops: 137.438953472  # (2048 * 8192 * 4096 * 2) / 1e9
+  pipeline: matmul
+
+- kernel: level1/17_Matmul_with_transposed_B.py
+  input_shapes: [2048x8192, 4096x8192]
+  initializations: [rnd, rnd]
+  output_shape: 2048x4096
+  dtypes: [f32, bf16]
+  gflops: 137.438953472  # (2048 * 8192 * 4096 * 2) / 1e9
+  pipeline: matmul
+
+- kernel: level1/18_Matmul_with_transposed_both.py
+  input_shapes: [8192x2048, 4096x8192]
+  initializations: [rnd, rnd]
+  output_shape: 2048x4096
+  dtypes: [f32, bf16]
+  gflops: 137.438953472  # (2048 * 8192 * 4096 * 2) / 1e9
+  pipeline: matmul
+
+- kernel: level1/19_ReLU.py
+  input_shapes: [4096x393216]
+  initializations: [rnd]
+  output_shape: 4096x393216
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/20_LeakyReLU.py
+  input_shapes: [4096x393216]
+  initializations: [rnd]
+  output_shape: 4096x393216
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/21_Sigmoid.py
+  input_shapes: [4096x393216]
+  initializations: [rnd]
+  output_shape: 4096x393216
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/22_Tanh.py
+  input_shapes: [4096x393216]
+  initializations: [rnd]
+  output_shape: 4096x393216
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/23_Softmax.py
+  input_shapes: [4096x393216]
+  initializations: [rnd]
+  output_shape: 4096x393216
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/24_LogSoftmax.py
+  input_shapes: [4096x393216]
+  initializations: [rnd]
+  output_shape: 4096x393216
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/25_Swish.py
+  input_shapes: [4096x393216]
+  initializations: [rnd]
+  output_shape: 4096x393216
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/26_GELU_.py
+  input_shapes: [4096x393216]
+  initializations: [rnd]
+  output_shape: 4096x393216
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/27_SELU_.py
+  input_shapes: [4096x393216]
+  initializations: [rnd]
+  output_shape: 4096x393216
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/28_HardSigmoid.py
+  input_shapes: [4096x393216]
+  initializations: [rnd]
+  output_shape: 4096x393216
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/29_Softplus.py
+  input_shapes: [4096x393216]
+  initializations: [rnd]
+  output_shape: 4096x393216
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/30_Softsign.py
+  input_shapes: [4096x393216]
+  initializations: [rnd]
+  output_shape: 4096x393216
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/31_ELU.py
+  input_shapes: [4096x393216]
+  initializations: [rnd]
+  output_shape: 4096x393216
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/32_HardTanh.py
+  input_shapes: [4096x393216]
+  initializations: [rnd]
+  output_shape: 4096x393216
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/33_BatchNorm.py
+  input_shapes: [64x64x512x512]
+  initializations: [rnd]
+  output_shape: 64x64x512x512
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/34_InstanceNorm.py
+  input_shapes: [112x64x512x512]
+  initializations: [rnd]
+  output_shape: 112x64x512x512
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/35_GroupNorm_.py
+  input_shapes: [112x64x512x512]
+  initializations: [rnd]
+  output_shape: 112x64x512x512
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/36_RMSNorm_.py
+  input_shapes: [112x64x512x512]
+  initializations: [rnd]
+  output_shape: 112x64x512x512
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/37_FrobeniusNorm_.py
+  input_shapes: [112x64x512x512]
+  initializations: [rnd]
+  output_shape: 112x64x512x512
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/38_L1Norm_.py
+  input_shapes: [32768x65535]
+  initializations: [rnd]
+  output_shape: 32768x65535
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/39_L2Norm_.py
+  input_shapes: [32768x65535]
+  initializations: [rnd]
+  output_shape: 32768x65535
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"
+
+- kernel: level1/40_LayerNorm.py
+  input_shapes: [16x64x256x256]
+  initializations: [rnd]
+  output_shape: 16x64x256x256
+  dtypes: [f32, bf16]
+  warning: "LLVM ERROR: operation destroyed but still has uses"

From 4e664b4eeb3321748cd72ad300f04e469fc12636 Mon Sep 17 00:00:00 2001
From: Renato Golin <rengolin@systemcall.eu>
Date: Fri, 15 May 2026 11:22:25 +0100
Subject: [PATCH 4/4] Break long line

---
 examples/end-to-end/KernelBench/tests.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/end-to-end/KernelBench/tests.yaml b/examples/end-to-end/KernelBench/tests.yaml
index c178f4c..8f4f47d 100644
--- a/examples/end-to-end/KernelBench/tests.yaml
+++ b/examples/end-to-end/KernelBench/tests.yaml
@@ -82,7 +82,9 @@
   initializations: [rnd, rnd]
   output_shape: 4096x4096
   dtypes: [f32, bf16]
-  warning: "torch_mlir.compiler_utils.TorchMlirCompilerError: Lowering TorchFX IR -> Torch Backend IR failed with the following diagnostics: python exception: Failure while executing pass pipeline"
+  warning: '''ERROR: torch_mlir.compiler_utils.TorchMlirCompilerError:
+              Lowering TorchFX IR -> Torch Backend IR failed with the following diagnostics:
+              python exception: Failure while executing pass pipeline'''
 
 - kernel: level1/13_Matmul_for_symmetric_matrices.py
   input_shapes: [4096x4096, 4096x4096]