From da379e84df067b0be591247202861d88180c55d5 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Thu, 14 May 2026 12:55:45 +0100 Subject: [PATCH 1/4] Add first 40 kernels to KB test Only the first two are actually benchmarking, as they're the only ones which we can achieve any reasonable performance. The others lower to loops and use smaller sizes to not bloat the testing times. The schedule selection also changed to allow for smaller deltas when selecting the whole pipeline for multiple kernels. Generic sub-schedules have been moved one directory lower. Can create sub-dirs with special differences reusing the generic ones as needed. To minimize CI impact, there's a new "CI mode", where only the first 5 tests are run, without benchmarking or bf16 support. This is just a smoke test. Further tests / benchmarking should be calling the script directly with the appropriate flags (per architecture). Notes: * The element-wise ones fail to lower, some matmul ones fail in the same way, comments on their entries in the tests table. * Higher dimensional matmuls don't tile the same way, so using the loops lowering for now. * Skinny matmul fails to use the optimized pipeline due to the skinny dimension not tiling (1), so also using the loops lowering. There's a new --test option to pick a particular test. For example, to benchmark the BF16 version of level1/40_LayerNorm.py, call: ``` $ test_kernel_bench --test=level1/40_LayerNorm.py --benchmark --bf16 ``` assisted-by: GitHub Copilot --- .../schedules/x86_64/{matmul => }/lower.yaml | 0 .../schedules/x86_64/matmul/bf16.yaml | 8 +- .../schedules/x86_64/matmul/f32.yaml | 6 +- .../schedules/x86_64/matmul/vectorize.yaml | 11 - .../x86_64/{matmul => }/pack_and_tile.yaml | 0 .../schedules/x86_64/vectorize.yaml | 7 + .../KernelBench/test_kernel_bench.py | 361 +++++++++++++++++- 7 files changed, 362 insertions(+), 31 deletions(-) rename examples/end-to-end/KernelBench/schedules/x86_64/{matmul => }/lower.yaml (100%) delete mode 100644 examples/end-to-end/KernelBench/schedules/x86_64/matmul/vectorize.yaml rename examples/end-to-end/KernelBench/schedules/x86_64/{matmul => }/pack_and_tile.yaml (100%) create mode 100644 examples/end-to-end/KernelBench/schedules/x86_64/vectorize.yaml diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/lower.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/lower.yaml similarity index 100% rename from examples/end-to-end/KernelBench/schedules/x86_64/matmul/lower.yaml rename to examples/end-to-end/KernelBench/schedules/x86_64/lower.yaml diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/bf16.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/bf16.yaml index c81d950..19bcf76 100644 --- a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/bf16.yaml +++ b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/bf16.yaml @@ -1,10 +1,8 @@ # This is an optimizing pipeline for kernel_bench matmuls on bf16 types. # This is basically a copy of the fp32 pipeline, with ONE CHANGE: # - register_tiling.py -> reg_unroll_k=2 (instead of 1) -# Tested on x86_64 with AVX512 reaching good performance for simple KB kernels. -# It may not apply to other workloads / extensions / architectures, so use with caution. Pipeline: - - include: pack_and_tile.yaml + - include: ../pack_and_tile.yaml ## CPU specific register tiling (depends on uArch & data type) - schedule: "x86/register_tiling.py[gen=matmul_register_tiling]{target=linalg.contract reg_tile_batch=1 reg_tile_m=8 reg_tile_n=32 reg_tile_k=2}" @@ -12,6 +10,6 @@ Pipeline: - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.fill tile_sizes=[1,1,1]}" - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.generic tile_sizes=[1,8]}" - - include: vectorize.yaml + - include: ../vectorize.yaml - - include: lower.yaml + - include: ../lower.yaml diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/f32.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/f32.yaml index 742f91c..6043543 100644 --- a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/f32.yaml +++ b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/f32.yaml @@ -2,7 +2,7 @@ # Tested on x86_64 with AVX512 reaching good performance for simple KB kernels. # It may not apply to other workloads / extensions / architectures, so use with caution. Pipeline: - - include: pack_and_tile.yaml + - include: ../pack_and_tile.yaml ## CPU specific register tiling (depends on uArch & data type) - schedule: "x86/register_tiling.py[gen=matmul_register_tiling]{target=linalg.contract reg_tile_batch=1 reg_tile_m=8 reg_tile_n=32 reg_tile_k=2}" @@ -10,6 +10,6 @@ Pipeline: - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.fill tile_sizes=[1,1,1]}" - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.generic tile_sizes=[1,8]}" - - include: vectorize.yaml + - include: ../vectorize.yaml - - include: lower.yaml + - include: ../lower.yaml diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/vectorize.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/vectorize.yaml deleted file mode 100644 index 527920c..0000000 --- a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/vectorize.yaml +++ /dev/null @@ -1,11 +0,0 @@ -# This is an optimizing pipeline for kernel_bench matmuls on bf16 types. -# This is basically a copy of the fp32 pipeline, with ONE CHANGE: -# - register_tiling.py -> reg_unroll_k=2 (instead of 1) -# Tested on x86_64 with AVX512 reaching good performance for simple KB kernels. -# It may not apply to other workloads / extensions / architectures, so use with caution. -Pipeline: - ## Tensor vectorization (for the left-over element wise) - - schedule: "vectorization.py[gen=vectorize_linalg]" - - schedule: "hoisting.py[gen=hoist_loops]" - - schedule: "vectorization.py[gen=simplify_vector_ops]" - - schedule: "vectorization.py[gen=x86_vectorization]" diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/pack_and_tile.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/pack_and_tile.yaml similarity index 100% rename from examples/end-to-end/KernelBench/schedules/x86_64/matmul/pack_and_tile.yaml rename to examples/end-to-end/KernelBench/schedules/x86_64/pack_and_tile.yaml diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/vectorize.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/vectorize.yaml new file mode 100644 index 0000000..57abb67 --- /dev/null +++ b/examples/end-to-end/KernelBench/schedules/x86_64/vectorize.yaml @@ -0,0 +1,7 @@ +# Tensor level vectorization for matmul like kernels on any type. +Pipeline: + ## Tensor vectorization (for the left-over element wise) + - schedule: "vectorization.py[gen=vectorize_linalg]" + - schedule: "hoisting.py[gen=hoist_loops]" + - schedule: "vectorization.py[gen=simplify_vector_ops]" + - schedule: "vectorization.py[gen=x86_vectorization]" diff --git a/examples/end-to-end/KernelBench/test_kernel_bench.py b/examples/end-to-end/KernelBench/test_kernel_bench.py index 202a468..69bfd8b 100755 --- a/examples/end-to-end/KernelBench/test_kernel_bench.py +++ b/examples/end-to-end/KernelBench/test_kernel_bench.py @@ -1,4 +1,4 @@ -# RUN: python %s | FileCheck %s +# RUN: python %s --ci | FileCheck %s # REQUIRES: torch # REQUIRES: kernel_bench @@ -16,17 +16,15 @@ kb_path = project_root / "third_party" / "KernelBench" / "KernelBench" -def get_pipeline_file(kernel_name: str, dtype: str) -> Path: +def get_pipeline_file(name: str, dtype: str) -> Path: """ Returns the appropriate pipeline file for a given kernel. """ arch = platform.machine() - if arch != "x86_64": - return kb_default_pipeline - # Level 1 matmuls should use the same pipelines - if kernel_name.startswith("level1") and "matrix_multiplication" in kernel_name: - pipeline = script_path / f"schedules/{arch}/matmul/{dtype}.yaml" + # If the pipeline file exists for the given name and dtype + if name: + pipeline = script_path / f"schedules/{arch}/{name}/{dtype}.yaml" if pipeline.exists(): return pipeline @@ -42,6 +40,7 @@ def get_pipeline_file(kernel_name: str, dtype: str) -> Path: "output_shape": "1024x1024", "dtypes": ["f32", "bf16"], "gflops": (1024 * 1024 * 1024 * 2) / 1e9, + "pipeline": "matmul", }, { "kernel": "level1/2_Standard_matrix_multiplication_.py", @@ -50,6 +49,304 @@ def get_pipeline_file(kernel_name: str, dtype: str) -> Path: "output_shape": "512x512", "dtypes": ["f32", "bf16"], "gflops": (512 * 1024 * 512 * 2) / 1e9, + "pipeline": "matmul", + }, + { + "kernel": "level1/3_Batched_matrix_multiplication.py", + "input_shapes": ["4x64x32", "4x32x64"], + "initializations": ["rnd", "rnd"], + "output_shape": "4x64x64", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/4_Matrix_vector_multiplication_.py", + "input_shapes": ["1024x1024", "1024x1"], + "initializations": ["rnd", "rnd"], + "output_shape": "1024x1", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/5_Matrix_scalar_multiplication.py", + "input_shapes": ["1024x1024", "1"], + "initializations": ["rnd", "rnd"], + "output_shape": "1024x1024", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/6_Matmul_with_large_K_dimension_.py", + "input_shapes": ["256x524288", "524288x256"], + "initializations": ["rnd", "rnd"], + "output_shape": "256x256", + "dtypes": ["f32", "bf16"], + "gflops": (256 * 524288 * 256 * 2) / 1e9, + "pipeline": "matmul", + }, + { + "kernel": "level1/7_Matmul_with_small_K_dimension_.py", + "input_shapes": ["32768x64", "64x32768"], + "initializations": ["rnd", "rnd"], + "output_shape": "32768x32768", + "dtypes": ["f32", "bf16"], + "gflops": (32768 * 64 * 32768 * 2) / 1e9, + "pipeline": "matmul", + }, + { + "kernel": "level1/8_Matmul_with_irregular_shapes_.py", + "input_shapes": ["8205x2949", "2949x5921"], + "initializations": ["rnd", "rnd"], + "output_shape": "8205x5921", + "dtypes": ["f32", "bf16"], + "gflops": (8205 * 2949 * 5921 * 2) / 1e9, + "pipeline": "matmul", + }, + # too many tiles provided, expected at most 3 found 4 + { + "kernel": "level1/9_Tall_skinny_matrix_multiplication_.py", + "input_shapes": ["1024x32", "32x1024"], + "initializations": ["rnd", "rnd"], + "output_shape": "1024x1024", + "dtypes": ["f32", "bf16"], + # "gflops": (1024 * 32 * 1024 * 2) / 1e9, + # "pipeline": "matmul", + }, + { + "kernel": "level1/10_3D_tensor_matrix_multiplication.py", + "input_shapes": ["16x1024x2048", "2048x768"], + "initializations": ["rnd", "rnd"], + "output_shape": "16x1024x768", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/11_4D_tensor_matrix_multiplication.py", + "input_shapes": ["8x256x512x256", "256x768"], + "initializations": ["rnd", "rnd"], + "output_shape": "8x256x512x768", + "dtypes": ["f32", "bf16"], + }, + # level1/12_Matmul_with_diagonal_matrices_.py + # torch_mlir.compiler_utils.TorchMlirCompilerError: + # Lowering TorchFX IR -> Torch Backend IR failed with the following diagnostics: + # python exception: Failure while executing pass pipeline + { + "kernel": "level1/12_Matmul_with_diagonal_matrices_.py", + "input_shapes": ["4096", "4096x4096"], + "initializations": ["rnd", "rnd"], + "output_shape": "4096x4096", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/13_Matmul_for_symmetric_matrices.py", + "input_shapes": ["4096x4096", "4096x4096"], + "initializations": ["rnd", "rnd"], + "output_shape": "4096x4096", + "dtypes": ["f32", "bf16"], + "gflops": (4096 * 4096 * 4096 * 2) / 1e9, + "pipeline": "matmul", + }, + # level1/14_Matmul_for_upper_triangular_matrices.py + # LLVM ERROR: operation destroyed but still has uses + { + "kernel": "level1/14_Matmul_for_upper_triangular_matrices.py", + "input_shapes": ["4096x4096", "4096x4096"], + "initializations": ["rnd", "rnd"], + "output_shape": "4096x4096", + "dtypes": ["f32", "bf16"], + "gflops": (4096 * 4096 * 4096 * 2) / 1e9, + "pipeline": "matmul", + }, + # level1/15_Matmul_for_lower_triangular_matrices.py + # LLVM ERROR: operation destroyed but still has uses + { + "kernel": "level1/15_Matmul_for_lower_triangular_matrices.py", + "input_shapes": ["4096x4096", "4096x4096"], + "initializations": ["rnd", "rnd"], + "output_shape": "4096x4096", + "dtypes": ["f32", "bf16"], + "gflops": (4096 * 4096 * 4096 * 2) / 1e9, + "pipeline": "matmul", + }, + { + "kernel": "level1/16_Matmul_with_transposed_A.py", + "input_shapes": ["8192x2048", "8192x4096"], + "initializations": ["rnd", "rnd"], + "output_shape": "2048x4096", + "dtypes": ["f32", "bf16"], + "gflops": (2048 * 8192 * 4096 * 2) / 1e9, + "pipeline": "matmul", + }, + { + "kernel": "level1/17_Matmul_with_transposed_B.py", + "input_shapes": ["2048x8192", "4096x8192"], + "initializations": ["rnd", "rnd"], + "output_shape": "2048x4096", + "dtypes": ["f32", "bf16"], + "gflops": (2048 * 8192 * 4096 * 2) / 1e9, + "pipeline": "matmul", + }, + { + "kernel": "level1/18_Matmul_with_transposed_both.py", + "input_shapes": ["8192x2048", "4096x8192"], + "initializations": ["rnd", "rnd"], + "output_shape": "2048x4096", + "dtypes": ["f32", "bf16"], + "gflops": (2048 * 8192 * 4096 * 2) / 1e9, + "pipeline": "matmul", + }, + # All Element-wise kernels below fail with the same error: + # LLVM ERROR: operation destroyed but still has uses + { + "kernel": "level1/19_ReLU.py", + "input_shapes": ["4096x393216"], + "initializations": ["rnd"], + "output_shape": "4096x393216", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/20_LeakyReLU.py", + "input_shapes": ["4096x393216"], + "initializations": ["rnd"], + "output_shape": "4096x393216", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/21_Sigmoid.py", + "input_shapes": ["4096x393216"], + "initializations": ["rnd"], + "output_shape": "4096x393216", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/22_Tanh.py", + "input_shapes": ["4096x393216"], + "initializations": ["rnd"], + "output_shape": "4096x393216", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/23_Softmax.py", + "input_shapes": ["4096x393216"], + "initializations": ["rnd"], + "output_shape": "4096x393216", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/24_LogSoftmax.py", + "input_shapes": ["4096x393216"], + "initializations": ["rnd"], + "output_shape": "4096x393216", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/25_Swish.py", + "input_shapes": ["4096x393216"], + "initializations": ["rnd"], + "output_shape": "4096x393216", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/26_GELU_.py", + "input_shapes": ["4096x393216"], + "initializations": ["rnd"], + "output_shape": "4096x393216", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/27_SELU_.py", + "input_shapes": ["4096x393216"], + "initializations": ["rnd"], + "output_shape": "4096x393216", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/28_HardSigmoid.py", + "input_shapes": ["4096x393216"], + "initializations": ["rnd"], + "output_shape": "4096x393216", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/29_Softplus.py", + "input_shapes": ["4096x393216"], + "initializations": ["rnd"], + "output_shape": "4096x393216", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/30_Softsign.py", + "input_shapes": ["4096x393216"], + "initializations": ["rnd"], + "output_shape": "4096x393216", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/31_ELU.py", + "input_shapes": ["4096x393216"], + "initializations": ["rnd"], + "output_shape": "4096x393216", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/32_HardTanh.py", + "input_shapes": ["4096x393216"], + "initializations": ["rnd"], + "output_shape": "4096x393216", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/33_BatchNorm.py", + "input_shapes": ["64x64x512x512"], + "initializations": ["rnd"], + "output_shape": "64x64x512x512", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/34_InstanceNorm.py", + "input_shapes": ["112x64x512x512"], + "initializations": ["rnd"], + "output_shape": "112x64x512x512", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/35_GroupNorm_.py", + "input_shapes": ["112x64x512x512"], + "initializations": ["rnd"], + "output_shape": "112x64x512x512", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/36_RMSNorm_.py", + "input_shapes": ["112x64x512x512"], + "initializations": ["rnd"], + "output_shape": "112x64x512x512", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/37_FrobeniusNorm_.py", + "input_shapes": ["112x64x512x512"], + "initializations": ["rnd"], + "output_shape": "112x64x512x512", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/38_L1Norm_.py", + "input_shapes": ["32768x65535"], + "initializations": ["rnd"], + "output_shape": "32768x65535", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/39_L2Norm_.py", + "input_shapes": ["32768x65535"], + "initializations": ["rnd"], + "output_shape": "32768x65535", + "dtypes": ["f32", "bf16"], + }, + { + "kernel": "level1/40_LayerNorm.py", + "input_shapes": ["16x64x256x256"], + "initializations": ["rnd"], + "output_shape": "16x64x256x256", + "dtypes": ["f32", "bf16"], }, ] @@ -58,11 +355,21 @@ def get_tests(args: argparse.Namespace) -> list[dict]: """ Returns the list of tests to be executed. """ + if args.ci: + print( + "Running in CI mode: fewer tests, no bf16, no benchmarking for faster feedback" + ) + args.bf16 = False # Disable bf16 tests in CI for faster feedback + args.benchmark = False # Disable benchmarking in CI for faster feedback + test_list = [] for test in tests: for dtype in test["dtypes"]: if not args.bf16 and dtype == "bf16": continue + # If a specific test is specified, only include that test + if args.test and test["kernel"] != args.test: + continue test_list.append( { "kernel": test["kernel"], @@ -76,9 +383,12 @@ def get_tests(args: argparse.Namespace) -> list[dict]: "gflops": test["gflops"] if "gflops" in test and args.benchmark else None, - "pipeline": str(get_pipeline_file(test["kernel"], dtype)), + "pipeline": str(get_pipeline_file(test.get("pipeline", ""), dtype)), } ) + # CI mode runs fewer tests for faster feedback + if args.ci and len(test_list) >= 5: + return test_list return test_list @@ -105,9 +415,28 @@ def get_flops_per_second(stdout: str, gflops: float) -> float: action=argparse.BooleanOptionalAction, help="Enable bf16 precision kernels.", ) + Parser.add_argument( + "--ci", + action=argparse.BooleanOptionalAction, + help="Enable CI mode (faster run, fewer kernels).", + ) + Parser.add_argument( + "--test", + type=str, + help="Specify a particular test to run.", + ) args = Parser.parse_args() + tests = get_tests(args) + if len(tests) == 0: + if args.test: + print( + f"No tests found matching '{args.test}'. Please check your arguments." + ) + else: + print("No tests to run. Please check your arguments.") + exit(0) - for test in get_tests(args): + for test in tests: kb_kernel = kb_path / test["kernel"] command_line = [ str(kb_program), @@ -147,10 +476,18 @@ def get_flops_per_second(stdout: str, gflops: float) -> float: # CHECK: 0.3745{{.*}} 0.9507{{.*}} 0.7319{{.*}} ... 0.2973{{.*}} 0.9243{{.*}} 0.9710{{.*}} # CHECK: 0.7201{{.*}} 0.9926{{.*}} 0.1208{{.*}} ... 0.1742{{.*}} 0.3485{{.*}} 0.6436{{.*}} -# CHECK-NOT: Execution failed - # CHECK: 2_Standard_matrix_multiplication_.mlir # CHECK: 249.78{{.*}} 260.13{{.*}} 249.36{{.*}} ... 261.10{{.*}} 260.49{{.*}} 257.09{{.*}} # CHECK: 243.56{{.*}} 250.91{{.*}} 252.38{{.*}} ... 260.40{{.*}} 261.56{{.*}} 256.24{{.*}} -# CHECK-NOT: Execution failed +# CHECK: 3_Batched_matrix_multiplication.mlir +# CHECK: 5.2403{{.*}} 7.7905{{.*}} 6.0769{{.*}} ... 7.8579{{.*}} 6.8890{{.*}} 6.6193{{.*}} +# CHECK: 9.0407{{.*}} 6.3299{{.*}} 5.2003{{.*}} ... 6.2594{{.*}} 6.2980{{.*}} 5.9807{{.*}} + +# CHECK: 4_Matrix_vector_multiplication_.mlir +# CHECK: 264.86{{.*}} +# CHECK: 265.12{{.*}} + +# CHECK: 5_Matrix_scalar_multiplication.mlir +# CHECK: 0.1750{{.*}} 0.4442{{.*}} 0.3420{{.*}} ... 0.1389{{.*}} 0.4319{{.*}} 0.4538{{.*}} +# CHECK: 0.3365{{.*}} 0.4638{{.*}} 0.0564{{.*}} ... 0.0814{{.*}} 0.1628{{.*}} 0.3007{{.*}} From 98172129c1a185e7b09d13b5f9e144bcb926632a Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Thu, 14 May 2026 17:10:42 +0100 Subject: [PATCH 2/4] Changes: Allow partial kernel name match Allow pass print-mlir-after-all Allow bypass output capture for debugging purposes --- .../KernelBench/test_kernel_bench.py | 39 +++++++++++++------ 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/examples/end-to-end/KernelBench/test_kernel_bench.py b/examples/end-to-end/KernelBench/test_kernel_bench.py index 69bfd8b..910c611 100755 --- a/examples/end-to-end/KernelBench/test_kernel_bench.py +++ b/examples/end-to-end/KernelBench/test_kernel_bench.py @@ -368,7 +368,7 @@ def get_tests(args: argparse.Namespace) -> list[dict]: if not args.bf16 and dtype == "bf16": continue # If a specific test is specified, only include that test - if args.test and test["kernel"] != args.test: + if args.test and not test["kernel"].startswith(args.test): continue test_list.append( { @@ -425,6 +425,11 @@ def get_flops_per_second(stdout: str, gflops: float) -> float: type=str, help="Specify a particular test to run.", ) + Parser.add_argument( + "--print-mlir-after-all", + action=argparse.BooleanOptionalAction, + help="Whether to print the MLIR module after all stages. Default is False.", + ) args = Parser.parse_args() tests = get_tests(args) if len(tests) == 0: @@ -450,25 +455,37 @@ def get_flops_per_second(stdout: str, gflops: float) -> float: "--print-tensor=1", "--seed=42", ] - benchmark = test.get("gflops") is not None + benchmark = args.benchmark and test.get("gflops") is not None if benchmark: command_line += ["--benchmark"] + if args.print_mlir_after_all: + command_line += ["--print-mlir-after-all"] print(f"Running command: {' '.join(command_line)}") + + # While debugging kernels, it's useful to see the output as it comes. + # Note: GFLOPS can't be shown if the output is not captured. + capture_output = True + if args.print_mlir_after_all and not args.ci: + capture_output = False + result = subprocess.run( command_line, - capture_output=True, + capture_output=capture_output, text=True, ) - print("STDOUT:") - print(result.stdout) - if benchmark: - flops_per_second = get_flops_per_second(result.stdout, test["gflops"]) - if flops_per_second > 0: - print(f"Performance: {flops_per_second:.2f} GFLOPS") + # If output is captured, print it out, including benchmark results if applicable. + if capture_output: + print("STDOUT:") + print(result.stdout) + if benchmark: + flops_per_second = get_flops_per_second(result.stdout, test["gflops"]) + if flops_per_second > 0: + print(f"Performance: {flops_per_second:.2f} GFLOPS") + + print("STDERR:") + print(result.stderr) - print("STDERR:") - print(result.stderr) print(f"Return code: {result.returncode}") assert result.returncode == 0, "Execution failed" From 3f90c3df1dbe3c276081b8a664150d7672df13d8 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Fri, 15 May 2026 11:19:37 +0100 Subject: [PATCH 3/4] Move test list to another file, it will grow a lot, we may want to split by level. Also adding a warning instead of a comment on the tests that don't work. --- .../KernelBench/test_kernel_bench.py | 329 +----------------- examples/end-to-end/KernelBench/tests.yaml | 289 +++++++++++++++ 2 files changed, 299 insertions(+), 319 deletions(-) create mode 100644 examples/end-to-end/KernelBench/tests.yaml diff --git a/examples/end-to-end/KernelBench/test_kernel_bench.py b/examples/end-to-end/KernelBench/test_kernel_bench.py index 910c611..954047b 100755 --- a/examples/end-to-end/KernelBench/test_kernel_bench.py +++ b/examples/end-to-end/KernelBench/test_kernel_bench.py @@ -9,11 +9,14 @@ import platform from pathlib import Path +import yaml + script_path = Path(__file__).parent project_root = script_path.parent.parent.parent kb_program = project_root / "tools" / "kernel_bench" kb_default_pipeline = kb_program.parent / "kernel_bench.yaml" kb_path = project_root / "third_party" / "KernelBench" / "KernelBench" +yaml_path = script_path / "tests.yaml" def get_pipeline_file(name: str, dtype: str) -> Path: @@ -32,325 +35,6 @@ def get_pipeline_file(name: str, dtype: str) -> Path: return kb_default_pipeline -tests = [ - { - "kernel": "level1/1_Square_matrix_multiplication_.py", - "input_shapes": ["1024x1024", "1024x1024"], - "initializations": ["rnd", "id"], - "output_shape": "1024x1024", - "dtypes": ["f32", "bf16"], - "gflops": (1024 * 1024 * 1024 * 2) / 1e9, - "pipeline": "matmul", - }, - { - "kernel": "level1/2_Standard_matrix_multiplication_.py", - "input_shapes": ["512x1024", "1024x512"], - "initializations": ["rnd", "rnd"], - "output_shape": "512x512", - "dtypes": ["f32", "bf16"], - "gflops": (512 * 1024 * 512 * 2) / 1e9, - "pipeline": "matmul", - }, - { - "kernel": "level1/3_Batched_matrix_multiplication.py", - "input_shapes": ["4x64x32", "4x32x64"], - "initializations": ["rnd", "rnd"], - "output_shape": "4x64x64", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/4_Matrix_vector_multiplication_.py", - "input_shapes": ["1024x1024", "1024x1"], - "initializations": ["rnd", "rnd"], - "output_shape": "1024x1", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/5_Matrix_scalar_multiplication.py", - "input_shapes": ["1024x1024", "1"], - "initializations": ["rnd", "rnd"], - "output_shape": "1024x1024", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/6_Matmul_with_large_K_dimension_.py", - "input_shapes": ["256x524288", "524288x256"], - "initializations": ["rnd", "rnd"], - "output_shape": "256x256", - "dtypes": ["f32", "bf16"], - "gflops": (256 * 524288 * 256 * 2) / 1e9, - "pipeline": "matmul", - }, - { - "kernel": "level1/7_Matmul_with_small_K_dimension_.py", - "input_shapes": ["32768x64", "64x32768"], - "initializations": ["rnd", "rnd"], - "output_shape": "32768x32768", - "dtypes": ["f32", "bf16"], - "gflops": (32768 * 64 * 32768 * 2) / 1e9, - "pipeline": "matmul", - }, - { - "kernel": "level1/8_Matmul_with_irregular_shapes_.py", - "input_shapes": ["8205x2949", "2949x5921"], - "initializations": ["rnd", "rnd"], - "output_shape": "8205x5921", - "dtypes": ["f32", "bf16"], - "gflops": (8205 * 2949 * 5921 * 2) / 1e9, - "pipeline": "matmul", - }, - # too many tiles provided, expected at most 3 found 4 - { - "kernel": "level1/9_Tall_skinny_matrix_multiplication_.py", - "input_shapes": ["1024x32", "32x1024"], - "initializations": ["rnd", "rnd"], - "output_shape": "1024x1024", - "dtypes": ["f32", "bf16"], - # "gflops": (1024 * 32 * 1024 * 2) / 1e9, - # "pipeline": "matmul", - }, - { - "kernel": "level1/10_3D_tensor_matrix_multiplication.py", - "input_shapes": ["16x1024x2048", "2048x768"], - "initializations": ["rnd", "rnd"], - "output_shape": "16x1024x768", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/11_4D_tensor_matrix_multiplication.py", - "input_shapes": ["8x256x512x256", "256x768"], - "initializations": ["rnd", "rnd"], - "output_shape": "8x256x512x768", - "dtypes": ["f32", "bf16"], - }, - # level1/12_Matmul_with_diagonal_matrices_.py - # torch_mlir.compiler_utils.TorchMlirCompilerError: - # Lowering TorchFX IR -> Torch Backend IR failed with the following diagnostics: - # python exception: Failure while executing pass pipeline - { - "kernel": "level1/12_Matmul_with_diagonal_matrices_.py", - "input_shapes": ["4096", "4096x4096"], - "initializations": ["rnd", "rnd"], - "output_shape": "4096x4096", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/13_Matmul_for_symmetric_matrices.py", - "input_shapes": ["4096x4096", "4096x4096"], - "initializations": ["rnd", "rnd"], - "output_shape": "4096x4096", - "dtypes": ["f32", "bf16"], - "gflops": (4096 * 4096 * 4096 * 2) / 1e9, - "pipeline": "matmul", - }, - # level1/14_Matmul_for_upper_triangular_matrices.py - # LLVM ERROR: operation destroyed but still has uses - { - "kernel": "level1/14_Matmul_for_upper_triangular_matrices.py", - "input_shapes": ["4096x4096", "4096x4096"], - "initializations": ["rnd", "rnd"], - "output_shape": "4096x4096", - "dtypes": ["f32", "bf16"], - "gflops": (4096 * 4096 * 4096 * 2) / 1e9, - "pipeline": "matmul", - }, - # level1/15_Matmul_for_lower_triangular_matrices.py - # LLVM ERROR: operation destroyed but still has uses - { - "kernel": "level1/15_Matmul_for_lower_triangular_matrices.py", - "input_shapes": ["4096x4096", "4096x4096"], - "initializations": ["rnd", "rnd"], - "output_shape": "4096x4096", - "dtypes": ["f32", "bf16"], - "gflops": (4096 * 4096 * 4096 * 2) / 1e9, - "pipeline": "matmul", - }, - { - "kernel": "level1/16_Matmul_with_transposed_A.py", - "input_shapes": ["8192x2048", "8192x4096"], - "initializations": ["rnd", "rnd"], - "output_shape": "2048x4096", - "dtypes": ["f32", "bf16"], - "gflops": (2048 * 8192 * 4096 * 2) / 1e9, - "pipeline": "matmul", - }, - { - "kernel": "level1/17_Matmul_with_transposed_B.py", - "input_shapes": ["2048x8192", "4096x8192"], - "initializations": ["rnd", "rnd"], - "output_shape": "2048x4096", - "dtypes": ["f32", "bf16"], - "gflops": (2048 * 8192 * 4096 * 2) / 1e9, - "pipeline": "matmul", - }, - { - "kernel": "level1/18_Matmul_with_transposed_both.py", - "input_shapes": ["8192x2048", "4096x8192"], - "initializations": ["rnd", "rnd"], - "output_shape": "2048x4096", - "dtypes": ["f32", "bf16"], - "gflops": (2048 * 8192 * 4096 * 2) / 1e9, - "pipeline": "matmul", - }, - # All Element-wise kernels below fail with the same error: - # LLVM ERROR: operation destroyed but still has uses - { - "kernel": "level1/19_ReLU.py", - "input_shapes": ["4096x393216"], - "initializations": ["rnd"], - "output_shape": "4096x393216", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/20_LeakyReLU.py", - "input_shapes": ["4096x393216"], - "initializations": ["rnd"], - "output_shape": "4096x393216", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/21_Sigmoid.py", - "input_shapes": ["4096x393216"], - "initializations": ["rnd"], - "output_shape": "4096x393216", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/22_Tanh.py", - "input_shapes": ["4096x393216"], - "initializations": ["rnd"], - "output_shape": "4096x393216", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/23_Softmax.py", - "input_shapes": ["4096x393216"], - "initializations": ["rnd"], - "output_shape": "4096x393216", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/24_LogSoftmax.py", - "input_shapes": ["4096x393216"], - "initializations": ["rnd"], - "output_shape": "4096x393216", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/25_Swish.py", - "input_shapes": ["4096x393216"], - "initializations": ["rnd"], - "output_shape": "4096x393216", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/26_GELU_.py", - "input_shapes": ["4096x393216"], - "initializations": ["rnd"], - "output_shape": "4096x393216", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/27_SELU_.py", - "input_shapes": ["4096x393216"], - "initializations": ["rnd"], - "output_shape": "4096x393216", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/28_HardSigmoid.py", - "input_shapes": ["4096x393216"], - "initializations": ["rnd"], - "output_shape": "4096x393216", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/29_Softplus.py", - "input_shapes": ["4096x393216"], - "initializations": ["rnd"], - "output_shape": "4096x393216", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/30_Softsign.py", - "input_shapes": ["4096x393216"], - "initializations": ["rnd"], - "output_shape": "4096x393216", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/31_ELU.py", - "input_shapes": ["4096x393216"], - "initializations": ["rnd"], - "output_shape": "4096x393216", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/32_HardTanh.py", - "input_shapes": ["4096x393216"], - "initializations": ["rnd"], - "output_shape": "4096x393216", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/33_BatchNorm.py", - "input_shapes": ["64x64x512x512"], - "initializations": ["rnd"], - "output_shape": "64x64x512x512", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/34_InstanceNorm.py", - "input_shapes": ["112x64x512x512"], - "initializations": ["rnd"], - "output_shape": "112x64x512x512", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/35_GroupNorm_.py", - "input_shapes": ["112x64x512x512"], - "initializations": ["rnd"], - "output_shape": "112x64x512x512", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/36_RMSNorm_.py", - "input_shapes": ["112x64x512x512"], - "initializations": ["rnd"], - "output_shape": "112x64x512x512", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/37_FrobeniusNorm_.py", - "input_shapes": ["112x64x512x512"], - "initializations": ["rnd"], - "output_shape": "112x64x512x512", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/38_L1Norm_.py", - "input_shapes": ["32768x65535"], - "initializations": ["rnd"], - "output_shape": "32768x65535", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/39_L2Norm_.py", - "input_shapes": ["32768x65535"], - "initializations": ["rnd"], - "output_shape": "32768x65535", - "dtypes": ["f32", "bf16"], - }, - { - "kernel": "level1/40_LayerNorm.py", - "input_shapes": ["16x64x256x256"], - "initializations": ["rnd"], - "output_shape": "16x64x256x256", - "dtypes": ["f32", "bf16"], - }, -] - - def get_tests(args: argparse.Namespace) -> list[dict]: """ Returns the list of tests to be executed. @@ -362,6 +46,10 @@ def get_tests(args: argparse.Namespace) -> list[dict]: args.bf16 = False # Disable bf16 tests in CI for faster feedback args.benchmark = False # Disable benchmarking in CI for faster feedback + tests = [] + with open(yaml_path) as f: + tests = yaml.safe_load(f) + test_list = [] for test in tests: for dtype in test["dtypes"]: @@ -384,6 +72,7 @@ def get_tests(args: argparse.Namespace) -> list[dict]: if "gflops" in test and args.benchmark else None, "pipeline": str(get_pipeline_file(test.get("pipeline", ""), dtype)), + "warning": test.get("warning", None), } ) # CI mode runs fewer tests for faster feedback @@ -460,6 +149,8 @@ def get_flops_per_second(stdout: str, gflops: float) -> float: command_line += ["--benchmark"] if args.print_mlir_after_all: command_line += ["--print-mlir-after-all"] + if test.get("warning"): + print(f"WARNING: {test['warning']}") print(f"Running command: {' '.join(command_line)}") # While debugging kernels, it's useful to see the output as it comes. diff --git a/examples/end-to-end/KernelBench/tests.yaml b/examples/end-to-end/KernelBench/tests.yaml new file mode 100644 index 0000000..c178f4c --- /dev/null +++ b/examples/end-to-end/KernelBench/tests.yaml @@ -0,0 +1,289 @@ +- kernel: level1/1_Square_matrix_multiplication_.py + input_shapes: [1024x1024, 1024x1024] + initializations: [rnd, id] + output_shape: 1024x1024 + dtypes: [f32, bf16] + gflops: 2.147483648 # (1024 * 1024 * 1024 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/2_Standard_matrix_multiplication_.py + input_shapes: [512x1024, 1024x512] + initializations: [rnd, rnd] + output_shape: 512x512 + dtypes: [f32, bf16] + gflops: 0.536870912 # (512 * 1024 * 512 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/3_Batched_matrix_multiplication.py + input_shapes: [4x64x32, 4x32x64] + initializations: [rnd, rnd] + output_shape: 4x64x64 + dtypes: [f32, bf16] + +- kernel: level1/4_Matrix_vector_multiplication_.py + input_shapes: [1024x1024, 1024x1] + initializations: [rnd, rnd] + output_shape: 1024x1 + dtypes: [f32, bf16] + +- kernel: level1/5_Matrix_scalar_multiplication.py + input_shapes: [1024x1024, "1"] + initializations: [rnd, rnd] + output_shape: 1024x1024 + dtypes: [f32, bf16] + +- kernel: level1/6_Matmul_with_large_K_dimension_.py + input_shapes: [256x524288, 524288x256] + initializations: [rnd, rnd] + output_shape: 256x256 + dtypes: [f32, bf16] + gflops: 68.719476736 # (256 * 524288 * 256 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/7_Matmul_with_small_K_dimension_.py + input_shapes: [32768x64, 64x32768] + initializations: [rnd, rnd] + output_shape: 32768x32768 + dtypes: [f32, bf16] + gflops: 137.438953472 # (32768 * 64 * 32768 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/8_Matmul_with_irregular_shapes_.py + input_shapes: [8205x2949, 2949x5921] + initializations: [rnd, rnd] + output_shape: 8205x5921 + dtypes: [f32, bf16] + gflops: 286.535485890 # (8205 * 2949 * 5921 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/9_Tall_skinny_matrix_multiplication_.py + input_shapes: [1024x32, 32x1024] + initializations: [rnd, rnd] + output_shape: 1024x1024 + dtypes: [f32, bf16] + # gflops: 0.067108864 # (1024 * 32 * 1024 * 2) / 1e9 + # pipeline: matmul + warning: "Optimized pipeline error: too many tiles provided, expected at most 3 found 4" + +- kernel: level1/10_3D_tensor_matrix_multiplication.py + input_shapes: [16x1024x2048, 2048x768] + initializations: [rnd, rnd] + output_shape: 16x1024x768 + dtypes: [f32, bf16] + +- kernel: level1/11_4D_tensor_matrix_multiplication.py + input_shapes: [8x256x512x256, 256x768] + initializations: [rnd, rnd] + output_shape: 8x256x512x768 + dtypes: [f32, bf16] + +- kernel: level1/12_Matmul_with_diagonal_matrices_.py + input_shapes: ["4096", 4096x4096] + initializations: [rnd, rnd] + output_shape: 4096x4096 + dtypes: [f32, bf16] + warning: "torch_mlir.compiler_utils.TorchMlirCompilerError: Lowering TorchFX IR -> Torch Backend IR failed with the following diagnostics: python exception: Failure while executing pass pipeline" + +- kernel: level1/13_Matmul_for_symmetric_matrices.py + input_shapes: [4096x4096, 4096x4096] + initializations: [rnd, rnd] + output_shape: 4096x4096 + dtypes: [f32, bf16] + gflops: 137.438953472 # (4096 * 4096 * 4096 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/14_Matmul_for_upper_triangular_matrices.py + input_shapes: [4096x4096, 4096x4096] + initializations: [rnd, rnd] + output_shape: 4096x4096 + dtypes: [f32, bf16] + gflops: 137.438953472 # (4096 * 4096 * 4096 * 2) / 1e9 + pipeline: matmul + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/15_Matmul_for_lower_triangular_matrices.py + input_shapes: [4096x4096, 4096x4096] + initializations: [rnd, rnd] + output_shape: 4096x4096 + dtypes: [f32, bf16] + gflops: 137.438953472 # (4096 * 4096 * 4096 * 2) / 1e9 + pipeline: matmul + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/16_Matmul_with_transposed_A.py + input_shapes: [8192x2048, 8192x4096] + initializations: [rnd, rnd] + output_shape: 2048x4096 + dtypes: [f32, bf16] + gflops: 137.438953472 # (2048 * 8192 * 4096 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/17_Matmul_with_transposed_B.py + input_shapes: [2048x8192, 4096x8192] + initializations: [rnd, rnd] + output_shape: 2048x4096 + dtypes: [f32, bf16] + gflops: 137.438953472 # (2048 * 8192 * 4096 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/18_Matmul_with_transposed_both.py + input_shapes: [8192x2048, 4096x8192] + initializations: [rnd, rnd] + output_shape: 2048x4096 + dtypes: [f32, bf16] + gflops: 137.438953472 # (2048 * 8192 * 4096 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/19_ReLU.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/20_LeakyReLU.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/21_Sigmoid.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/22_Tanh.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/23_Softmax.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/24_LogSoftmax.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/25_Swish.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/26_GELU_.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/27_SELU_.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/28_HardSigmoid.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/29_Softplus.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/30_Softsign.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/31_ELU.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/32_HardTanh.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/33_BatchNorm.py + input_shapes: [64x64x512x512] + initializations: [rnd] + output_shape: 64x64x512x512 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/34_InstanceNorm.py + input_shapes: [112x64x512x512] + initializations: [rnd] + output_shape: 112x64x512x512 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/35_GroupNorm_.py + input_shapes: [112x64x512x512] + initializations: [rnd] + output_shape: 112x64x512x512 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/36_RMSNorm_.py + input_shapes: [112x64x512x512] + initializations: [rnd] + output_shape: 112x64x512x512 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/37_FrobeniusNorm_.py + input_shapes: [112x64x512x512] + initializations: [rnd] + output_shape: 112x64x512x512 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/38_L1Norm_.py + input_shapes: [32768x65535] + initializations: [rnd] + output_shape: 32768x65535 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/39_L2Norm_.py + input_shapes: [32768x65535] + initializations: [rnd] + output_shape: 32768x65535 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/40_LayerNorm.py + input_shapes: [16x64x256x256] + initializations: [rnd] + output_shape: 16x64x256x256 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" From 4e664b4eeb3321748cd72ad300f04e469fc12636 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Fri, 15 May 2026 11:22:25 +0100 Subject: [PATCH 4/4] Break long line --- examples/end-to-end/KernelBench/tests.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/end-to-end/KernelBench/tests.yaml b/examples/end-to-end/KernelBench/tests.yaml index c178f4c..8f4f47d 100644 --- a/examples/end-to-end/KernelBench/tests.yaml +++ b/examples/end-to-end/KernelBench/tests.yaml @@ -82,7 +82,9 @@ initializations: [rnd, rnd] output_shape: 4096x4096 dtypes: [f32, bf16] - warning: "torch_mlir.compiler_utils.TorchMlirCompilerError: Lowering TorchFX IR -> Torch Backend IR failed with the following diagnostics: python exception: Failure while executing pass pipeline" + warning: '''ERROR: torch_mlir.compiler_utils.TorchMlirCompilerError: + Lowering TorchFX IR -> Torch Backend IR failed with the following diagnostics: + python exception: Failure while executing pass pipeline''' - kernel: level1/13_Matmul_for_symmetric_matrices.py input_shapes: [4096x4096, 4096x4096]