diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/lower.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/lower.yaml similarity index 100% rename from examples/end-to-end/KernelBench/schedules/x86_64/matmul/lower.yaml rename to examples/end-to-end/KernelBench/schedules/x86_64/lower.yaml diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/bf16.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/bf16.yaml index c81d950..19bcf76 100644 --- a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/bf16.yaml +++ b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/bf16.yaml @@ -1,10 +1,8 @@ # This is an optimizing pipeline for kernel_bench matmuls on bf16 types. # This is basically a copy of the fp32 pipeline, with ONE CHANGE: # - register_tiling.py -> reg_unroll_k=2 (instead of 1) -# Tested on x86_64 with AVX512 reaching good performance for simple KB kernels. -# It may not apply to other workloads / extensions / architectures, so use with caution. Pipeline: - - include: pack_and_tile.yaml + - include: ../pack_and_tile.yaml ## CPU specific register tiling (depends on uArch & data type) - schedule: "x86/register_tiling.py[gen=matmul_register_tiling]{target=linalg.contract reg_tile_batch=1 reg_tile_m=8 reg_tile_n=32 reg_tile_k=2}" @@ -12,6 +10,6 @@ Pipeline: - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.fill tile_sizes=[1,1,1]}" - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.generic tile_sizes=[1,8]}" - - include: vectorize.yaml + - include: ../vectorize.yaml - - include: lower.yaml + - include: ../lower.yaml diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/f32.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/f32.yaml index 742f91c..6043543 100644 --- a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/f32.yaml +++ b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/f32.yaml @@ -2,7 +2,7 @@ # Tested on x86_64 with AVX512 reaching good performance for simple KB kernels. # It may not apply to other workloads / extensions / architectures, so use with caution. Pipeline: - - include: pack_and_tile.yaml + - include: ../pack_and_tile.yaml ## CPU specific register tiling (depends on uArch & data type) - schedule: "x86/register_tiling.py[gen=matmul_register_tiling]{target=linalg.contract reg_tile_batch=1 reg_tile_m=8 reg_tile_n=32 reg_tile_k=2}" @@ -10,6 +10,6 @@ Pipeline: - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.fill tile_sizes=[1,1,1]}" - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.generic tile_sizes=[1,8]}" - - include: vectorize.yaml + - include: ../vectorize.yaml - - include: lower.yaml + - include: ../lower.yaml diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/vectorize.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/matmul/vectorize.yaml deleted file mode 100644 index 527920c..0000000 --- a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/vectorize.yaml +++ /dev/null @@ -1,11 +0,0 @@ -# This is an optimizing pipeline for kernel_bench matmuls on bf16 types. -# This is basically a copy of the fp32 pipeline, with ONE CHANGE: -# - register_tiling.py -> reg_unroll_k=2 (instead of 1) -# Tested on x86_64 with AVX512 reaching good performance for simple KB kernels. -# It may not apply to other workloads / extensions / architectures, so use with caution. -Pipeline: - ## Tensor vectorization (for the left-over element wise) - - schedule: "vectorization.py[gen=vectorize_linalg]" - - schedule: "hoisting.py[gen=hoist_loops]" - - schedule: "vectorization.py[gen=simplify_vector_ops]" - - schedule: "vectorization.py[gen=x86_vectorization]" diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/matmul/pack_and_tile.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/pack_and_tile.yaml similarity index 100% rename from examples/end-to-end/KernelBench/schedules/x86_64/matmul/pack_and_tile.yaml rename to examples/end-to-end/KernelBench/schedules/x86_64/pack_and_tile.yaml diff --git a/examples/end-to-end/KernelBench/schedules/x86_64/vectorize.yaml b/examples/end-to-end/KernelBench/schedules/x86_64/vectorize.yaml new file mode 100644 index 0000000..57abb67 --- /dev/null +++ b/examples/end-to-end/KernelBench/schedules/x86_64/vectorize.yaml @@ -0,0 +1,7 @@ +# Tensor level vectorization for matmul like kernels on any type. +Pipeline: + ## Tensor vectorization (for the left-over element wise) + - schedule: "vectorization.py[gen=vectorize_linalg]" + - schedule: "hoisting.py[gen=hoist_loops]" + - schedule: "vectorization.py[gen=simplify_vector_ops]" + - schedule: "vectorization.py[gen=x86_vectorization]" diff --git a/examples/end-to-end/KernelBench/test_kernel_bench.py b/examples/end-to-end/KernelBench/test_kernel_bench.py index 202a468..954047b 100755 --- a/examples/end-to-end/KernelBench/test_kernel_bench.py +++ b/examples/end-to-end/KernelBench/test_kernel_bench.py @@ -1,4 +1,4 @@ -# RUN: python %s | FileCheck %s +# RUN: python %s --ci | FileCheck %s # REQUIRES: torch # REQUIRES: kernel_bench @@ -9,24 +9,25 @@ import platform from pathlib import Path +import yaml + script_path = Path(__file__).parent project_root = script_path.parent.parent.parent kb_program = project_root / "tools" / "kernel_bench" kb_default_pipeline = kb_program.parent / "kernel_bench.yaml" kb_path = project_root / "third_party" / "KernelBench" / "KernelBench" +yaml_path = script_path / "tests.yaml" -def get_pipeline_file(kernel_name: str, dtype: str) -> Path: +def get_pipeline_file(name: str, dtype: str) -> Path: """ Returns the appropriate pipeline file for a given kernel. """ arch = platform.machine() - if arch != "x86_64": - return kb_default_pipeline - # Level 1 matmuls should use the same pipelines - if kernel_name.startswith("level1") and "matrix_multiplication" in kernel_name: - pipeline = script_path / f"schedules/{arch}/matmul/{dtype}.yaml" + # If the pipeline file exists for the given name and dtype + if name: + pipeline = script_path / f"schedules/{arch}/{name}/{dtype}.yaml" if pipeline.exists(): return pipeline @@ -34,35 +35,29 @@ def get_pipeline_file(kernel_name: str, dtype: str) -> Path: return kb_default_pipeline -tests = [ - { - "kernel": "level1/1_Square_matrix_multiplication_.py", - "input_shapes": ["1024x1024", "1024x1024"], - "initializations": ["rnd", "id"], - "output_shape": "1024x1024", - "dtypes": ["f32", "bf16"], - "gflops": (1024 * 1024 * 1024 * 2) / 1e9, - }, - { - "kernel": "level1/2_Standard_matrix_multiplication_.py", - "input_shapes": ["512x1024", "1024x512"], - "initializations": ["rnd", "rnd"], - "output_shape": "512x512", - "dtypes": ["f32", "bf16"], - "gflops": (512 * 1024 * 512 * 2) / 1e9, - }, -] - - def get_tests(args: argparse.Namespace) -> list[dict]: """ Returns the list of tests to be executed. """ + if args.ci: + print( + "Running in CI mode: fewer tests, no bf16, no benchmarking for faster feedback" + ) + args.bf16 = False # Disable bf16 tests in CI for faster feedback + args.benchmark = False # Disable benchmarking in CI for faster feedback + + tests = [] + with open(yaml_path) as f: + tests = yaml.safe_load(f) + test_list = [] for test in tests: for dtype in test["dtypes"]: if not args.bf16 and dtype == "bf16": continue + # If a specific test is specified, only include that test + if args.test and not test["kernel"].startswith(args.test): + continue test_list.append( { "kernel": test["kernel"], @@ -76,9 +71,13 @@ def get_tests(args: argparse.Namespace) -> list[dict]: "gflops": test["gflops"] if "gflops" in test and args.benchmark else None, - "pipeline": str(get_pipeline_file(test["kernel"], dtype)), + "pipeline": str(get_pipeline_file(test.get("pipeline", ""), dtype)), + "warning": test.get("warning", None), } ) + # CI mode runs fewer tests for faster feedback + if args.ci and len(test_list) >= 5: + return test_list return test_list @@ -105,9 +104,33 @@ def get_flops_per_second(stdout: str, gflops: float) -> float: action=argparse.BooleanOptionalAction, help="Enable bf16 precision kernels.", ) + Parser.add_argument( + "--ci", + action=argparse.BooleanOptionalAction, + help="Enable CI mode (faster run, fewer kernels).", + ) + Parser.add_argument( + "--test", + type=str, + help="Specify a particular test to run.", + ) + Parser.add_argument( + "--print-mlir-after-all", + action=argparse.BooleanOptionalAction, + help="Whether to print the MLIR module after all stages. Default is False.", + ) args = Parser.parse_args() + tests = get_tests(args) + if len(tests) == 0: + if args.test: + print( + f"No tests found matching '{args.test}'. Please check your arguments." + ) + else: + print("No tests to run. Please check your arguments.") + exit(0) - for test in get_tests(args): + for test in tests: kb_kernel = kb_path / test["kernel"] command_line = [ str(kb_program), @@ -121,25 +144,39 @@ def get_flops_per_second(stdout: str, gflops: float) -> float: "--print-tensor=1", "--seed=42", ] - benchmark = test.get("gflops") is not None + benchmark = args.benchmark and test.get("gflops") is not None if benchmark: command_line += ["--benchmark"] + if args.print_mlir_after_all: + command_line += ["--print-mlir-after-all"] + if test.get("warning"): + print(f"WARNING: {test['warning']}") print(f"Running command: {' '.join(command_line)}") + + # While debugging kernels, it's useful to see the output as it comes. + # Note: GFLOPS can't be shown if the output is not captured. + capture_output = True + if args.print_mlir_after_all and not args.ci: + capture_output = False + result = subprocess.run( command_line, - capture_output=True, + capture_output=capture_output, text=True, ) - print("STDOUT:") - print(result.stdout) - if benchmark: - flops_per_second = get_flops_per_second(result.stdout, test["gflops"]) - if flops_per_second > 0: - print(f"Performance: {flops_per_second:.2f} GFLOPS") + # If output is captured, print it out, including benchmark results if applicable. + if capture_output: + print("STDOUT:") + print(result.stdout) + if benchmark: + flops_per_second = get_flops_per_second(result.stdout, test["gflops"]) + if flops_per_second > 0: + print(f"Performance: {flops_per_second:.2f} GFLOPS") + + print("STDERR:") + print(result.stderr) - print("STDERR:") - print(result.stderr) print(f"Return code: {result.returncode}") assert result.returncode == 0, "Execution failed" @@ -147,10 +184,18 @@ def get_flops_per_second(stdout: str, gflops: float) -> float: # CHECK: 0.3745{{.*}} 0.9507{{.*}} 0.7319{{.*}} ... 0.2973{{.*}} 0.9243{{.*}} 0.9710{{.*}} # CHECK: 0.7201{{.*}} 0.9926{{.*}} 0.1208{{.*}} ... 0.1742{{.*}} 0.3485{{.*}} 0.6436{{.*}} -# CHECK-NOT: Execution failed - # CHECK: 2_Standard_matrix_multiplication_.mlir # CHECK: 249.78{{.*}} 260.13{{.*}} 249.36{{.*}} ... 261.10{{.*}} 260.49{{.*}} 257.09{{.*}} # CHECK: 243.56{{.*}} 250.91{{.*}} 252.38{{.*}} ... 260.40{{.*}} 261.56{{.*}} 256.24{{.*}} -# CHECK-NOT: Execution failed +# CHECK: 3_Batched_matrix_multiplication.mlir +# CHECK: 5.2403{{.*}} 7.7905{{.*}} 6.0769{{.*}} ... 7.8579{{.*}} 6.8890{{.*}} 6.6193{{.*}} +# CHECK: 9.0407{{.*}} 6.3299{{.*}} 5.2003{{.*}} ... 6.2594{{.*}} 6.2980{{.*}} 5.9807{{.*}} + +# CHECK: 4_Matrix_vector_multiplication_.mlir +# CHECK: 264.86{{.*}} +# CHECK: 265.12{{.*}} + +# CHECK: 5_Matrix_scalar_multiplication.mlir +# CHECK: 0.1750{{.*}} 0.4442{{.*}} 0.3420{{.*}} ... 0.1389{{.*}} 0.4319{{.*}} 0.4538{{.*}} +# CHECK: 0.3365{{.*}} 0.4638{{.*}} 0.0564{{.*}} ... 0.0814{{.*}} 0.1628{{.*}} 0.3007{{.*}} diff --git a/examples/end-to-end/KernelBench/tests.yaml b/examples/end-to-end/KernelBench/tests.yaml new file mode 100644 index 0000000..8f4f47d --- /dev/null +++ b/examples/end-to-end/KernelBench/tests.yaml @@ -0,0 +1,291 @@ +- kernel: level1/1_Square_matrix_multiplication_.py + input_shapes: [1024x1024, 1024x1024] + initializations: [rnd, id] + output_shape: 1024x1024 + dtypes: [f32, bf16] + gflops: 2.147483648 # (1024 * 1024 * 1024 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/2_Standard_matrix_multiplication_.py + input_shapes: [512x1024, 1024x512] + initializations: [rnd, rnd] + output_shape: 512x512 + dtypes: [f32, bf16] + gflops: 0.536870912 # (512 * 1024 * 512 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/3_Batched_matrix_multiplication.py + input_shapes: [4x64x32, 4x32x64] + initializations: [rnd, rnd] + output_shape: 4x64x64 + dtypes: [f32, bf16] + +- kernel: level1/4_Matrix_vector_multiplication_.py + input_shapes: [1024x1024, 1024x1] + initializations: [rnd, rnd] + output_shape: 1024x1 + dtypes: [f32, bf16] + +- kernel: level1/5_Matrix_scalar_multiplication.py + input_shapes: [1024x1024, "1"] + initializations: [rnd, rnd] + output_shape: 1024x1024 + dtypes: [f32, bf16] + +- kernel: level1/6_Matmul_with_large_K_dimension_.py + input_shapes: [256x524288, 524288x256] + initializations: [rnd, rnd] + output_shape: 256x256 + dtypes: [f32, bf16] + gflops: 68.719476736 # (256 * 524288 * 256 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/7_Matmul_with_small_K_dimension_.py + input_shapes: [32768x64, 64x32768] + initializations: [rnd, rnd] + output_shape: 32768x32768 + dtypes: [f32, bf16] + gflops: 137.438953472 # (32768 * 64 * 32768 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/8_Matmul_with_irregular_shapes_.py + input_shapes: [8205x2949, 2949x5921] + initializations: [rnd, rnd] + output_shape: 8205x5921 + dtypes: [f32, bf16] + gflops: 286.535485890 # (8205 * 2949 * 5921 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/9_Tall_skinny_matrix_multiplication_.py + input_shapes: [1024x32, 32x1024] + initializations: [rnd, rnd] + output_shape: 1024x1024 + dtypes: [f32, bf16] + # gflops: 0.067108864 # (1024 * 32 * 1024 * 2) / 1e9 + # pipeline: matmul + warning: "Optimized pipeline error: too many tiles provided, expected at most 3 found 4" + +- kernel: level1/10_3D_tensor_matrix_multiplication.py + input_shapes: [16x1024x2048, 2048x768] + initializations: [rnd, rnd] + output_shape: 16x1024x768 + dtypes: [f32, bf16] + +- kernel: level1/11_4D_tensor_matrix_multiplication.py + input_shapes: [8x256x512x256, 256x768] + initializations: [rnd, rnd] + output_shape: 8x256x512x768 + dtypes: [f32, bf16] + +- kernel: level1/12_Matmul_with_diagonal_matrices_.py + input_shapes: ["4096", 4096x4096] + initializations: [rnd, rnd] + output_shape: 4096x4096 + dtypes: [f32, bf16] + warning: '''ERROR: torch_mlir.compiler_utils.TorchMlirCompilerError: + Lowering TorchFX IR -> Torch Backend IR failed with the following diagnostics: + python exception: Failure while executing pass pipeline''' + +- kernel: level1/13_Matmul_for_symmetric_matrices.py + input_shapes: [4096x4096, 4096x4096] + initializations: [rnd, rnd] + output_shape: 4096x4096 + dtypes: [f32, bf16] + gflops: 137.438953472 # (4096 * 4096 * 4096 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/14_Matmul_for_upper_triangular_matrices.py + input_shapes: [4096x4096, 4096x4096] + initializations: [rnd, rnd] + output_shape: 4096x4096 + dtypes: [f32, bf16] + gflops: 137.438953472 # (4096 * 4096 * 4096 * 2) / 1e9 + pipeline: matmul + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/15_Matmul_for_lower_triangular_matrices.py + input_shapes: [4096x4096, 4096x4096] + initializations: [rnd, rnd] + output_shape: 4096x4096 + dtypes: [f32, bf16] + gflops: 137.438953472 # (4096 * 4096 * 4096 * 2) / 1e9 + pipeline: matmul + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/16_Matmul_with_transposed_A.py + input_shapes: [8192x2048, 8192x4096] + initializations: [rnd, rnd] + output_shape: 2048x4096 + dtypes: [f32, bf16] + gflops: 137.438953472 # (2048 * 8192 * 4096 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/17_Matmul_with_transposed_B.py + input_shapes: [2048x8192, 4096x8192] + initializations: [rnd, rnd] + output_shape: 2048x4096 + dtypes: [f32, bf16] + gflops: 137.438953472 # (2048 * 8192 * 4096 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/18_Matmul_with_transposed_both.py + input_shapes: [8192x2048, 4096x8192] + initializations: [rnd, rnd] + output_shape: 2048x4096 + dtypes: [f32, bf16] + gflops: 137.438953472 # (2048 * 8192 * 4096 * 2) / 1e9 + pipeline: matmul + +- kernel: level1/19_ReLU.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/20_LeakyReLU.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/21_Sigmoid.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/22_Tanh.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/23_Softmax.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/24_LogSoftmax.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/25_Swish.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/26_GELU_.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/27_SELU_.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/28_HardSigmoid.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/29_Softplus.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/30_Softsign.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/31_ELU.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/32_HardTanh.py + input_shapes: [4096x393216] + initializations: [rnd] + output_shape: 4096x393216 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/33_BatchNorm.py + input_shapes: [64x64x512x512] + initializations: [rnd] + output_shape: 64x64x512x512 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/34_InstanceNorm.py + input_shapes: [112x64x512x512] + initializations: [rnd] + output_shape: 112x64x512x512 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/35_GroupNorm_.py + input_shapes: [112x64x512x512] + initializations: [rnd] + output_shape: 112x64x512x512 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/36_RMSNorm_.py + input_shapes: [112x64x512x512] + initializations: [rnd] + output_shape: 112x64x512x512 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/37_FrobeniusNorm_.py + input_shapes: [112x64x512x512] + initializations: [rnd] + output_shape: 112x64x512x512 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/38_L1Norm_.py + input_shapes: [32768x65535] + initializations: [rnd] + output_shape: 32768x65535 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/39_L2Norm_.py + input_shapes: [32768x65535] + initializations: [rnd] + output_shape: 32768x65535 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses" + +- kernel: level1/40_LayerNorm.py + input_shapes: [16x64x256x256] + initializations: [rnd] + output_shape: 16x64x256x256 + dtypes: [f32, bf16] + warning: "LLVM ERROR: operation destroyed but still has uses"