From eecee073a311b1ad4aad4bd24b97fdfc2caf3d03 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 13 Mar 2026 23:32:23 +0000 Subject: [PATCH 01/38] save work --- examples/xegpu/softmax.py | 336 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 336 insertions(+) create mode 100644 examples/xegpu/softmax.py diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py new file mode 100644 index 00000000..bb90f812 --- /dev/null +++ b/examples/xegpu/softmax.py @@ -0,0 +1,336 @@ +# RUN: %PYTHON %s --dump-kernel=xegpu-wg | FileCheck %s +# CHECK: module attributes {gpu.container_module} { + +""" +XeGPU softmax benchmark. +""" + +import argparse +import ctypes +from typing import Optional +from functools import cached_property + +import numpy as np +from mlir import ir +from mlir.execution_engine import ExecutionEngine +from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func + +from lighthouse.workload import benchmark +from lighthouse.utils.memref import to_ctype as memref_to_ctype +from lighthouse.utils.numpy import numpy_to_ctype +from lighthouse.utils.mlir import func_cif +from lighthouse.ingress.mlir_gen import get_mlir_elem_type +from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor + +from xegpu_workload import XeGPUWorkload + + +def softmax_complexity(M: int, N: int, nbytes: int): + """ + Complexity of softmax operation. + + For each row: + - O(N) to find max + - O(N) to compute exp(x - max) and sum + - O(N) to normalize + Total: 3*N operations per row, but with transcendental (exp) operations + """ + # Approximation: 5 FLOPs per element (max, sub, exp, sum, div) + # exp is expensive but we count it as ~1 FLOP for simplicity + flop_count = M * N * 5 + memory_reads = M * N * nbytes # read input + memory_writes = M * N * nbytes # write output + return flop_count, memory_reads, memory_writes + + +class XeGPUSoftmax(XeGPUWorkload): + """ + Softmax workload on XeGPU. + + Computes softmax along the last dimension (rows): + output[i, j] = exp(input[i, j] - max_i) / sum_i(exp(input[i, j] - max_i)) + + where max_i and sum_i are computed over row i. + """ + + def __init__( + self, + M: int, + N: int, + dtype: str = "f32", + ): + super().__init__() + self.M = M + self.N = N + self.shape = (M, N) + assert dtype == "f32", "Only f32 type is supported for softmax" + self.dtype_str = dtype + type_str_to_numpy = { + "f16": np.float16, + "f32": np.float32, + } + self.dtype = type_str_to_numpy[dtype] + + @cached_property + def _initial_host_arrays(self) -> tuple[np.ndarray]: + """Generate initial values on host with numpy.""" + np.random.seed(42) + # Use values in range [-0.5, 0.5] to avoid numerical issues + input_arr = np.random.uniform(-0.5, 0.5, self.shape).astype(self.dtype) + return (input_arr,) + + @cached_property + def _reference_solution(self) -> np.ndarray: + """Compute reference solution on host with numpy.""" + (input_arr,) = self._initial_host_arrays + # Use float32 for computation + x = input_arr.astype(np.float32) + # Compute softmax along axis 1 (each row independently) + # Numerically stable version: subtract max before exp + max_vals = np.max(x, axis=1, keepdims=True) + exp_vals = np.exp(x - max_vals) + sum_vals = np.sum(exp_vals, axis=1, keepdims=True) + output = exp_vals / sum_vals + return output.astype(self.dtype) + + def _get_input_arrays( + self, execution_engine: ExecutionEngine + ) -> list[ctypes.Structure]: + # Allocate device memory for input and output + input_gpu = self._allocate_array( + "input", self.shape, self.dtype_str, execution_engine + ) + output_gpu = self._allocate_array( + "output", self.shape, self.dtype_str, execution_engine + ) + + # Copy input to device + (input_host,) = self._initial_host_arrays + copy_fn = f"gpu_copy_2d_{self.dtype_str}" + execution_engine.invoke( + copy_fn, numpy_to_ctype(input_host), memref_to_ctype(input_gpu) + ) + + # Return memrefs: [output, input] + return [output_gpu, input_gpu] + + def check_correctness( + self, execution_engine: ExecutionEngine, verbose: int = 0 + ) -> bool: + # Copy result from device to host + output_gpu = self.gpu_memrefs[("output", self.dtype_str)] + output_host = np.zeros(self.shape, dtype=self.dtype) + execution_engine.invoke( + f"gpu_copy_2d_{self.dtype_str}", + memref_to_ctype(output_gpu), + numpy_to_ctype(output_host), + ) + + output_ref = self._reference_solution + output_computed = output_host.astype(np.float32) + + if verbose > 1: + print("Reference solution (first 5 rows):") + print(output_ref[:5]) + print("Computed solution (first 5 rows):") + print(output_computed[:5]) + + # Check row sums are close to 1.0 + row_sums = np.sum(output_computed, axis=1) + sums_ok = np.allclose(row_sums, 1.0, rtol=1e-5, atol=1e-6) + + # Check values match reference + values_ok = np.allclose(output_computed, output_ref, rtol=1e-4, atol=1e-6) + + success = sums_ok and values_ok + + if verbose: + if success: + print("PASSED") + else: + print("FAILED!") + if not sums_ok: + print(f" Row sums check failed. Min: {row_sums.min():.6f}, Max: {row_sums.max():.6f}") + if not values_ok: + max_diff = np.abs(output_computed - output_ref).max() + print(f" Values mismatch. Max abs diff: {max_diff:.6e}") + return success + + def get_complexity(self) -> tuple[int, int, int]: + nbytes = np.dtype(self.dtype).itemsize + return softmax_complexity(self.M, self.N, nbytes) + + def payload_module(self) -> ir.Module: + """Generate MLIR module for softmax payload.""" + mod = ir.Module.create() + dtype = get_mlir_elem_type(self.dtype_str) + memref_t = ir.MemRefType.get(self.shape, dtype) + + with ir.InsertionPoint(mod.body): + # Function signature: payload(output, input) + @func_cif(memref_t, memref_t, name=self.payload_function_name) + def payload(output, input_arg): + # Convert memrefs to tensors + output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True) + input_tensor = emit_buf_to_tensor(input_arg, restrict=True) + + # Create intermediate buffer for softmax (used internally by linalg.softmax) + # This stores the sum of exp values + M, N = self.shape + softmax_buf_type = ir.MemRefType.get((M,N), dtype) + softmax_buf = gpu.alloc(softmax_buf_type, None, [], [], []) + softmax_buf_tensor = emit_buf_to_tensor(softmax_buf, restrict=True, writable=True) + + # Compute softmax along dimension 1 (rows) + # linalg.softmax performs: exp(x - max(x)) / sum(exp(x - max(x))) + result = linalg.softmax( + (input_tensor.type,), input_tensor, softmax_buf_tensor, dimension=1 + ) + + # Materialize result back to output memref + bufferization.materialize_in_destination( + None, result, output, restrict=True, writable=True + ) + + # Cleanup + gpu.dealloc(None, [], softmax_buf) + + # Emit utility functions for GPU memory management + emit_gpu_util_funcs(dtype, rank=2) + + return mod + + def schedule_module( + self, stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None + ) -> ir.Module: + """ + Generate transform schedule for softmax. + + For now, returns an empty schedule. In the future, this would contain + tiling, vectorization, and XeGPU-specific lowering transformations. + """ + # TODO: Implement proper transform schedule + # For now, create a minimal schedule that just applies bufferization + mod = ir.Module.create() + with ir.InsertionPoint(mod.body): + from mlir.dialects import transform + + # Create a simple transform sequence + @func_cif(name="__transform_main") + def transform_main(): + # Empty transform - just identity + # In a full implementation, this would tile, vectorize, + # and lower to XeGPU operations + pass + + return mod + + def shared_libs(self) -> list[str]: + return ["libmlir_levelzero_runtime.so"] + + +def parse_cli(): + parser = argparse.ArgumentParser( + description="Softmax using MLIR XeGPU", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--sizes", + type=int, + nargs=2, + default=[1024, 512], + help="M,N matrix sizes (MxN)", + ) + parser.add_argument( + "--wg-tile", + type=int, + nargs=2, + default=[64, 32], + help="Workgroup tile size M,N.", + ) + parser.add_argument( + "--nruns", + type=int, + default=1000, + help="Number of runs to average the execution time.", + ) + parser.add_argument( + "--nwarmup", + type=int, + default=20, + help="Number of warm-up iterations before benchmarking.", + ) + parser.add_argument( + "--check-result", + action="store_true", + help="Check the result of the softmax computation.", + ) + parser.add_argument( + "--dump-kernel", + type=str, + choices=[ + "initial", + "tiled", + "vectorized", + "bufferized", + "xegpu-initial", + "xegpu-wg", + "final", + ], + help="Dump kernel IR at different stages of lowering and exit without " + "executing the kernel.", + ) + parser.add_argument( + "--dump-schedule", + action="store_true", + help="Dump transform schedule.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + args = parse_cli() + + params = { + "wg_m": args.wg_tile[0], + "wg_n": args.wg_tile[1], + } + + M, N = args.sizes + dtype = "f32" + + with ir.Context(), ir.Location.unknown(): + wload = XeGPUSoftmax(M=M, N=N, dtype=dtype) + + if args.dump_kernel or args.dump_schedule: + wload.lower_payload( + dump_payload=args.dump_kernel, + dump_schedule=args.dump_schedule, + schedule_parameters=params, + ) + else: + times = benchmark( + wload, + nruns=args.nruns, + nwarmup=args.nwarmup, + schedule_parameters=params, + check_correctness=args.check_result, + verbose=1, + ) + times *= 1e6 # convert to microseconds + elapsed = np.mean(times) + flop_count = wload.get_complexity()[0] + gflops = flop_count / (elapsed * 1e-6) / 1e9 + + def list2str(a): + return ",".join(map(str, a)) + + parts = [ + f"sizes={list2str(args.sizes)}", + f"dt={dtype}", + f"wg-tile={list2str(args.wg_tile)}", + f"time(us): {elapsed:.2f}", + f"GFLOPS: {gflops:.2f}", + ] + print(" ".join(parts)) From f991027da04fd09f164f10b771ea470b9b577519 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 16 Mar 2026 22:46:40 +0000 Subject: [PATCH 02/38] save work --- examples/xegpu/softmax.py | 93 ++++++++++++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 17 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index bb90f812..ab61f1e5 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -13,9 +13,9 @@ import numpy as np from mlir import ir from mlir.execution_engine import ExecutionEngine -from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func +from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func, math -from lighthouse.workload import benchmark +from lighthouse.workload import benchmark, get_bench_wrapper_schedule from lighthouse.utils.memref import to_ctype as memref_to_ctype from lighthouse.utils.numpy import numpy_to_ctype from lighthouse.utils.mlir import func_cif @@ -174,35 +174,94 @@ def payload(output, input_arg): output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True) input_tensor = emit_buf_to_tensor(input_arg, restrict=True) - # Create intermediate buffer for softmax (used internally by linalg.softmax) - # This stores the sum of exp values M, N = self.shape - softmax_buf_type = ir.MemRefType.get((M,N), dtype) - softmax_buf = gpu.alloc(softmax_buf_type, None, [], [], []) - softmax_buf_tensor = emit_buf_to_tensor(softmax_buf, restrict=True, writable=True) - # Compute softmax along dimension 1 (rows) - # linalg.softmax performs: exp(x - max(x)) / sum(exp(x - max(x))) - result = linalg.softmax( - (input_tensor.type,), input_tensor, softmax_buf_tensor, dimension=1 + # Define affine maps for indexing + # #map = affine_map<(d0, d1) -> (d0, d1)> (identity 2D) + # #map1 = affine_map<(d0, d1) -> (d0)> (broadcast/reduce along d1) + d0 = ir.AffineDimExpr.get(0) + d1 = ir.AffineDimExpr.get(1) + map_2d = ir.AffineMap.get(2, 0, [d0, d1]) + map_1d = ir.AffineMap.get(2, 0, [d0]) + + # Step 1: Find max - linalg.generic reduction + neg_inf = arith.constant(dtype, float('-inf')) + max_init = tensor.empty((M,), dtype) + max_filled = linalg.fill(neg_inf, outs=[max_init]) + + @linalg.generic( + [input_tensor], # inputs + [max_filled], # outputs + [map_2d, map_1d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types + ) + def row_max(in_val, acc): + return arith.maximumf(in_val, acc) + + # Step 2: Subtract max (broadcast) - linalg.generic elementwise + output_init = tensor.empty((M, N), dtype) + + @linalg.generic( + [input_tensor, row_max], # inputs + [output_init], # outputs + [map_2d, map_1d, map_2d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + ) + def shifted(in_val, max_val, out): + return arith.subf(in_val, max_val) + + # Step 3: Compute exp - linalg.generic elementwise + @linalg.generic( + [shifted], # inputs + [output_init], # outputs + [map_2d, map_2d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + ) + def exp_vals(in_val, out): + return math.exp(in_val) + + # Step 4: Sum exp values - linalg.generic reduction + # Create collapsed tensor for sum init + # sum_init_2d = tensor.empty((M, 1), dtype) + sum_init = tensor.empty((M,), dtype) + # sum_init = tensor.CollapseShapeOp(sum_init_2d, [[0, 1]]) + + + zero = arith.constant(dtype, 0.0) + sum_filled = linalg.fill(zero, outs=[sum_init]) + + @linalg.generic( + [exp_vals], # inputs + [sum_filled], # outputs + [map_2d, map_1d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types ) + def row_sum(in_val, acc): + return arith.addf(in_val, acc) + + # Step 5: Divide by sum (broadcast) - linalg.generic elementwise + @linalg.generic( + [exp_vals, row_sum], # inputs + [output_init], # outputs + [map_2d, map_1d, map_2d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + ) + def result(exp_val, sum_val, out): + return arith.divf(exp_val, sum_val) # Materialize result back to output memref bufferization.materialize_in_destination( None, result, output, restrict=True, writable=True ) - - # Cleanup - gpu.dealloc(None, [], softmax_buf) # Emit utility functions for GPU memory management emit_gpu_util_funcs(dtype, rank=2) return mod - def schedule_module( + def schedule_modules( self, stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None - ) -> ir.Module: + ) -> list[ir.Module]: """ Generate transform schedule for softmax. @@ -223,7 +282,7 @@ def transform_main(): # and lower to XeGPU operations pass - return mod + return [get_bench_wrapper_schedule(self), mod] def shared_libs(self) -> list[str]: return ["libmlir_levelzero_runtime.so"] From 22415bb54f34727337b66392f4585f09110d2b6a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 16 Mar 2026 22:48:34 +0000 Subject: [PATCH 03/38] save work --- examples/xegpu/softmax.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index ab61f1e5..dff8bd99 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -224,7 +224,7 @@ def exp_vals(in_val, out): # Create collapsed tensor for sum init # sum_init_2d = tensor.empty((M, 1), dtype) sum_init = tensor.empty((M,), dtype) - # sum_init = tensor.CollapseShapeOp(sum_init_2d, [[0, 1]]) + # tensor.CollapseShapeOp(sum_init, sum_init_2d, [[0, 1]]) zero = arith.constant(dtype, 0.0) From cb9ead174134465610ef797dbb33fbee0196e1cd Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 17 Mar 2026 22:05:47 +0000 Subject: [PATCH 04/38] save work --- examples/xegpu/softmax.py | 64 +++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 9 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index dff8bd99..97abd49a 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -269,20 +269,66 @@ def schedule_modules( tiling, vectorization, and XeGPU-specific lowering transformations. """ # TODO: Implement proper transform schedule - # For now, create a minimal schedule that just applies bufferization + # For now, create a minimal schedule that prints the last linalg operation mod = ir.Module.create() + mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get() + with ir.InsertionPoint(mod.body): from mlir.dialects import transform + from mlir.dialects.transform import structured + + # Create a transform sequence with proper signature + named_sequence = transform.named_sequence( + "__transform_main", + [transform.AnyOpType.get()], # input: module + [], # no outputs + arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}] + ) - # Create a simple transform sequence - @func_cif(name="__transform_main") - def transform_main(): - # Empty transform - just identity - # In a full implementation, this would tile, vectorize, - # and lower to XeGPU operations - pass + with ir.InsertionPoint(named_sequence.body): + # Get the input module (bodyTarget) + payload_mod = named_sequence.bodyTarget + + # Match all linalg.generic operations + # We have 5 generic ops in softmax: max, sub, exp, sum, div + generic_ops = structured.structured_match( + transform.AnyOpType.get(), + payload_mod, + ops=["linalg.generic"] + ) + + # Split the handle into individual operation handles + # For softmax, we have 5 operations + anytype = transform.AnyOpType.get() + split_ops = transform.split_handle( + (anytype, anytype, anytype, anytype, anytype), # 5 result types + generic_ops + ) + + # The last operation (index 4) is the division + last_op = split_ops[-1] + + # Print the last operation before tiling + # transform.print_(target=last_op, name="last_linalg_generic_before_tiling") + + # Tile the last operation using tile_using_forall + # Tile sizes: [64, 64] for the two parallel dimensions (M, N) + tiled_op, for_op = structured.structured_tile_using_forall( + anytype, anytype, + last_op, + num_threads=[], + tile_sizes=[], + static_tile_sizes=[64, 64], + ) + + # Print the tiled operation + # transform.print_(target=tiled_op, name="tiled_linalg_generic") + # transform.print_(target=for_op, name="forall_op") + + # Required: yield to end the transform sequence + transform.yield_() - return [get_bench_wrapper_schedule(self), mod] + return [mod] def shared_libs(self) -> list[str]: return ["libmlir_levelzero_runtime.so"] From ac39be33c3b90e6eb77c16237611fd33d4490695 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 18 Mar 2026 22:41:32 +0000 Subject: [PATCH 05/38] save work --- examples/xegpu/softmax.py | 86 +++++++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 8 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index 97abd49a..6d6f87eb 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -14,6 +14,8 @@ from mlir import ir from mlir.execution_engine import ExecutionEngine from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func, math +from mlir.dialects import transform +from mlir.dialects.transform import structured, loop from lighthouse.workload import benchmark, get_bench_wrapper_schedule from lighthouse.utils.memref import to_ctype as memref_to_ctype @@ -21,9 +23,25 @@ from lighthouse.utils.mlir import func_cif from lighthouse.ingress.mlir_gen import get_mlir_elem_type from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor +from lighthouse.pipeline.helper import ( + apply_registered_pass, + canonicalize, + match, +) +from mlir.dialects.transform import bufferization as transform_bufferization +from mlir.dialects.bufferization import LayoutMapOption from xegpu_workload import XeGPUWorkload +def match_and_split(*args, nhandles=1, **kwargs): + """Helper function that splits matched handles.""" + matched = match(*args, **kwargs) + anytype = transform.AnyOpType.get() + matched_ops = transform.split_handle((anytype,) * nhandles, matched) + if nhandles == 1: + matched_ops = [matched_ops] + return matched_ops + def softmax_complexity(M: int, N: int, nbytes: int): """ @@ -274,8 +292,7 @@ def schedule_modules( mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get() with ir.InsertionPoint(mod.body): - from mlir.dialects import transform - from mlir.dialects.transform import structured + # Create a transform sequence with proper signature named_sequence = transform.named_sequence( @@ -305,8 +322,11 @@ def schedule_modules( generic_ops ) - # The last operation (index 4) is the division - last_op = split_ops[-1] + # Reverse split_ops to have operations in reverse order + split_ops = list(reversed(split_ops)) + + # The first operation (after reversal) is the division - this is the consumer + last_op = split_ops[0] # Print the last operation before tiling # transform.print_(target=last_op, name="last_linalg_generic_before_tiling") @@ -318,12 +338,62 @@ def schedule_modules( last_op, num_threads=[], tile_sizes=[], - static_tile_sizes=[64, 64], + static_tile_sizes=(64,), ) - # Print the tiled operation - # transform.print_(target=tiled_op, name="tiled_linalg_generic") - # transform.print_(target=for_op, name="forall_op") + # Fuse the producer operations into the forall loop + # Iterate through remaining operations (already in reverse order) + current_forall = for_op + for producer_op in split_ops[1:]: + fused_op, current_forall = structured.structured_fuse_into_containing_op( + anytype, anytype, + producer_op, + current_forall + ) + + func = transform.get_parent_op( + anytype, + current_forall, + op_name="func.func", + deduplicate=True, + ) + transform.apply_cse(func) + canonicalize(func) + func = apply_registered_pass(func, "eliminate-empty-tensors") + func = structured.VectorizeChildrenAndApplyPatternsOp( + func, + fold_type_extensions_into_contract=True, + ).result + identity_layout = LayoutMapOption.IdentityLayoutMap + payload_mod = transform.get_parent_op( + anytype, + func, + op_name="builtin.module", + deduplicate=True, + ) + payload_mod = transform_bufferization.OneShotBufferizeOp( + payload_mod, + allow_return_allocs_from_loops=True, + bufferize_function_boundaries=True, + function_boundary_type_conversion=identity_layout, + ).result + payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") + transform.apply_cse(payload_mod) + canonicalize(payload_mod) + + # convert forall to parallel + wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) + for wg_loop in wg_loops: + wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) + func = transform.get_parent_op(anytype, wg_loop) + + # convert to scf.parallel to gpu.launch + func = apply_registered_pass(func, "gpu-map-parallel-loops") + func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") + func = apply_registered_pass(func, "lower-affine") + transform.apply_cse(func) + canonicalize(func) + # Required: yield to end the transform sequence transform.yield_() From 51d494e23aa34e3166210e080aa23663e98924c2 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 20 Mar 2026 17:10:46 +0000 Subject: [PATCH 06/38] save work --- examples/xegpu/softmax.py | 67 +++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index 6d6f87eb..be29cdd0 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -15,7 +15,7 @@ from mlir.execution_engine import ExecutionEngine from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func, math from mlir.dialects import transform -from mlir.dialects.transform import structured, loop +from mlir.dialects.transform import structured, loop, xegpu from lighthouse.workload import benchmark, get_bench_wrapper_schedule from lighthouse.utils.memref import to_ctype as memref_to_ctype @@ -338,7 +338,7 @@ def schedule_modules( last_op, num_threads=[], tile_sizes=[], - static_tile_sizes=(64,), + static_tile_sizes=(parameters["wg_rows"],), ) # Fuse the producer operations into the forall loop @@ -393,6 +393,39 @@ def schedule_modules( func = apply_registered_pass(func, "lower-affine") transform.apply_cse(func) canonicalize(func) + # set the number of threads for the gpu.launch operation + launch_op = match_and_split(func, ops={"gpu.launch"}) + num_threads = parameters["sg_rows"] * parameters["subgroup_size"] + xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) + + # outline gpu func + func = apply_registered_pass(func, "lower-affine") + canonicalize(func) + func = apply_registered_pass(func, "gpu-launch-sink-index-computations") + payload_mod = transform.get_parent_op( + anytype, + func, + op_name="builtin.module", + deduplicate=True, + ) + payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining") + # transform.PrintOp(target=payload_mod, name="before_gpu_outlining") + # transform.apply_cse(payload_mod) + + # set xevm target + # payload_mod = apply_registered_pass( + # payload_mod, + # "xevm-attach-target", + # options={"O": "3", "chip": "bmg"}, + # ) + + # # convert vector to xegpu + # gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"}) + # for gpu_mod in gpu_mod_ops: + # gpu_func = match(gpu_mod, ops={"gpu.func"}) + # gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") + # transform.apply_cse(gpu_func) + # Required: yield to end the transform sequence @@ -413,15 +446,26 @@ def parse_cli(): "--sizes", type=int, nargs=2, - default=[1024, 512], + default=[1024, 64], help="M,N matrix sizes (MxN)", ) parser.add_argument( - "--wg-tile", + "--wg-rows", type=int, - nargs=2, - default=[64, 32], - help="Workgroup tile size M,N.", + default=64, + help="Number of rows per workgroup.", + ) + parser.add_argument( + "--sg-rows", + type=int, + default=8, + help="Number of rows per subgroup.", + ) + parser.add_argument( + "--subgroup-size", + type=int, + default=16, + help="Subgroup size.", ) parser.add_argument( "--nruns", @@ -468,8 +512,9 @@ def parse_cli(): args = parse_cli() params = { - "wg_m": args.wg_tile[0], - "wg_n": args.wg_tile[1], + "wg_rows": args.wg_rows, + "sg_rows": args.sg_rows, + "subgroup_size": args.subgroup_size, } M, N = args.sizes @@ -504,7 +549,9 @@ def list2str(a): parts = [ f"sizes={list2str(args.sizes)}", f"dt={dtype}", - f"wg-tile={list2str(args.wg_tile)}", + f"wg-rows={args.wg_rows}", + f"sg-rows={args.sg_rows}", + f"subgroup-size={args.subgroup_size}", f"time(us): {elapsed:.2f}", f"GFLOPS: {gflops:.2f}", ] From 7ac8852412f06100c2cf188c37c2f18254e38c83 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 20 Mar 2026 21:37:04 +0000 Subject: [PATCH 07/38] save work --- examples/xegpu/softmax.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index be29cdd0..56b43c21 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -408,23 +408,34 @@ def schedule_modules( op_name="builtin.module", deduplicate=True, ) - payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining") + # payload = match(payload_mod, ops={"func.func"}) # transform.PrintOp(target=payload_mod, name="before_gpu_outlining") - # transform.apply_cse(payload_mod) + payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining") + transform.apply_cse(payload_mod) # set xevm target - # payload_mod = apply_registered_pass( - # payload_mod, - # "xevm-attach-target", - # options={"O": "3", "chip": "bmg"}, - # ) - - # # convert vector to xegpu - # gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"}) + payload_mod = apply_registered_pass( + payload_mod, + "xevm-attach-target", + options={"O": "3", "chip": "bmg"}, + ) + + # convert vector to xegpu + gpu_mod = match_and_split(payload_mod, ops={"gpu.module"}) # for gpu_mod in gpu_mod_ops: - # gpu_func = match(gpu_mod, ops={"gpu.func"}) - # gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") - # transform.apply_cse(gpu_func) + gpu_func = match(gpu_mod[0], ops={"gpu.func"}) + gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") + transform.apply_cse(gpu_func) + + # Set layout attributes for xegpu.store_nd operations + store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) + # for store_op in store_ops: + xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64]) + + payload_mod = apply_registered_pass( + payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} + ) + From d65bf9fe47d2c95c5fdc986177e604ec4c0b6f82 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 20 Mar 2026 22:58:52 +0000 Subject: [PATCH 08/38] save work --- examples/xegpu/softmax.py | 88 ++++++++++++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 20 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index 56b43c21..2debc8c8 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -311,14 +311,14 @@ def schedule_modules( generic_ops = structured.structured_match( transform.AnyOpType.get(), payload_mod, - ops=["linalg.generic"] + ops=["linalg.generic", "linalg.fill"] ) # Split the handle into individual operation handles # For softmax, we have 5 operations anytype = transform.AnyOpType.get() split_ops = transform.split_handle( - (anytype, anytype, anytype, anytype, anytype), # 5 result types + (anytype, anytype, anytype, anytype, anytype, anytype, anytype), # 7 result types generic_ops ) @@ -350,20 +350,23 @@ def schedule_modules( producer_op, current_forall ) - - func = transform.get_parent_op( - anytype, - current_forall, - op_name="func.func", - deduplicate=True, - ) - transform.apply_cse(func) - canonicalize(func) - func = apply_registered_pass(func, "eliminate-empty-tensors") + transform.annotate(current_forall, "gpu_loop") + + transform.apply_cse(payload_mod) + # canonicalize(payload_mod) + + # Vectorize and bufferize sequence + func = match(payload_mod, ops={"func.func"}) func = structured.VectorizeChildrenAndApplyPatternsOp( func, fold_type_extensions_into_contract=True, ).result + loops = match_and_split(payload_mod, ops={"scf.forall"}) + loop.loop_hoist_loop_invariant_subsets(loops[0]) + transform.apply_cse(payload_mod) + canonicalize(payload_mod) + # transform.PrintOp(target=payload_mod, name="vectorize") + identity_layout = LayoutMapOption.IdentityLayoutMap payload_mod = transform.get_parent_op( anytype, @@ -377,28 +380,73 @@ def schedule_modules( bufferize_function_boundaries=True, function_boundary_type_conversion=identity_layout, ).result + # payload_mod = transform_bufferization.OneShotBufferizeOp( + # payload_mod, + # allow_return_allocs_from_loops=False, + # bufferize_function_boundaries=True, + # function_boundary_type_conversion=identity_layout, + # ).result payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") + payload_mod = apply_registered_pass(payload_mod, "drop-equivalent-buffer-results") + payload_mod = apply_registered_pass( + payload_mod, + "buffer-results-to-out-params", + options={ + "add-result-attr": "true", + "hoist-dynamic-allocs": "true", + "hoist-static-allocs": "true", + "modify-public-functions": "true" + } + ) transform.apply_cse(payload_mod) canonicalize(payload_mod) + # # # transform.PrintOp(target=payload_mod, name="bufferize") + + # # func = match(payload_mod, ops={"func.func"}) + gpu_loop = match(payload_mod, op_attrs={"gpu_loop": ir.UnitAttr.get()}) + # # gpu_loop = transform.split_handle(anytype, gpu_loop) + gpu_loop = loop.loop_forall_to_parallel([anytype], gpu_loop) + + # # func = apply_registered_pass(payload_mod, "eliminate-empty-tensors") + # # func = structured.VectorizeChildrenAndApplyPatternsOp( + # # func, + # # fold_type_extensions_into_contract=True, + # # ).result + # # identity_layout = LayoutMapOption.IdentityLayoutMap + payload_mod = transform.get_parent_op( + anytype, + gpu_loop, + op_name="func.func", + deduplicate=True, + ) + # payload_mod = transform_bufferization.OneShotBufferizeOp( + # payload_mod, + # allow_return_allocs_from_loops=True, + # bufferize_function_boundaries=True, + # function_boundary_type_conversion=identity_layout, + # ).result + # payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") + # transform.apply_cse(payload_mod) + # canonicalize(payload_mod) - # convert forall to parallel - wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) - for wg_loop in wg_loops: - wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) - func = transform.get_parent_op(anytype, wg_loop) + # # convert forall to parallel + # wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) + # for wg_loop in wg_loops: + # wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) + # func = transform.get_parent_op(anytype, wg_loop) # convert to scf.parallel to gpu.launch - func = apply_registered_pass(func, "gpu-map-parallel-loops") + func = apply_registered_pass(payload_mod, "gpu-map-parallel-loops") func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") func = apply_registered_pass(func, "lower-affine") transform.apply_cse(func) canonicalize(func) - # set the number of threads for the gpu.launch operation + # # set the number of threads for the gpu.launch operation launch_op = match_and_split(func, ops={"gpu.launch"}) num_threads = parameters["sg_rows"] * parameters["subgroup_size"] xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) - # outline gpu func + # # outline gpu func func = apply_registered_pass(func, "lower-affine") canonicalize(func) func = apply_registered_pass(func, "gpu-launch-sink-index-computations") From 0bf3eb32a7fa9fbd66b0a7a238fb302607c2896a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 24 Mar 2026 20:32:41 +0000 Subject: [PATCH 09/38] save working version --- examples/xegpu/softmax.py | 138 ++++++++++++-------------------------- 1 file changed, 42 insertions(+), 96 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index 2debc8c8..e95286b5 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -17,6 +17,7 @@ from mlir.dialects import transform from mlir.dialects.transform import structured, loop, xegpu +from lighthouse import dialects as lh_dialects from lighthouse.workload import benchmark, get_bench_wrapper_schedule from lighthouse.utils.memref import to_ctype as memref_to_ctype from lighthouse.utils.numpy import numpy_to_ctype @@ -54,7 +55,6 @@ def softmax_complexity(M: int, N: int, nbytes: int): Total: 3*N operations per row, but with transcendental (exp) operations """ # Approximation: 5 FLOPs per element (max, sub, exp, sum, div) - # exp is expensive but we count it as ~1 FLOP for simplicity flop_count = M * N * 5 memory_reads = M * N * nbytes # read input memory_writes = M * N * nbytes # write output @@ -303,8 +303,15 @@ def schedule_modules( ) with ir.InsertionPoint(named_sequence.body): - # Get the input module (bodyTarget) - payload_mod = named_sequence.bodyTarget + # match the payload module + anytype = transform.AnyOpType.get() + func = match(named_sequence.bodyTarget, ops={"func.func"}) + payload_mod = transform.get_parent_op( + anytype, + func, + op_name="builtin.module", + deduplicate=True, + ) # Match all linalg.generic operations # We have 5 generic ops in softmax: max, sub, exp, sum, div @@ -315,7 +322,7 @@ def schedule_modules( ) # Split the handle into individual operation handles - # For softmax, we have 5 operations + # For softmax, we have 7 operations anytype = transform.AnyOpType.get() split_ops = transform.split_handle( (anytype, anytype, anytype, anytype, anytype, anytype, anytype), # 7 result types @@ -350,117 +357,59 @@ def schedule_modules( producer_op, current_forall ) - transform.annotate(current_forall, "gpu_loop") - - transform.apply_cse(payload_mod) - # canonicalize(payload_mod) + + func = transform.get_parent_op( + anytype, + current_forall, + op_name="func.func", + deduplicate=True, + ) + transform.apply_cse(func) + canonicalize(func) - # Vectorize and bufferize sequence - func = match(payload_mod, ops={"func.func"}) func = structured.VectorizeChildrenAndApplyPatternsOp( func, fold_type_extensions_into_contract=True, ).result - loops = match_and_split(payload_mod, ops={"scf.forall"}) - loop.loop_hoist_loop_invariant_subsets(loops[0]) - transform.apply_cse(payload_mod) - canonicalize(payload_mod) - # transform.PrintOp(target=payload_mod, name="vectorize") - + transform.apply_cse(func) + canonicalize(func) + payload_mod = apply_registered_pass(payload_mod, "eliminate-empty-tensors") identity_layout = LayoutMapOption.IdentityLayoutMap - payload_mod = transform.get_parent_op( - anytype, - func, - op_name="builtin.module", - deduplicate=True, - ) payload_mod = transform_bufferization.OneShotBufferizeOp( payload_mod, allow_return_allocs_from_loops=True, bufferize_function_boundaries=True, function_boundary_type_conversion=identity_layout, ).result - # payload_mod = transform_bufferization.OneShotBufferizeOp( - # payload_mod, - # allow_return_allocs_from_loops=False, - # bufferize_function_boundaries=True, - # function_boundary_type_conversion=identity_layout, - # ).result + # fold memref.subviews into vector.transfer_read/write ops payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") - payload_mod = apply_registered_pass(payload_mod, "drop-equivalent-buffer-results") - payload_mod = apply_registered_pass( - payload_mod, - "buffer-results-to-out-params", - options={ - "add-result-attr": "true", - "hoist-dynamic-allocs": "true", - "hoist-static-allocs": "true", - "modify-public-functions": "true" - } - ) transform.apply_cse(payload_mod) canonicalize(payload_mod) - # # # transform.PrintOp(target=payload_mod, name="bufferize") - - # # func = match(payload_mod, ops={"func.func"}) - gpu_loop = match(payload_mod, op_attrs={"gpu_loop": ir.UnitAttr.get()}) - # # gpu_loop = transform.split_handle(anytype, gpu_loop) - gpu_loop = loop.loop_forall_to_parallel([anytype], gpu_loop) - # # func = apply_registered_pass(payload_mod, "eliminate-empty-tensors") - # # func = structured.VectorizeChildrenAndApplyPatternsOp( - # # func, - # # fold_type_extensions_into_contract=True, - # # ).result - # # identity_layout = LayoutMapOption.IdentityLayoutMap - payload_mod = transform.get_parent_op( - anytype, - gpu_loop, - op_name="func.func", - deduplicate=True, - ) - # payload_mod = transform_bufferization.OneShotBufferizeOp( - # payload_mod, - # allow_return_allocs_from_loops=True, - # bufferize_function_boundaries=True, - # function_boundary_type_conversion=identity_layout, - # ).result - # payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") - # transform.apply_cse(payload_mod) - # canonicalize(payload_mod) - - # # convert forall to parallel - # wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) - # for wg_loop in wg_loops: - # wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) - # func = transform.get_parent_op(anytype, wg_loop) - + # convert forall to parallel + wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) + for wg_loop in wg_loops: + wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) + func = transform.get_parent_op(anytype, wg_loop) # convert to scf.parallel to gpu.launch - func = apply_registered_pass(payload_mod, "gpu-map-parallel-loops") + func = apply_registered_pass(func, "gpu-map-parallel-loops") func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") func = apply_registered_pass(func, "lower-affine") transform.apply_cse(func) canonicalize(func) - # # set the number of threads for the gpu.launch operation + + # set the number of threads for the gpu.launch operation launch_op = match_and_split(func, ops={"gpu.launch"}) num_threads = parameters["sg_rows"] * parameters["subgroup_size"] xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) - # # outline gpu func + # outline gpu func func = apply_registered_pass(func, "lower-affine") canonicalize(func) func = apply_registered_pass(func, "gpu-launch-sink-index-computations") - payload_mod = transform.get_parent_op( - anytype, - func, - op_name="builtin.module", - deduplicate=True, - ) - # payload = match(payload_mod, ops={"func.func"}) - # transform.PrintOp(target=payload_mod, name="before_gpu_outlining") payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining") transform.apply_cse(payload_mod) - + # set xevm target payload_mod = apply_registered_pass( payload_mod, @@ -469,12 +418,12 @@ def schedule_modules( ) # convert vector to xegpu - gpu_mod = match_and_split(payload_mod, ops={"gpu.module"}) - # for gpu_mod in gpu_mod_ops: - gpu_func = match(gpu_mod[0], ops={"gpu.func"}) - gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") - transform.apply_cse(gpu_func) - + gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"}) + for gpu_mod in gpu_mod_ops: + gpu_func = match(gpu_mod, ops={"gpu.func"}) + gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") + transform.apply_cse(gpu_func) + # Set layout attributes for xegpu.store_nd operations store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) # for store_op in store_ops: @@ -483,14 +432,10 @@ def schedule_modules( payload_mod = apply_registered_pass( payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} ) - - - - # Required: yield to end the transform sequence transform.yield_() - return [mod] + return [get_bench_wrapper_schedule(self), mod] def shared_libs(self) -> list[str]: return ["libmlir_levelzero_runtime.so"] @@ -580,6 +525,7 @@ def parse_cli(): dtype = "f32" with ir.Context(), ir.Location.unknown(): + lh_dialects.register_and_load() wload = XeGPUSoftmax(M=M, N=N, dtype=dtype) if args.dump_kernel or args.dump_schedule: From fabd656c1267437174b7c7b9ed472c0b76556ee9 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 24 Mar 2026 21:48:44 +0000 Subject: [PATCH 10/38] save working version --- examples/xegpu/softmax.py | 288 +----------------- .../ingress/mlir_gen/gpu_softmax_payload.py | 121 ++++++++ lighthouse/schedule/xegpu/softmax_schedule.py | 195 ++++++++++++ 3 files changed, 332 insertions(+), 272 deletions(-) create mode 100644 lighthouse/ingress/mlir_gen/gpu_softmax_payload.py create mode 100644 lighthouse/schedule/xegpu/softmax_schedule.py diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index e95286b5..474eb1a3 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -13,36 +13,17 @@ import numpy as np from mlir import ir from mlir.execution_engine import ExecutionEngine -from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func, math -from mlir.dialects import transform -from mlir.dialects.transform import structured, loop, xegpu from lighthouse import dialects as lh_dialects from lighthouse.workload import benchmark, get_bench_wrapper_schedule from lighthouse.utils.memref import to_ctype as memref_to_ctype from lighthouse.utils.numpy import numpy_to_ctype -from lighthouse.utils.mlir import func_cif from lighthouse.ingress.mlir_gen import get_mlir_elem_type -from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor -from lighthouse.pipeline.helper import ( - apply_registered_pass, - canonicalize, - match, -) -from mlir.dialects.transform import bufferization as transform_bufferization -from mlir.dialects.bufferization import LayoutMapOption +from lighthouse.ingress.mlir_gen.gpu_softmax_payload import generate_gpu_softmax_payload +from lighthouse.schedule.xegpu.softmax_schedule import get_softmax_schedule_module from xegpu_workload import XeGPUWorkload -def match_and_split(*args, nhandles=1, **kwargs): - """Helper function that splits matched handles.""" - matched = match(*args, **kwargs) - anytype = transform.AnyOpType.get() - matched_ops = transform.split_handle((anytype,) * nhandles, matched) - if nhandles == 1: - matched_ops = [matched_ops] - return matched_ops - def softmax_complexity(M: int, N: int, nbytes: int): """ @@ -180,262 +161,25 @@ def get_complexity(self) -> tuple[int, int, int]: def payload_module(self) -> ir.Module: """Generate MLIR module for softmax payload.""" - mod = ir.Module.create() dtype = get_mlir_elem_type(self.dtype_str) - memref_t = ir.MemRefType.get(self.shape, dtype) - - with ir.InsertionPoint(mod.body): - # Function signature: payload(output, input) - @func_cif(memref_t, memref_t, name=self.payload_function_name) - def payload(output, input_arg): - # Convert memrefs to tensors - output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True) - input_tensor = emit_buf_to_tensor(input_arg, restrict=True) - - M, N = self.shape - - # Define affine maps for indexing - # #map = affine_map<(d0, d1) -> (d0, d1)> (identity 2D) - # #map1 = affine_map<(d0, d1) -> (d0)> (broadcast/reduce along d1) - d0 = ir.AffineDimExpr.get(0) - d1 = ir.AffineDimExpr.get(1) - map_2d = ir.AffineMap.get(2, 0, [d0, d1]) - map_1d = ir.AffineMap.get(2, 0, [d0]) - - # Step 1: Find max - linalg.generic reduction - neg_inf = arith.constant(dtype, float('-inf')) - max_init = tensor.empty((M,), dtype) - max_filled = linalg.fill(neg_inf, outs=[max_init]) - - @linalg.generic( - [input_tensor], # inputs - [max_filled], # outputs - [map_2d, map_1d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types - ) - def row_max(in_val, acc): - return arith.maximumf(in_val, acc) - - # Step 2: Subtract max (broadcast) - linalg.generic elementwise - output_init = tensor.empty((M, N), dtype) - - @linalg.generic( - [input_tensor, row_max], # inputs - [output_init], # outputs - [map_2d, map_1d, map_2d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types - ) - def shifted(in_val, max_val, out): - return arith.subf(in_val, max_val) - - # Step 3: Compute exp - linalg.generic elementwise - @linalg.generic( - [shifted], # inputs - [output_init], # outputs - [map_2d, map_2d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types - ) - def exp_vals(in_val, out): - return math.exp(in_val) - - # Step 4: Sum exp values - linalg.generic reduction - # Create collapsed tensor for sum init - # sum_init_2d = tensor.empty((M, 1), dtype) - sum_init = tensor.empty((M,), dtype) - # tensor.CollapseShapeOp(sum_init, sum_init_2d, [[0, 1]]) - - - zero = arith.constant(dtype, 0.0) - sum_filled = linalg.fill(zero, outs=[sum_init]) - - @linalg.generic( - [exp_vals], # inputs - [sum_filled], # outputs - [map_2d, map_1d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types - ) - def row_sum(in_val, acc): - return arith.addf(in_val, acc) - - # Step 5: Divide by sum (broadcast) - linalg.generic elementwise - @linalg.generic( - [exp_vals, row_sum], # inputs - [output_init], # outputs - [map_2d, map_1d, map_2d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types - ) - def result(exp_val, sum_val, out): - return arith.divf(exp_val, sum_val) - - # Materialize result back to output memref - bufferization.materialize_in_destination( - None, result, output, restrict=True, writable=True - ) - - # Emit utility functions for GPU memory management - emit_gpu_util_funcs(dtype, rank=2) - - return mod + return generate_gpu_softmax_payload( + func_name=self.payload_function_name, + M=self.M, + N=self.N, + dtype=dtype, + ) def schedule_modules( self, stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None ) -> list[ir.Module]: - """ - Generate transform schedule for softmax. - - For now, returns an empty schedule. In the future, this would contain - tiling, vectorization, and XeGPU-specific lowering transformations. - """ - # TODO: Implement proper transform schedule - # For now, create a minimal schedule that prints the last linalg operation - mod = ir.Module.create() - mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get() - - with ir.InsertionPoint(mod.body): - - - # Create a transform sequence with proper signature - named_sequence = transform.named_sequence( - "__transform_main", - [transform.AnyOpType.get()], # input: module - [], # no outputs - arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}] - ) - - with ir.InsertionPoint(named_sequence.body): - # match the payload module - anytype = transform.AnyOpType.get() - func = match(named_sequence.bodyTarget, ops={"func.func"}) - payload_mod = transform.get_parent_op( - anytype, - func, - op_name="builtin.module", - deduplicate=True, - ) - - # Match all linalg.generic operations - # We have 5 generic ops in softmax: max, sub, exp, sum, div - generic_ops = structured.structured_match( - transform.AnyOpType.get(), - payload_mod, - ops=["linalg.generic", "linalg.fill"] - ) - - # Split the handle into individual operation handles - # For softmax, we have 7 operations - anytype = transform.AnyOpType.get() - split_ops = transform.split_handle( - (anytype, anytype, anytype, anytype, anytype, anytype, anytype), # 7 result types - generic_ops - ) - - # Reverse split_ops to have operations in reverse order - split_ops = list(reversed(split_ops)) - - # The first operation (after reversal) is the division - this is the consumer - last_op = split_ops[0] - - # Print the last operation before tiling - # transform.print_(target=last_op, name="last_linalg_generic_before_tiling") - - # Tile the last operation using tile_using_forall - # Tile sizes: [64, 64] for the two parallel dimensions (M, N) - tiled_op, for_op = structured.structured_tile_using_forall( - anytype, anytype, - last_op, - num_threads=[], - tile_sizes=[], - static_tile_sizes=(parameters["wg_rows"],), - ) - - # Fuse the producer operations into the forall loop - # Iterate through remaining operations (already in reverse order) - current_forall = for_op - for producer_op in split_ops[1:]: - fused_op, current_forall = structured.structured_fuse_into_containing_op( - anytype, anytype, - producer_op, - current_forall - ) - - func = transform.get_parent_op( - anytype, - current_forall, - op_name="func.func", - deduplicate=True, - ) - transform.apply_cse(func) - canonicalize(func) - - func = structured.VectorizeChildrenAndApplyPatternsOp( - func, - fold_type_extensions_into_contract=True, - ).result - transform.apply_cse(func) - canonicalize(func) - payload_mod = apply_registered_pass(payload_mod, "eliminate-empty-tensors") - identity_layout = LayoutMapOption.IdentityLayoutMap - payload_mod = transform_bufferization.OneShotBufferizeOp( - payload_mod, - allow_return_allocs_from_loops=True, - bufferize_function_boundaries=True, - function_boundary_type_conversion=identity_layout, - ).result - # fold memref.subviews into vector.transfer_read/write ops - payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") - transform.apply_cse(payload_mod) - canonicalize(payload_mod) - - # convert forall to parallel - wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) - for wg_loop in wg_loops: - wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) - func = transform.get_parent_op(anytype, wg_loop) - # convert to scf.parallel to gpu.launch - func = apply_registered_pass(func, "gpu-map-parallel-loops") - func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") - func = apply_registered_pass(func, "lower-affine") - transform.apply_cse(func) - canonicalize(func) - - # set the number of threads for the gpu.launch operation - launch_op = match_and_split(func, ops={"gpu.launch"}) - num_threads = parameters["sg_rows"] * parameters["subgroup_size"] - xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) - - # outline gpu func - func = apply_registered_pass(func, "lower-affine") - canonicalize(func) - func = apply_registered_pass(func, "gpu-launch-sink-index-computations") - payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining") - transform.apply_cse(payload_mod) - - # set xevm target - payload_mod = apply_registered_pass( - payload_mod, - "xevm-attach-target", - options={"O": "3", "chip": "bmg"}, - ) - - # convert vector to xegpu - gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"}) - for gpu_mod in gpu_mod_ops: - gpu_func = match(gpu_mod, ops={"gpu.func"}) - gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") - transform.apply_cse(gpu_func) - - # Set layout attributes for xegpu.store_nd operations - store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) - # for store_op in store_ops: - xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64]) - - payload_mod = apply_registered_pass( - payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} - ) - # Required: yield to end the transform sequence - transform.yield_() - - return [get_bench_wrapper_schedule(self), mod] + """Generate transform schedule for softmax.""" + return [ + get_bench_wrapper_schedule(self), + get_softmax_schedule_module( + stop_at_stage=stop_at_stage, + parameters=parameters, + ), + ] def shared_libs(self) -> list[str]: return ["libmlir_levelzero_runtime.so"] diff --git a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py new file mode 100644 index 00000000..26fd9042 --- /dev/null +++ b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py @@ -0,0 +1,121 @@ +"""Generate MLIR payload for GPU softmax operation.""" + +from mlir import ir +from mlir.dialects import linalg, bufferization, arith, tensor, func, math + +from lighthouse.utils.mlir import func_cif +from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor + + +def generate_gpu_softmax_payload( + func_name: str, + M: int, + N: int, + dtype: ir.Type, +) -> ir.Module: + """ + Generate MLIR module for softmax payload. + + Computes softmax along the last dimension (rows): + output[i, j] = exp(input[i, j] - max_i) / sum_i(exp(input[i, j] - max_i)) + + where max_i and sum_i are computed over row i. + + Args: + func_name: Name of the payload function + M: Number of rows + N: Number of columns + dtype: MLIR element type (e.g., F32Type) + + Returns: + MLIR module containing the softmax payload function + """ + mod = ir.Module.create() + shape = (M, N) + memref_t = ir.MemRefType.get(shape, dtype) + + with ir.InsertionPoint(mod.body): + # Function signature: payload(output, input) + @func_cif(memref_t, memref_t, name=func_name) + def payload(output, input_arg): + # Convert memrefs to tensors + output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True) + input_tensor = emit_buf_to_tensor(input_arg, restrict=True) + + # Define affine maps for indexing + # #map = affine_map<(d0, d1) -> (d0, d1)> (identity 2D) + # #map1 = affine_map<(d0, d1) -> (d0)> (broadcast/reduce along d1) + d0 = ir.AffineDimExpr.get(0) + d1 = ir.AffineDimExpr.get(1) + map_2d = ir.AffineMap.get(2, 0, [d0, d1]) + map_1d = ir.AffineMap.get(2, 0, [d0]) + + # Step 1: Find max - linalg.generic reduction + neg_inf = arith.constant(dtype, float('-inf')) + max_init = tensor.empty((M,), dtype) + max_filled = linalg.fill(neg_inf, outs=[max_init]) + + @linalg.generic( + [input_tensor], # inputs + [max_filled], # outputs + [map_2d, map_1d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types + ) + def row_max(in_val, acc): + return arith.maximumf(in_val, acc) + + # Step 2: Subtract max (broadcast) - linalg.generic elementwise + output_init = tensor.empty((M, N), dtype) + + @linalg.generic( + [input_tensor, row_max], # inputs + [output_init], # outputs + [map_2d, map_1d, map_2d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + ) + def shifted(in_val, max_val, out): + return arith.subf(in_val, max_val) + + # Step 3: Compute exp - linalg.generic elementwise + @linalg.generic( + [shifted], # inputs + [output_init], # outputs + [map_2d, map_2d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + ) + def exp_vals(in_val, out): + return math.exp(in_val) + + # Step 4: Sum exp values - linalg.generic reduction + sum_init = tensor.empty((M,), dtype) + zero = arith.constant(dtype, 0.0) + sum_filled = linalg.fill(zero, outs=[sum_init]) + + @linalg.generic( + [exp_vals], # inputs + [sum_filled], # outputs + [map_2d, map_1d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types + ) + def row_sum(in_val, acc): + return arith.addf(in_val, acc) + + # Step 5: Divide by sum (broadcast) - linalg.generic elementwise + @linalg.generic( + [exp_vals, row_sum], # inputs + [output_init], # outputs + [map_2d, map_1d, map_2d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + ) + def result(exp_val, sum_val, out): + return arith.divf(exp_val, sum_val) + + # Materialize result back to output memref + bufferization.materialize_in_destination( + None, result, output, restrict=True, writable=True + ) + + # Emit utility functions for GPU memory management + emit_gpu_util_funcs(dtype, rank=2) + + return mod diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py new file mode 100644 index 00000000..e01f9af6 --- /dev/null +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -0,0 +1,195 @@ +"""Generate MLIR transform schedule for XeGPU softmax operation.""" + +from typing import Optional + +from mlir import ir +from mlir.dialects import transform +from mlir.dialects.transform import structured, loop, xegpu +from mlir.dialects.transform import bufferization as transform_bufferization +from mlir.dialects.bufferization import LayoutMapOption + +from lighthouse.pipeline.helper import ( + apply_registered_pass, + canonicalize, + match, +) + + +def match_and_split(*args, nhandles=1, **kwargs): + """Helper function that splits matched handles.""" + matched = match(*args, **kwargs) + anytype = transform.AnyOpType.get() + matched_ops = transform.split_handle((anytype,) * nhandles, matched) + if nhandles == 1: + matched_ops = [matched_ops] + return matched_ops + + +def get_softmax_schedule_module( + stop_at_stage: Optional[str] = None, + parameters: Optional[dict] = None, +) -> ir.Module: + """ + Generate transform schedule for softmax operation. + + The schedule performs the following transformations: + 1. Tile the consumer operation (division) using forall + 2. Fuse producer operations into the forall loop + 3. Vectorize operations + 4. Bufferize tensors + 5. Convert to GPU dialect + 6. Lower to XeGPU operations + + Args: + stop_at_stage: Optional stage name to stop early (for debugging) + parameters: Dictionary with scheduling parameters: + - wg_rows: Number of rows per workgroup + - sg_rows: Number of rows per subgroup + - subgroup_size: Size of subgroup + + Returns: + MLIR module containing the transform schedule + """ + assert parameters is not None, "Schedule parameters must be provided" + + mod = ir.Module.create() + mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get() + + with ir.InsertionPoint(mod.body): + # Create a transform sequence with proper signature + named_sequence = transform.named_sequence( + "__transform_main", + [transform.AnyOpType.get()], # input: module + [], # no outputs + arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}] + ) + + with ir.InsertionPoint(named_sequence.body): + # match the payload module + anytype = transform.AnyOpType.get() + func = match(named_sequence.bodyTarget, ops={"func.func"}) + payload_mod = transform.get_parent_op( + anytype, + func, + op_name="builtin.module", + deduplicate=True, + ) + + # Match all linalg.generic and linalg.fill operations + # We have 7 operations in softmax: + # fill(max_init), max, sub, exp, fill(sum_init), sum, div + generic_ops = structured.structured_match( + transform.AnyOpType.get(), + payload_mod, + ops=["linalg.generic", "linalg.fill"] + ) + + # Split the handle into individual operation handles + anytype = transform.AnyOpType.get() + split_ops = transform.split_handle( + (anytype, anytype, anytype, anytype, anytype, anytype, anytype), # 7 result types + generic_ops + ) + + # Reverse split_ops to have operations in reverse order + split_ops = list(reversed(split_ops)) + + # The first operation (after reversal) is the division - this is the consumer + last_op = split_ops[0] + + # Tile the last operation using tile_using_forall + tiled_op, for_op = structured.structured_tile_using_forall( + anytype, anytype, + last_op, + num_threads=[], + tile_sizes=[], + static_tile_sizes=(parameters["wg_rows"],), + ) + + # Fuse the producer operations into the forall loop + # Iterate through remaining operations (already in reverse order) + current_forall = for_op + for producer_op in split_ops[1:]: + fused_op, current_forall = structured.structured_fuse_into_containing_op( + anytype, anytype, + producer_op, + current_forall + ) + + func = transform.get_parent_op( + anytype, + current_forall, + op_name="func.func", + deduplicate=True, + ) + transform.apply_cse(func) + canonicalize(func) + + func = structured.VectorizeChildrenAndApplyPatternsOp( + func, + fold_type_extensions_into_contract=True, + ).result + transform.apply_cse(func) + canonicalize(func) + payload_mod = apply_registered_pass(payload_mod, "eliminate-empty-tensors") + identity_layout = LayoutMapOption.IdentityLayoutMap + payload_mod = transform_bufferization.OneShotBufferizeOp( + payload_mod, + allow_return_allocs_from_loops=True, + bufferize_function_boundaries=True, + function_boundary_type_conversion=identity_layout, + ).result + # fold memref.subviews into vector.transfer_read/write ops + payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") + transform.apply_cse(payload_mod) + canonicalize(payload_mod) + + # convert forall to parallel + wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) + for wg_loop in wg_loops: + wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) + func = transform.get_parent_op(anytype, wg_loop) + # convert scf.parallel to gpu.launch + func = apply_registered_pass(func, "gpu-map-parallel-loops") + func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") + func = apply_registered_pass(func, "lower-affine") + transform.apply_cse(func) + canonicalize(func) + + # set the number of threads for the gpu.launch operation + launch_op = match_and_split(func, ops={"gpu.launch"}) + num_threads = parameters["sg_rows"] * parameters["subgroup_size"] + xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) + + # outline gpu func + func = apply_registered_pass(func, "lower-affine") + canonicalize(func) + func = apply_registered_pass(func, "gpu-launch-sink-index-computations") + payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining") + transform.apply_cse(payload_mod) + + # set xevm target + payload_mod = apply_registered_pass( + payload_mod, + "xevm-attach-target", + options={"O": "3", "chip": "bmg"}, + ) + + # convert vector to xegpu + gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"}) + for gpu_mod in gpu_mod_ops: + gpu_func = match(gpu_mod, ops={"gpu.func"}) + gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") + transform.apply_cse(gpu_func) + + # Set layout attributes for xegpu.store_nd operations + store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) + xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64]) + + payload_mod = apply_registered_pass( + payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} + ) + # Required: yield to end the transform sequence + transform.yield_() + + return mod From 1e63d7de4e88047d2251643e5649a477fd0a7c1b Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 24 Mar 2026 22:15:39 +0000 Subject: [PATCH 11/38] save working version --- lighthouse/pipeline/helper.py | 16 + lighthouse/schedule/xegpu/mlp_schedule.py | 18 +- lighthouse/schedule/xegpu/softmax_schedule.py | 307 +++++++++++------- 3 files changed, 203 insertions(+), 138 deletions(-) diff --git a/lighthouse/pipeline/helper.py b/lighthouse/pipeline/helper.py index 213b45df..9c4820d5 100644 --- a/lighthouse/pipeline/helper.py +++ b/lighthouse/pipeline/helper.py @@ -35,3 +35,19 @@ def cleanup_func(target): func = structured.MatchOp.match_op_names(target, ["func.func"]).result transform.apply_cse(func) canonicalize(func) + + +class PipelineInterrupt(Exception): + """Exception to signal early termination of the transform schedule.""" + + pass + + +def match_and_split(*args, nhandles=1, **kwargs): + """Helper function that splits matched handles.""" + matched = match(*args, **kwargs) + anytype = transform.AnyOpType.get() + matched_ops = transform.split_handle((anytype,) * nhandles, matched) + if nhandles == 1: + matched_ops = [matched_ops] + return matched_ops diff --git a/lighthouse/schedule/xegpu/mlp_schedule.py b/lighthouse/schedule/xegpu/mlp_schedule.py index e9fa7909..94cbdf01 100644 --- a/lighthouse/schedule/xegpu/mlp_schedule.py +++ b/lighthouse/schedule/xegpu/mlp_schedule.py @@ -11,6 +11,8 @@ apply_registered_pass, canonicalize, match, + match_and_split, + PipelineInterrupt, ) from lighthouse.dialects import smt_ext, transform_smt_ext as td_smt_ext @@ -33,22 +35,6 @@ MIN_NB_THREADS = 16 -class PipelineInterrupt(Exception): - """Exception to signal early termination of the transform schedule.""" - - pass - - -def match_and_split(*args, nhandles=1, **kwargs): - """Helper function that splits matched handles.""" - matched = match(*args, **kwargs) - anytype = transform.AnyOpType.get() - matched_ops = transform.split_handle((anytype,) * nhandles, matched) - if nhandles == 1: - matched_ops = [matched_ops] - return matched_ops - - @KnobValue.ast_rewrite(in_exprs=True) def params_with_constraints_imposed( params: dict[str, int | None], knob_name_prefix="" diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index e01f9af6..98acd4e6 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -12,19 +12,11 @@ apply_registered_pass, canonicalize, match, + match_and_split, + PipelineInterrupt, ) -def match_and_split(*args, nhandles=1, **kwargs): - """Helper function that splits matched handles.""" - matched = match(*args, **kwargs) - anytype = transform.AnyOpType.get() - matched_ops = transform.split_handle((anytype,) * nhandles, matched) - if nhandles == 1: - matched_ops = [matched_ops] - return matched_ops - - def get_softmax_schedule_module( stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None, @@ -75,121 +67,192 @@ def get_softmax_schedule_module( deduplicate=True, ) - # Match all linalg.generic and linalg.fill operations - # We have 7 operations in softmax: - # fill(max_init), max, sub, exp, fill(sum_init), sum, div - generic_ops = structured.structured_match( - transform.AnyOpType.get(), + xegpu_softmax_transform_schedule( payload_mod, - ops=["linalg.generic", "linalg.fill"] - ) - - # Split the handle into individual operation handles - anytype = transform.AnyOpType.get() - split_ops = transform.split_handle( - (anytype, anytype, anytype, anytype, anytype, anytype, anytype), # 7 result types - generic_ops - ) - - # Reverse split_ops to have operations in reverse order - split_ops = list(reversed(split_ops)) - - # The first operation (after reversal) is the division - this is the consumer - last_op = split_ops[0] - - # Tile the last operation using tile_using_forall - tiled_op, for_op = structured.structured_tile_using_forall( - anytype, anytype, - last_op, - num_threads=[], - tile_sizes=[], - static_tile_sizes=(parameters["wg_rows"],), + parameters=parameters, + stop_at_stage=stop_at_stage or "", ) + + return mod - # Fuse the producer operations into the forall loop - # Iterate through remaining operations (already in reverse order) - current_forall = for_op - for producer_op in split_ops[1:]: - fused_op, current_forall = structured.structured_fuse_into_containing_op( - anytype, anytype, - producer_op, - current_forall - ) - - func = transform.get_parent_op( - anytype, - current_forall, - op_name="func.func", - deduplicate=True, - ) - transform.apply_cse(func) - canonicalize(func) - - func = structured.VectorizeChildrenAndApplyPatternsOp( - func, - fold_type_extensions_into_contract=True, - ).result - transform.apply_cse(func) - canonicalize(func) - payload_mod = apply_registered_pass(payload_mod, "eliminate-empty-tensors") - identity_layout = LayoutMapOption.IdentityLayoutMap - payload_mod = transform_bufferization.OneShotBufferizeOp( - payload_mod, - allow_return_allocs_from_loops=True, - bufferize_function_boundaries=True, - function_boundary_type_conversion=identity_layout, - ).result - # fold memref.subviews into vector.transfer_read/write ops - payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") - transform.apply_cse(payload_mod) - canonicalize(payload_mod) - - # convert forall to parallel - wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) - for wg_loop in wg_loops: - wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) - func = transform.get_parent_op(anytype, wg_loop) - # convert scf.parallel to gpu.launch - func = apply_registered_pass(func, "gpu-map-parallel-loops") - func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") - func = apply_registered_pass(func, "lower-affine") - transform.apply_cse(func) - canonicalize(func) - - # set the number of threads for the gpu.launch operation - launch_op = match_and_split(func, ops={"gpu.launch"}) - num_threads = parameters["sg_rows"] * parameters["subgroup_size"] - xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) - - # outline gpu func - func = apply_registered_pass(func, "lower-affine") - canonicalize(func) - func = apply_registered_pass(func, "gpu-launch-sink-index-computations") - payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining") - transform.apply_cse(payload_mod) - - # set xevm target - payload_mod = apply_registered_pass( - payload_mod, - "xevm-attach-target", - options={"O": "3", "chip": "bmg"}, - ) - # convert vector to xegpu - gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"}) - for gpu_mod in gpu_mod_ops: - gpu_func = match(gpu_mod, ops={"gpu.func"}) - gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") - transform.apply_cse(gpu_func) - - # Set layout attributes for xegpu.store_nd operations - store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) - xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64]) - - payload_mod = apply_registered_pass( - payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} - ) - # Required: yield to end the transform sequence - transform.yield_() +def xegpu_softmax_transform_schedule( + mod: ir.Value[transform.AnyOpType], + parameters: dict, + stop_at_stage: str = "", +): + """Transform schedule for softmax payload.""" + try: + mod = bundle_xegpu_softmax_schedule( + mod, + parameters=parameters, + stop_at_stage=stop_at_stage, + ) + + mod = bundle_xegpu_to_binary( + mod, + stop_at_stage=stop_at_stage, + ) + except PipelineInterrupt: + pass + finally: + transform.yield_() + + +def bundle_xegpu_softmax_schedule( + mod: ir.Value[transform.AnyOpType], + parameters: dict, + stop_at_stage: str = "", +) -> ir.Value[transform.AnyOpType]: + """Schedule for lowering softmax payload to xegpu wg level.""" + if stop_at_stage == "initial": + raise PipelineInterrupt() + + anytype = transform.AnyOpType.get() + + # Match all linalg.generic and linalg.fill operations + # We have 7 operations in softmax: + # fill(max_init), max, sub, exp, fill(sum_init), sum, div + generic_ops = structured.structured_match( + transform.AnyOpType.get(), + mod, + ops=["linalg.generic", "linalg.fill"] + ) + + # Split the handle into individual operation handles + split_ops = transform.split_handle( + (anytype, anytype, anytype, anytype, anytype, anytype, anytype), # 7 result types + generic_ops + ) + + # Reverse split_ops to have operations in reverse order + split_ops = list(reversed(split_ops)) + + # The first operation (after reversal) is the division - this is the consumer + last_op = split_ops[0] + + # Tile the last operation using tile_using_forall + tiled_op, for_op = structured.structured_tile_using_forall( + anytype, anytype, + last_op, + num_threads=[], + tile_sizes=[], + static_tile_sizes=(parameters["wg_rows"],), + ) + + # Fuse the producer operations into the forall loop + # Iterate through remaining operations (already in reverse order) + current_forall = for_op + for producer_op in split_ops[1:]: + fused_op, current_forall = structured.structured_fuse_into_containing_op( + anytype, anytype, + producer_op, + current_forall + ) + + func = transform.get_parent_op( + anytype, + current_forall, + op_name="func.func", + deduplicate=True, + ) + transform.apply_cse(func) + canonicalize(func) + + if stop_at_stage == "tiled": + raise PipelineInterrupt() + + # vectorize + func = structured.VectorizeChildrenAndApplyPatternsOp( + func, + fold_type_extensions_into_contract=True, + ).result + transform.apply_cse(func) + canonicalize(func) + + if stop_at_stage == "vectorized": + raise PipelineInterrupt() + + # bufferize + mod = apply_registered_pass(mod, "eliminate-empty-tensors") + identity_layout = LayoutMapOption.IdentityLayoutMap + mod = transform_bufferization.OneShotBufferizeOp( + mod, + allow_return_allocs_from_loops=True, + bufferize_function_boundaries=True, + function_boundary_type_conversion=identity_layout, + ).result + # fold memref.subviews into vector.transfer_read/write ops + mod = apply_registered_pass(mod, "fold-memref-alias-ops") + transform.apply_cse(mod) + canonicalize(mod) + + if stop_at_stage == "bufferized": + raise PipelineInterrupt() + + # convert forall to parallel + wg_loops = match_and_split(mod, ops={"scf.forall"}) + for wg_loop in wg_loops: + wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) + func = transform.get_parent_op(anytype, wg_loop) + + # convert scf.parallel to gpu.launch + func = apply_registered_pass(func, "gpu-map-parallel-loops") + func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") + func = apply_registered_pass(func, "lower-affine") + transform.apply_cse(func) + canonicalize(func) + + # set the number of threads for the gpu.launch operation + launch_op = match_and_split(func, ops={"gpu.launch"}) + num_threads = parameters["sg_rows"] * parameters["subgroup_size"] + xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) + + # outline gpu func + func = apply_registered_pass(func, "lower-affine") + canonicalize(func) + func = apply_registered_pass(func, "gpu-launch-sink-index-computations") + mod = apply_registered_pass(mod, "gpu-kernel-outlining") + transform.apply_cse(mod) + + # set xevm target + mod = apply_registered_pass( + mod, + "xevm-attach-target", + options={"O": "3", "chip": "bmg"}, + ) + + # convert vector to xegpu + gpu_mod_ops = match_and_split(mod, ops={"gpu.module"}) + for gpu_mod in gpu_mod_ops: + gpu_func = match(gpu_mod, ops={"gpu.func"}) + gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") + transform.apply_cse(gpu_func) + + if stop_at_stage == "xegpu-initial": + raise PipelineInterrupt() + + # Set layout attributes for xegpu.store_nd operations + store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) + xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64]) + + if stop_at_stage == "xegpu-wg": + raise PipelineInterrupt() + + return mod + + +def bundle_xegpu_to_binary( + mod: ir.Value, stop_at_stage: str = "" +) -> ir.Value[transform.AnyOpType]: + """Schedule for lowering xegpu wg level to binary.""" + # upstream xegpu/xevm pipeline is payload independent. + mod = apply_registered_pass( + mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} + ) + + if stop_at_stage == "final": + raise PipelineInterrupt() + return mod From 64b5d73f52813059ad51293e028c0c1f2cabd459 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 24 Mar 2026 22:47:05 +0000 Subject: [PATCH 12/38] save working version --- examples/xegpu/softmax.py | 1 + lighthouse/schedule/xegpu/softmax_schedule.py | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index 474eb1a3..c31af47a 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -260,6 +260,7 @@ def parse_cli(): args = parse_cli() params = { + "sizes": args.sizes, "wg_rows": args.wg_rows, "sg_rows": args.sg_rows, "subgroup_size": args.subgroup_size, diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 98acd4e6..26bc5f3e 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -206,7 +206,8 @@ def bundle_xegpu_softmax_schedule( # set the number of threads for the gpu.launch operation launch_op = match_and_split(func, ops={"gpu.launch"}) - num_threads = parameters["sg_rows"] * parameters["subgroup_size"] + num_subgroups = parameters["wg_rows"] // parameters["sg_rows"] + num_threads = num_subgroups * parameters["subgroup_size"] xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) # outline gpu func @@ -233,9 +234,12 @@ def bundle_xegpu_softmax_schedule( if stop_at_stage == "xegpu-initial": raise PipelineInterrupt() - # Set layout attributes for xegpu.store_nd operations + # Set layout attributes for xegpu.store_nd operations. + # FIXME: currently ecah subgroup is handling the entire row. store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) - xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64]) + sg_layout = [parameters["sg_rows"], 1] + sg_data = [parameters["sg_rows"], parameters["sizes"][1]] + xegpu.set_op_layout_attr(store_ops[0], sg_layout=sg_layout, sg_data=sg_data) if stop_at_stage == "xegpu-wg": raise PipelineInterrupt() From 108f2c09e8ee111b6b96f97553bc9bacead0013e Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 25 Mar 2026 20:48:48 +0000 Subject: [PATCH 13/38] save working version --- lighthouse/schedule/xegpu/mlp_schedule.py | 13 +------------ lighthouse/schedule/xegpu/softmax_schedule.py | 17 ++--------------- 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/lighthouse/schedule/xegpu/mlp_schedule.py b/lighthouse/schedule/xegpu/mlp_schedule.py index 94cbdf01..ab11a75c 100644 --- a/lighthouse/schedule/xegpu/mlp_schedule.py +++ b/lighthouse/schedule/xegpu/mlp_schedule.py @@ -14,6 +14,7 @@ match_and_split, PipelineInterrupt, ) +from lighthouse.schedule.xegpu.helper import bundle_xegpu_to_binary from lighthouse.dialects import smt_ext, transform_smt_ext as td_smt_ext from lighthouse.dialects.transform_tune_ext import knob, KnobValue @@ -600,15 +601,3 @@ def annotate_ab_load(tile, layout_load, layout_dpas): canonicalize(gpu_func) transform.apply_cse(gpu_func) - - -def bundle_xegpu_to_binary( - mod: ir.Value, stop_at_stage: str = "" -) -> ir.Value[transform.AnyOpType]: - """Schedule for lowering xegpu wg level to binary.""" - # upstream xegpu/xevm pipeline is payload independent. - mod = apply_registered_pass( - mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} - ) - - return mod diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 26bc5f3e..9e5226b9 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -15,6 +15,7 @@ match_and_split, PipelineInterrupt, ) +from lighthouse.schedule.xegpu.helper import bundle_xegpu_to_binary def get_softmax_schedule_module( @@ -38,6 +39,7 @@ def get_softmax_schedule_module( - wg_rows: Number of rows per workgroup - sg_rows: Number of rows per subgroup - subgroup_size: Size of subgroup + - sizes: Tuple with the sizes of the input tensors (e.g. (M, N)) Returns: MLIR module containing the transform schedule @@ -245,18 +247,3 @@ def bundle_xegpu_softmax_schedule( raise PipelineInterrupt() return mod - - -def bundle_xegpu_to_binary( - mod: ir.Value, stop_at_stage: str = "" -) -> ir.Value[transform.AnyOpType]: - """Schedule for lowering xegpu wg level to binary.""" - # upstream xegpu/xevm pipeline is payload independent. - mod = apply_registered_pass( - mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} - ) - - if stop_at_stage == "final": - raise PipelineInterrupt() - - return mod From a7e1e6c7535c5dd085c14ebea39ae13ac25eeb69 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 25 Mar 2026 20:58:29 +0000 Subject: [PATCH 14/38] save working version --- lighthouse/schedule/xegpu/helper.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 lighthouse/schedule/xegpu/helper.py diff --git a/lighthouse/schedule/xegpu/helper.py b/lighthouse/schedule/xegpu/helper.py new file mode 100644 index 00000000..1452301e --- /dev/null +++ b/lighthouse/schedule/xegpu/helper.py @@ -0,0 +1,21 @@ +"""Helper functions for XeGPU scheduling.""" + +from mlir import ir +from mlir.dialects import transform + +from lighthouse.pipeline.helper import apply_registered_pass, PipelineInterrupt + + +def bundle_xegpu_to_binary( + mod: ir.Value, stop_at_stage: str = "" +) -> ir.Value[transform.AnyOpType]: + """Schedule for lowering xegpu wg level to binary.""" + # upstream xegpu/xevm pipeline is payload independent. + mod = apply_registered_pass( + mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} + ) + + if stop_at_stage == "final": + raise PipelineInterrupt() + + return mod From df53caa54ff2c066393f3fca9479203cc0ef471a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 25 Mar 2026 21:09:29 +0000 Subject: [PATCH 15/38] precommit issues --- examples/xegpu/softmax.py | 12 +-- .../ingress/mlir_gen/gpu_softmax_payload.py | 68 +++++++++------ lighthouse/schedule/xegpu/helper.py | 2 +- lighthouse/schedule/xegpu/softmax_schedule.py | 87 ++++++++++--------- 4 files changed, 97 insertions(+), 72 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index c31af47a..e3cf5840 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -28,7 +28,7 @@ def softmax_complexity(M: int, N: int, nbytes: int): """ Complexity of softmax operation. - + For each row: - O(N) to find max - O(N) to compute exp(x - max) and sum @@ -127,7 +127,7 @@ def check_correctness( output_ref = self._reference_solution output_computed = output_host.astype(np.float32) - + if verbose > 1: print("Reference solution (first 5 rows):") print(output_ref[:5]) @@ -137,10 +137,10 @@ def check_correctness( # Check row sums are close to 1.0 row_sums = np.sum(output_computed, axis=1) sums_ok = np.allclose(row_sums, 1.0, rtol=1e-5, atol=1e-6) - + # Check values match reference values_ok = np.allclose(output_computed, output_ref, rtol=1e-4, atol=1e-6) - + success = sums_ok and values_ok if verbose: @@ -149,7 +149,9 @@ def check_correctness( else: print("FAILED!") if not sums_ok: - print(f" Row sums check failed. Min: {row_sums.min():.6f}, Max: {row_sums.max():.6f}") + print( + f" Row sums check failed. Min: {row_sums.min():.6f}, Max: {row_sums.max():.6f}" + ) if not values_ok: max_diff = np.abs(output_computed - output_ref).max() print(f" Values mismatch. Max abs diff: {max_diff:.6e}") diff --git a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py index 26fd9042..4568448e 100644 --- a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py +++ b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py @@ -1,10 +1,13 @@ """Generate MLIR payload for GPU softmax operation.""" from mlir import ir -from mlir.dialects import linalg, bufferization, arith, tensor, func, math +from mlir.dialects import linalg, bufferization, arith, tensor, math from lighthouse.utils.mlir import func_cif -from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor +from lighthouse.ingress.mlir_gen.gpu_utils import ( + emit_gpu_util_funcs, + emit_buf_to_tensor, +) def generate_gpu_softmax_payload( @@ -15,33 +18,33 @@ def generate_gpu_softmax_payload( ) -> ir.Module: """ Generate MLIR module for softmax payload. - + Computes softmax along the last dimension (rows): output[i, j] = exp(input[i, j] - max_i) / sum_i(exp(input[i, j] - max_i)) - + where max_i and sum_i are computed over row i. - + Args: func_name: Name of the payload function M: Number of rows - N: Number of columns + N: Number of columns dtype: MLIR element type (e.g., F32Type) - + Returns: MLIR module containing the softmax payload function """ mod = ir.Module.create() shape = (M, N) memref_t = ir.MemRefType.get(shape, dtype) - + with ir.InsertionPoint(mod.body): # Function signature: payload(output, input) @func_cif(memref_t, memref_t, name=func_name) def payload(output, input_arg): # Convert memrefs to tensors - output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True) + emit_buf_to_tensor(output, restrict=True, writable=True) input_tensor = emit_buf_to_tensor(input_arg, restrict=True) - + # Define affine maps for indexing # #map = affine_map<(d0, d1) -> (d0, d1)> (identity 2D) # #map1 = affine_map<(d0, d1) -> (d0)> (broadcast/reduce along d1) @@ -49,67 +52,82 @@ def payload(output, input_arg): d1 = ir.AffineDimExpr.get(1) map_2d = ir.AffineMap.get(2, 0, [d0, d1]) map_1d = ir.AffineMap.get(2, 0, [d0]) - + # Step 1: Find max - linalg.generic reduction - neg_inf = arith.constant(dtype, float('-inf')) + neg_inf = arith.constant(dtype, float("-inf")) max_init = tensor.empty((M,), dtype) max_filled = linalg.fill(neg_inf, outs=[max_init]) - + @linalg.generic( [input_tensor], # inputs [max_filled], # outputs [map_2d, map_1d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types + [ + linalg.IteratorType.parallel, + linalg.IteratorType.reduction, + ], # iterator_types ) def row_max(in_val, acc): return arith.maximumf(in_val, acc) - + # Step 2: Subtract max (broadcast) - linalg.generic elementwise output_init = tensor.empty((M, N), dtype) - + @linalg.generic( [input_tensor, row_max], # inputs [output_init], # outputs [map_2d, map_1d, map_2d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + [ + linalg.IteratorType.parallel, + linalg.IteratorType.parallel, + ], # iterator_types ) def shifted(in_val, max_val, out): return arith.subf(in_val, max_val) - + # Step 3: Compute exp - linalg.generic elementwise @linalg.generic( [shifted], # inputs [output_init], # outputs [map_2d, map_2d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + [ + linalg.IteratorType.parallel, + linalg.IteratorType.parallel, + ], # iterator_types ) def exp_vals(in_val, out): return math.exp(in_val) - + # Step 4: Sum exp values - linalg.generic reduction sum_init = tensor.empty((M,), dtype) zero = arith.constant(dtype, 0.0) sum_filled = linalg.fill(zero, outs=[sum_init]) - + @linalg.generic( [exp_vals], # inputs [sum_filled], # outputs [map_2d, map_1d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types + [ + linalg.IteratorType.parallel, + linalg.IteratorType.reduction, + ], # iterator_types ) def row_sum(in_val, acc): return arith.addf(in_val, acc) - + # Step 5: Divide by sum (broadcast) - linalg.generic elementwise @linalg.generic( [exp_vals, row_sum], # inputs [output_init], # outputs [map_2d, map_1d, map_2d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + [ + linalg.IteratorType.parallel, + linalg.IteratorType.parallel, + ], # iterator_types ) def result(exp_val, sum_val, out): return arith.divf(exp_val, sum_val) - + # Materialize result back to output memref bufferization.materialize_in_destination( None, result, output, restrict=True, writable=True diff --git a/lighthouse/schedule/xegpu/helper.py b/lighthouse/schedule/xegpu/helper.py index 1452301e..0c1d93a7 100644 --- a/lighthouse/schedule/xegpu/helper.py +++ b/lighthouse/schedule/xegpu/helper.py @@ -14,7 +14,7 @@ def bundle_xegpu_to_binary( mod = apply_registered_pass( mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} ) - + if stop_at_stage == "final": raise PipelineInterrupt() diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 9e5226b9..d9701b84 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -24,7 +24,7 @@ def get_softmax_schedule_module( ) -> ir.Module: """ Generate transform schedule for softmax operation. - + The schedule performs the following transformations: 1. Tile the consumer operation (division) using forall 2. Fuse producer operations into the forall loop @@ -32,32 +32,32 @@ def get_softmax_schedule_module( 4. Bufferize tensors 5. Convert to GPU dialect 6. Lower to XeGPU operations - + Args: stop_at_stage: Optional stage name to stop early (for debugging) parameters: Dictionary with scheduling parameters: - wg_rows: Number of rows per workgroup - - sg_rows: Number of rows per subgroup + - sg_rows: Number of rows per subgroup - subgroup_size: Size of subgroup - sizes: Tuple with the sizes of the input tensors (e.g. (M, N)) - + Returns: MLIR module containing the transform schedule """ assert parameters is not None, "Schedule parameters must be provided" - + mod = ir.Module.create() mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get() - + with ir.InsertionPoint(mod.body): # Create a transform sequence with proper signature named_sequence = transform.named_sequence( "__transform_main", [transform.AnyOpType.get()], # input: module [], # no outputs - arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}] + arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}], ) - + with ir.InsertionPoint(named_sequence.body): # match the payload module anytype = transform.AnyOpType.get() @@ -68,13 +68,13 @@ def get_softmax_schedule_module( op_name="builtin.module", deduplicate=True, ) - + xegpu_softmax_transform_schedule( payload_mod, parameters=parameters, stop_at_stage=stop_at_stage or "", ) - + return mod @@ -107,36 +107,43 @@ def bundle_xegpu_softmax_schedule( stop_at_stage: str = "", ) -> ir.Value[transform.AnyOpType]: """Schedule for lowering softmax payload to xegpu wg level.""" - + if stop_at_stage == "initial": raise PipelineInterrupt() - + anytype = transform.AnyOpType.get() - + # Match all linalg.generic and linalg.fill operations - # We have 7 operations in softmax: + # We have 7 operations in softmax: # fill(max_init), max, sub, exp, fill(sum_init), sum, div generic_ops = structured.structured_match( - transform.AnyOpType.get(), - mod, - ops=["linalg.generic", "linalg.fill"] + transform.AnyOpType.get(), mod, ops=["linalg.generic", "linalg.fill"] ) - + # Split the handle into individual operation handles split_ops = transform.split_handle( - (anytype, anytype, anytype, anytype, anytype, anytype, anytype), # 7 result types - generic_ops + ( + anytype, + anytype, + anytype, + anytype, + anytype, + anytype, + anytype, + ), # 7 result types + generic_ops, ) - + # Reverse split_ops to have operations in reverse order split_ops = list(reversed(split_ops)) - + # The first operation (after reversal) is the division - this is the consumer last_op = split_ops[0] # Tile the last operation using tile_using_forall tiled_op, for_op = structured.structured_tile_using_forall( - anytype, anytype, + anytype, + anytype, last_op, num_threads=[], tile_sizes=[], @@ -148,11 +155,9 @@ def bundle_xegpu_softmax_schedule( current_forall = for_op for producer_op in split_ops[1:]: fused_op, current_forall = structured.structured_fuse_into_containing_op( - anytype, anytype, - producer_op, - current_forall + anytype, anytype, producer_op, current_forall ) - + func = transform.get_parent_op( anytype, current_forall, @@ -161,10 +166,10 @@ def bundle_xegpu_softmax_schedule( ) transform.apply_cse(func) canonicalize(func) - + if stop_at_stage == "tiled": raise PipelineInterrupt() - + # vectorize func = structured.VectorizeChildrenAndApplyPatternsOp( func, @@ -172,10 +177,10 @@ def bundle_xegpu_softmax_schedule( ).result transform.apply_cse(func) canonicalize(func) - + if stop_at_stage == "vectorized": raise PipelineInterrupt() - + # bufferize mod = apply_registered_pass(mod, "eliminate-empty-tensors") identity_layout = LayoutMapOption.IdentityLayoutMap @@ -189,36 +194,36 @@ def bundle_xegpu_softmax_schedule( mod = apply_registered_pass(mod, "fold-memref-alias-ops") transform.apply_cse(mod) canonicalize(mod) - + if stop_at_stage == "bufferized": raise PipelineInterrupt() - + # convert forall to parallel wg_loops = match_and_split(mod, ops={"scf.forall"}) for wg_loop in wg_loops: wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) func = transform.get_parent_op(anytype, wg_loop) - + # convert scf.parallel to gpu.launch func = apply_registered_pass(func, "gpu-map-parallel-loops") func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") func = apply_registered_pass(func, "lower-affine") transform.apply_cse(func) canonicalize(func) - + # set the number of threads for the gpu.launch operation launch_op = match_and_split(func, ops={"gpu.launch"}) num_subgroups = parameters["wg_rows"] // parameters["sg_rows"] num_threads = num_subgroups * parameters["subgroup_size"] xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) - + # outline gpu func func = apply_registered_pass(func, "lower-affine") canonicalize(func) func = apply_registered_pass(func, "gpu-launch-sink-index-computations") mod = apply_registered_pass(mod, "gpu-kernel-outlining") transform.apply_cse(mod) - + # set xevm target mod = apply_registered_pass( mod, @@ -232,18 +237,18 @@ def bundle_xegpu_softmax_schedule( gpu_func = match(gpu_mod, ops={"gpu.func"}) gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") transform.apply_cse(gpu_func) - + if stop_at_stage == "xegpu-initial": raise PipelineInterrupt() - + # Set layout attributes for xegpu.store_nd operations. # FIXME: currently ecah subgroup is handling the entire row. store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) sg_layout = [parameters["sg_rows"], 1] sg_data = [parameters["sg_rows"], parameters["sizes"][1]] xegpu.set_op_layout_attr(store_ops[0], sg_layout=sg_layout, sg_data=sg_data) - + if stop_at_stage == "xegpu-wg": raise PipelineInterrupt() - + return mod From 9bcc6538adc23d8cb2f9186bfb24cb581919c68d Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 27 Mar 2026 22:36:53 +0000 Subject: [PATCH 16/38] use linalg.softmax --- .../ingress/mlir_gen/gpu_softmax_payload.py | 90 ++----------------- lighthouse/schedule/xegpu/softmax_schedule.py | 60 ++++--------- 2 files changed, 27 insertions(+), 123 deletions(-) diff --git a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py index 4568448e..05dc148a 100644 --- a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py +++ b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py @@ -1,7 +1,7 @@ """Generate MLIR payload for GPU softmax operation.""" from mlir import ir -from mlir.dialects import linalg, bufferization, arith, tensor, math +from mlir.dialects import linalg, bufferization, tensor from lighthouse.utils.mlir import func_cif from lighthouse.ingress.mlir_gen.gpu_utils import ( @@ -45,88 +45,16 @@ def payload(output, input_arg): emit_buf_to_tensor(output, restrict=True, writable=True) input_tensor = emit_buf_to_tensor(input_arg, restrict=True) - # Define affine maps for indexing - # #map = affine_map<(d0, d1) -> (d0, d1)> (identity 2D) - # #map1 = affine_map<(d0, d1) -> (d0)> (broadcast/reduce along d1) - d0 = ir.AffineDimExpr.get(0) - d1 = ir.AffineDimExpr.get(1) - map_2d = ir.AffineMap.get(2, 0, [d0, d1]) - map_1d = ir.AffineMap.get(2, 0, [d0]) + # Create output tensor and fill with zeros + output_init = tensor.empty(shape, dtype) - # Step 1: Find max - linalg.generic reduction - neg_inf = arith.constant(dtype, float("-inf")) - max_init = tensor.empty((M,), dtype) - max_filled = linalg.fill(neg_inf, outs=[max_init]) - - @linalg.generic( - [input_tensor], # inputs - [max_filled], # outputs - [map_2d, map_1d], # indexing_maps - [ - linalg.IteratorType.parallel, - linalg.IteratorType.reduction, - ], # iterator_types - ) - def row_max(in_val, acc): - return arith.maximumf(in_val, acc) - - # Step 2: Subtract max (broadcast) - linalg.generic elementwise - output_init = tensor.empty((M, N), dtype) - - @linalg.generic( - [input_tensor, row_max], # inputs - [output_init], # outputs - [map_2d, map_1d, map_2d], # indexing_maps - [ - linalg.IteratorType.parallel, - linalg.IteratorType.parallel, - ], # iterator_types - ) - def shifted(in_val, max_val, out): - return arith.subf(in_val, max_val) - - # Step 3: Compute exp - linalg.generic elementwise - @linalg.generic( - [shifted], # inputs - [output_init], # outputs - [map_2d, map_2d], # indexing_maps - [ - linalg.IteratorType.parallel, - linalg.IteratorType.parallel, - ], # iterator_types - ) - def exp_vals(in_val, out): - return math.exp(in_val) - - # Step 4: Sum exp values - linalg.generic reduction - sum_init = tensor.empty((M,), dtype) - zero = arith.constant(dtype, 0.0) - sum_filled = linalg.fill(zero, outs=[sum_init]) - - @linalg.generic( - [exp_vals], # inputs - [sum_filled], # outputs - [map_2d, map_1d], # indexing_maps - [ - linalg.IteratorType.parallel, - linalg.IteratorType.reduction, - ], # iterator_types - ) - def row_sum(in_val, acc): - return arith.addf(in_val, acc) - - # Step 5: Divide by sum (broadcast) - linalg.generic elementwise - @linalg.generic( - [exp_vals, row_sum], # inputs - [output_init], # outputs - [map_2d, map_1d, map_2d], # indexing_maps - [ - linalg.IteratorType.parallel, - linalg.IteratorType.parallel, - ], # iterator_types + # Apply softmax along dimension 1 (last dimension) + result = linalg.softmax( + result=[ir.RankedTensorType.get(shape, dtype)], + input=input_tensor, + output=output_init, + dimension=1, ) - def result(exp_val, sum_val, out): - return arith.divf(exp_val, sum_val) # Materialize result back to output memref bufferization.materialize_in_destination( diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index d9701b84..35c3ab4c 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -26,12 +26,11 @@ def get_softmax_schedule_module( Generate transform schedule for softmax operation. The schedule performs the following transformations: - 1. Tile the consumer operation (division) using forall - 2. Fuse producer operations into the forall loop - 3. Vectorize operations - 4. Bufferize tensors - 5. Convert to GPU dialect - 6. Lower to XeGPU operations + 1. Tile the linalg.softmax operation using forall + 2. Vectorize operations + 3. Bufferize tensors + 4. Convert to GPU dialect + 5. Lower to XeGPU operations Args: stop_at_stage: Optional stage name to stop early (for debugging) @@ -113,57 +112,34 @@ def bundle_xegpu_softmax_schedule( anytype = transform.AnyOpType.get() - # Match all linalg.generic and linalg.fill operations - # We have 7 operations in softmax: - # fill(max_init), max, sub, exp, fill(sum_init), sum, div - generic_ops = structured.structured_match( - transform.AnyOpType.get(), mod, ops=["linalg.generic", "linalg.fill"] + # Match linalg.softmax operation + # We have only 1 operation: linalg.softmax + softmax_op = structured.structured_match( + transform.AnyOpType.get(), mod, ops=["linalg.softmax"] ) - # Split the handle into individual operation handles - split_ops = transform.split_handle( - ( - anytype, - anytype, - anytype, - anytype, - anytype, - anytype, - anytype, - ), # 7 result types - generic_ops, - ) - - # Reverse split_ops to have operations in reverse order - split_ops = list(reversed(split_ops)) - - # The first operation (after reversal) is the division - this is the consumer - last_op = split_ops[0] - - # Tile the last operation using tile_using_forall + # Tile the softmax operation using tile_using_forall tiled_op, for_op = structured.structured_tile_using_forall( anytype, anytype, - last_op, + softmax_op, num_threads=[], tile_sizes=[], static_tile_sizes=(parameters["wg_rows"],), ) - # Fuse the producer operations into the forall loop - # Iterate through remaining operations (already in reverse order) - current_forall = for_op - for producer_op in split_ops[1:]: - fused_op, current_forall = structured.structured_fuse_into_containing_op( - anytype, anytype, producer_op, current_forall - ) - func = transform.get_parent_op( anytype, - current_forall, + for_op, op_name="func.func", deduplicate=True, ) + # Decompose softmax into linalg.generic operations + softmax_ops = structured.structured_match( + transform.AnyOpType.get(), func, ops=["linalg.softmax"] + ) + structured.structured_decompose_interface(anytype, softmax_ops) + transform.apply_cse(func) canonicalize(func) From 3f5cbceacfca0cf15ea47bab5e29e1b62de48b4e Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 30 Mar 2026 19:28:26 +0000 Subject: [PATCH 17/38] save work --- examples/xegpu/softmax.py | 17 +++++++++-- lighthouse/schedule/xegpu/softmax_schedule.py | 29 +++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index e3cf5840..27a58ec2 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -217,6 +217,12 @@ def parse_cli(): default=16, help="Subgroup size.", ) + parser.add_argument( + "--reduction-step-size", + type=int, + default=16, + help="Step size for reduction loop tiling (optional).", + ) parser.add_argument( "--nruns", type=int, @@ -266,6 +272,7 @@ def parse_cli(): "wg_rows": args.wg_rows, "sg_rows": args.sg_rows, "subgroup_size": args.subgroup_size, + "reduction_step_size": args.reduction_step_size, } M, N = args.sizes @@ -304,7 +311,13 @@ def list2str(a): f"wg-rows={args.wg_rows}", f"sg-rows={args.sg_rows}", f"subgroup-size={args.subgroup_size}", - f"time(us): {elapsed:.2f}", - f"GFLOPS: {gflops:.2f}", ] + if args.reduction_step_size is not None: + parts.append(f"reduction-step-size={args.reduction_step_size}") + parts.extend( + [ + f"time(us): {elapsed:.2f}", + f"GFLOPS: {gflops:.2f}", + ] + ) print(" ".join(parts)) diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 35c3ab4c..11410612 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -39,6 +39,7 @@ def get_softmax_schedule_module( - sg_rows: Number of rows per subgroup - subgroup_size: Size of subgroup - sizes: Tuple with the sizes of the input tensors (e.g. (M, N)) + - reduction_step_size: Optional step size for tiling reduction loops Returns: MLIR module containing the transform schedule @@ -140,6 +141,34 @@ def bundle_xegpu_softmax_schedule( ) structured.structured_decompose_interface(anytype, softmax_ops) + linalg_ops = match_and_split( + func, ops={"linalg.generic", "linalg.fill"}, nhandles=6 + ) + init_max_reduction = linalg_ops[0] + max_reduction = linalg_ops[1] + max_center_and_exp_op = linalg_ops[2] + init_sum_reduction = linalg_ops[3] + sum_reduction = linalg_ops[4] + div_op = linalg_ops[5] + + reduction_step_size = parameters["reduction_step_size"] + + # Tile the max reduction using TileReductionUsingFor + _, _, _, for_op = structured.structured_tile_reduction_using_for( + [anytype], + anytype, + anytype, + anytype, + target=max_reduction, + tile_sizes=[0, reduction_step_size], + ) + + # Fuse the init_max_reduction into the for loop + # fused_init, new_for_loop = structured.structured_fuse_into_containing_op( + # anytype, anytype, init_max_reduction, for_op + # ) + transform.PrintOp(target=init_max_reduction) + transform.apply_cse(func) canonicalize(func) From 6204d6c2a59cb8d816f87b2fc811d96163a613a3 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 30 Mar 2026 22:21:34 +0000 Subject: [PATCH 18/38] add inner dim tiling --- lighthouse/schedule/xegpu/softmax_schedule.py | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 11410612..9bda77fc 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -144,30 +144,28 @@ def bundle_xegpu_softmax_schedule( linalg_ops = match_and_split( func, ops={"linalg.generic", "linalg.fill"}, nhandles=6 ) - init_max_reduction = linalg_ops[0] max_reduction = linalg_ops[1] max_center_and_exp_op = linalg_ops[2] - init_sum_reduction = linalg_ops[3] sum_reduction = linalg_ops[4] div_op = linalg_ops[5] reduction_step_size = parameters["reduction_step_size"] - # Tile the max reduction using TileReductionUsingFor - _, _, _, for_op = structured.structured_tile_reduction_using_for( - [anytype], - anytype, - anytype, - anytype, - target=max_reduction, - tile_sizes=[0, reduction_step_size], - ) - - # Fuse the init_max_reduction into the for loop - # fused_init, new_for_loop = structured.structured_fuse_into_containing_op( - # anytype, anytype, init_max_reduction, for_op - # ) - transform.PrintOp(target=init_max_reduction) + # Tile all reduction ops using the same step size + reduction_ops = [max_reduction, sum_reduction] + for reduction_op in reduction_ops: + structured.structured_tile_reduction_using_for( + [anytype], + anytype, + anytype, + anytype, + target=reduction_op, + tile_sizes=[0, reduction_step_size], + ) + # Tile elementwise ops to match the reduction tile size + elementwise_ops = [max_center_and_exp_op, div_op] + for elementwise_op in elementwise_ops: + structured.TileUsingForOp(elementwise_op, sizes=[0, reduction_step_size]) transform.apply_cse(func) canonicalize(func) From 1feb0d48d94e6cc46fb6cbf7c01dc9825e43f73a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 1 Apr 2026 22:09:26 +0000 Subject: [PATCH 19/38] save fused version --- examples/xegpu/softmax.py | 1 + lighthouse/schedule/xegpu/softmax_schedule.py | 76 +++++++++++++++---- 2 files changed, 62 insertions(+), 15 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index 324d6e0c..8049f0f4 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -212,6 +212,7 @@ def parse_cli(): "tiled", "vectorized", "bufferized", + "gpu-outlining", "xegpu-initial", "xegpu-wg", "final", diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 361dfdfe..b42b6479 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -140,6 +140,7 @@ def bundle_xegpu_softmax_schedule( transform.AnyOpType.get(), func, ops=["linalg.softmax"] ) structured.structured_decompose_interface(anytype, softmax_ops) + # transform.print_(target=func, name="After structured_decompose_interface") linalg_ops = match_and_split( func, ops={"linalg.generic", "linalg.fill"}, nhandles=6 @@ -151,22 +152,60 @@ def bundle_xegpu_softmax_schedule( reduction_step_size = parameters["reduction_step_size"] - # Tile all reduction ops using the same step size - reduction_ops = [max_reduction, sum_reduction] - for reduction_op in reduction_ops: - structured.structured_tile_reduction_using_for( - [anytype], - anytype, - anytype, - anytype, - target=reduction_op, - tile_sizes=[0, reduction_step_size], - ) - # Tile elementwise ops to match the reduction tile size - elementwise_ops = [max_center_and_exp_op, div_op] - for elementwise_op in elementwise_ops: - structured.TileUsingForOp(elementwise_op, sizes=[0, reduction_step_size]) + # Tile the division op and fuse the sub+exp producer into it + _, div_loop = structured.TileUsingForOp( + div_op, sizes=[0, reduction_step_size] + ).results + + # Fuse max_center_and_exp_op into the div loop + _, fused_loop = structured.structured_fuse_into_containing_op( + anytype, + anytype, + producer_op=max_center_and_exp_op, + containing_op=div_loop, + ) + + # Tile the sum reduction and fuse the sub+exp producer into it + _, _, _, sum_loop = structured.structured_tile_reduction_using_for( + [anytype], + anytype, + anytype, + anytype, + target=sum_reduction, + tile_sizes=[0, reduction_step_size], + ) + + func = transform.get_parent_op( + anytype, + fused_loop, + op_name="func.func", + deduplicate=True, + ) + # Re-match and split linalg generic ops, there are 5 at this point + linalg_ops = match_and_split(func, ops={"linalg.generic"}, nhandles=5) + max_center_and_exp_op = linalg_ops[1] + + # Fuse max_center_and_exp_op into the sum reduction loop + _, fused_sum_loop = structured.structured_fuse_into_containing_op( + anytype, + anytype, + producer_op=max_center_and_exp_op, + containing_op=sum_loop, + ) + + # Tile the max reduction. + max_reduction = linalg_ops[0] + structured.structured_tile_reduction_using_for( + [anytype], + anytype, + anytype, + anytype, + target=max_reduction, + tile_sizes=[0, reduction_step_size], + ) + + # Cleanup after tiling and fusion transform.apply_cse(func) canonicalize(func) @@ -227,6 +266,9 @@ def bundle_xegpu_softmax_schedule( mod = apply_registered_pass(mod, "gpu-kernel-outlining") transform.apply_cse(mod) + if stop_at_stage == "gpu-outlining": + raise PipelineInterrupt() + # set xevm target mod = apply_registered_pass( mod, @@ -241,6 +283,10 @@ def bundle_xegpu_softmax_schedule( gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") transform.apply_cse(gpu_func) + # Cleanup. + transform.apply_cse(mod) + canonicalize(mod) + if stop_at_stage == "xegpu-initial": raise PipelineInterrupt() From a28cf4a1035cf12956aa9922ff0fbd3a46e51314 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 1 Apr 2026 22:30:40 +0000 Subject: [PATCH 20/38] save work --- lighthouse/schedule/xegpu/softmax_schedule.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index b42b6479..18186b89 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -237,6 +237,17 @@ def bundle_xegpu_softmax_schedule( transform.apply_cse(mod) canonicalize(mod) + # promote memref.alloc to memref.alloca in payload function + func = match(mod, ops={"func.func"}) + func = apply_registered_pass( + func, + "promote-buffers-to-stack", + options={ + "max-alloc-size-in-bytes": "8192", + "max-rank-of-allocated-memref": "2", + }, + ) + if stop_at_stage == "bufferized": raise PipelineInterrupt() From 79e2f737caae45431f207744187a7119c5504b12 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Apr 2026 19:16:35 +0000 Subject: [PATCH 21/38] save work --- lighthouse/schedule/xegpu/softmax_schedule.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 18186b89..bda5f819 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -303,10 +303,10 @@ def bundle_xegpu_softmax_schedule( # Set layout attributes for xegpu.store_nd operations. # FIXME: currently ecah subgroup is handling the entire row. - store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) + store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=5) sg_layout = [parameters["sg_rows"], 1] - sg_data = [parameters["sg_rows"], parameters["sizes"][1]] - xegpu.set_anchor_layout(store_ops[0], sg_layout=sg_layout, sg_data=sg_data) + sg_data = [parameters["sg_rows"], parameters["reduction_step_size"]] + xegpu.set_anchor_layout(store_ops[-1], sg_layout=sg_layout, sg_data=sg_data) if stop_at_stage == "xegpu-wg": raise PipelineInterrupt() From 55c175c0f69c511f6d753b0f7c55b5480d5303b3 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Apr 2026 22:24:15 +0000 Subject: [PATCH 22/38] save work --- docs/softmax_lowering.md | 402 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 402 insertions(+) create mode 100644 docs/softmax_lowering.md diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md new file mode 100644 index 00000000..f67779ae --- /dev/null +++ b/docs/softmax_lowering.md @@ -0,0 +1,402 @@ +# Linalg softmax lowering in XeGPU pipeline + +## Overview + +The lowering process consists of seven stages: +1. **initial** - High-level tensor operations +2. **tiled-softmax** - Tiled softmax operations +3. **decomposed** - Decomposition into constituent operations +4. **vectorized** - Vector operations +5. **bufferized** - Memory-based representation +6. **xegpu-initial** - GPU kernel with XeGPU operations +7. **xegpu-wg** - Work-group optimized XeGPU + +--- + +## Stage 1: Initial + +**Code:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + // ... + %2 = tensor.empty() : tensor<1024x64xf32> + %3 = linalg.softmax dimension(1) ins(%1 : tensor<1024x64xf32>) + outs(%2 : tensor<1024x64xf32>) -> tensor<1024x64xf32> + // ... + return +} +``` +--- + +## Stage 2: Tiled Softmax + +**Key Characteristics:** +- Work distribution via `scf.forall` (16 parallel iterations) +- Each tile processes 64x64 elements + +**Code:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + // ... + %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) { + %4 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2) + // Extract 64x64 input slice + %extracted_slice = tensor.extract_slice ... + // Extract 64x64 output slice + %extracted_slice_0 = tensor.extract_slice ... + // Apply softmax to the tile + %5 = linalg.softmax dimension(1) ins(%extracted_slice : tensor<64x64xf32>) + outs(%extracted_slice_0 : tensor<64x64xf32>) -> tensor<64x64xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %5 into %arg3[%4, %c0] [64, 64] [1, 1] : + tensor<64x64xf32> into tensor<1024x64xf32> + } + } + // ... + return +} +``` + +--- + +## Stage 3: Decomposed + +**Key Characteristics:** +- Softmax decomposed into 4 constituent `linalg.generic` ops : max, sub+exp, sum, divide +- Uses `structured.structured_decompose_interface` implemented by `linalg.softmax` + +**Code:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 0xFFC00000 : f32 // -inf for max reduction + %0 = bufferization.to_tensor %arg1 restrict : memref<1024x64xf32> to tensor<1024x64xf32> + %1 = tensor.empty() : tensor<1024x64xf32> + + %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x64xf32>) { + %3 = affine.apply #map(%arg2) // %3 = %arg2 * 64 + %extracted_slice = tensor.extract_slice %0[%3, 0] [64, 64] [1, 1] : + tensor<1024x64xf32> to tensor<64x64xf32> + + // Step 1: Find max along dimension 1 + %4 = tensor.empty() : tensor<64xf32> + %5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<64xf32>) -> tensor<64xf32> + %6 = linalg.generic {indexing_maps = [#map1, #map2], + iterator_types = ["parallel", "reduction"]} + ins(%extracted_slice : tensor<64x64xf32>) outs(%5 : tensor<64xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = arith.maxnumf %in, %out : f32 + linalg.yield %11 : f32 + } -> tensor<64xf32> + + // Step 2: Subtract max and exponentiate + %7 = linalg.generic {indexing_maps = [#map1, #map2, #map1], + iterator_types = ["parallel", "parallel"]} + ins(%extracted_slice, %6 : tensor<64x64xf32>, tensor<64xf32>) + outs(%extracted_slice_1 : tensor<64x64xf32>) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %11 = arith.subf %in, %in_2 : f32 + %12 = math.exp %11 : f32 + linalg.yield %12 : f32 + } -> tensor<64x64xf32> + + // Step 3: Sum exponentials + %8 = linalg.fill ins(%cst : f32) outs(%4 : tensor<64xf32>) -> tensor<64xf32> + %9 = linalg.generic {indexing_maps = [#map1, #map2], + iterator_types = ["parallel", "reduction"]} + ins(%7 : tensor<64x64xf32>) outs(%8 : tensor<64xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = arith.addf %in, %out : f32 + linalg.yield %11 : f32 + } -> tensor<64xf32> + + // Step 4: Normalize by sum + %10 = linalg.generic {indexing_maps = [#map1, #map2, #map1], + iterator_types = ["parallel", "parallel"]} + ins(%7, %9 : tensor<64x64xf32>, tensor<64xf32>) + outs(%extracted_slice_1 : tensor<64x64xf32>) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %11 = arith.divf %in, %in_2 : f32 + linalg.yield %11 : f32 + } -> tensor<64x64xf32> + + scf.forall.in_parallel { + tensor.parallel_insert_slice %10 into %arg3[%3, 0] [64, 64] [1, 1] : + tensor<64x64xf32> into tensor<1024x64xf32> + } + } + return +} +``` + +**What Happens:** +- The 1024x64 input is divided into 16 tiles of 64x64 each +- Softmax algorithm made explicit: + 1. **Max reduction**: Find maximum value per row (for numerical stability) + 2. **Exp**: Compute exp(x - max) for each element + 3. **Sum reduction**: Sum exponentials per row + 4. **Normalize**: Divide each element by its row sum +- Each tile is processed independently, enabling parallelization +- Results are inserted back into the output tensor + +--- + +## Stage 4: Vectorized + +**Key Characteristics:** +- `linalg.generic` operations replaced with vector operations +- SIMD-friendly representation using `vector<64x64xf32>` +- Explicit vector multi-reductions +- Vector transfers for reading/writing data + +**Code:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + %cst = arith.constant dense<0.000000e+00> : vector<64xf32> + %0 = ub.poison : f32 + %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32> + %c0 = arith.constant 0 : index + %1 = bufferization.to_tensor %arg1 restrict : memref<1024x64xf32> to tensor<1024x64xf32> + %2 = tensor.empty() : tensor<1024x64xf32> + + %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) { + %4 = affine.apply #map(%arg2) // %4 = %arg2 * 64 + %extracted_slice = tensor.extract_slice %arg3[%4, 0] [64, 64] [1, 1] + + // Vector read: Load 64x64 tile + %5 = vector.transfer_read %1[%4, %c0], %0 {in_bounds = [true, true]} : + tensor<1024x64xf32>, vector<64x64xf32> + + // Max reduction: Reduce dimension 1 -> vector<64xf32> + %6 = vector.multi_reduction , %5, %cst_0 [1] : + vector<64x64xf32> to vector<64xf32> + + // Broadcast max values back to 64x64 and transpose + %7 = vector.broadcast %6 : vector<64xf32> to vector<64x64xf32> + %8 = vector.transpose %7, [1, 0] : vector<64x64xf32> to vector<64x64xf32> + + // Subtract max and exponentiate + %9 = arith.subf %5, %8 : vector<64x64xf32> + %10 = math.exp %9 : vector<64x64xf32> + + // Sum reduction: Reduce dimension 1 -> vector<64xf32> + %11 = vector.multi_reduction , %10, %cst [1] : + vector<64x64xf32> to vector<64xf32> + + // Broadcast sums back to 64x64 and transpose + %12 = vector.broadcast %11 : vector<64xf32> to vector<64x64xf32> + %13 = vector.transpose %12, [1, 0] : vector<64x64xf32> to vector<64x64xf32> + + // Normalize + %14 = arith.divf %10, %13 : vector<64x64xf32> + + // Vector write + %15 = vector.transfer_write %14, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : + vector<64x64xf32>, tensor<64x64xf32> + + scf.forall.in_parallel { + tensor.parallel_insert_slice %15 into %arg3[%4, 0] [64, 64] [1, 1] + } + } + return +} +``` + +**What Happens:** +- Linalg operations converted to vector dialect operations +- `vector.transfer_read` loads entire 64x64 tile at once +- `vector.multi_reduction` performs SIMD reductions (max and sum) +- `vector.broadcast` and `vector.transpose` handle dimension alignment +- All arithmetic operations work on vectors, enabling SIMD execution +- `vector.transfer_write` stores results back + +--- + +## Stage 5: Bufferized + +**Key Characteristics:** +- Tensors eliminated, working directly with memrefs +- Vector operations read/write directly from/to memory +- No more tensor extract/insert operations +- Simplified control flow + +**Code:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + %cst = arith.constant dense<0.000000e+00> : vector<64xf32> + %0 = ub.poison : f32 + %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32> + %c0 = arith.constant 0 : index + + scf.forall (%arg2) in (16) { + %1 = affine.apply #map(%arg2) // %1 = %arg2 * 64 + + // Direct memref read + %2 = vector.transfer_read %arg1[%1, %c0], %0 {in_bounds = [true, true]} : + memref<1024x64xf32>, vector<64x64xf32> + + // Max reduction + %3 = vector.multi_reduction , %2, %cst_0 [1] : + vector<64x64xf32> to vector<64xf32> + %4 = vector.broadcast %3 : vector<64xf32> to vector<64x64xf32> + %5 = vector.transpose %4, [1, 0] : vector<64x64xf32> to vector<64x64xf32> + + // Subtract and exp + %6 = arith.subf %2, %5 : vector<64x64xf32> + %7 = math.exp %6 : vector<64x64xf32> + + // Sum reduction + %8 = vector.multi_reduction , %7, %cst [1] : + vector<64x64xf32> to vector<64xf32> + %9 = vector.broadcast %8 : vector<64xf32> to vector<64x64xf32> + %10 = vector.transpose %9, [1, 0] : vector<64x64xf32> to vector<64x64xf32> + + // Normalize + %11 = arith.divf %7, %10 : vector<64x64xf32> + + // Direct memref write + vector.transfer_write %11, %arg0[%1, %c0] {in_bounds = [true, true]} : + vector<64x64xf32>, memref<1024x64xf32> + } + return +} +``` + +**What Happens:** +- All tensor operations converted to memref-based operations +- `scf.forall` no longer uses `shared_outs`, simplified to pure side effects +- Vector transfers work directly on input/output memrefs +- Memory layout is now explicit +- This representation is ready for GPU kernel extraction + +--- + +## Stage 6: XeGPU-Initial + +**Key Characteristics:** +- GPU kernel separated from host code +- `gpu.launch_func` invocation with grid/block dimensions +- XeGPU tensor descriptors for memory access +- Block-based load/store operations + +**Code:** + +**Host Side:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c128 = arith.constant 128 : index + gpu.launch_func @payload_kernel::@payload_kernel + blocks in (%c16, %c1, %c1) + threads in (%c128, %c1, %c1) + args(%arg1 : memref<1024x64xf32>, %arg0 : memref<1024x64xf32>) + return +} +``` + +**GPU Kernel:** +```mlir +gpu.module @payload_kernel [#xevm.target] { + gpu.func @payload_kernel(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) kernel + attributes {known_block_size = array, + known_grid_size = array} { + %cst = arith.constant dense<0.000000e+00> : vector<64xf32> + %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32> + %c64 = arith.constant 64 : index + %block_id_x = gpu.block_id x + %0 = arith.muli %block_id_x, %c64 overflow : index + + // Create XeGPU tensor descriptor for load + %1 = xegpu.create_nd_tdesc %arg0 : memref<1024x64xf32> -> + !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> + + // XeGPU block load + %2 = xegpu.load_nd %1[%0, 0] : + !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> -> + vector<64x64xf32> + + // Same compute operations as before + %3 = vector.multi_reduction , %2, %cst_0 [1] : + vector<64x64xf32> to vector<64xf32> + %4 = vector.broadcast %3 : vector<64xf32> to vector<64x64xf32> + %5 = vector.transpose %4, [1, 0] : vector<64x64xf32> to vector<64x64xf32> + %6 = arith.subf %2, %5 : vector<64x64xf32> + %7 = math.exp %6 : vector<64x64xf32> + %8 = vector.multi_reduction , %7, %cst [1] : + vector<64x64xf32> to vector<64xf32> + %9 = vector.broadcast %8 : vector<64xf32> to vector<64x64xf32> + %10 = vector.transpose %9, [1, 0] : vector<64x64xf32> to vector<64x64xf32> + %11 = arith.divf %7, %10 : vector<64x64xf32> + + // Create XeGPU tensor descriptor for store + %12 = xegpu.create_nd_tdesc %arg1 : memref<1024x64xf32> -> + !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> + + // XeGPU block store + xegpu.store_nd %11, %12[%0, 0] : + vector<64x64xf32>, + !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> + + gpu.return + } +} +``` + +**What Happens:** +- Host function now calls `gpu.launch_func` with 16 blocks, 128 threads per block +- Separate `gpu.module` contains the kernel code +- `gpu.block_id x` replaces the loop iterator +- `xegpu.create_nd_tdesc` creates tensor descriptors for memory regions +- `xegpu.load_nd` performs hardware-optimized block loads +- `xegpu.store_nd` performs hardware-optimized block stores +- XeGPU operations map directly to Intel GPU instructions +- Boundary checking disabled for performance (sizes known at compile time) + +--- + +## Stage 7: XeGPU-WG (Work-Group Optimized) + +**Key Characteristics:** +- Additional layout hints for work-group optimization +- Sub-group layout specification: `sg_layout` and `sg_data` +- Optimized memory access patterns for Intel XeGPU + +**Code (differences from xegpu-initial):** +```mlir +// Store operation now includes layout hints +xegpu.store_nd %11, %12[%0, 0] + <{layout = #xegpu.layout}> : + vector<64x64xf32>, + !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> +``` + +**What Happens:** +- The `layout` attribute provides explicit sub-group (SG) tiling information: + - `sg_layout = [8, 1]`: Data is distributed across 8 sub-groups in the first dimension + - `sg_data = [8, 64]`: Each sub-group handles an 8x64 slice of data +- This layout specification: + - Matches the 64x64 total data size (8 sub-groups × 8 rows = 64 rows, 64 columns) + - Optimizes coalesced memory accesses + - Enables efficient SIMD execution within each sub-group + - Aligns with Intel GPU hardware execution model (128 threads = 8 sub-groups of 16 threads) +- The layout is applied to the store operation to optimize write patterns +- Load operation remains unchanged as reads are typically more flexible + +--- + +## Summary + +The lowering pipeline progressively transforms abstract operations into hardware-specific instructions: + +| Stage | Abstraction Level | Key Operations | +|-------|------------------|----------------| +| **initial** | High-level ML | `linalg.softmax` on full tensor | +| **tiled-softmax** | Tiled high-level | `linalg.softmax` on tiles, `scf.forall` | +| **decomposed** | Tiled computation | `linalg.generic` with reductions | +| **vectorized** | Vector operations | `vector.multi_reduction`, `vector.transfer_read/write` | +| **bufferized** | Memory-based | Direct memref operations with vectors | +| **xegpu-initial** | GPU-specific | `xegpu.load_nd`, `xegpu.store_nd`, `gpu.launch_func` | +| **xegpu-wg** | Hardware-optimized | Layout hints for sub-group optimization | + +Each stage maintains the same computational semantics while providing increasingly detailed control over execution and memory access patterns, ultimately targeting efficient execution on Intel XeGPU hardware. From bf3a8c66a058cf4f7e8443d7247f2494437672a1 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Apr 2026 22:51:42 +0000 Subject: [PATCH 23/38] save work --- docs/softmax_lowering.md | 158 ++++++++------------------------------- 1 file changed, 31 insertions(+), 127 deletions(-) diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md index f67779ae..c5608e37 100644 --- a/docs/softmax_lowering.md +++ b/docs/softmax_lowering.md @@ -1,7 +1,10 @@ -# Linalg softmax lowering in XeGPU pipeline +# Linalg softmax lowering to XeGPU (Currently supported in lighthouse) ## Overview +**Assumptions:** +Softmax dimension size is small (64 in this example). + The lowering process consists of seven stages: 1. **initial** - High-level tensor operations 2. **tiled-softmax** - Tiled softmax operations @@ -30,7 +33,7 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { ## Stage 2: Tiled Softmax -**Key Characteristics:** +**Notes** - Work distribution via `scf.forall` (16 parallel iterations) - Each tile processes 64x64 elements @@ -61,63 +64,45 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { ## Stage 3: Decomposed -**Key Characteristics:** +**Notes** - Softmax decomposed into 4 constituent `linalg.generic` ops : max, sub+exp, sum, divide - Uses `structured.structured_decompose_interface` implemented by `linalg.softmax` **Code:** ```mlir func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - %cst = arith.constant 0.000000e+00 : f32 - %cst_0 = arith.constant 0xFFC00000 : f32 // -inf for max reduction - %0 = bufferization.to_tensor %arg1 restrict : memref<1024x64xf32> to tensor<1024x64xf32> - %1 = tensor.empty() : tensor<1024x64xf32> + // ... %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x64xf32>) { %3 = affine.apply #map(%arg2) // %3 = %arg2 * 64 - %extracted_slice = tensor.extract_slice %0[%3, 0] [64, 64] [1, 1] : - tensor<1024x64xf32> to tensor<64x64xf32> + %extracted_slice = tensor.extract_slice ... // Step 1: Find max along dimension 1 %4 = tensor.empty() : tensor<64xf32> %5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<64xf32>) -> tensor<64xf32> - %6 = linalg.generic {indexing_maps = [#map1, #map2], - iterator_types = ["parallel", "reduction"]} - ins(%extracted_slice : tensor<64x64xf32>) outs(%5 : tensor<64xf32>) { - ^bb0(%in: f32, %out: f32): + %6 = linalg.generic // ... %11 = arith.maxnumf %in, %out : f32 - linalg.yield %11 : f32 + // ... } -> tensor<64xf32> // Step 2: Subtract max and exponentiate - %7 = linalg.generic {indexing_maps = [#map1, #map2, #map1], - iterator_types = ["parallel", "parallel"]} - ins(%extracted_slice, %6 : tensor<64x64xf32>, tensor<64xf32>) - outs(%extracted_slice_1 : tensor<64x64xf32>) { - ^bb0(%in: f32, %in_2: f32, %out: f32): + %7 = linalg.generic // ... %11 = arith.subf %in, %in_2 : f32 %12 = math.exp %11 : f32 - linalg.yield %12 : f32 + // ... } -> tensor<64x64xf32> // Step 3: Sum exponentials %8 = linalg.fill ins(%cst : f32) outs(%4 : tensor<64xf32>) -> tensor<64xf32> - %9 = linalg.generic {indexing_maps = [#map1, #map2], - iterator_types = ["parallel", "reduction"]} - ins(%7 : tensor<64x64xf32>) outs(%8 : tensor<64xf32>) { - ^bb0(%in: f32, %out: f32): + %9 = linalg.generic // ... %11 = arith.addf %in, %out : f32 - linalg.yield %11 : f32 + // ... } -> tensor<64xf32> // Step 4: Normalize by sum - %10 = linalg.generic {indexing_maps = [#map1, #map2, #map1], - iterator_types = ["parallel", "parallel"]} - ins(%7, %9 : tensor<64x64xf32>, tensor<64xf32>) - outs(%extracted_slice_1 : tensor<64x64xf32>) { - ^bb0(%in: f32, %in_2: f32, %out: f32): + %10 = linalg.generic // ... %11 = arith.divf %in, %in_2 : f32 - linalg.yield %11 : f32 + // ... } -> tensor<64x64xf32> scf.forall.in_parallel { @@ -129,39 +114,21 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { } ``` -**What Happens:** -- The 1024x64 input is divided into 16 tiles of 64x64 each -- Softmax algorithm made explicit: - 1. **Max reduction**: Find maximum value per row (for numerical stability) - 2. **Exp**: Compute exp(x - max) for each element - 3. **Sum reduction**: Sum exponentials per row - 4. **Normalize**: Divide each element by its row sum -- Each tile is processed independently, enabling parallelization -- Results are inserted back into the output tensor - --- ## Stage 4: Vectorized -**Key Characteristics:** +**Notes** - `linalg.generic` operations replaced with vector operations -- SIMD-friendly representation using `vector<64x64xf32>` -- Explicit vector multi-reductions - Vector transfers for reading/writing data **Code:** ```mlir func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - %cst = arith.constant dense<0.000000e+00> : vector<64xf32> - %0 = ub.poison : f32 - %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32> - %c0 = arith.constant 0 : index - %1 = bufferization.to_tensor %arg1 restrict : memref<1024x64xf32> to tensor<1024x64xf32> - %2 = tensor.empty() : tensor<1024x64xf32> - + // ... %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) { %4 = affine.apply #map(%arg2) // %4 = %arg2 * 64 - %extracted_slice = tensor.extract_slice %arg3[%4, 0] [64, 64] [1, 1] + %extracted_slice = tensor.extract_slice .. // Vector read: Load 64x64 tile %5 = vector.transfer_read %1[%4, %c0], %0 {in_bounds = [true, true]} : @@ -201,32 +168,17 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { return } ``` - -**What Happens:** -- Linalg operations converted to vector dialect operations -- `vector.transfer_read` loads entire 64x64 tile at once -- `vector.multi_reduction` performs SIMD reductions (max and sum) -- `vector.broadcast` and `vector.transpose` handle dimension alignment -- All arithmetic operations work on vectors, enabling SIMD execution -- `vector.transfer_write` stores results back - --- ## Stage 5: Bufferized -**Key Characteristics:** +**Notes** - Tensors eliminated, working directly with memrefs -- Vector operations read/write directly from/to memory -- No more tensor extract/insert operations -- Simplified control flow **Code:** ```mlir func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - %cst = arith.constant dense<0.000000e+00> : vector<64xf32> - %0 = ub.poison : f32 - %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32> - %c0 = arith.constant 0 : index + // ... scf.forall (%arg2) in (16) { %1 = affine.apply #map(%arg2) // %1 = %arg2 * 64 @@ -262,31 +214,21 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { } ``` -**What Happens:** -- All tensor operations converted to memref-based operations -- `scf.forall` no longer uses `shared_outs`, simplified to pure side effects -- Vector transfers work directly on input/output memrefs -- Memory layout is now explicit -- This representation is ready for GPU kernel extraction - --- ## Stage 6: XeGPU-Initial -**Key Characteristics:** -- GPU kernel separated from host code +**Notes** +- GPU kernel separated from host code (Gpu Outlining) - `gpu.launch_func` invocation with grid/block dimensions -- XeGPU tensor descriptors for memory access -- Block-based load/store operations +- Use `vector-to-xegpu` **Code:** **Host Side:** ```mlir func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - %c1 = arith.constant 1 : index - %c16 = arith.constant 16 : index - %c128 = arith.constant 128 : index + // ... gpu.launch_func @payload_kernel::@payload_kernel blocks in (%c16, %c1, %c1) threads in (%c128, %c1, %c1) @@ -301,9 +243,7 @@ gpu.module @payload_kernel [#xevm.target] { gpu.func @payload_kernel(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) kernel attributes {known_block_size = array, known_grid_size = array} { - %cst = arith.constant dense<0.000000e+00> : vector<64xf32> - %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32> - %c64 = arith.constant 64 : index + // ... %block_id_x = gpu.block_id x %0 = arith.muli %block_id_x, %c64 overflow : index @@ -343,24 +283,14 @@ gpu.module @payload_kernel [#xevm.target] { } ``` -**What Happens:** -- Host function now calls `gpu.launch_func` with 16 blocks, 128 threads per block -- Separate `gpu.module` contains the kernel code -- `gpu.block_id x` replaces the loop iterator -- `xegpu.create_nd_tdesc` creates tensor descriptors for memory regions -- `xegpu.load_nd` performs hardware-optimized block loads -- `xegpu.store_nd` performs hardware-optimized block stores -- XeGPU operations map directly to Intel GPU instructions -- Boundary checking disabled for performance (sizes known at compile time) - --- ## Stage 7: XeGPU-WG (Work-Group Optimized) -**Key Characteristics:** -- Additional layout hints for work-group optimization -- Sub-group layout specification: `sg_layout` and `sg_data` -- Optimized memory access patterns for Intel XeGPU +**Notes** +- Sets the layout for anchor xegpu ops. Each Wg consistes of [8, 1] subgroups + doing 8x64 softmax slice. +- Only sets the layotu for `store_nd`. Layout propagation does the rest. **Code (differences from xegpu-initial):** ```mlir @@ -371,32 +301,6 @@ xegpu.store_nd %11, %12[%0, 0] !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> ``` -**What Happens:** -- The `layout` attribute provides explicit sub-group (SG) tiling information: - - `sg_layout = [8, 1]`: Data is distributed across 8 sub-groups in the first dimension - - `sg_data = [8, 64]`: Each sub-group handles an 8x64 slice of data -- This layout specification: - - Matches the 64x64 total data size (8 sub-groups × 8 rows = 64 rows, 64 columns) - - Optimizes coalesced memory accesses - - Enables efficient SIMD execution within each sub-group - - Aligns with Intel GPU hardware execution model (128 threads = 8 sub-groups of 16 threads) -- The layout is applied to the store operation to optimize write patterns -- Load operation remains unchanged as reads are typically more flexible - --- -## Summary - -The lowering pipeline progressively transforms abstract operations into hardware-specific instructions: - -| Stage | Abstraction Level | Key Operations | -|-------|------------------|----------------| -| **initial** | High-level ML | `linalg.softmax` on full tensor | -| **tiled-softmax** | Tiled high-level | `linalg.softmax` on tiles, `scf.forall` | -| **decomposed** | Tiled computation | `linalg.generic` with reductions | -| **vectorized** | Vector operations | `vector.multi_reduction`, `vector.transfer_read/write` | -| **bufferized** | Memory-based | Direct memref operations with vectors | -| **xegpu-initial** | GPU-specific | `xegpu.load_nd`, `xegpu.store_nd`, `gpu.launch_func` | -| **xegpu-wg** | Hardware-optimized | Layout hints for sub-group optimization | - -Each stage maintains the same computational semantics while providing increasingly detailed control over execution and memory access patterns, ultimately targeting efficient execution on Intel XeGPU hardware. +# Supporting larger Softmax dimension sizes. From b083887154b083d184430c0c87baf887d155dcca Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Apr 2026 23:03:15 +0000 Subject: [PATCH 24/38] save work --- examples/xegpu/softmax.py | 8 -- lighthouse/schedule/xegpu/softmax_schedule.py | 90 +------------------ 2 files changed, 3 insertions(+), 95 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index 8049f0f4..afca86e3 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -181,12 +181,6 @@ def parse_cli(): default=16, help="Subgroup size.", ) - parser.add_argument( - "--reduction-step-size", - type=int, - default=16, - help="Step size for reduction loop tiling (optional).", - ) parser.add_argument( "--nruns", type=int, @@ -212,7 +206,6 @@ def parse_cli(): "tiled", "vectorized", "bufferized", - "gpu-outlining", "xegpu-initial", "xegpu-wg", "final", @@ -244,7 +237,6 @@ def parse_cli(): "wg_rows": args.wg_rows, "sg_rows": args.sg_rows, "subgroup_size": args.subgroup_size, - "reduction_step_size": args.reduction_step_size, } M, N = args.sizes diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index bda5f819..0907bc5a 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -39,7 +39,6 @@ def get_softmax_schedule_module( - sg_rows: Number of rows per subgroup - subgroup_size: Size of subgroup - sizes: Tuple with the sizes of the input tensors (e.g. (M, N)) - - reduction_step_size: Optional step size for tiling reduction loops Returns: MLIR module containing the transform schedule @@ -140,72 +139,7 @@ def bundle_xegpu_softmax_schedule( transform.AnyOpType.get(), func, ops=["linalg.softmax"] ) structured.structured_decompose_interface(anytype, softmax_ops) - # transform.print_(target=func, name="After structured_decompose_interface") - linalg_ops = match_and_split( - func, ops={"linalg.generic", "linalg.fill"}, nhandles=6 - ) - max_reduction = linalg_ops[1] - max_center_and_exp_op = linalg_ops[2] - sum_reduction = linalg_ops[4] - div_op = linalg_ops[5] - - reduction_step_size = parameters["reduction_step_size"] - - # Tile the division op and fuse the sub+exp producer into it - _, div_loop = structured.TileUsingForOp( - div_op, sizes=[0, reduction_step_size] - ).results - - # Fuse max_center_and_exp_op into the div loop - _, fused_loop = structured.structured_fuse_into_containing_op( - anytype, - anytype, - producer_op=max_center_and_exp_op, - containing_op=div_loop, - ) - - # Tile the sum reduction and fuse the sub+exp producer into it - _, _, _, sum_loop = structured.structured_tile_reduction_using_for( - [anytype], - anytype, - anytype, - anytype, - target=sum_reduction, - tile_sizes=[0, reduction_step_size], - ) - - func = transform.get_parent_op( - anytype, - fused_loop, - op_name="func.func", - deduplicate=True, - ) - - # Re-match and split linalg generic ops, there are 5 at this point - linalg_ops = match_and_split(func, ops={"linalg.generic"}, nhandles=5) - max_center_and_exp_op = linalg_ops[1] - - # Fuse max_center_and_exp_op into the sum reduction loop - _, fused_sum_loop = structured.structured_fuse_into_containing_op( - anytype, - anytype, - producer_op=max_center_and_exp_op, - containing_op=sum_loop, - ) - - # Tile the max reduction. - max_reduction = linalg_ops[0] - structured.structured_tile_reduction_using_for( - [anytype], - anytype, - anytype, - anytype, - target=max_reduction, - tile_sizes=[0, reduction_step_size], - ) - - # Cleanup after tiling and fusion transform.apply_cse(func) canonicalize(func) @@ -237,17 +171,6 @@ def bundle_xegpu_softmax_schedule( transform.apply_cse(mod) canonicalize(mod) - # promote memref.alloc to memref.alloca in payload function - func = match(mod, ops={"func.func"}) - func = apply_registered_pass( - func, - "promote-buffers-to-stack", - options={ - "max-alloc-size-in-bytes": "8192", - "max-rank-of-allocated-memref": "2", - }, - ) - if stop_at_stage == "bufferized": raise PipelineInterrupt() @@ -277,9 +200,6 @@ def bundle_xegpu_softmax_schedule( mod = apply_registered_pass(mod, "gpu-kernel-outlining") transform.apply_cse(mod) - if stop_at_stage == "gpu-outlining": - raise PipelineInterrupt() - # set xevm target mod = apply_registered_pass( mod, @@ -294,19 +214,15 @@ def bundle_xegpu_softmax_schedule( gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") transform.apply_cse(gpu_func) - # Cleanup. - transform.apply_cse(mod) - canonicalize(mod) - if stop_at_stage == "xegpu-initial": raise PipelineInterrupt() # Set layout attributes for xegpu.store_nd operations. # FIXME: currently ecah subgroup is handling the entire row. - store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=5) + store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) sg_layout = [parameters["sg_rows"], 1] - sg_data = [parameters["sg_rows"], parameters["reduction_step_size"]] - xegpu.set_anchor_layout(store_ops[-1], sg_layout=sg_layout, sg_data=sg_data) + sg_data = [parameters["sg_rows"], parameters["sizes"][1]] + xegpu.set_anchor_layout(store_ops[0], sg_layout=sg_layout, sg_data=sg_data) if stop_at_stage == "xegpu-wg": raise PipelineInterrupt() From 76a02bdac6643f889a51be9e40f3d9f857822eab Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Apr 2026 23:37:26 +0000 Subject: [PATCH 25/38] tiled reduction doc --- docs/softmax_lowering.md | 331 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 330 insertions(+), 1 deletion(-) diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md index c5608e37..a31e6f4c 100644 --- a/docs/softmax_lowering.md +++ b/docs/softmax_lowering.md @@ -303,4 +303,333 @@ xegpu.store_nd %11, %12[%0, 0] --- -# Supporting larger Softmax dimension sizes. +# Supporting larger Softmax dimension sizes + +When the softmax dimension is larger than what can fit efficiently in registers, additional tiling and fusion transformations are applied to the reduction dimension. This section shows the intermediate stages between "decomposed" and "vectorized". + +**Approach:** Tile reductions along dimension 1 (step size = 16) and fuse producers into consumers to enable streaming computation. + +--- + +## Decomposed → Tiled: Stage A - Tile div op + +**Notes:** +- Tile the division operation with step size 16 along dimension 1 +- Creates `scf.for` loop iterating over 64 elements in chunks of 16 + +**Key Changes:** +```mlir +// Before: Single division linalg.generic over 64x64 +%11 = linalg.generic {...} ins(%8, %10 : tensor<64x64xf32>, tensor<64xf32>) + outs(%extracted_slice_0 : tensor<64x64xf32>) { ... } -> tensor<64x64xf32> + +// After: Division tiled into 64x16 chunks +%11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) { + %extracted_slice_3 = tensor.extract_slice %8[0, %arg4] [64, 16] [1, 1] + %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) + outs(%extracted_slice_5 : tensor<64x16xf32>) { ... } -> tensor<64x16xf32> + %inserted_slice = tensor.insert_slice %12 into %arg5[0, %arg4] [64, 16] [1, 1] + scf.yield %inserted_slice +} +``` + +--- + +## Stage B - Fuse sub+exp into div loop + +**Notes:** +- Fuse the `sub+exp` producer (max_center_and_exp_op) into the div loop +- Recomputes exp values on-the-fly instead of materializing full 64x64 tensor + +**Key Changes:** +```mlir +%11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) { + %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1] + + // Fused: sub+exp computed per 16-element chunk + %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) + outs(%extracted_slice_5 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_8: f32, %out: f32): + %14 = arith.subf %in, %in_8 : f32 + %15 = math.exp %14 : f32 + linalg.yield %15 : f32 + } -> tensor<64x16xf32> + + // Division operation + %13 = linalg.generic {...} ins(%12, %extracted_slice_6 : tensor<64x16xf32>, tensor<64xf32>) + outs(%extracted_slice_7 : tensor<64x16xf32>) { ... } -> tensor<64x16xf32> + // ... +} +``` + +--- + +## Stage C - Tile sum reduction + +**Notes:** +- Tile the sum reduction using `structured_tile_reduction_using_for` +- Creates intermediate accumulator tensor (64x16) +- Final reduction via `linalg.reduce` over dimension 1 + +**Key Changes:** +```mlir +// Tiled sum reduction with intermediate accumulator +%10 = tensor.empty() : tensor<64x16xf32> +%11 = linalg.fill ins(%cst_2 : f32) outs(%10 : tensor<64x16xf32>) -> tensor<64x16xf32> + +%12 = scf.for %arg4 = %c0_3 to %c64 step %c16 iter_args(%arg5 = %11) -> (tensor<64x16xf32>) { + %extracted_slice_7 = tensor.extract_slice %8[0, %arg4] [64, 16] [1, 1] + %14 = linalg.generic {...} ins(%extracted_slice_7 : tensor<64x16xf32>) + outs(%extracted_slice_8 : tensor<64x16xf32>) { + ^bb0(%in: f32, %out: f32): + %15 = arith.addf %in, %out : f32 + linalg.yield %15 : f32 + } -> tensor<64x16xf32> + // ... +} + +// Final reduction to 64xf32 +%reduced = linalg.reduce ins(%12 : tensor<64x16xf32>) outs(%9 : tensor<64xf32>) dimensions = [1] + (%in: f32, %init: f32) { + %14 = arith.addf %in, %init : f32 + linalg.yield %14 : f32 + } +``` + +--- + +## Stage D - Fuse sub+exp into sum loop + +**Notes:** +- Fuse `sub+exp` into the sum reduction loop +- Stream computation: compute exp and accumulate in same loop + +**Key Changes:** +```mlir +%12 = scf.for %arg4 = %c0_3 to %c64 step %c16 iter_args(%arg5 = %11) -> (tensor<64x16xf32>) { + %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1] + + // Fused: sub+exp + %14 = linalg.generic {...} ins(%extracted_slice_7, %extracted_slice_8 : tensor<64x16xf32>, tensor<64xf32>) + outs(%extracted_slice_9 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_11: f32, %out: f32): + %16 = arith.subf %in, %in_11 : f32 + %17 = math.exp %16 : f32 + linalg.yield %17 : f32 + } -> tensor<64x16xf32> + + // Accumulate sum + %15 = linalg.generic {...} ins(%14 : tensor<64x16xf32>) + outs(%extracted_slice_10 : tensor<64x16xf32>) { + ^bb0(%in: f32, %out: f32): + %16 = arith.addf %in, %out : f32 + linalg.yield %16 : f32 + } -> tensor<64x16xf32> + // ... +} +``` + +--- + +## Stage E - Tile max reduction + +**Notes:** +- Tile max reduction similar to sum reduction +- Creates 64x16 intermediate accumulator +- Final reduction via `linalg.reduce` with maxnumf + +**Key Changes:** +```mlir +// Tiled max reduction +%7 = tensor.empty() : tensor<64x16xf32> +%8 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<64x16xf32>) -> tensor<64x16xf32> + +%9 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %8) -> (tensor<64x16xf32>) { + %extracted_slice_12 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1] + %16 = linalg.generic {...} ins(%extracted_slice_12 : tensor<64x16xf32>) + outs(%extracted_slice_13 : tensor<64x16xf32>) { + ^bb0(%in: f32, %out: f32): + %17 = arith.maxnumf %in, %out : f32 + linalg.yield %17 : f32 + } -> tensor<64x16xf32> + // ... +} + +// Final max reduction +%reduced = linalg.reduce ins(%9 : tensor<64x16xf32>) outs(%6 : tensor<64xf32>) dimensions = [1] + (%in: f32, %init: f32) { + %16 = arith.maxnumf %in, %init : f32 + linalg.yield %16 : f32 + } +``` + +**Result:** Now all three major computations (max, sum, div) are tiled and operate on 64x16 chunks, with exp computation fused into both sum and div loops. + +--- + +## Stage F - Vectorization + +**Notes:** +- Convert tiled linalg operations to vector operations +- `scf.for` loops remain but operate on vectors +- Vector size: 64x16 for tiled operations + +**Code:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + // ... + %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) { + // ... + + // Vectorized max reduction loop + %6 = vector.transfer_write %cst_1, %5[%c0, %c0] : vector<64x16xf32>, tensor<64x16xf32> + %7 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %6) -> (tensor<64x16xf32>) { + %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32> + %16 = vector.transfer_read %arg5[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32> + %17 = arith.maxnumf %15, %16 : vector<64x16xf32> + %18 = vector.transfer_write %17, %arg5[%c0, %c0] : vector<64x16xf32>, tensor<64x16xf32> + scf.yield %18 : tensor<64x16xf32> + } + %8 = vector.transfer_read %7[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32> + %9 = vector.multi_reduction , %8, %cst_2 [1] : vector<64x16xf32> to vector<64xf32> + + // Vectorized sum reduction loop with fused sub+exp + %11 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %10) -> (tensor<64x16xf32>) { + %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32> + %16 = vector.broadcast %9 : vector<64xf32> to vector<16x64xf32> + %17 = vector.transpose %16, [1, 0] : vector<16x64xf32> to vector<64x16xf32> + %18 = arith.subf %15, %17 : vector<64x16xf32> + %19 = math.exp %18 : vector<64x16xf32> + %20 = vector.transfer_read %arg5[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32> + %21 = arith.addf %19, %20 : vector<64x16xf32> + %22 = vector.transfer_write %21, %arg5[%c0, %c0] : vector<64x16xf32>, tensor<64x16xf32> + scf.yield %22 : tensor<64x16xf32> + } + %12 = vector.transfer_read %11[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32> + %13 = vector.multi_reduction , %12, %cst_0 [1] : vector<64x16xf32> to vector<64xf32> + + // Vectorized div loop with fused sub+exp + %14 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %extracted_slice) -> (tensor<64x64xf32>) { + %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32> + %16 = vector.broadcast %9 : vector<64xf32> to vector<16x64xf32> + %17 = vector.transpose %16, [1, 0] : vector<16x64xf32> to vector<64x16xf32> + %18 = arith.subf %15, %17 : vector<64x16xf32> + %19 = math.exp %18 : vector<64x16xf32> + %20 = vector.broadcast %13 : vector<64xf32> to vector<16x64xf32> + %21 = vector.transpose %20, [1, 0] : vector<16x64xf32> to vector<64x16xf32> + %22 = arith.divf %19, %21 : vector<64x16xf32> + %23 = vector.transfer_write %22, %arg5[%c0, %arg4] : vector<64x16xf32>, tensor<64x64xf32> + scf.yield %23 : tensor<64x64xf32> + } + } + // ... +} +``` + +--- + +## Stage G - Bufferization + +**Notes:** +- Convert tensors to memrefs +- Allocate stack buffer for 64x16 accumulator: `memref.alloc()` + +**Code:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + // ... + scf.forall (%arg2) in (16) { + %1 = affine.apply #map(%arg2) + %subview = memref.subview %arg0[%1, 0] [64, 64] [1, 1] + + // Allocate accumulator buffer + %alloc = memref.alloc() {alignment = 64 : i64} : memref<64x16xf32> + + // Max reduction loop + vector.transfer_write %cst_1, %alloc[%c0, %c0] : vector<64x16xf32>, memref<64x16xf32> + scf.for %arg3 = %c0 to %c64 step %c16 { + %6 = vector.transfer_read %arg1[%1, %arg3], %0 : memref<1024x64xf32>, vector<64x16xf32> + %7 = vector.transfer_read %alloc[%c0, %c0], %0 : memref<64x16xf32>, vector<64x16xf32> + %8 = arith.maxnumf %6, %7 : vector<64x16xf32> + vector.transfer_write %8, %alloc[%c0, %c0] : vector<64x16xf32>, memref<64x16xf32> + } + %2 = vector.transfer_read %alloc[%c0, %c0], %0 : memref<64x16xf32>, vector<64x16xf32> + %3 = vector.multi_reduction , %2, %cst_2 [1] : vector<64x16xf32> to vector<64xf32> + + // Sum reduction loop (reuses %alloc) + // ... + + // Div loop (writes to %subview) + // ... + } +} +``` + +--- + +## Stage H - Promote buffers to stack + +**Notes:** +- Convert `memref.alloc()` to `memref.alloca()` for stack allocation +- Reduces memory allocation overhead + +**Code:** +```mlir +scf.forall (%arg2) in (16) { + %1 = affine.apply #map(%arg2) + %subview = memref.subview %arg0[%1, 0] [64, 64] [1, 1] + + // Stack allocation instead of heap + %alloca = memref.alloca() {alignment = 64 : i64} : memref<64x16xf32> + + // ... same operations using %alloca ... +} +``` + +--- + +## Stage I - GPU outlining + +**Notes:** +- Convert `scf.forall` to `scf.parallel`, then to `gpu.launch` +- Extract GPU kernel into separate `gpu.module` +- Set thread count: 128 threads = (64 rows / 8 sg_rows) × 16 subgroup_size + +**Host Side:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + gpu.launch_func @payload_kernel::@payload_kernel + blocks in (%c16, %c1, %c1) + threads in (%c128, %c1, %c1) + args(%arg0 : memref<1024x64xf32>, %arg1 : memref<1024x64xf32>) + return +} +``` + +**GPU Kernel:** +```mlir +gpu.module @payload_kernel { + gpu.func @payload_kernel(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) kernel + attributes {known_block_size = array, + known_grid_size = array} { + %block_id_x = gpu.block_id x + %1 = arith.muli %block_id_x, %c64 overflow : index + %subview = memref.subview %arg0[%1, 0] [64, 64] [1, 1] + %alloca = memref.alloca() {alignment = 64 : i64} : memref<64x16xf32> + + // Three reduction loops (max, sum, div) with same structure + scf.for %arg2 = %c0 to %c64 step %c16 { + // Max: accumulate max values + // Sum: compute & accumulate exp(x - max) + // Div: compute exp(x - max) / sum + } + + gpu.return + } +} +``` + +**Summary:** At this stage, the kernel processes 64x16 chunks in streaming fashion through three sequential loops, minimizing memory footprint. From 4d6ae0a5b491fac31bfb2f4edfb9b1541fc35e3f Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Apr 2026 23:55:14 +0000 Subject: [PATCH 26/38] save work --- docs/softmax_lowering.md | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md index a31e6f4c..311746bc 100644 --- a/docs/softmax_lowering.md +++ b/docs/softmax_lowering.md @@ -305,9 +305,9 @@ xegpu.store_nd %11, %12[%0, 0] # Supporting larger Softmax dimension sizes -When the softmax dimension is larger than what can fit efficiently in registers, additional tiling and fusion transformations are applied to the reduction dimension. This section shows the intermediate stages between "decomposed" and "vectorized". +Previsouly we tiled all ops in the parallel dimension only (i.e. non softmax dim). Handling a larger softmax dimension require tiling the softmax contituent ops in that dimension. -**Approach:** Tile reductions along dimension 1 (step size = 16) and fuse producers into consumers to enable streaming computation. +**Approach:** Tile reductions along dimension 1 (using appropriate step size) and fuse producers into consumers to enable streaming computation. --- @@ -320,16 +320,27 @@ When the softmax dimension is larger than what can fit efficiently in registers, **Key Changes:** ```mlir // Before: Single division linalg.generic over 64x64 -%11 = linalg.generic {...} ins(%8, %10 : tensor<64x64xf32>, tensor<64xf32>) - outs(%extracted_slice_0 : tensor<64x64xf32>) { ... } -> tensor<64x64xf32> +scf.forall ... { + // Max, Center+Exp, Sum ops ... + %11 = linalg.generic {...} ins(%8, %10 : tensor<64x64xf32>, tensor<64xf32>) outs(%extracted_slice_0 : tensor<64x64xf32>) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %12 = arith.divf %in, %in_2 : f32 + linalg.yield %12 : f32 + } -> tensor<64x64xf32> +} // After: Division tiled into 64x16 chunks -%11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) { - %extracted_slice_3 = tensor.extract_slice %8[0, %arg4] [64, 16] [1, 1] - %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) - outs(%extracted_slice_5 : tensor<64x16xf32>) { ... } -> tensor<64x16xf32> - %inserted_slice = tensor.insert_slice %12 into %arg5[0, %arg4] [64, 16] [1, 1] - scf.yield %inserted_slice +scf.forall ... { + %11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) { + // Max, Center+Exp, Sum ops ... + %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) outs(%extracted_slice_5 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_6: f32, %out: f32): + %13 = arith.divf %in, %in_6 : f32 + linalg.yield %13 : f32 + } -> tensor<64x16xf32> + %inserted_slice = tensor.insert_slice %12 into %arg5[0, %arg4] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x64xf32> + scf.yield %inserted_slice : tensor<64x64xf32> + } } ``` From 5ba1e679f0d04ec37a5de715b9703af0c85dad07 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 8 Apr 2026 01:18:39 +0000 Subject: [PATCH 27/38] add attention lowering doc: --- docs/IREEAttentionLowering.md | 323 ++++++++++++++++++++++++++++++++++ 1 file changed, 323 insertions(+) create mode 100644 docs/IREEAttentionLowering.md diff --git a/docs/IREEAttentionLowering.md b/docs/IREEAttentionLowering.md new file mode 100644 index 00000000..e378e6fa --- /dev/null +++ b/docs/IREEAttentionLowering.md @@ -0,0 +1,323 @@ +# IREE Attention Lowering Pipeline + +## 1. ConvertAttentionToOnlineAttentionPass + +### Overview + +The `ConvertAttentionToOnlineAttentionPass` transforms a standard (offline) attention operation (`iree_linalg_ext.attention`) into an **online attention** operation (`iree_linalg_ext.online_attention`). Online attention computes attention in a **tiled/streaming** fashion, maintaining running max and running sum accumulators to perform numerically stable softmax incrementally — this is the core idea behind **FlashAttention**. + +### Why? + +Standard attention computes `softmax(Q @ K^T / scale) @ V` in a single monolithic step, requiring the entire attention matrix to be materialized in memory. Online attention tiles the computation over the key/value sequence dimension, updating partial results with running statistics, enabling **O(1) memory** in the sequence length. + +### Before the Pass + +A standard `iree_linalg_ext.attention` op with Q, K, V, scale, and output: + +```mlir +%result = iree_linalg_ext.attention { + indexing_maps = [ + affine_map<(b, m, k1, k2, n) -> (b, m, k1)>, // Q + affine_map<(b, m, k1, k2, n) -> (b, k2, k1)>, // K + affine_map<(b, m, k1, k2, n) -> (b, k2, n)>, // V + affine_map<(b, m, k1, k2, n) -> ()>, // scale + affine_map<(b, m, k1, k2, n) -> (b, m, n)> // output + ]} + ins(%Q, %K, %V, %scale : tensor<1x4048x64xf16>, tensor<1x4048x64xf16>, + tensor<1x4048x64xf16>, f16) + outs(%output : tensor<1x4048x64xf32>) + -> tensor<1x4048x64xf32> +``` + +### After the Pass + +The op is converted to `iree_linalg_ext.online_attention` with two additional accumulator outputs — **running max** and **running sum** — initialized to `-inf` and `0` respectively: + +```mlir +// Initialize accumulators +%neg_inf = arith.constant dense<0xFF800000> : tensor<1x4048xf32> // -inf for max +%zeros = arith.constant dense<0.0> : tensor<1x4048xf32> // 0 for sum + +// Online attention with streaming accumulators +%result:3 = iree_linalg_ext.online_attention { + indexing_maps = [ + affine_map<(b, m, k1, k2, n) -> (b, m, k1)>, // Q + affine_map<(b, m, k1, k2, n) -> (b, k2, k1)>, // K + affine_map<(b, m, k1, k2, n) -> (b, k2, n)>, // V + affine_map<(b, m, k1, k2, n) -> ()>, // scale + affine_map<(b, m, k1, k2, n) -> (b, m, n)>, // output (acc) + affine_map<(b, m, k1, k2, n) -> (b, m)>, // running max + affine_map<(b, m, k1, k2, n) -> (b, m)> // running sum + ]} + ins(%Q, %K, %V, %scale : tensor<1x4048x64xf16>, tensor<1x4048x64xf16>, + tensor<1x4048x64xf16>, f16) + outs(%output, %neg_inf, %zeros : tensor<1x4048x64xf32>, + tensor<1x4048xf32>, + tensor<1x4048xf32>) + -> tensor<1x4048x64xf32>, tensor<1x4048xf32>, tensor<1x4048xf32> + +// Final normalization: divide accumulated output by the final sum +%final = linalg.generic { + indexing_maps = [ + affine_map<(b, m, n) -> (b, m, n)>, // accumulated output + affine_map<(b, m, n) -> (b, m)>, // final sum + affine_map<(b, m, n) -> (b, m, n)> // normalized output + ], + iterator_types = ["parallel", "parallel", "parallel"]} + ins(%result#0, %result#2 : ...) + outs(%empty : ...) { + ^bb0(%acc: f32, %sum: f32, %out: f32): + %normalized = arith.divf %acc, %sum : f32 + linalg.yield %normalized : f32 +} -> tensor<1x4048x64xf32> +``` + +### Key Transformations + +| Aspect | Before | After | +|--------|--------|-------| +| **Op** | `iree_linalg_ext.attention` | `iree_linalg_ext.online_attention` | +| **Outputs** | 1 (result) | 3 (result, max, sum) | +| **Max accumulator** | N/A | Initialized to `-inf` | +| **Sum accumulator** | N/A | Initialized to `0.0` | +| **Post-processing** | None | Division by final sum (normalization) | +| **Memory** | Materializes full attention matrix | Streams over K/V tiles | + +### Significance in the Pipeline + +This pass is a critical step in IREE's attention lowering pipeline. After this conversion, subsequent passes can **tile the online_attention op along the K2 (key sequence) dimension**, processing chunks of keys/values at a time while maintaining numerically stable softmax via the running max/sum — exactly the FlashAttention algorithm. + +--- + +## 2. DecomposeAttentionPass + +### Overview + +The `DecomposeAttentionPass` (`iree-linalg-ext-decompose-attention`) runs **after** tiling has been applied to the online attention op. It decomposes each tiled `iree_linalg_ext.online_attention` op into a sequence of primitive `linalg.generic` operations that implement the online softmax + attention algorithm explicitly. + +This is the pass that eliminates all custom attention ops and produces standard linalg operations that the rest of the compiler knows how to handle (vectorize, bufferize, map to hardware intrinsics, etc.). + +### Pipeline Context + +By the time `DecomposeAttentionPass` runs, the IR has been through: + +1. `ConvertAttentionToOnlineAttention` — introduced online_attention + max/sum accumulators +2. `TileAndDistributeToWorkgroups` — tiled across batch and query-sequence dims +3. `GPUApplyTilingLevel` (multiple times) — tiled the K2 (key-sequence) reduction dimension into chunks (e.g., tiles of 64 or 128) + +So the input to this pass is a **tiled** online_attention operating on a slice of K/V. + +### Before the Pass (Tiled Online Attention) + +After tiling, the online attention operates on a K2-tile (e.g., 64 keys at a time): + +```mlir +// Inside an scf.for loop over K2 tiles: +%results:3 = iree_linalg_ext.online_attention { + indexing_maps = [ + affine_map<(m, k1, k2, n) -> (m, k1)>, // Q tile + affine_map<(m, k1, k2, n) -> (k2, k1)>, // K tile (64 x 64) + affine_map<(m, k1, k2, n) -> (k2, n)>, // V tile (64 x 64) + affine_map<(m, k1, k2, n) -> ()>, // scale + affine_map<(m, k1, k2, n) -> (m, n)>, // acc output + affine_map<(m, k1, k2, n) -> (m)>, // running max + affine_map<(m, k1, k2, n) -> (m)> // running sum + ]} + ins(%q_tile, %k_tile, %v_tile, %scale : ...) + outs(%acc, %old_max, %old_sum : tensor<64x64xf32>, + tensor<64xf32>, + tensor<64xf32>) + -> tensor<64x64xf32>, tensor<64xf32>, tensor<64xf32> +``` + +### After the Pass (Decomposed to linalg.generic) + +The pass decomposes the single online_attention op into **5 steps**: + +#### Step 1: Compute S = Q @ K^T * scale (matmul + scale) + +```mlir +// S[m, k2] = sum_k1(Q[m, k1] * K[k2, k1]) * scale +%empty_S = tensor.empty() : tensor<64x64xf32> +%zero_S = linalg.fill ins(%cst_0) outs(%empty_S) +%S = linalg.generic { + indexing_maps = [ + affine_map<(m, k2, k1) -> (m, k1)>, // Q + affine_map<(m, k2, k1) -> (k2, k1)>, // K + affine_map<(m, k2, k1) -> ()>, // scale + affine_map<(m, k2, k1) -> (m, k2)> // S (output) + ], + iterator_types = ["parallel", "parallel", "reduction"]} + ins(%q_tile, %k_tile, %scale : ...) + outs(%zero_S : tensor<64x64xf32>) { + ^bb0(%q: f16, %k: f16, %s: f16, %out: f32): + %q_ext = arith.extf %q : f16 to f32 + %k_ext = arith.extf %k : f16 to f32 + %s_ext = arith.extf %s : f16 to f32 + %mul = arith.mulf %q_ext, %k_ext : f32 + %scaled = arith.mulf %mul, %s_ext : f32 + %add = arith.addf %scaled, %out : f32 + linalg.yield %add : f32 +} -> tensor<64x64xf32> +``` + +#### Step 2: Compute new_max = max(old_max, rowmax(S)) + +```mlir +// Row-wise max of S, then element-wise max with old_max +%new_max = linalg.generic { + indexing_maps = [ + affine_map<(m, k2) -> (m, k2)>, // S + affine_map<(m, k2) -> (m)> // max accumulator + ], + iterator_types = ["parallel", "reduction"]} + ins(%S : tensor<64x64xf32>) + outs(%old_max : tensor<64xf32>) { + ^bb0(%s_val: f32, %cur_max: f32): + %m = arith.maximumf %s_val, %cur_max : f32 + linalg.yield %m : f32 +} -> tensor<64xf32> +``` + +#### Step 3: Compute P = exp(S - new_max) and correction factor alpha = exp(old_max - new_max) + +```mlir +// Subtract new_max from S and exponentiate: P[m, k2] = exp(S[m, k2] - new_max[m]) +%P = linalg.generic { + indexing_maps = [ + affine_map<(m, k2) -> (m, k2)>, // S + affine_map<(m, k2) -> (m)>, // new_max + affine_map<(m, k2) -> (m, k2)> // P (output) + ], + iterator_types = ["parallel", "parallel"]} + ins(%S, %new_max : ...) + outs(%empty_S : tensor<64x64xf32>) { + ^bb0(%s_val: f32, %max_val: f32, %out: f32): + %sub = arith.subf %s_val, %max_val : f32 + %exp = math.exp %sub : f32 + linalg.yield %exp : f32 +} -> tensor<64x64xf32> + +// Correction factor: alpha[m] = exp(old_max[m] - new_max[m]) +%alpha = linalg.generic { + indexing_maps = [ + affine_map<(m) -> (m)>, // old_max + affine_map<(m) -> (m)>, // new_max + affine_map<(m) -> (m)> // alpha + ], + iterator_types = ["parallel"]} + ins(%old_max, %new_max : ...) + outs(%empty_alpha : tensor<64xf32>) { + ^bb0(%old_m: f32, %new_m: f32, %out: f32): + %sub = arith.subf %old_m, %new_m : f32 + %exp = math.exp %sub : f32 + linalg.yield %exp : f32 +} -> tensor<64xf32> +``` + +#### Step 4: Update sum = alpha * old_sum + rowsum(P) + +```mlir +// Scale old sum by correction factor, then add row sums of P +// new_sum[m] = alpha[m] * old_sum[m] + sum_k2(P[m, k2]) +%scaled_sum = linalg.generic { + indexing_maps = [ + affine_map<(m) -> (m)>, // old_sum + affine_map<(m) -> (m)>, // alpha + affine_map<(m) -> (m)> // output + ], + iterator_types = ["parallel"]} + ins(%old_sum, %alpha : ...) + outs(%empty_sum : tensor<64xf32>) { + ^bb0(%s: f32, %a: f32, %out: f32): + %mul = arith.mulf %s, %a : f32 + linalg.yield %mul : f32 +} -> tensor<64xf32> + +%new_sum = linalg.generic { + indexing_maps = [ + affine_map<(m, k2) -> (m, k2)>, // P + affine_map<(m, k2) -> (m)> // sum accumulator + ], + iterator_types = ["parallel", "reduction"]} + ins(%P : tensor<64x64xf32>) + outs(%scaled_sum : tensor<64xf32>) { + ^bb0(%p_val: f32, %cur_sum: f32): + %add = arith.addf %p_val, %cur_sum : f32 + linalg.yield %add : f32 +} -> tensor<64xf32> +``` + +#### Step 5: Update output = alpha * old_acc + P @ V + +```mlir +// Scale old accumulator by alpha: corrected_acc[m, n] = alpha[m] * old_acc[m, n] +%corrected_acc = linalg.generic { + indexing_maps = [ + affine_map<(m, n) -> (m)>, // alpha + affine_map<(m, n) -> (m, n)>, // old_acc + affine_map<(m, n) -> (m, n)> // output + ], + iterator_types = ["parallel", "parallel"]} + ins(%alpha, %old_acc : ...) + outs(%empty_acc : tensor<64x64xf32>) { + ^bb0(%a: f32, %acc: f32, %out: f32): + %mul = arith.mulf %a, %acc : f32 + linalg.yield %mul : f32 +} -> tensor<64x64xf32> + +// new_acc[m, n] = corrected_acc[m, n] + sum_k2(P[m, k2] * V[k2, n]) +%new_acc = linalg.generic { + indexing_maps = [ + affine_map<(m, n, k2) -> (m, k2)>, // P + affine_map<(m, n, k2) -> (k2, n)>, // V + affine_map<(m, n, k2) -> (m, n)> // acc (output) + ], + iterator_types = ["parallel", "parallel", "reduction"]} + ins(%P, %v_tile : ...) + outs(%corrected_acc : tensor<64x64xf32>) { + ^bb0(%p_val: f32, %v_val: f16, %acc: f32): + %v_ext = arith.extf %v_val : f16 to f32 + %mul = arith.mulf %p_val, %v_ext : f32 + %add = arith.addf %mul, %acc : f32 + linalg.yield %add : f32 +} -> tensor<64x64xf32> +``` + +### Summary of Decomposition + +The online attention op is decomposed into these primitive operations: + +``` +┌─────────────────────────────────────────────────┐ +│ iree_linalg_ext.online_attention (1 tiled op) │ +└────────────────────┬────────────────────────────┘ + │ DecomposeAttentionPass + ▼ +┌─────────────────────────────────────────────────┐ +│ 1. S = Q @ K^T * scale (linalg.generic) │ +│ 2. new_max = max(old_max, rowmax(S)) (generic) │ +│ 3. P = exp(S - new_max) (generic) │ +│ alpha = exp(old_max - new_max) (generic) │ +│ 4. new_sum = alpha*old_sum + rowsum(P) (generic) │ +│ 5. new_acc = alpha*old_acc + P @ V (generic) │ +└─────────────────────────────────────────────────┘ +``` + +| Step | Operation | Type | Dims | +|------|-----------|------|------| +| 1 | `S = Q @ K^T * scale` | Matmul + scale | `[m, k2]` ← `[m, k1] × [k2, k1]` | +| 2 | `new_max = max(old_max, rowmax(S))` | Row reduction | `[m]` ← `[m, k2]` | +| 3a | `P = exp(S - new_max)` | Elementwise | `[m, k2]` | +| 3b | `alpha = exp(old_max - new_max)` | Elementwise | `[m]` | +| 4 | `new_sum = alpha * old_sum + Σ P` | Scale + row reduction | `[m]` | +| 5 | `new_acc = alpha * old_acc + P @ V` | Scale + matmul | `[m, n]` ← `[m, k2] × [k2, n]` | + +### Why This Matters + +After decomposition, all ops are standard `linalg.generic` operations. This enables: + +- **Vectorization** via IREE's vector distribution pipeline +- **Mapping to MMA intrinsics** (e.g., MFMA on MI300X) for the two matmuls (Steps 1 and 5) +- **Register-level tiling** and shared memory promotion for GPU targets +- The `scf.for` loop around these ops implements the streaming/online iteration over K/V chunks From 2b5f9ae508182ecf5658da40811276f823180761 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 8 Apr 2026 21:13:23 +0000 Subject: [PATCH 28/38] save work --- docs/IREEAttentionLowering.md | 176 ++++++++++++++++++---------------- 1 file changed, 94 insertions(+), 82 deletions(-) diff --git a/docs/IREEAttentionLowering.md b/docs/IREEAttentionLowering.md index e378e6fa..d17b6fe6 100644 --- a/docs/IREEAttentionLowering.md +++ b/docs/IREEAttentionLowering.md @@ -17,16 +17,16 @@ A standard `iree_linalg_ext.attention` op with Q, K, V, scale, and output: ```mlir %result = iree_linalg_ext.attention { indexing_maps = [ - affine_map<(b, m, k1, k2, n) -> (b, m, k1)>, // Q - affine_map<(b, m, k1, k2, n) -> (b, k2, k1)>, // K - affine_map<(b, m, k1, k2, n) -> (b, k2, n)>, // V - affine_map<(b, m, k1, k2, n) -> ()>, // scale - affine_map<(b, m, k1, k2, n) -> (b, m, n)> // output + affine_map<(d0, d1, d2, d3) -> (d0, d1)>, // Q: (m, k1) + affine_map<(d0, d1, d2, d3) -> (d2, d1)>, // K: (k2, k1) + affine_map<(d0, d1, d2, d3) -> (d2, d3)>, // V: (k2, n) + affine_map<(d0, d1, d2, d3) -> ()>, // scale + affine_map<(d0, d1, d2, d3) -> (d0, d3)> // output: (m, n) ]} - ins(%Q, %K, %V, %scale : tensor<1x4048x64xf16>, tensor<1x4048x64xf16>, - tensor<1x4048x64xf16>, f16) - outs(%output : tensor<1x4048x64xf32>) - -> tensor<1x4048x64xf32> + ins(%Q, %K, %V, %scale : tensor<16x64xf32>, tensor<4048x64xf32>, + tensor<4048x64xf32>, f32) + outs(%output : tensor<16x64xf32>) + -> tensor<16x64xf32> ``` ### After the Pass @@ -35,41 +35,52 @@ The op is converted to `iree_linalg_ext.online_attention` with two additional ac ```mlir // Initialize accumulators -%neg_inf = arith.constant dense<0xFF800000> : tensor<1x4048xf32> // -inf for max -%zeros = arith.constant dense<0.0> : tensor<1x4048xf32> // 0 for sum +%empty_output = tensor.empty() : tensor<16x64xf32> +%empty_max = tensor.empty() : tensor<16xf32> +%cst_0 = arith.constant 0.000000e+00 : f32 // 0 for output +%cst_neg_inf = arith.constant -3.40282347E+38 : f32 // -inf for max +%cst_zero = arith.constant 0.000000e+00 : f32 // 0 for sum + +%output_acc = linalg.fill ins(%cst_0 : f32) outs(%empty_output) -> tensor<16x64xf32> +%max_init = linalg.fill ins(%cst_neg_inf : f32) outs(%empty_max) -> tensor<16xf32> +%sum_init = linalg.fill ins(%cst_zero : f32) outs(%empty_max) -> tensor<16xf32> // Online attention with streaming accumulators %result:3 = iree_linalg_ext.online_attention { indexing_maps = [ - affine_map<(b, m, k1, k2, n) -> (b, m, k1)>, // Q - affine_map<(b, m, k1, k2, n) -> (b, k2, k1)>, // K - affine_map<(b, m, k1, k2, n) -> (b, k2, n)>, // V - affine_map<(b, m, k1, k2, n) -> ()>, // scale - affine_map<(b, m, k1, k2, n) -> (b, m, n)>, // output (acc) - affine_map<(b, m, k1, k2, n) -> (b, m)>, // running max - affine_map<(b, m, k1, k2, n) -> (b, m)> // running sum + affine_map<(d0, d1, d2, d3) -> (d0, d1)>, // Q: (m, k1) + affine_map<(d0, d1, d2, d3) -> (d2, d1)>, // K: (k2, k1) + affine_map<(d0, d1, d2, d3) -> (d2, d3)>, // V: (k2, n) + affine_map<(d0, d1, d2, d3) -> ()>, // scale + affine_map<(d0, d1, d2, d3) -> (d0, d3)>, // output: (m, n) + affine_map<(d0, d1, d2, d3) -> (d0)>, // running max: (m) + affine_map<(d0, d1, d2, d3) -> (d0)> // running sum: (m) ]} - ins(%Q, %K, %V, %scale : tensor<1x4048x64xf16>, tensor<1x4048x64xf16>, - tensor<1x4048x64xf16>, f16) - outs(%output, %neg_inf, %zeros : tensor<1x4048x64xf32>, - tensor<1x4048xf32>, - tensor<1x4048xf32>) - -> tensor<1x4048x64xf32>, tensor<1x4048xf32>, tensor<1x4048xf32> + ins(%Q, %K, %V, %scale : tensor<16x64xf32>, tensor<4048x64xf32>, + tensor<4048x64xf32>, f32) + outs(%output_acc, %max_init, %sum_init : tensor<16x64xf32>, + tensor<16xf32>, + tensor<16xf32>) { + ^bb0(%arg: f32): + iree_linalg_ext.yield %arg : f32 +} -> tensor<16x64xf32>, tensor<16xf32>, tensor<16xf32> // Final normalization: divide accumulated output by the final sum %final = linalg.generic { indexing_maps = [ - affine_map<(b, m, n) -> (b, m, n)>, // accumulated output - affine_map<(b, m, n) -> (b, m)>, // final sum - affine_map<(b, m, n) -> (b, m, n)> // normalized output + affine_map<(d0, d1) -> (d0)>, // final sum: (m) + affine_map<(d0, d1) -> (d0, d1)>, // accumulated output: (m, n) + affine_map<(d0, d1) -> (d0, d1)> // normalized output: (m, n) ], - iterator_types = ["parallel", "parallel", "parallel"]} - ins(%result#0, %result#2 : ...) - outs(%empty : ...) { - ^bb0(%acc: f32, %sum: f32, %out: f32): - %normalized = arith.divf %acc, %sum : f32 + iterator_types = ["parallel", "parallel"]} + ins(%result#2, %result#0 : tensor<16xf32>, tensor<16x64xf32>) + outs(%empty_output : tensor<16x64xf32>) { + ^bb0(%sum: f32, %acc: f32, %out: f32): + %cst_1 = arith.constant 1.000000e+00 : f32 + %inv_sum = arith.divf %cst_1, %sum : f32 + %normalized = arith.mulf %inv_sum, %acc : f32 linalg.yield %normalized : f32 -} -> tensor<1x4048x64xf32> +} -> tensor<16x64xf32> ``` ### Key Transformations @@ -77,8 +88,11 @@ The op is converted to `iree_linalg_ext.online_attention` with two additional ac | Aspect | Before | After | |--------|--------|-------| | **Op** | `iree_linalg_ext.attention` | `iree_linalg_ext.online_attention` | -| **Outputs** | 1 (result) | 3 (result, max, sum) | -| **Max accumulator** | N/A | Initialized to `-inf` | +| **Q shape** | `tensor<16x64xf32>` | `tensor<16x64xf32>` (unchanged) | +| **K shape** | `tensor<4048x64xf32>` | `tensor<4048x64xf32>` (unchanged) | +| **V shape** | `tensor<4048x64xf32>` | `tensor<4048x64xf32>` (unchanged) | +| **Outputs** | 1 (result: `16x64xf32`) | 3 (result: `16x64xf32`, max: `16xf32`, sum: `16xf32`) | +| **Max accumulator** | N/A | Initialized to `-inf` (`-3.40282347E+38`) | | **Sum accumulator** | N/A | Initialized to `0.0` | | **Post-processing** | None | Division by final sum (normalization) | | **Memory** | Materializes full attention matrix | Streams over K/V tiles | @@ -109,25 +123,27 @@ So the input to this pass is a **tiled** online_attention operating on a slice o ### Before the Pass (Tiled Online Attention) -After tiling, the online attention operates on a K2-tile (e.g., 64 keys at a time): +After tiling, the online attention operates on a K2-tile (e.g., 16 keys at a time). This example shows a 16x64 Q-tile processing a 16x64 K-tile and V-tile: ```mlir // Inside an scf.for loop over K2 tiles: %results:3 = iree_linalg_ext.online_attention { indexing_maps = [ - affine_map<(m, k1, k2, n) -> (m, k1)>, // Q tile - affine_map<(m, k1, k2, n) -> (k2, k1)>, // K tile (64 x 64) - affine_map<(m, k1, k2, n) -> (k2, n)>, // V tile (64 x 64) - affine_map<(m, k1, k2, n) -> ()>, // scale - affine_map<(m, k1, k2, n) -> (m, n)>, // acc output - affine_map<(m, k1, k2, n) -> (m)>, // running max - affine_map<(m, k1, k2, n) -> (m)> // running sum + affine_map<(d0, d1, d2, d3) -> (d0, d1)>, // Q tile: (m, k1) + affine_map<(d0, d1, d2, d3) -> (d2, d1)>, // K tile: (k2, k1) + affine_map<(d0, d1, d2, d3) -> (d2, d3)>, // V tile: (k2, n) + affine_map<(d0, d1, d2, d3) -> ()>, // scale + affine_map<(d0, d1, d2, d3) -> (d0, d3)>, // acc output: (m, n) + affine_map<(d0, d1, d2, d3) -> (d0)>, // running max: (m) + affine_map<(d0, d1, d2, d3) -> (d0)> // running sum: (m) ]} - ins(%q_tile, %k_tile, %v_tile, %scale : ...) - outs(%acc, %old_max, %old_sum : tensor<64x64xf32>, - tensor<64xf32>, - tensor<64xf32>) - -> tensor<64x64xf32>, tensor<64xf32>, tensor<64xf32> + ins(%q_tile, %k_tile, %v_tile, %scale : tensor<16x64xf32>, + tensor<16x64xf32>, + tensor<16x64xf32>, f32) + outs(%acc, %old_max, %old_sum : tensor<16x64xf32>, + tensor<16xf32>, + tensor<16xf32>) + -> tensor<16x64xf32>, tensor<16xf32>, tensor<16xf32> ``` ### After the Pass (Decomposed to linalg.generic) @@ -138,7 +154,7 @@ The pass decomposes the single online_attention op into **5 steps**: ```mlir // S[m, k2] = sum_k1(Q[m, k1] * K[k2, k1]) * scale -%empty_S = tensor.empty() : tensor<64x64xf32> +%empty_S = tensor.empty() : tensor<16x16xf32> %zero_S = linalg.fill ins(%cst_0) outs(%empty_S) %S = linalg.generic { indexing_maps = [ @@ -149,16 +165,13 @@ The pass decomposes the single online_attention op into **5 steps**: ], iterator_types = ["parallel", "parallel", "reduction"]} ins(%q_tile, %k_tile, %scale : ...) - outs(%zero_S : tensor<64x64xf32>) { - ^bb0(%q: f16, %k: f16, %s: f16, %out: f32): - %q_ext = arith.extf %q : f16 to f32 - %k_ext = arith.extf %k : f16 to f32 - %s_ext = arith.extf %s : f16 to f32 - %mul = arith.mulf %q_ext, %k_ext : f32 - %scaled = arith.mulf %mul, %s_ext : f32 + outs(%zero_S : tensor<16x16xf32>) { + ^bb0(%q: f32, %k: f32, %s: f32, %out: f32): + %mul = arith.mulf %q, %k : f32 + %scaled = arith.mulf %mul, %s : f32 %add = arith.addf %scaled, %out : f32 linalg.yield %add : f32 -} -> tensor<64x64xf32> +} -> tensor<16x16xf32> ``` #### Step 2: Compute new_max = max(old_max, rowmax(S)) @@ -171,12 +184,12 @@ The pass decomposes the single online_attention op into **5 steps**: affine_map<(m, k2) -> (m)> // max accumulator ], iterator_types = ["parallel", "reduction"]} - ins(%S : tensor<64x64xf32>) - outs(%old_max : tensor<64xf32>) { + ins(%S : tensor<16x16xf32>) + outs(%old_max : tensor<16xf32>) { ^bb0(%s_val: f32, %cur_max: f32): %m = arith.maximumf %s_val, %cur_max : f32 linalg.yield %m : f32 -} -> tensor<64xf32> +} -> tensor<16xf32> ``` #### Step 3: Compute P = exp(S - new_max) and correction factor alpha = exp(old_max - new_max) @@ -191,12 +204,12 @@ The pass decomposes the single online_attention op into **5 steps**: ], iterator_types = ["parallel", "parallel"]} ins(%S, %new_max : ...) - outs(%empty_S : tensor<64x64xf32>) { + outs(%empty_S : tensor<16x16xf32>) { ^bb0(%s_val: f32, %max_val: f32, %out: f32): %sub = arith.subf %s_val, %max_val : f32 %exp = math.exp %sub : f32 linalg.yield %exp : f32 -} -> tensor<64x64xf32> +} -> tensor<16x16xf32> // Correction factor: alpha[m] = exp(old_max[m] - new_max[m]) %alpha = linalg.generic { @@ -207,12 +220,12 @@ The pass decomposes the single online_attention op into **5 steps**: ], iterator_types = ["parallel"]} ins(%old_max, %new_max : ...) - outs(%empty_alpha : tensor<64xf32>) { + outs(%empty_alpha : tensor<16xf32>) { ^bb0(%old_m: f32, %new_m: f32, %out: f32): %sub = arith.subf %old_m, %new_m : f32 %exp = math.exp %sub : f32 linalg.yield %exp : f32 -} -> tensor<64xf32> +} -> tensor<16xf32> ``` #### Step 4: Update sum = alpha * old_sum + rowsum(P) @@ -228,11 +241,11 @@ The pass decomposes the single online_attention op into **5 steps**: ], iterator_types = ["parallel"]} ins(%old_sum, %alpha : ...) - outs(%empty_sum : tensor<64xf32>) { + outs(%empty_sum : tensor<16xf32>) { ^bb0(%s: f32, %a: f32, %out: f32): %mul = arith.mulf %s, %a : f32 linalg.yield %mul : f32 -} -> tensor<64xf32> +} -> tensor<16xf32> %new_sum = linalg.generic { indexing_maps = [ @@ -240,12 +253,12 @@ The pass decomposes the single online_attention op into **5 steps**: affine_map<(m, k2) -> (m)> // sum accumulator ], iterator_types = ["parallel", "reduction"]} - ins(%P : tensor<64x64xf32>) - outs(%scaled_sum : tensor<64xf32>) { + ins(%P : tensor<16x16xf32>) + outs(%scaled_sum : tensor<16xf32>) { ^bb0(%p_val: f32, %cur_sum: f32): %add = arith.addf %p_val, %cur_sum : f32 linalg.yield %add : f32 -} -> tensor<64xf32> +} -> tensor<16xf32> ``` #### Step 5: Update output = alpha * old_acc + P @ V @@ -260,11 +273,11 @@ The pass decomposes the single online_attention op into **5 steps**: ], iterator_types = ["parallel", "parallel"]} ins(%alpha, %old_acc : ...) - outs(%empty_acc : tensor<64x64xf32>) { + outs(%empty_acc : tensor<16x64xf32>) { ^bb0(%a: f32, %acc: f32, %out: f32): %mul = arith.mulf %a, %acc : f32 linalg.yield %mul : f32 -} -> tensor<64x64xf32> +} -> tensor<16x64xf32> // new_acc[m, n] = corrected_acc[m, n] + sum_k2(P[m, k2] * V[k2, n]) %new_acc = linalg.generic { @@ -275,13 +288,12 @@ The pass decomposes the single online_attention op into **5 steps**: ], iterator_types = ["parallel", "parallel", "reduction"]} ins(%P, %v_tile : ...) - outs(%corrected_acc : tensor<64x64xf32>) { - ^bb0(%p_val: f32, %v_val: f16, %acc: f32): - %v_ext = arith.extf %v_val : f16 to f32 - %mul = arith.mulf %p_val, %v_ext : f32 + outs(%corrected_acc : tensor<16x64xf32>) { + ^bb0(%p_val: f32, %v_val: f32, %acc: f32): + %mul = arith.mulf %p_val, %v_val : f32 %add = arith.addf %mul, %acc : f32 linalg.yield %add : f32 -} -> tensor<64x64xf32> +} -> tensor<16x64xf32> ``` ### Summary of Decomposition @@ -306,12 +318,12 @@ The online attention op is decomposed into these primitive operations: | Step | Operation | Type | Dims | |------|-----------|------|------| -| 1 | `S = Q @ K^T * scale` | Matmul + scale | `[m, k2]` ← `[m, k1] × [k2, k1]` | -| 2 | `new_max = max(old_max, rowmax(S))` | Row reduction | `[m]` ← `[m, k2]` | -| 3a | `P = exp(S - new_max)` | Elementwise | `[m, k2]` | -| 3b | `alpha = exp(old_max - new_max)` | Elementwise | `[m]` | -| 4 | `new_sum = alpha * old_sum + Σ P` | Scale + row reduction | `[m]` | -| 5 | `new_acc = alpha * old_acc + P @ V` | Scale + matmul | `[m, n]` ← `[m, k2] × [k2, n]` | +| 1 | `S = Q @ K^T * scale` | Matmul + scale | `[16, 16]` ← `[16, 64] × [16, 64]` | +| 2 | `new_max = max(old_max, rowmax(S))` | Row reduction | `[16]` ← `[16, 16]` | +| 3a | `P = exp(S - new_max)` | Elementwise | `[16, 16]` | +| 3b | `alpha = exp(old_max - new_max)` | Elementwise | `[16]` | +| 4 | `new_sum = alpha * old_sum + Σ P` | Scale + row reduction | `[16]` | +| 5 | `new_acc = alpha * old_acc + P @ V` | Scale + matmul | `[16, 64]` ← `[16, 16] × [16, 64]` | ### Why This Matters From 35598a8258224797a28b9c6bf9d5d616988e84ac Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 8 Apr 2026 22:18:55 +0000 Subject: [PATCH 29/38] save work --- docs/IREEAttentionLowering.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/IREEAttentionLowering.md b/docs/IREEAttentionLowering.md index d17b6fe6..2b863bb1 100644 --- a/docs/IREEAttentionLowering.md +++ b/docs/IREEAttentionLowering.md @@ -6,9 +6,6 @@ The `ConvertAttentionToOnlineAttentionPass` transforms a standard (offline) attention operation (`iree_linalg_ext.attention`) into an **online attention** operation (`iree_linalg_ext.online_attention`). Online attention computes attention in a **tiled/streaming** fashion, maintaining running max and running sum accumulators to perform numerically stable softmax incrementally — this is the core idea behind **FlashAttention**. -### Why? - -Standard attention computes `softmax(Q @ K^T / scale) @ V` in a single monolithic step, requiring the entire attention matrix to be materialized in memory. Online attention tiles the computation over the key/value sequence dimension, updating partial results with running statistics, enabling **O(1) memory** in the sequence length. ### Before the Pass @@ -23,8 +20,8 @@ A standard `iree_linalg_ext.attention` op with Q, K, V, scale, and output: affine_map<(d0, d1, d2, d3) -> ()>, // scale affine_map<(d0, d1, d2, d3) -> (d0, d3)> // output: (m, n) ]} - ins(%Q, %K, %V, %scale : tensor<16x64xf32>, tensor<4048x64xf32>, - tensor<4048x64xf32>, f32) + ins(%Q, %K, %V, %scale : tensor<16x64xf32>, tensor<4096x64xf32>, + tensor<4096x64xf32>, f32) outs(%output : tensor<16x64xf32>) -> tensor<16x64xf32> ``` @@ -56,8 +53,8 @@ The op is converted to `iree_linalg_ext.online_attention` with two additional ac affine_map<(d0, d1, d2, d3) -> (d0)>, // running max: (m) affine_map<(d0, d1, d2, d3) -> (d0)> // running sum: (m) ]} - ins(%Q, %K, %V, %scale : tensor<16x64xf32>, tensor<4048x64xf32>, - tensor<4048x64xf32>, f32) + ins(%Q, %K, %V, %scale : tensor<16x64xf32>, tensor<4096x64xf32>, + tensor<4096x64xf32>, f32) outs(%output_acc, %max_init, %sum_init : tensor<16x64xf32>, tensor<16xf32>, tensor<16xf32>) { @@ -83,14 +80,16 @@ The op is converted to `iree_linalg_ext.online_attention` with two additional ac } -> tensor<16x64xf32> ``` +**Question**: not sure why they do it this way. why not embed the normalization also inside the op? + ### Key Transformations | Aspect | Before | After | |--------|--------|-------| | **Op** | `iree_linalg_ext.attention` | `iree_linalg_ext.online_attention` | | **Q shape** | `tensor<16x64xf32>` | `tensor<16x64xf32>` (unchanged) | -| **K shape** | `tensor<4048x64xf32>` | `tensor<4048x64xf32>` (unchanged) | -| **V shape** | `tensor<4048x64xf32>` | `tensor<4048x64xf32>` (unchanged) | +| **K shape** | `tensor<4096x64xf32>` | `tensor<4096x64xf32>` (unchanged) | +| **V shape** | `tensor<4096x64xf32>` | `tensor<4096x64xf32>` (unchanged) | | **Outputs** | 1 (result: `16x64xf32`) | 3 (result: `16x64xf32`, max: `16xf32`, sum: `16xf32`) | | **Max accumulator** | N/A | Initialized to `-inf` (`-3.40282347E+38`) | | **Sum accumulator** | N/A | Initialized to `0.0` | @@ -127,6 +126,7 @@ After tiling, the online attention operates on a K2-tile (e.g., 16 keys at a tim ```mlir // Inside an scf.for loop over K2 tiles: +// Step size is 16. %results:3 = iree_linalg_ext.online_attention { indexing_maps = [ affine_map<(d0, d1, d2, d3) -> (d0, d1)>, // Q tile: (m, k1) From a54fa4baf2531db023d6189a04accac9aefb1a06 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 8 Apr 2026 23:28:24 +0000 Subject: [PATCH 30/38] save work --- docs/softmax_lowering.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md index 311746bc..e0c5a0e8 100644 --- a/docs/softmax_lowering.md +++ b/docs/softmax_lowering.md @@ -301,6 +301,33 @@ xegpu.store_nd %11, %12[%0, 0] !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> ``` +--- +## What is reused in between matmul and softmax? + +The softmax implementation leverages several reusable lighthouse infrastructure components: + +The workload-specific code is minimal (payload generation + tiling/decomposition strategy). Most infrastructure (execution, benchmarking, XeGPU lowering) is shared across different operations like matmul, softmax, etc. + +### Pipeline Infrastructure (`lighthouse.pipeline`) +- **`TransformDriver`**: Orchestrates application of transform schedules to payload modules +- **`apply_registered_pass`**: Applies named MLIR passes (e.g., `eliminate-empty-tensors`, `gpu-kernel-outlining`) +- **`canonicalize`, `match`, `match_and_split`, `PipelineInterrupt`**: Helper utilities for constructing transform sequences + +### Execution Infrastructure (`lighthouse.execution`) +- **`execute`**: Runs compiled MLIR modules on GPU with memory management +- **`benchmark`**: Benchmarks kernel execution with warmup and timing utilities +- **`GPUMemoryManager`**: Manages host-device memory transfers for GPU execution +- **`get_bench_wrapper_schedule`**: Wraps payload functions with benchmarking infrastructure + +### XeGPU Lowering (`lighthouse.schedule.xegpu.helper`) +- **`bundle_xegpu_to_binary`**: Common lowering path from XeGPU to executable binary (shared with matmul and other XeGPU workloads) + - Handles XeGPU peephole optimizations, layout propagation, and GPU-to-SPIRV/binary compilation + +### Payload Generation (`lighthouse.ingress.mlir_gen`) +- **`get_mlir_elem_type`**: Type conversion utilities for constructing MLIR types + +**Key Insight**: + --- # Supporting larger Softmax dimension sizes From 6e1c7f5eec3068674c7f483e5fec3098d0f69087 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 8 Apr 2026 23:35:40 +0000 Subject: [PATCH 31/38] save work --- docs/softmax_lowering.pptx | Bin 0 -> 45031 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 docs/softmax_lowering.pptx diff --git a/docs/softmax_lowering.pptx b/docs/softmax_lowering.pptx new file mode 100644 index 0000000000000000000000000000000000000000..42e7908862d02c963e89ddf7ea4bd97ed5e6955f GIT binary patch literal 45031 zcmdqIW00iXx~^NcZQEV8%`V%vZFE&-vCCbyZFHGkwr$(q^?h^gbLO09N34Bf{hJZ_ zzLEHo`QWlmo7Mzl)}PLK;xVx- zQBvigu8o3s#yNIJ z^kAmUswbp3f@dq%J*ajHHV|Un-h-e$7S`rinww?pNXl@SCogtA<1HO~_zYU*BeTZT z%>OA)P+;1X0W2T#w512;I;2sNS-0iQ*cE(Xnz5yp9yv356V~`l10isV6AiJ~>HFh|qpubk2UZV`xMA)ZXulN-Qyo3PR7h-BAUu9)sy zkl*-crFdhR|JCi2OF(IoSRZ6?{z#z$kd9_CNI{&a;!r{vo+Kee6L(V* z0dH!(Y*pm#Pn&n3$9fUqR7*bmU1Dy`zA?1G{(0>gOR$$v8jntPFPPu|rA)5jInIE- z%48b^2nhD8O!S>hZJZey{<>GjPs)Q5BZXb^gD=AtGpt%geKYm`M$v(-OlS~t%-|Xb zEi3o>kO%O$YLY|b>wSAVz%}SxdKPLo9$|o>FLosbhd`}_%}i!29~#a?OC{F8G7_YJ z{I#FgktVPUq%*O%p@;#FQdi|=)}~IAPQbUvjS{M)IkXoz%Iq4^-7XYnwZPg;^KPng z4y;U9Sv4~wPg}c!n$;I`+Q{;RG`?WG6^HSO@&m3^WC0zSR{GcIfqEOmo^&+q_yi_5 zf5puogFqq2uKQ}>wJU4md{61x5I?K?&>9{J7hg?cR8;wryPq?TIA`SQ6Tqx9H_0v@@mPOe&?m0 z)Pqrxpk+#oKV%3LVNI+W_hWMO)q(&5Ar+-!4yLXK7*p@ioSFlA5hJUD-9Rx2mRGVh zB;BS#oE4flraITi?hJnB`qVd6G zd_e~ga+RDZWbUH9Be9w!p4KVq&5We}!EUKjA-pvnN=Ref#a*uzOO@rrcfkHoc__> ztSt*P%n&-=Dgq z(G-0d^Y%=5%lNN(%9@Jty!^_OEf^3G(m&_P(81xaG-ayv#LO}wb&y^oCM!5tBnbcb zzHbJuTj`agmBfxwKB1E9yMe60>oOKi)(0)F0KQPIc#uyl97^Z_J7< zP(@R0X0UO*sMiCY=yfV&d`N_8gBH+nMRDg1cML7eb@-CD`LO7t>!dGh$f(0AGP~V^ z`#f%ot|{l|bds?a%ehXmXY)??4~>#lP+|xKF$8I4)~3DvJr+N7?=Vrab*Jea9oiTAT8yLy;A%BLFpuoT}o7JeTr}C`TPR;)Q*Ir1u|QR)<woB_J1I2It@Z1f1bT@{kO53g;ulJD*|4dD3H-RENhCjs$<`dd78<=3AUk3xVtk(rQ~d+xd%G4*?8RLHIAga zK!AsRATn9MF$%KJmjY_HDNpM{Pe?foOZP#M6!R3*9L@cOCnSH7EP%1zj!G0rTjpYx zQ|`c348B4$Ki>B}S=*wd-Iix7%~osrdtvboeFfNiaXANFRDO5}d?i=i^A6ARxtVgx z)o*(DXghxzl7RcupQ~*NhgR$EWvyL1UQ+QFh8^BXIRsi&CV@;aNAiu2w#M;$WeoH3 zh%jimg%8A<_bJlgI|}zy^Tm`xy69AcQptPD@q1KA&bOkdQN8;lZzS(tJ~Z(tlsqiU zi=5657skzxV?!+NmT^N;droK0XE#fSwWG!S*=@Yno;x>lhoz%{NnOvSW(ghdrA3LV zrin)h9sk-(M(6i|iH+Yp9Uo95B7tU{QhAb;i9;Z9yyLF`Zny75nlM0Fcojk$G(XwT zQ7mUB*j`Ft*jaeE)&+_{T#cc$AP zfKBb&1>wNC4^lg;1Yi;iG;P{o2VRhJV&76cE@NZ43fAP=R!-DjC^XJJO5xnqh+WAu z(z!2*Twraia2T}J)DhW?-}^JHTg*w8l-K3(Yjkf^RW=(Rn{o@?9_zO`WK$)%Azz?@ z_yU#uKnn3(c(L)f2f|cQ5v?Xr#09Rm%(aF#OUY0;w(-PsM+faXTx9OO!!|F`wxx&_qzRCvpW_h?SqNXAg^9OV0H&)Mb9n^ zn^H`r5M`1ZUqMScwV1&{E3`jf<0>uZTT<|$BnYmaLjG_tEb6Pj!MMeCBtX1aO!*X;_UNwjNF)9~BG*uf|PoOtc<*D`)u^e8O zKE^Ys-mxEKAyJ2FcnC)^fywbw50>=$jv&xdl=NDbkVH_IIpAc38->zjP*CRm%7-bQ z22f#qedJ-3nv?@)KMgRWyhqPMlT$dM<)==|F0Fnmum~vX1WD9GO?3I_R^R;BwCQ9> zpoxB^4e(!X;eUhXZ_Dr(Wg6p0|8fgrkCJa9dmYw1QnyM8G?qaj{XyV90ZE*dg-^t* zzC`_9sKPL4?CUk@(?=Uz?yTkaQg9oA*rQ2fz?ueI-z(+YWdFtUo9 z6}(6h!!C!D%+%Wh5lVJvn8c1AWrfm@g{t#%y&9GlqB&$MBj?U|l+1?j&gOatC($e5 z5!5WMIsaR%ygvig($z$qmS2qAqjND3fvXz`otCvnmw5r0+Ho(1Nj?E3G%&ljuFP6P z)wC{(gpd*X!^nhh9nsFHg6)%$_s&=D?$tZO{a-_F&O#BL{uS~d(0>@Tze4V2Y3ly} zggjcgM}GH9`K$W{G&+M5po9TwPC^bsS_PpUxaMg%Tuu9qGNX%^WOkOYd=BXxyPkgb zgIOE)UVRHl-+8oe-)MH>Dx4~tbjS76hen~zu?lSq9Wksr!S^2PBL$T8E7^S7!NkEZ zXfJ!0*E4@GT#Dj8CZGRGquHh=0Xi|HX(r?~}ZQlJJ5T^BTu$m8! z29O2B#r5whd9T$!Z`M2mNwOM0mXeWv`yh2^lbB^6RsA$8WETmY{L_LjDgJ~4AjO{mrWj{i)yVJ73DUf_oHR?nM$mX?hfrTtND3S}$P3m-15Nc*Ch9`e|JjYiN zZL?D;bTeOM9!p}ancM|3Q`X!Lg{Nhx;v!As#WnrzmpZ#8x;c#8I@Gl5No9>; z2RK2R+{H;xkXa=)wbGRj_#pIaH)6e;R#BI1M>sG)2-mAlI zlKR;c=*-igv_8Q0L0jH@L>Y9yPi$j7P1{=S8s~Em#IIxaxaAbx$2ioUVN1KQ@j$I3 z$bRDLBJU%!($2jy$ld&@5!~l<H`6hmO zCY64;o0I3*>MQ}VuZzp`>1=PCkusJbllN^|&;xWRQ2y!WmGR@_LHTZR_EL}kV@KHrWXfw+o6ut$a)axoMfWZJnk5Hh5s+};RHVB5 zu03W5MnXK=vEOBnzCt)5{uOmw6^{X3l$$6jKTDy2b$SgsA)<{V*ZWsrXCH+kIyD*} z6S2MFgzFx=z)>itbQJe2=Ugq}FsP(rlv^t^O;0&``~)!qrykCQcVfxy{DUq|$N`vC zcGQnjGN|f>-H>P|KqQ5}0A$=S&R|#HauTpfGHw8Pe}(uB%cnWmLA#!cyeau59~tu1zs! zk*C{WNa}tkHLWSD8BF3pJ=p6;FXXDyO!F>d)xM>egYP}I1~;$FiVPL)iFzvCbcUF% zd5a`=veJqYK11!RGs~WRQg1)bk^w|II@$INKNzvqws4C`*jlew{>Hd*m}wGsun96u z1QJMf!x;-|vGt_kxH{9ysP|aUP?aCoHFFRXB`H`LWF9N6lbBtkxt9HGTa|rx_y&QM zHf*KEsx|Rh@`I01ix~%5fnGCJF4EkmB76z)os`wYxHe4a(kx4tKWXatbQTb>W`_Y# zr(ersKhWPqV++SXn>Yxbw$u5$s2~;ga5gl1J<9+S z6RVJGrK#mRaTr5l$lNLH(j$vz4rWlDq^dBTFLvgMG=jM=?O_~8E0Kyft6-3Dv2rev zi8{JlR?k@J>FREx;ReVzMM}CLY=hg z#_KUKw#oWuw#K4rTkuo+qFx%@8j32rrj`qabRzhlg(}IfJO9+@ADj)Mm}fIw+-IKS_?%~5Sb5iq+MW}{ zk7Il!?({(tdE-`b7$;Z%0W*6xP&a`3+Dh{}Re)W{e+~w{HJVfZen$LPNm+W2jxGGM&rM&~U(?XP z%z=xck&UUci>HmL^IwVrKPH6`^tJBW?f|%3eE8CfD3YP|64AP!0MjHA9}8OKqwKDm zKHtPMjtfZVrD&X)u!4SW0i(3BtrtK-#CiuC3obaWJ6bjv)WE=cHfU%O(=`#^VlxFgmQ)f2#ENXGBUF`#1a9NK*r%-H9g=M%SOojlc6|+^BaCCH5oH!F>_GXgu59b{2uy*Uril$=xoYxyI zlo+E4B;z;BTYE#Nj*Q&3G*Y*THzel43Epa%KF%*G&8A&D$LUgA`GfN+-}O@zY9MfR>D+V4eckocvI{NgEg2nF1|Fix05BIQ zXwOKaS&xHJsWeYTb{Q(>Rn19x1x6{<+w*uU?9)L1!CwvGc4i1?E%Tno)<{2{nOZR} z$xqWF4mg1RgS`Ym(@MeHd9|UAv|O-IVJt23Ag|uzGhaDVdbg+p`oa0br&)93^?IbK z><6Bs_^qnOpIPL|cw89_3>g+nHqUCqfD<5+d0{2SVzD~jj)I_%8sb*RxQFO#fHBDN zwZo12c0tXFIO>W7D>dY@oRz=ScTXg~Kd#x~G_ptl(9hG@GE7LN(x09C<>Al0-AWs? zj;fX5$;N8h+B4qU@#=KcH*-5@$=BxL5{zj6e3SjRg|Cc>{0zy$6WR0xT`E58>omTb z)X{R%*~+b*LQk)f$0|Xtgv6Ls+3cm!on(!m+1*{ybK+T-)k>>~;bDIkvzM)3f6U66 z@pbNJMc|K^Bcbcirq`8%=(f%(e@y*EjmAfk`dHIRfcbe@h}>> zQgELGfNgImflO>g*^gqDKD?&R&b1|bSqKV>YC%jhUkwHd+&HNRJ3rZ#ie;lwNEzrKkYlyMlPrL};=DPuRgZ)DutWgu< zAHTCzS$g{3MVZ#kJ+jjfeuNR;QDJD3M3ns+;gG7eNW+)ceCrq0G-=!1Zk3xA6$$x? zJ7g%?-ZG3jHO|}EB)>W&+42;VykK(BGr_=jb%`G}gNr3+COQ2xk60`iSq>#xX3GL1 zGziXgU>7&So@(iPO)ioMaVQEF%-k=o$s1gUP_LzhhAsbUXfu^XtP&K=yH1UfN6TQ* zFr`O)Uc8+@hz@YAv+8Kq{PzSt9fLA!2#hN|r>-9F`0XWqk?vP7_3I`0WLK zxEEXS-0jl{8#R5nms`d8gs7iaTF3#b#yQCib7=>*U`eYq}F z45==O9zy|d;ONWC70m{@N-CH`)HZ#fulAAtcP%y#oNeEFEM}L=*(u`7;yN0=s=XWM zUY9{J8B1CM@X_)$u#@+{7xk#}i7>f`FFpuJx-=UyF&L0qNAsg%?>Dkgj8133&<17) zzCX`aKTxRL1Mjfqj1UIX_`{XGS(e-(H9CZB`Of?R6(b`H0~=yQ+!CAy?he%Ae8WJP zmAKdn%-iao1?|uxUnM2*i-f$`n!d9@#EPYV-X;JdXc-7@@E}SFdHQ+%G}(oSiZIiQ z?6O5ffPjDm zgPa~|bZDjV0Lx`=SLC9@leNv+Qy*a*W2`mn>Zf1$PoKlS@kIN1L;4m~g1-`nn^xMsNN7D6@BuopGk& zD{vcjLQJ`Ya&YJ&=N)hSJT+CpxEK@B^3}SB;bJmQig$6Ndc$xQ5eIXT^6-W_9uEXd zt-XGQj5YwH+iBTSZuc#?&4nL`xCln_GM08scHfhrW|9v*&|!^PV1X z2i_oRedj>=CX;q73C$&A8QH_MJcy47v_XbD*;OV8Cqby)zgB9xv$OEs+bABLm1sI zuMTQ#EoEX@y=d=?=vk455f42Th24$co>mfkWay<%y&hsi9rQ^Zi_Kl_WfJlGk0zt) zFuwe{d1NpI1E>?=&tOyJeObF9Eo5S*jkg0_x6ek3 zCN3lOs#3J`0Z=31ATLgub9Ex+lS1X-NkGPCgi1=QX)*LuV&kv_M+GB?xUi`nwV@nU zuCOR3EXny1iA+}n!jVx-COgPbp$9JcvcD=0gOq&LEB#ULEBw&R9#-%31;whSwa{Rj z8CRIrWSKdhEMAn|WyI5eP~D@?38mNWYgm-~*AjrDP+3FNA5PmmJCLwzZ)AoKu7h~k zD`u*ST$plaxoV-gr|3!OEIjhQQYj7)f2$NpIAq9^($haRJq65s$JogOI>d>rkjKZ5 zc;p(M1^JQ7ahFNqYQGise)OGo5*p&5V}U3fgySm6nB3%d$Vxx&=E*YQ77UDqxN&6s zBRvbVk<$2Fvf}OcQ$i7GK_=3gNjk0NOEZl4LTN1W`3XeV^cs@aa5M>4Ml>&xf;F7%?Rtr@AWBsmslO5l?8vP52H^fpQrB~`Mi#ijv|KWruK=$LmQS!CeWuSxseN zh%XwLm7WSq;wPJQ9vIjF`u%P$5zu|l2hFXZG;zM@JnsGR4Nv7em{K0@gzvqfo0S{i%gN z&QKe#*^aOOhSdPm?eONWzGweJEB*he|NjZA|4f0uxf(s$fzV5Y1bMYyOhhLud}*Kr8av9oZA(-7jddZ%^~^416RVX z+NgTGx^>=bLRs#~y_{Bm%CjTjcI0h-g*0X?p6xG6m6kEU@_kT6LlV?%yzuRUG1*F! zq{Rxi(WCht=wAs4^nM0U^R?Ry{&g(iUshrNCj$Pp(j3>&jag+!^1Gyly6;hdxfhXY zK(;p+F3;;Wb4RvE=uWcGh9WN`J71y(+0#0Abc$C>H1YIb;SK-G}0+3BIMxj|B*e;}|ppD!ujUA?Qlk}`gu!(@^J#sW1d z<7a83T3MDLRXrO$z^Bhz>)2j~K|1fQT%$+l=j4jZaLyqoF`bm6W@aa+STyRy{4C7-ghwJG_QICN3_>YS!&h1P62w4cWCfJ4n(itM%hT-$AY{rU%P0)@0myPo;>~g4Ox- zG&ir#62Qu3lj z(-}DBaw5&(kpt!t>-v#8&(mv8V>!6bu59sqBhPgf+fU?1Yva1+dGzu zO1(~Q)xt;=UHjF*Sf-=ig^kiwCVQYc*nOa8?=~nXpF-FM5%=l^bfY$M%5>)*vfZNF zTe5EyGzy}-2if4v&?hw@7}$Xm=z$lHI|Z5(4@g2HfOJpaVq6T=fEUPaO3z|?8~Ex8 z$^lChQ_E$yjgT;tz?}2K)c@t<4VCbo@*{%^<^p^8D6L*6_-fMe=1|)ET};0gTKl26 z>vnTU;4M&lUTL!$m+=E`$AsP8!9agk)516kgeNQCng^g5KbB#TL2+d5uRYmVUPfi9 zRly##VVmemqs?$fa`Xa9iqrkf1??rTG?DJYX$GDdoK(nwCo)QiOL7Op$MWg??G@Q< z0ypl;fzcL+%HcOPVfFwy=O|y&{>oi>?iXxl{(|l8^3sC4`ntdn)T*&Uu{|C{7RMk} zucMcw*0P75rAzU_$)mmh`^)?4ruiRLkF}~3+{b8@btoIrTztT;;}JP$lQCu?ds-~1 zcXDW8CIr8*RFpZu|Gm-DTbFehjLr7$(sMK5_mLQ^-rua9PmfB`Wkj_8NR3FtEXs7x ztODj(ZN3a$0t5lQ;>+P^f3jK_u12M!8Se0vUN%ukN!M>vn3XY5kIwL zvLTt|YU{NU3{iuKL zhMdG7(z8yEbX;IbLx;9^gfo5@vzm1i%B$&z zSHB3eRi!zzIg^0%wQIZ@76priE49FO-7!M7+mjdQ)%G@@UYrMntw7}jRou1eW3UNi z{15(ee!bypjSr!q&|@Fj7h5myNOp6Un=)fPY^vho9-ok~2~&mS6^e=_Y)W^phDZzX zLakE#frlUIVeleH9@H~!ZVtPh+FXJQG=*4f-okTY(o!G5R1gkAXdcl8y&y{lcltTR z@H`_cIvY+{A_Dp=h{2ml0}t{u?=r?0hm^x(-e!#@&Jb z=v^kK1XGXWBB@_Ru#9PDmQRIU{4kkiBy1CZ-nF()8(RtF%OenjosVJ(Y4Sj^Iw=(O z6QaJ|ViKs4x9UCM;e>KrD639`h?8j@1A3R zcJ*UQIwWE-T;IMz(#m_X&<$`3(K7od7@F6jgwYS=e+sL=JN7>ZR*?qhTE<_tP~-o? z7P9=q7OvX;wWYnO40ZCO8chks73Ksj8P67UhOA^3(tvCQLmFRvrm7=XP~X}+gMV03 zx(<)Sa(x2UiBxIM|LFtt`F8P?BRw9+y4OQdeK*b*LIR?+j+?P;z~kB0jc7>Kzzf4dV+n|K^xxsrWIxF$;UHrOiBQ30obq_yU7jpH)-4 zX9&Bt9=|ZW7r22!Xj=cN43mKEliad%#36JS=FWMR48pFgA{D@TJi^%dnljEMuF0Y9SA<5g^)~znJY2% znU_*Fb0X<~3Xmf9-B_heJF-(D7{!F4tK1CHFrn$|5JOAg<1u?ZB3p^ZrWg{_)umNt zek|mQ#nW^Rz;*HbTqCh+!1Ov5B77?DyS(VFUh|Q06i;SLO7C%ykAoh0EmCQ~{^L(W z{I!A%%>$pefVIVznv*uc+F$gh80;SG3IQP?;rBfR(qOajHI}9M*vuaJE+Lauo5xF!@OJShwyUu0rLpIwjjT0u~9cxaE)fFacjrAiO<5 z_XxVyvrz4sw%{S#Va~2!&$+HcK{qTAt1rPBox8*o7JPo_p`1m$`nQ9Y2p3laz4hR4 znV;~UoW7yK-sDXDE-txkWpc$!{x$LHh1jKg)njb>EJP2%(>1Xlt*d#Rf=QV`4aaxN zobjZFMKT#td^96%lt_kpBxV`Je~&N7Acrm{HDgq3v}Z;>X5428N(t1&;_YF}heaZ4ffie!xbIjpSx(^Nyj`SaKns?_+;8qTENAU&il>trMf=< zr*n`F`0IK#uXk0;k=bAb`pi$x7#~V#W7;M~t6WPojfA})8#v{e9*T|SFF%R)0u(_X)Lgfp1QvMV zQdHge%IO=Q6cSCG3*Mx200H|v>;4gYe*3UqmSZ5*@<)^FtVLZO@+Y&YrXD{6Pj(Ri z-!0&0%NUa>Ikww^sLO`#=R*??RNDFuo3>;QmGrl65o-ohwt(*i&PoQ312jry-YkQ# zbo2TjP%JUoXs$xJhn&E44tiSmo_-^q?CJZ|*!3=oKb7nxIde5=?gx0QMf=n;d(|9C z(4x6Is^fLkO)Q2f-5#%s!Qo;U=VPpOX=i7l#{~hfjo*BYh|CIASPzm*iZYNoXi~av zt(4ZCop<_fvVwVM;qIki7dWi2*6^5NpU$3)>-&S}qcC5}c!AYKMr*sZK7~6Z)KCi2 z#~aq=4&5PZj^7S|aM(e%V!qqXi?^m6ctmr}>L`J%RjNjJ=Czp~n*&Q1olPIukFAoh zv!7bC*{@ZVRt&^LBM}b|XxVYI$hn7F1OB&oda7m#%P^4R}UgSAJ zH}~%)yGSv{+KViNl7W|@8ox;R7{g$URi4oE#-%GPcAMIF<0W{Y3&=qg<-%ECsbF%B5#aa{GHx{@MQgalQ$=N3(_xRYKWY`STuz&Z7zrkmt8GU}z%2AZ^2X@pz_) z^5nMTjvSHifaY%BdD0UU2|-P1rgSZ_Fvh8KXEdU?M{OQ7zNb18;iUsY`ylEu;KcIH zG1F_Xw*O^%3YHHsm;YqmP^pbgJ)eFpibpo$`v-x#yG#Pv56_7Kv+|MAA#MpRs5NY4 z-A~B>snz`(CjU7cJuc{v-TmSyJKlfKQMSK1s<|GY_cup#mSC%~punIOmXzt&?QB&Q z+!v2{1#yf<5s=scO%n?$u)RElReqlOE}l4>+u)fna>9Ga2%4w8=k6(vBg`U|D(KXtfL>78N{Q_0p zFiljSRfr9RQusBzl}MIWHARdR8NH3_k&0M~K)@(ez?PVijYb;P+Iwte`*r=IVy(*P zU*J+){M5ZETT&~8&JcL_@|e7Z5G+x{822Q!&*X+Y zpWb;5z9IbKbcl45$IK(R_9)YD+63-lW6doq4G`H&azLsagT-69tbPfrZ`t^oBsNzA zZud%=TFHX7%5Nc8oH-!DB@FbO#S-~o7+jUDr*3Qb(~i&|@D;xyB5gwC16^@7sskHs z_cV*Uz2GalAWlV!UHjd7Uf}7h4A9M9f1U@3!Q0e8oVI>Q5yL((r>%&>RFs^GhANxt zpx|v9v}7B8E2-AYYyGKakCf_+6Gtr|=fGWMF~D5y7_<&iWIlat$9s|0 zk|iPI`A&wfO?{-)gX5nnomB}qNYm!B_nb2|Xu`zxveDo=?E$y%4_%5vtJ*=N&3N(Q z2erxZ*izcvs}jAj6wubO7kB{v9AyfM%I)pit6>kym)74b?)L5wlZ%<9U?0vAsj{UJ zyBWJ&6TwxczahQZwqy=|-&v*TozIlNXM3|Yk9iJ%qIK2`w0O<+8$=`Cnt8#8MV#jDtQ<09eYaMUOO5^xE;pXSZ{m`TT#Lx1Bm*p5A z(ltlL=mu(;0^)bSt!3g9Twfqah<0e7U9YXy!ERR~xYr|Y7rU-f780P}$fWx{vzLy( zCg5A;BIR@dtbG@{4Kdbo`L3~`P9(Ld0$w0cHCGe1zIff>ZV-kW{OX8!mCDYPk+f0g za%K^w2V_jddS;2b8#c^jWwCexb?TY$U)GR*>YxAPX~^H4{LhgH`UZ-7l&{fk3)Fuv z5$u0SM5Efe-3rr})mGM623IdlKqAWN5fx^Uvyxz2fxJQioyJ?OhYyxZdeB46qTk~> z?tlj7tCJAn4e5Q4;$5qw2A&BpQdYLZp^RWQ0_xOLS8CqalWCU{V{36-@E2zGmKw?` zh0AxIcpz{pvx4oNN*8<>2ucc1zyWJkgXLoL&fBldo4a^=$c%^e%+KfvJC7DtwZzAR z3ep}WsO_Ej>JE!|ykZiC;z!mm%dCc;9fusAoFud; zid*&*6f^}$N6Rg0_fKmrg>LNj!-o4Vm?$tcWe^|` zmE$|>#~2YR#Va}Uw$!jg`MA66u%&S8H#zCO&N(q-lIF^Id7c5YtC(AY>w7usW)XU=Z(LsmL*Af~7AZUz z_8eLGB%Oqogxh22fR(1*JGeoF12%2oU}F%lP=!EeXI6ZQR8$4Xu-A>9yD(1YiVM*X z5sBn2+e*-n8MgnQn$Ex5>OY4lrTI#0~s zFF!X`Rlp#)< znX0M>2^Y~e@l6~l&)wx#>Iys?cGtTx6}EiL)dtr%|3Tt3G>xcUh_N?i9V52sN69@5 zn#OAF`O1R4e_T^sWnJCiFFx)1`Fq^X<&zzF`#IeJ1cv#Nv>m;^YaULPig31S2XVEE z5kO5lUw4(uFr%g$BmD?anpvJu+oQ$SX;QV;iL((7iCZp=r_)HZXri@%Go)PkXn*UOOI9gt<(&RpnvZWP)QJ`*&rO3Mc1MZVF~wt! z*wEbK7^t78>nyJDxxj~wWFquLV7)-|maLB1w@p?)ZX@@L?ssAGA` z=4l9!^4v#|LMUOg1^(ALL*AFQbbMuGBm7!(IQnGG0yrDq=qnEMC^O5AHJ3qdm?^6pMtZg9m zs1zGs0IYS+HVwI^o?KWkV=Mp}j&)EmDK|*PgHywh{mJ8gH0&iFeVJY>j%yK2Y=B|< zd$yyVej0_0d5h>-&|C{jmmCy5GN%6V+!)mhMagJy1N|tAOrp9MPu2sr0!0Gvz#>^@ zO3MyB^Z@vd(X&nRVuf+}Yt1|wi%`C}fIf=;yhlkwu>e6zTakeaF?WC;If=rMSdnT8 z8veJ^7K5qIeV?L%l4Mz!z7vUvS{v2KO$)v%!V(;>W{(`LEUB_&VUATotzXMwIy6pQ3Ie`=vuM~*q{8m1$)okda7hy^C zymOx9-wCKgbs&(Oauq-xZjnndzZzMtOhxOG@ZQO?NR(6xSx8{+OSkE;2yaybRlOAh zw1gup+2&w%A51-NG+k=D`jzNfi;5Yp*g7rkujg(=2E+!YKmfiayCYA`3v6mPE!`zr z)x5X`(D6DX=pWq{5DWSF* zGgxNX)xXlizOZ4C$*0j0?aHpv0%>h)ybW8{D8nbECD`1iX}fAnTmEKK*<-ajTzth?88k2;CzvlHl$Ea2H#j@nG&QZ=&!g zBVx9ik&ictKl{v7pgFzg9-5+Xqlx(tFD&&ANf-SJ4jr}Q8|N!uF>XK%Y=y`W?wzoK z=OsI5rRben2jQM7g^wQle93dB2PLcf17!f@TXo?{-qC70A}z4xo{`hflI{jtUxtJa>gs%DKjOZ&iy z=sV6Z#dqgC*ALdP_upC4$S zQ$sRXZ-9-HofrZi@Zq+}CIlju+%-_0aVn?UEe~BC;3zN(6@7_w*;Gp8v=x1(Biq$IOf6CzNjYxWu zYs_>dSzc*d@%{O7qKePw*&|q!wXq5_@NSi9y`heCOoTuI)!i9KSQJd(u>f9*`{N(33a17tJeDS=6j(!i*PUQw|DgV98$!$31Zd#xX+ ztesbb-~k?of3%Tz;iOHZyuzb&E^SOEoGQ-WTkzY9Aeux84y&q;ldjAP_7eY>iqWF7 z*I(3#|L+%dez?8=J04$irY_zM;PDy%4UfGU4f|2@%K~3MR3=X_tUq*stf zrfWXRmNPr%#u3pa)TzsmXPsj}q?twT5Or&4#2y*RE$;UA+n$#>CX=eU>e)1r5x_V{ z7d@t3AL9U_S4l)KX=ua=wrM4^3~E0YP{+zYQ)FMmEfya6;S>d}M-~YUq>LJ4C%9qR zIshV&f$?yTTko`r&@87)3C++*rMrr!r~@9HzRd z{b99`aYH_rx6rdfq$yr}Bv|Dx5pjvCp>yWAvZcV{@Y`*&g*j0UHWV%kJvOsEg5n!H z7SB1h%Lf|?#|KoTDQBg%+4}Zp@S2#fP3Q|5_6u*TY;~)n7ECZimmXd5G$Da_qUL^7A1=X4-cr=NWM;FO7x!u0 z7c;C?6ArQyCaFM_TTD^{)oi-LWvBTfG=I~jj`Q#N@Y%-0hFF?VRVI9cHe|_ zLxtG&&FELq(H1?Q5t#8x1z1nvVOl!sumer=-esrRai(#;XBbB^o#;y611&idWMAt=FDRoIpr15+jJY{7=w9q8X{3AZd5t1T` z+C!0+YEvD$xo|s5hNSG4mLSrCe9-Xe%|b|FFGvFdO;JRPo&NJ$x20nZg;eUfx9Tcm zP0end>}b|o>3%qClXVW0?nS=*nKF+UfIjuy$wr8ZBRz{VGCV zcMyJASYT~N6&h%;xr780HR%rvIu5$=k(Oscy$L>0A=D(!@j+!BPW^(XP*{iaIj$}R z4c*gOC$5N*R-<{>)IOE{4lkPuAG%^hS;tG=#8KmOzo05^Ajrxk2)WrbiKk`Oh?cyb zb}{nsp_ulXl}g;Mqyq7rNxwCb2!jL#jWEN zDXwDt!unSb=M!DwaO@7ZYZHp`PmL?#d2dwG!rZ4}tNihDod_jW`DESk(VMewV%hTP zpfOl8Zk(0x*4LkJM59^h&xd5o-++W_#tipqVG$2vo*0Ty=T-&gvwCX5s33t>9V4ZL zdqNzB?@&MmHxZTJF25Wnz8E$e{ShbBnObHc2zM~_?a*Z`;bw7zNmEt^ja>1P9{x&_ z;hG(xVa2=qbrwQZ@Ek&re!oqawM5^b8?FfIMxZl*ks3-2wuNw+X5B5!M!@Q|Pe2u&)tLVjo#9_)i zXnPlAlfKD3(8m z2&p|9LhuKB>|&99Rsh@k1P`geBDsA+GIl3YX1&W=&0YV8J3Bcz-}v6aU$NaQ3u_a| zZM}@8Ne$tbe`JPfEOA_9kp6UO_2E?RdaG*!^5sWt@|$hvax`t^U{G!)?sNTD>|uuQ zIZ5T}If3X>jw+$0rxKr0vr6Lqn;Tf*mPH+ezRoeg1Sovq#U8Lu^VMSKYG^v5)Csfu z7CrP7MIAq~=9sD#Kgr(6dS&E~@Obgb=YQ~K|7Kdy50n1C7UUe~1k&yx$ z**XuI>QuQoY;-l4!{H02M}GUpnIi}=3PT|7Jjrp6irrWGPQF`=r_r!niu3!8^Sf|E z?eFPL#|Xlp(rvI6rD`l_Zfy3|>>ZP$s#UAAbXuf4LU#@Cy_l<%H~N8<#->sPI`b@y zftEYYiGW&E)mR~4atf|6fwb4mN^dh5Uzvewm!v}s=-0>+s^0@iOL__~d3FUPJpip4 zL=WKChIRmlG;%GPR7{s#`tQ@?8Df1YT72IEJy-=4jcMl z2yVVGc-bcylm76WH#&4m_EzGEnzVry5DVSB6Ap`X>hG2Xs8*IJAheqb2ndtwNs*-S zEBYNhhc=!N7Cnd%KAW_*F;vOaS_XS@_X)7u`Ohm>+;9%m*EQfCeKJcwTophE$ptoq zn|gyd=t0UVWhYfFbz>Jan>z`+diaekvRZi3k=7eV5~OWPek>RZo$UQb$8V%KU_5v< ztyr-hw*J=K6EnVqM?uc8b{PB9}-wzSURrcReKsO&yg>9#DeWX8^T7tlHv zVX;O{yeLGYBXj$7a-?HX17}zlC7F9zT?L2N?Jz0i3rqJLhHm0hme0fp=uBcohWE+& zq+DoJQbl?9$%RfEF5#a_jd#JaH8|SEGiTYJ?jmZwtCAKdPFW_^)uXW8O{s#Fn5j;e zzc|5As$$L7iSP06;Ho2=!_5LsvB#+%2`eP>6@YktK0@~35?GJ7SD*L>ZI3~<+rpF* z{IOcSH)k>00;S6(lTGcE8eO*cRVY)OBiT3M5E1{N-L*X>zF0T--Imq+OqbA;D^v8b z2$Cv0iaaH-ntcI;8Q&nvR(W`<&m)8Wm|Y<5aU`Tv>Sgn9zwIx%wa015E zV|ez6&)E%G^)IT7j5a>9uni2-ckq&CSVul6j=ixTBL{u(H6PBna45)N$|!z`uHXt! z^6^9LtkCBSfL9AguamEcS-;@xq7wU6jr26RoL`$+-d4X=4DP!mcZ&8{hj( z8tm$wMVQk*Nf+KbZ#lap$}6UzE2fW8&>N`VIgFAxj^VZ}J9-WuAC@pezq}q_vH$!$ z*$(B${+VY=@bmNhOWM@Xz@^$4g0utetq(;vb&%tZDt0Ak(H>eK z+OLN{F*7^V06D1I^bJbvwPP{vU%^JRfn}M#4f!Kqo&;XVfLzf8d7A}gCe4&cV z=R-D&L5MtdX}d@-zO$0w2DbkdFqOFwjO1%>7eL_tBHFIYYCOAqm zW^mT3;$1md<%D`{kHwA!k!M8zve9eimBOd10@ciD$inyob#smEpsiOle$-C^5|GX3 zh7{G$vQw?(Abh8H#NTzB%O=DQlIx*y+59!e3c!k_^se8a`nNE^;kEH{LgWCkx3M9w zwPIL2ocA$n=SZ5nVn8WYO-;}Ix=xt6E*i*uiAe79U%=6XR0JP|bB6xDHd{e?>2 z`MU)K+UpnAxOK~?k96g_*oIfR(cqzvYC7g$FXXFxSK_2RddE(w9HuRIr*sGOf(z|j zlyIfhJ!Q!|e;MfE3CJe>?{@i9ZqHxsQW+A-eEDzfvc*e|*u#JbqYeG-Pj(6Zr(HH^ z%&FlJro6@0CQ(I{OGDrs3{W(JNcS}gJsi!4HTTyZnJt6b2j(rcRykFEWOJ=$SX#2k z9Cz;hH|(-5iPXQWP&LBk(=X-3dM@$ymjQP90GO)&&MtrG~Dd`qwoMVtb<-t!wT;nizODt<3uW_ceaV5SVa{wFGjLu( zYzQ)LjC#{T9MvFNd7^lB9gJ(Vkft#&86zCgqaUM4Qg*M|VXL;kS&&7peNqTVjpQ3o z=bJi)P6-E$@0#vzpt;7NZ8EqO#q{FV>k-&NK9=eXOsvwH;Ej?MO?7WCo9L8{#SU== zM5pluSMW3iqYRM6{EWNa7yC0o#pE82bqz2B!Nn6`a{Bs=5ht+~2`dlj=4TS6UVi1# z^_(yCt0Ztvi$EDsCzfrTF2DCQ&r4wtWLJaf73$i8c2w5v`1E$j@(4Tj4TE)|$#7@_ z%~}ccZvH(4&m;K<)|xB&(`632ZZjLwrnf4(c+5fQ9}9F5`!4k-on1ips}9q2(?HIQ z$dm%4{CgN5`$}>?Y6RLhp`Si#e>S&ai`#rH`Hku}DGJB=nZ63dd*wLE zIK!0G0EWf6kQq^zkhsM!U5AQ^T{Admk#JWBt*@ zMX{65#);al-nqofiM7SRECx*{9S)k6=NpAV8_6E;)`#p3-+P;bcfYPNo3vPa{qG*& zr__<3Jph{dF_Hf!$s>@MgLQy4vV+3cl}}Ut0XtqV-?hD5V?h7m0jU4W0~GzyGoStF znGpd!GhvueZE+O`KOOsQ>yMuK-0_c|`Gy1JN6-9r8Vb-eD*<|DF6EQb470(c}bgHtRvGQ{OBspW$HG>sU}rf zOUWSc=ZGlKngu4)Anh?t1Y|y5v|L^}4oHcxq6K4;v>*t3!HX-YlxOp?_t%J)qYEpX z@qc4YSmqua3df>9)ftr%DX|Wlvm`aWs&Lq6FKdKRPxXj{^q*z9Ys+T-U?adi#Oj~1 z6~RrX*${n~vxD~-UpXc>hMl{@S(#l)0bGmr`-?- z98w7!bx=m(wfuhj5;%OL5uaTcvL*<1g)4y7rAGYhLG8&Pg6 z-d?k(yhK`Netv@E+$J)jA%^MD$y@!@NbU4~v`BFDqFY?t#xW|IMSrZ-cAQUM~FhAA4!7*CxehmD^|K-a%C+l+;O?w_u8(b%C2EG zL{}c!nc?>!Rw zlW&O$$HBH>^&06-S6?oxO(z=j4qd5Hs@d#h&P;4!R#;QLYKUTn>S(iYORq0hNLG>u zLHnY+En8_HrN7w5po=Z(XVZ^o4&bN}s(8g2)-xDj-$hHjgPT^uMBTz4BzTb~`0U-SmS(0-BJwWWp(!k=?}*I}oF+_|^HyosCA^+Ij;{g=u$ z|DNCaDOKY?-){}egmdx(MlLmgvZ258TR&vS6)J1C*o<&K8+rz7qtIi{N&1Es zb8{p`#``Lfon#WAdrn-|#H3+)KK|JS4>WCvnDY@iln=76QdM#@1MO3IH8x^Q3&b1C zJwdjDp+4==I>^S0&?BP;y|z!Ssa5?5E2)sAOZS8?$6cz4(*2r0?SQ5+d!fH{enT}Y4WI7H(6WH9q@-;ON`;gg;0UwI!5 zFkUKH<8)z6(qOfCt9{<)jt!Ca zW18wgGc#_Yi$H$kPUnfZ}-Bucz1_OThPq!>x@TqwLOryL*ntq$38+!t0?V># zGwFls65ZabPFY?EIF>kG-ko6GS`NG{c;j!vDnZ^Nw2pB+#Utj@pb5uGAhwpQEd8FU zRkb?|#)A)3b)5%iD-DG?VKVPF=GCGVR~s%Ag62BYkAAVUe{Y8HlivMjJNr+ZA@FCn zQ7i%MjPcFy?d*pbcTa5tFhfA}KB(dlDTP6vHD7kpsr%qBy8;^-7HzsqCTlrtA9?!W z^t=1GY|lVT4sRqww^aK5**l+u^KJY=@)g0ZVDnz7)SHliY6R&*9v0NTgUP?0e<`$>A(c`**UlDX7ULVspA6&}cmnC>0YF6NT&?>CnnmuEw^J+-6<{@=M zJ^lzSoKN&tdbiEz8OFu8rTt7D1($3KL6(dzU7?rVgoU_W@PiyUF}ah3lbB+DsADq} zTxlF0;qVFen6mAl&HW0?i;k-3o4WD*wX+bFSvK#mwOSkKDz%bLwL{a=&5Jp&mc}(w z77f`J(hQNr%NY=iXP-t4wL#UmIRhA+4P5-)Mn?E7ga)qYL*Lsll$GRlesJ9REZNO0 z*6njR=Rn6|RV?n$fD=WQK-S7_lOI&(SOR&0z^$d;V#pA_l~`b$BiYO3%1h6DJMf^1 zKh^gTxgsr5_H8FT=8PJTw{yFff#+ki5BUQ}q@8NZ%&KQqu{XpVW9?*KQvo}a#rw{@{8Tkm4u|#+=>LJLZOgWnk@NLgGIe>vMW;SwpBlfN9e^UbQ@ro({#O!(b>JFk+X@pE1AqHer&r8*b3@q z;=R*>Ij8)#wHFP0$)36N%WV1I>u7(Hr~m8#{u3t%%VcU9;eZK(AfWum?>gEaI@>k% zl|L4+0i`hEz&O0^DP&qDJ;uOU}tnLvani4(SgdcnK!gynX2(Hb$dxvVclvfO{qkUn0320n{IJh$Ykjq zozKbN6%}NH2qFB|F@(LOVyPxHBz>x;Tl&;>5JlYHD%*aX9S1i_u5>jy*VLtyJWW{a zP;nsTE1iEfcv1i!MILnFkBujb-nk84A3No$M>vzX+m4P%&e(A~)W?4eXWMt0^Yy{( zBIh9OGjHC^yb~q;Zf5Q3A$j5X!U^feW6RTU6ZK z$?N2b0FOvOu)x+_#otRDM?`67(YeN}Namo$eN`k> zKmMTloVw=jTbDkeuJH~d zDDiR^aNU1be#&Im3y!g0<9_XIx)p-W`qt0+K7v`mm65hMJG>yfc$cIAYn7?CTV#R> zs!6ig=KHHDQLhq_DWC7q0}otE3=$_S#ugD~w8V>x%Q<^m~sSwCX+UukYx z#3Ne+#q>^&n`#_wVI=h{u$_RCpelLNgPYF6jgKQeTp4lJ?FCF3`p0Mby1Xcr7^_q< z4%VT-TJLW5dq?+^!N_qb=~P|Y0BdB1??U%)dO#s|eqmE8hbLGq3T8{B8-d2zKLulI zFkZTd2tDWPc!;GrVaMAzamH&sheKfIDaM3Nx-)xT>O>xWpj_wmUTqybe@QI66aN0$ z8j?N^x0G&$Y*=_IBj~)T=~RyZ<5v;ozvr=jlFI+*d#rQ{BIXEy$Kv@LkM%<*y7#BY zQs!6(dV6B1;hb(3U=pxfAQ9mHgbD^*DZC^s5l%{}B^&265gYHawnn0!9TrvSJ!u^r zV!97!>-|`chBL4HR&~@9GbWc%USpOtVbt?}uzysnmJn0QNWzuY{N1V|v#0azj1nt* z8<-W%So&`Fh$bt7kWzUufr&CT)s*U-CSpXDJ63X<6{7M5Ls%{iYT$6TZPnR~uDL~V zT_4fZgx}$l@2pBc%wsMcwx2p~XyIx=8k!OrNAFNLIQ`mMF|169NBoJJChe@&>6q=j zEo(Is!Zr{I5(|QVHT#W?!NL$B8`SUp`|{O!7#@52n^)9L5$Xuw27$y@{S4 z=*X=Hx{HNOLua$w7=BcH&%KZl@kDcy5_uh+epJGFwKuO(brC&f#nMd_t8RcdQmls^ zaq8YQK8)4$8MbwAR!Q~yG?t|1OEkwms3@!BO_jC{kLsAl5yd@5dmGmQ_I%)h0BY2r z#=KbL0s7fyi+T(EbAKi+jfUdKFI#R(D$2D8bsuEsTW{ZZ7PlhyW(?^iyL)OK@2*JU zicJ|Md&``xKXev2rE@)&%Z=mK(-pJSv+;AHNFz&)DlN`F%B6>78_{8R1ih)HMpQP_ zO!cWScGnucbeVKul9*XqsKY6m`H+SqD}Ajdbki8i+ym9dJU)c{Rdp&H_2bxe%T%O= zs}Q@FydF+YVX-L39?JNZ5&sR78x)iivMIvW`NYtZPBRdk;gH{EDe+M#MbO@DEkP-{ zGXWPQGz;8T5Wd2TJcx`0G=D(eZU-(+$|<2+6m@=5p()HD0|4svK1k5qpYkK`X(yIno2{ z#zfdQOM9$Xb|o13Nm0dIA`oF^SnuhHT`@Xey8}4}Wf)s_Z1By>qO*nfO{Ah=uZCFE zR8qb|e~ICNzr~V`Bg7+DC6#|HqKV;n z+%UVC0Cx@u%z@;bp0hxm#`vVIO?F*w5Fqdr2$an8%Yu45-~~ ze$Brqp0xJ6(-wR^0A!y0ygvHxxvZbG-v9Y7D~WVe#0@aZ>iL`X(H{!Q3YBh)w~T+N z1cTd~f|hG^pt}3uSqjW^tfe#fZ3Q&O+(qi*Cx@allsXEs=IM{bEeCUC2xUCuRMS%L zMCG{i@^)0->xcQ()$WXEFPsA{Pg7bS?2Pw644xT4Qb?BN+oKO=uqqz7UL6m`3{h+TLs3~;k$g)R8*LZ~5*5`AZk88VXE1x;SG~c97&Y6l*!NV__v9PMHZm0U z4gxt7WV=){<-R*9BKWgrtS?!BVg)Z6$~z5CW~#B>2~fkwn$ej}Q*Z2=hZP5G{xW+; zimnleK@mc>;HXb#0aqdS#Gc=~f;>;|*>CJ%y(;qrFjtxDD&vJZo37HjxCUfP>Xh&B zXnoSW-q@dFQIQoJeo|x#58167qpDozu9qcI<2jJwWI{Eyw`QRC#CE{8XNqu$wqx+e zcjan7^lsp?Fpn0lDPY{4GOeMlS;TwU0BKu+vD>!V9rZ7XBb^_ofMf7X#nG&Art^?D zObF;X<~EgUg!5iJH+}Zl5v5Irs$^7W;XgEvMM6{VW|ij-?@9`};Nr}n&7Df7-X9}Y z{LXj2O7AX8XQiR1%F`1S&H5!kb$g`RE}KP7*aD;uqXkxQyFpPDqh-oTZRi;>pl&vq z7^OY%pxN9@5<5YaV|kPk*n90I5XBs>I}Zc_o?ukU)_{R0P#rSySOkWTeQwRCaU$;^ z9+FPZ0T@*gtL}iBd{YZzCm*>_YL)o?_XKz7$7!lf7xUT#w`p{jm&w~}-kR|y6nkqn zZ=_3LP;vh<5fy{UM(u$_uB`;@tVLcr!@KoFGS8&%R;eWq;1QUzSIGx#)kmh>7;#yf zzb5Sddts2DG`&9?=zn4u#N@0()fh0QXhHx2LjPH{@nfR}#vh`;H8tx$)=9p1=-H!e za)BbB^j992sI-g;)f#GHe4%8}$AlB*S`ws;C#D=ctA6bu288ss;a?_~yPo|LU0o)4 z2U`<{G3eRfgs@sDNyQ=TD6oK`2GLl4{t{warJErWqECZDx}&K7ZZ#LHgXeWSXatVl zFEVIY9k0AxMFTU0zP)B=NE3#6Od{%VM80w@oGDs*Sj?}VQE9c@L1%0>5>TuTCaTLa$Y?1&qWb|d=ib%&Td4)wiE z-X=?h(@~!&oe3g_XG~F@fwZnJt(atqLb{l)K)}O_B(9u}cfl$16DZQ$gD&93%-_?? zvu)yC@xM0!Z5*HH!&)ghQ(mo!t+E*Bb|@^`A4<_8Z!b*EZJvq@=a&z1qsZ&9rd|4G zml2O*_x`|nbV^Cp`w)y&wG%noO-?}u_ru;;0xlrrs!NuhiUt6nUuuJ0? z|C6lLN8{6+!*vUViRbv-69*UHjw0m8h`WbhwhsI2aN(!yc}M!c%Bg&6?>OQSL)jvjc4{!4;*UJHQKC;IGG9K@9F7rNgyOQ64_H8O)RsVk~y+tKXqa z8d^{(2AC|$d%7-MNRX0HYI6%P%;}q;R@-B}Ho_7$>ms&y$|JsgE5X!OM&Le7fYpbf zhV*!yUd*R}LMXb~^$`c&gz_lS$Tf}Iuu`&AKSsAsVD0f75?DG?9+gg#SxaTId{^|9 zsM$v)tV~5ui_w%Pq@Ln<8!@XqNC!#|U-M0uk_lU917v(pf~fAbwM+}glR=*zOzzBC zuVvde0Vhtz{JD8K*(`LE>rc4)Hy2D8_vcW?aur$nStUj7i;@qJkhe_|c{$p!PRU9c zFZrdg9p_r+cgI%Ud)8De)X(hha%J?*aK(A&vK5_s36y$&K zDSxZcREGRd^eIg*S6@;9bC6&_WuxB?aeipu`c=nl07~PIb4oD1X%L-s=Pr3L8JP05 zaNtvP7Ru^RIg63B%E@4ovfG)rG#+)l!xT$4kBzxn%wLI?$Q~|d$Q5p;E?W|`0k>*Qn-qGlHD9)W+Lph{=%{Z2Pb%(OX!GEZ)nnF^p+l*|>^+*ux8tYG#ngyhGC zL~u!MBhSeiaj6@>XV)&1r_SKKyUMAvIkte_>c1zPwA+y?rpZDXoUrg)cg z(Z`un%|x84MksH#<`!SY_g>v-6EE}ooAI%f8idx&`rGvVf8XMZe zutr0^m5@_Vr*<_!3g3yeqxZ7`|8PHVakUmw|w9bGehDkPmd91Ka7=6h@S06MQ35;K~rJZV!g*cCP zng~ZVtFGgnk8wEoKp$-#|GwKw%RCYF6QvS^SQS}>Qtug59pa@xks~HXSVhK!m%!H@gF)0z;K!pH81gM^Kfem}|OZ;c?m$lhPV2W6Qg|QTv^;Wh2T2wg|nMcsvpfos#Zx9<%*2?Z^>ZsDr7JrY7GeJy#Q; z8F)~l%~9oT-??N2Xj%9>;89tSJm^1vTxBa=-Igwpl4C0HxF2%wY5A-L14qBC=MTq@ znW9O(Q`}(JD;*3mV6hL=;Oi($_Y`Ct^FHgSicH%n+s1W!Ak=5&GUxOA;{3enZt-d` zM^eRYZr2P>9-VTfhE1so!u};SMcZ@e=i(XY`!fC0H3n0`>@U@jS!OW`IU>e66|~ZC zVY{z9JckFPBwm@l3ha&S##PMy%&k$SuJELrx=>k|b=zG@6eT`;!bUZ^-U;JjN{^C! zwY;dH_OLm!46OVJjOYEm1oOOUlp1Z>Za?;s>{yYEX3|)d=DuUmNMecZ` zt3WAs(zB#7RjneCoChObCMYXPK=s}_vsf@ih1@&BP;X}%Ujj>Wk#ntc`PACHvgoul znldY3`r_h*J-wbf#hTMA23E&S`jVCr-cpoGBb(bNEZScyx3&1_neO64%9EP)eRP;o zF!~Xm^ym^?nV1S!H>u5RF;g1v2KBE5Gyt4&Y8E(58Ln*fd?X~?#ii<|yM7x!c~9ArpiInh{QFI{$LaUX5|3m#=u{H>(1 z#s1+!`F2k=H%NsD4_*|R{a8~Zm(L5bkM(5khc61*6Mi34e2Z&#IWx=JN}$WY87V-D zE1DXnBAG@paxZG5%*^H7PoOx zXdWw}YRKFT-me}~G+67g1`ozR4M7b4q{CP%s9rohXO++s56fku&e4lg*)aII@iGrO z4?dqe*p<9{BgnNAWm{T8ECj-_+o3#>VzTT0)o81KUw=A)NCyr)nUBKVLvC|siemGe zoqG;FI?e(=Ko<&J&(Q}|z#LOEgiDId*5Jw@L7Jcn+s#ssk9fDUCbQ6PN*!!vwXHh} z!diK1mxMe;_CcMWqQHZM=VZE0<_$#H=mvY!PUTginFS4p3T$;cor%zbLMq1NV&D}9FjnNh>wf=Bk|+fjKO?T)p$4+q_$@jpW<$ZP0H@Vy+?e*suFwia z84*1j1UrsSkW)%LBotC^4y?!Ka$K*sZNYK!Xj$~5`;t^(IYoQCz=IN75z4S0 zYfr4Z$Z<#Vu!F13cNNb7IgJm{2ix$M>US?q`o`2W-`T_-knCr3ek%t>xs{Aad#diE zqL1AGm#?@=@P3u4Y-U*+**M)kM}&$zcn9`~3UvlzII0NKRGm8yTm~*651qI_t-uC9 z(x;4OPsT3fdvA=Ma#(2NZ~k;`=r4_)_m$&)uqCFkM#r*x>RyG?;}Cwc8iMpteXx&E zR(X!;G`0TV+u->`$b}fyRb*y1{Xu!n(kPKpd{cJQ4twAcv0^J+#nmtP@iSahot~Bv zWbvzkoG-heBHqVu!Ogd}V|y-k%qv(2O+F5a;4ntfHGLO$BQ-^%VV4!(uFR<5Q0oWHG6H)Pz9Jnp5>#vD zROslNTb;gg9A=!&c|NJ{t=vgbJol}oh#11s=R)$xb!y8uwa_(^xw%gdb^`x@8ZrML z6E|vR!&l^i0|D&-qWTzssJ@ZydwB<2J4gBtW=__IHU{QabQX5Trl+cwb|@;S?^-?B zDXqs4DQHn4ds>;g@@WOBLCDOP3lrJ$X{+pmB@$#%(#Q;4W|Czy;vV^5h!Hk?Xu#2M zO>va#vZE`AvhzSg!C~OuBh}wy^UkE=3p z6)CAe#SHdy@FZ~1T(PcIS6#xLD`=3U<^rl)$;gS#Vg+IWH=0r~VWZLeJe_?Ci9Wy& zu`3unLmDLQ>_MuZr`=Sk!vz9|i!dR5`nr(3g-(J2ln~5un5-=mss-%YNlgN%dC6=> zZxJUHTJJ(Mj?5^~CmQ}%SzFM)Jv@avT{8rnaZKdkifb)&1!%4k8TUCK%RBu>W2AXqUb0|JaK(qy` zX{FRf3g_(o>UykI%zoD!h-Z}~3+Y@G5|KQ^l=k|)V{Im}0U70k_#`~N650`4gomWs z(B?R6!$k#*J)95W^GvMp%>-b_xr`4}Hm1-Or!;-A`jk6h!2a?v?XVzrSa6%ZObk1i zAV|D;7k>D(AnxFG_aC{lk*rJHoP+YV!SV!BdBsh4s7g4GVNCPV35E$p6+_Q?6eac( zh!o)n2%Op>_v(>wto%32optS3q_6F|Har2HS`2 z7ON4n5zD8oaD=j~&wAh40pG5*X>IjDf7885_e{u!{tn_!cUpJt-m~vccj~Q@6mN+I zt*@TxeAJxGbkgC#N#P+C%H;>n7EhgK8*KN1fje5ab|cz%>V973r6miBZcc~a1R`?z z6{&E1>le~r53QJ@cTrYYiyA^R_b`Rt&n22k>rYBApjuwBqM;66%}5LBG2YIX$!45D zzyy-0){Li)l;oKvjoYYtB16?oQq4rnBg0jodA-qHlGkS$XZ?7gM2cwzyikq>QM6$z zuN^OdN+Mu|zFO<+4LL{sm@D+q!MFs8%;v_}^{jU>&h@dn0uHo|4tV#A?BdmhEY3F~ z{yQ8sb*t$02e@l+-(5-?c^~7nOB*h)geAVn ziK8C8quz7LBVhgo^bj)lKumgw-CUH;-h#Yxm`c6TUTT@-I(l6pTIhfbV?F}6ce>V5 zG-f!1$w_lvXU24c*VIPgk~bOI(p;mtBDb)?=7?!kH%04$#nr2flyh};x~1VXCj{T^ z!^`^^zR%{@N^*SHUmYhpC+nS7r06?qUgj3oJx@+E&f8Yn0h0*)^9Q5zj+ZR&+r6!- z$7mehc^}Ws8$d_?a`x0+%a`71ce$`Z_ws!(%SS;+*|fZ1Hb_KYU63QUparub2>B*3 zE4B*DKr`2=5{vV*zK&+H+fFiWdoLLp*LYMZ210?UTqZBW9#2vsMJuZT?yHoBwz}Yg z{m|}3*AXW#O(3<}Rd#|G18#6|j)55_c5v`3hw$Sopf>{sK?Owx_;4s7rnXS+Xbec1 zTR_!>RTLl~z|VglzJSkwfV6~c0p&|?P{857GSAfem>|cQZ|GWhl(E*ZU z|NHO-CI@`-kENv_`T4(|7=ik{;|4f!1@LbG0os2bzQFQ;1)KlwVcy$182|3NnUm3O z=YZ3#0hRv%DS7{W_yXSpKKbv{4eacGcXqWsYoagU!Vd62Kz}sGKM!AEG{6+n&h7)C zQj4)8;LZ&`nA_U?4k1nfsL>s8{b9gOF{&DZW;{3KPw9~~VUSF1|#?6aa3%0r30tw+J2}h5mtHU}$Bm^uf)_*ztFF zP)M80dIq2{P4|!5-v4Y+U*O&U0{EdGkTGxsRMYvdf*${P^IJYxqn8hF01Yw#c$j}| zocqti7Z?d}kALv_3GOcopnn1V?URxUlUM5kG;;-@VTt|;3wZSfHpBZ1@;^7|{c)MU z?t=bn%l)-_=O1O0{&@iK0Pz3K^q+WtVft6p-%|Z`oAN)QEr@>u{a32LdqjU_`g5Dk zkpBkmUzz>^`dgyE?(+L5(OHV$ApeEvkHGZ*)TBQX^##2B-<$N$X!ozkzwYq)Cs9AZ zv-=n1zYzTqZ~GPQ?^~3Z={Ioy(xQI`@qR`Ab)UOGiIOn?2Kg^Uf5aMoh5LJ=mh8WQ z`xl~r1~7g_{&gd%KZ#m#{08|iM1Rb%euevcqS3s+f%_Mte@@$eMgDa=lRt^Zy!#FE zUx@xlbNLnS?}=^-{08n{i2j*7^DFYNyCnQcbW89z$bTXFL*4!>+}{(05d96@zYzUX zGXE>`uZ7zGBnm0^8|1$b{h?|774Gke-b?)k?q7)hDU|&c`Pcfwe-eF={tfb9i2l&_ z{R;Q@L_aJ12JT;o{wWyz75UeCntu|lQTz?^Ux@xcg`B-g13?sp-9lf%(x1{&()t88 zR$7>#f(i)=iYO>(R)q)&2obc>2e9@T?0pG)AHc$KhUMOKyvLAYNwFVaW|w=iJ2SVe z%4424#cb-Y2fRO~g&LN8?v-b%*Bb>*o$|uYc`VE=z>GSz>4belFh+3KDMNg;0pWhF zt#4o&w{tSudk1LwT7+rl-x1F@=ci-(gRl(Qjaa)ZBSXWznK+8g5I%%QBE+CwcP9)whZ$h&@pk<*jh4dvG9st4#>Ez*! z#U|801X>mf(?*Zb=m-!_NGA+$bTpy*7-(52OcgD(ata71q?3gAF`3Zv8PKv&m>yc_ z>l_eHNGAsGQZS*<3!r78FeUUoG`<9c6Vl1RtFldKa0Rq16sLh7qJ;*Kj&(J60}N;M zPXh4D%!Zhrz;6Lpl)}e6t#mQ~hF8+nniqDNQuhvUMQLKW#!+ewf#H>Oz2>DVrnEEy zTu}-Ow!f87Y33dnUP)JMo@+OyZ^RX)uw-kcx3N^xHJfJ?P3iRka7Afi(Z*5Qe*}gf kC0(_7#>AA`Pww>;&Y!HUPN#PuX$yJ!yrlOit>@owKR-4{djJ3c literal 0 HcmV?d00001 From 6a21b02e88c153e59db0ba7df3dfa0d6bf6f7c06 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 8 Apr 2026 23:37:26 +0000 Subject: [PATCH 32/38] save work --- docs/softmax_lowering.pptx | Bin 45031 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 docs/softmax_lowering.pptx diff --git a/docs/softmax_lowering.pptx b/docs/softmax_lowering.pptx deleted file mode 100644 index 42e7908862d02c963e89ddf7ea4bd97ed5e6955f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 45031 zcmdqIW00iXx~^NcZQEV8%`V%vZFE&-vCCbyZFHGkwr$(q^?h^gbLO09N34Bf{hJZ_ zzLEHo`QWlmo7Mzl)}PLK;xVx- zQBvigu8o3s#yNIJ z^kAmUswbp3f@dq%J*ajHHV|Un-h-e$7S`rinww?pNXl@SCogtA<1HO~_zYU*BeTZT z%>OA)P+;1X0W2T#w512;I;2sNS-0iQ*cE(Xnz5yp9yv356V~`l10isV6AiJ~>HFh|qpubk2UZV`xMA)ZXulN-Qyo3PR7h-BAUu9)sy zkl*-crFdhR|JCi2OF(IoSRZ6?{z#z$kd9_CNI{&a;!r{vo+Kee6L(V* z0dH!(Y*pm#Pn&n3$9fUqR7*bmU1Dy`zA?1G{(0>gOR$$v8jntPFPPu|rA)5jInIE- z%48b^2nhD8O!S>hZJZey{<>GjPs)Q5BZXb^gD=AtGpt%geKYm`M$v(-OlS~t%-|Xb zEi3o>kO%O$YLY|b>wSAVz%}SxdKPLo9$|o>FLosbhd`}_%}i!29~#a?OC{F8G7_YJ z{I#FgktVPUq%*O%p@;#FQdi|=)}~IAPQbUvjS{M)IkXoz%Iq4^-7XYnwZPg;^KPng z4y;U9Sv4~wPg}c!n$;I`+Q{;RG`?WG6^HSO@&m3^WC0zSR{GcIfqEOmo^&+q_yi_5 zf5puogFqq2uKQ}>wJU4md{61x5I?K?&>9{J7hg?cR8;wryPq?TIA`SQ6Tqx9H_0v@@mPOe&?m0 z)Pqrxpk+#oKV%3LVNI+W_hWMO)q(&5Ar+-!4yLXK7*p@ioSFlA5hJUD-9Rx2mRGVh zB;BS#oE4flraITi?hJnB`qVd6G zd_e~ga+RDZWbUH9Be9w!p4KVq&5We}!EUKjA-pvnN=Ref#a*uzOO@rrcfkHoc__> ztSt*P%n&-=Dgq z(G-0d^Y%=5%lNN(%9@Jty!^_OEf^3G(m&_P(81xaG-ayv#LO}wb&y^oCM!5tBnbcb zzHbJuTj`agmBfxwKB1E9yMe60>oOKi)(0)F0KQPIc#uyl97^Z_J7< zP(@R0X0UO*sMiCY=yfV&d`N_8gBH+nMRDg1cML7eb@-CD`LO7t>!dGh$f(0AGP~V^ z`#f%ot|{l|bds?a%ehXmXY)??4~>#lP+|xKF$8I4)~3DvJr+N7?=Vrab*Jea9oiTAT8yLy;A%BLFpuoT}o7JeTr}C`TPR;)Q*Ir1u|QR)<woB_J1I2It@Z1f1bT@{kO53g;ulJD*|4dD3H-RENhCjs$<`dd78<=3AUk3xVtk(rQ~d+xd%G4*?8RLHIAga zK!AsRATn9MF$%KJmjY_HDNpM{Pe?foOZP#M6!R3*9L@cOCnSH7EP%1zj!G0rTjpYx zQ|`c348B4$Ki>B}S=*wd-Iix7%~osrdtvboeFfNiaXANFRDO5}d?i=i^A6ARxtVgx z)o*(DXghxzl7RcupQ~*NhgR$EWvyL1UQ+QFh8^BXIRsi&CV@;aNAiu2w#M;$WeoH3 zh%jimg%8A<_bJlgI|}zy^Tm`xy69AcQptPD@q1KA&bOkdQN8;lZzS(tJ~Z(tlsqiU zi=5657skzxV?!+NmT^N;droK0XE#fSwWG!S*=@Yno;x>lhoz%{NnOvSW(ghdrA3LV zrin)h9sk-(M(6i|iH+Yp9Uo95B7tU{QhAb;i9;Z9yyLF`Zny75nlM0Fcojk$G(XwT zQ7mUB*j`Ft*jaeE)&+_{T#cc$AP zfKBb&1>wNC4^lg;1Yi;iG;P{o2VRhJV&76cE@NZ43fAP=R!-DjC^XJJO5xnqh+WAu z(z!2*Twraia2T}J)DhW?-}^JHTg*w8l-K3(Yjkf^RW=(Rn{o@?9_zO`WK$)%Azz?@ z_yU#uKnn3(c(L)f2f|cQ5v?Xr#09Rm%(aF#OUY0;w(-PsM+faXTx9OO!!|F`wxx&_qzRCvpW_h?SqNXAg^9OV0H&)Mb9n^ zn^H`r5M`1ZUqMScwV1&{E3`jf<0>uZTT<|$BnYmaLjG_tEb6Pj!MMeCBtX1aO!*X;_UNwjNF)9~BG*uf|PoOtc<*D`)u^e8O zKE^Ys-mxEKAyJ2FcnC)^fywbw50>=$jv&xdl=NDbkVH_IIpAc38->zjP*CRm%7-bQ z22f#qedJ-3nv?@)KMgRWyhqPMlT$dM<)==|F0Fnmum~vX1WD9GO?3I_R^R;BwCQ9> zpoxB^4e(!X;eUhXZ_Dr(Wg6p0|8fgrkCJa9dmYw1QnyM8G?qaj{XyV90ZE*dg-^t* zzC`_9sKPL4?CUk@(?=Uz?yTkaQg9oA*rQ2fz?ueI-z(+YWdFtUo9 z6}(6h!!C!D%+%Wh5lVJvn8c1AWrfm@g{t#%y&9GlqB&$MBj?U|l+1?j&gOatC($e5 z5!5WMIsaR%ygvig($z$qmS2qAqjND3fvXz`otCvnmw5r0+Ho(1Nj?E3G%&ljuFP6P z)wC{(gpd*X!^nhh9nsFHg6)%$_s&=D?$tZO{a-_F&O#BL{uS~d(0>@Tze4V2Y3ly} zggjcgM}GH9`K$W{G&+M5po9TwPC^bsS_PpUxaMg%Tuu9qGNX%^WOkOYd=BXxyPkgb zgIOE)UVRHl-+8oe-)MH>Dx4~tbjS76hen~zu?lSq9Wksr!S^2PBL$T8E7^S7!NkEZ zXfJ!0*E4@GT#Dj8CZGRGquHh=0Xi|HX(r?~}ZQlJJ5T^BTu$m8! z29O2B#r5whd9T$!Z`M2mNwOM0mXeWv`yh2^lbB^6RsA$8WETmY{L_LjDgJ~4AjO{mrWj{i)yVJ73DUf_oHR?nM$mX?hfrTtND3S}$P3m-15Nc*Ch9`e|JjYiN zZL?D;bTeOM9!p}ancM|3Q`X!Lg{Nhx;v!As#WnrzmpZ#8x;c#8I@Gl5No9>; z2RK2R+{H;xkXa=)wbGRj_#pIaH)6e;R#BI1M>sG)2-mAlI zlKR;c=*-igv_8Q0L0jH@L>Y9yPi$j7P1{=S8s~Em#IIxaxaAbx$2ioUVN1KQ@j$I3 z$bRDLBJU%!($2jy$ld&@5!~l<H`6hmO zCY64;o0I3*>MQ}VuZzp`>1=PCkusJbllN^|&;xWRQ2y!WmGR@_LHTZR_EL}kV@KHrWXfw+o6ut$a)axoMfWZJnk5Hh5s+};RHVB5 zu03W5MnXK=vEOBnzCt)5{uOmw6^{X3l$$6jKTDy2b$SgsA)<{V*ZWsrXCH+kIyD*} z6S2MFgzFx=z)>itbQJe2=Ugq}FsP(rlv^t^O;0&``~)!qrykCQcVfxy{DUq|$N`vC zcGQnjGN|f>-H>P|KqQ5}0A$=S&R|#HauTpfGHw8Pe}(uB%cnWmLA#!cyeau59~tu1zs! zk*C{WNa}tkHLWSD8BF3pJ=p6;FXXDyO!F>d)xM>egYP}I1~;$FiVPL)iFzvCbcUF% zd5a`=veJqYK11!RGs~WRQg1)bk^w|II@$INKNzvqws4C`*jlew{>Hd*m}wGsun96u z1QJMf!x;-|vGt_kxH{9ysP|aUP?aCoHFFRXB`H`LWF9N6lbBtkxt9HGTa|rx_y&QM zHf*KEsx|Rh@`I01ix~%5fnGCJF4EkmB76z)os`wYxHe4a(kx4tKWXatbQTb>W`_Y# zr(ersKhWPqV++SXn>Yxbw$u5$s2~;ga5gl1J<9+S z6RVJGrK#mRaTr5l$lNLH(j$vz4rWlDq^dBTFLvgMG=jM=?O_~8E0Kyft6-3Dv2rev zi8{JlR?k@J>FREx;ReVzMM}CLY=hg z#_KUKw#oWuw#K4rTkuo+qFx%@8j32rrj`qabRzhlg(}IfJO9+@ADj)Mm}fIw+-IKS_?%~5Sb5iq+MW}{ zk7Il!?({(tdE-`b7$;Z%0W*6xP&a`3+Dh{}Re)W{e+~w{HJVfZen$LPNm+W2jxGGM&rM&~U(?XP z%z=xck&UUci>HmL^IwVrKPH6`^tJBW?f|%3eE8CfD3YP|64AP!0MjHA9}8OKqwKDm zKHtPMjtfZVrD&X)u!4SW0i(3BtrtK-#CiuC3obaWJ6bjv)WE=cHfU%O(=`#^VlxFgmQ)f2#ENXGBUF`#1a9NK*r%-H9g=M%SOojlc6|+^BaCCH5oH!F>_GXgu59b{2uy*Uril$=xoYxyI zlo+E4B;z;BTYE#Nj*Q&3G*Y*THzel43Epa%KF%*G&8A&D$LUgA`GfN+-}O@zY9MfR>D+V4eckocvI{NgEg2nF1|Fix05BIQ zXwOKaS&xHJsWeYTb{Q(>Rn19x1x6{<+w*uU?9)L1!CwvGc4i1?E%Tno)<{2{nOZR} z$xqWF4mg1RgS`Ym(@MeHd9|UAv|O-IVJt23Ag|uzGhaDVdbg+p`oa0br&)93^?IbK z><6Bs_^qnOpIPL|cw89_3>g+nHqUCqfD<5+d0{2SVzD~jj)I_%8sb*RxQFO#fHBDN zwZo12c0tXFIO>W7D>dY@oRz=ScTXg~Kd#x~G_ptl(9hG@GE7LN(x09C<>Al0-AWs? zj;fX5$;N8h+B4qU@#=KcH*-5@$=BxL5{zj6e3SjRg|Cc>{0zy$6WR0xT`E58>omTb z)X{R%*~+b*LQk)f$0|Xtgv6Ls+3cm!on(!m+1*{ybK+T-)k>>~;bDIkvzM)3f6U66 z@pbNJMc|K^Bcbcirq`8%=(f%(e@y*EjmAfk`dHIRfcbe@h}>> zQgELGfNgImflO>g*^gqDKD?&R&b1|bSqKV>YC%jhUkwHd+&HNRJ3rZ#ie;lwNEzrKkYlyMlPrL};=DPuRgZ)DutWgu< zAHTCzS$g{3MVZ#kJ+jjfeuNR;QDJD3M3ns+;gG7eNW+)ceCrq0G-=!1Zk3xA6$$x? zJ7g%?-ZG3jHO|}EB)>W&+42;VykK(BGr_=jb%`G}gNr3+COQ2xk60`iSq>#xX3GL1 zGziXgU>7&So@(iPO)ioMaVQEF%-k=o$s1gUP_LzhhAsbUXfu^XtP&K=yH1UfN6TQ* zFr`O)Uc8+@hz@YAv+8Kq{PzSt9fLA!2#hN|r>-9F`0XWqk?vP7_3I`0WLK zxEEXS-0jl{8#R5nms`d8gs7iaTF3#b#yQCib7=>*U`eYq}F z45==O9zy|d;ONWC70m{@N-CH`)HZ#fulAAtcP%y#oNeEFEM}L=*(u`7;yN0=s=XWM zUY9{J8B1CM@X_)$u#@+{7xk#}i7>f`FFpuJx-=UyF&L0qNAsg%?>Dkgj8133&<17) zzCX`aKTxRL1Mjfqj1UIX_`{XGS(e-(H9CZB`Of?R6(b`H0~=yQ+!CAy?he%Ae8WJP zmAKdn%-iao1?|uxUnM2*i-f$`n!d9@#EPYV-X;JdXc-7@@E}SFdHQ+%G}(oSiZIiQ z?6O5ffPjDm zgPa~|bZDjV0Lx`=SLC9@leNv+Qy*a*W2`mn>Zf1$PoKlS@kIN1L;4m~g1-`nn^xMsNN7D6@BuopGk& zD{vcjLQJ`Ya&YJ&=N)hSJT+CpxEK@B^3}SB;bJmQig$6Ndc$xQ5eIXT^6-W_9uEXd zt-XGQj5YwH+iBTSZuc#?&4nL`xCln_GM08scHfhrW|9v*&|!^PV1X z2i_oRedj>=CX;q73C$&A8QH_MJcy47v_XbD*;OV8Cqby)zgB9xv$OEs+bABLm1sI zuMTQ#EoEX@y=d=?=vk455f42Th24$co>mfkWay<%y&hsi9rQ^Zi_Kl_WfJlGk0zt) zFuwe{d1NpI1E>?=&tOyJeObF9Eo5S*jkg0_x6ek3 zCN3lOs#3J`0Z=31ATLgub9Ex+lS1X-NkGPCgi1=QX)*LuV&kv_M+GB?xUi`nwV@nU zuCOR3EXny1iA+}n!jVx-COgPbp$9JcvcD=0gOq&LEB#ULEBw&R9#-%31;whSwa{Rj z8CRIrWSKdhEMAn|WyI5eP~D@?38mNWYgm-~*AjrDP+3FNA5PmmJCLwzZ)AoKu7h~k zD`u*ST$plaxoV-gr|3!OEIjhQQYj7)f2$NpIAq9^($haRJq65s$JogOI>d>rkjKZ5 zc;p(M1^JQ7ahFNqYQGise)OGo5*p&5V}U3fgySm6nB3%d$Vxx&=E*YQ77UDqxN&6s zBRvbVk<$2Fvf}OcQ$i7GK_=3gNjk0NOEZl4LTN1W`3XeV^cs@aa5M>4Ml>&xf;F7%?Rtr@AWBsmslO5l?8vP52H^fpQrB~`Mi#ijv|KWruK=$LmQS!CeWuSxseN zh%XwLm7WSq;wPJQ9vIjF`u%P$5zu|l2hFXZG;zM@JnsGR4Nv7em{K0@gzvqfo0S{i%gN z&QKe#*^aOOhSdPm?eONWzGweJEB*he|NjZA|4f0uxf(s$fzV5Y1bMYyOhhLud}*Kr8av9oZA(-7jddZ%^~^416RVX z+NgTGx^>=bLRs#~y_{Bm%CjTjcI0h-g*0X?p6xG6m6kEU@_kT6LlV?%yzuRUG1*F! zq{Rxi(WCht=wAs4^nM0U^R?Ry{&g(iUshrNCj$Pp(j3>&jag+!^1Gyly6;hdxfhXY zK(;p+F3;;Wb4RvE=uWcGh9WN`J71y(+0#0Abc$C>H1YIb;SK-G}0+3BIMxj|B*e;}|ppD!ujUA?Qlk}`gu!(@^J#sW1d z<7a83T3MDLRXrO$z^Bhz>)2j~K|1fQT%$+l=j4jZaLyqoF`bm6W@aa+STyRy{4C7-ghwJG_QICN3_>YS!&h1P62w4cWCfJ4n(itM%hT-$AY{rU%P0)@0myPo;>~g4Ox- zG&ir#62Qu3lj z(-}DBaw5&(kpt!t>-v#8&(mv8V>!6bu59sqBhPgf+fU?1Yva1+dGzu zO1(~Q)xt;=UHjF*Sf-=ig^kiwCVQYc*nOa8?=~nXpF-FM5%=l^bfY$M%5>)*vfZNF zTe5EyGzy}-2if4v&?hw@7}$Xm=z$lHI|Z5(4@g2HfOJpaVq6T=fEUPaO3z|?8~Ex8 z$^lChQ_E$yjgT;tz?}2K)c@t<4VCbo@*{%^<^p^8D6L*6_-fMe=1|)ET};0gTKl26 z>vnTU;4M&lUTL!$m+=E`$AsP8!9agk)516kgeNQCng^g5KbB#TL2+d5uRYmVUPfi9 zRly##VVmemqs?$fa`Xa9iqrkf1??rTG?DJYX$GDdoK(nwCo)QiOL7Op$MWg??G@Q< z0ypl;fzcL+%HcOPVfFwy=O|y&{>oi>?iXxl{(|l8^3sC4`ntdn)T*&Uu{|C{7RMk} zucMcw*0P75rAzU_$)mmh`^)?4ruiRLkF}~3+{b8@btoIrTztT;;}JP$lQCu?ds-~1 zcXDW8CIr8*RFpZu|Gm-DTbFehjLr7$(sMK5_mLQ^-rua9PmfB`Wkj_8NR3FtEXs7x ztODj(ZN3a$0t5lQ;>+P^f3jK_u12M!8Se0vUN%ukN!M>vn3XY5kIwL zvLTt|YU{NU3{iuKL zhMdG7(z8yEbX;IbLx;9^gfo5@vzm1i%B$&z zSHB3eRi!zzIg^0%wQIZ@76priE49FO-7!M7+mjdQ)%G@@UYrMntw7}jRou1eW3UNi z{15(ee!bypjSr!q&|@Fj7h5myNOp6Un=)fPY^vho9-ok~2~&mS6^e=_Y)W^phDZzX zLakE#frlUIVeleH9@H~!ZVtPh+FXJQG=*4f-okTY(o!G5R1gkAXdcl8y&y{lcltTR z@H`_cIvY+{A_Dp=h{2ml0}t{u?=r?0hm^x(-e!#@&Jb z=v^kK1XGXWBB@_Ru#9PDmQRIU{4kkiBy1CZ-nF()8(RtF%OenjosVJ(Y4Sj^Iw=(O z6QaJ|ViKs4x9UCM;e>KrD639`h?8j@1A3R zcJ*UQIwWE-T;IMz(#m_X&<$`3(K7od7@F6jgwYS=e+sL=JN7>ZR*?qhTE<_tP~-o? z7P9=q7OvX;wWYnO40ZCO8chks73Ksj8P67UhOA^3(tvCQLmFRvrm7=XP~X}+gMV03 zx(<)Sa(x2UiBxIM|LFtt`F8P?BRw9+y4OQdeK*b*LIR?+j+?P;z~kB0jc7>Kzzf4dV+n|K^xxsrWIxF$;UHrOiBQ30obq_yU7jpH)-4 zX9&Bt9=|ZW7r22!Xj=cN43mKEliad%#36JS=FWMR48pFgA{D@TJi^%dnljEMuF0Y9SA<5g^)~znJY2% znU_*Fb0X<~3Xmf9-B_heJF-(D7{!F4tK1CHFrn$|5JOAg<1u?ZB3p^ZrWg{_)umNt zek|mQ#nW^Rz;*HbTqCh+!1Ov5B77?DyS(VFUh|Q06i;SLO7C%ykAoh0EmCQ~{^L(W z{I!A%%>$pefVIVznv*uc+F$gh80;SG3IQP?;rBfR(qOajHI}9M*vuaJE+Lauo5xF!@OJShwyUu0rLpIwjjT0u~9cxaE)fFacjrAiO<5 z_XxVyvrz4sw%{S#Va~2!&$+HcK{qTAt1rPBox8*o7JPo_p`1m$`nQ9Y2p3laz4hR4 znV;~UoW7yK-sDXDE-txkWpc$!{x$LHh1jKg)njb>EJP2%(>1Xlt*d#Rf=QV`4aaxN zobjZFMKT#td^96%lt_kpBxV`Je~&N7Acrm{HDgq3v}Z;>X5428N(t1&;_YF}heaZ4ffie!xbIjpSx(^Nyj`SaKns?_+;8qTENAU&il>trMf=< zr*n`F`0IK#uXk0;k=bAb`pi$x7#~V#W7;M~t6WPojfA})8#v{e9*T|SFF%R)0u(_X)Lgfp1QvMV zQdHge%IO=Q6cSCG3*Mx200H|v>;4gYe*3UqmSZ5*@<)^FtVLZO@+Y&YrXD{6Pj(Ri z-!0&0%NUa>Ikww^sLO`#=R*??RNDFuo3>;QmGrl65o-ohwt(*i&PoQ312jry-YkQ# zbo2TjP%JUoXs$xJhn&E44tiSmo_-^q?CJZ|*!3=oKb7nxIde5=?gx0QMf=n;d(|9C z(4x6Is^fLkO)Q2f-5#%s!Qo;U=VPpOX=i7l#{~hfjo*BYh|CIASPzm*iZYNoXi~av zt(4ZCop<_fvVwVM;qIki7dWi2*6^5NpU$3)>-&S}qcC5}c!AYKMr*sZK7~6Z)KCi2 z#~aq=4&5PZj^7S|aM(e%V!qqXi?^m6ctmr}>L`J%RjNjJ=Czp~n*&Q1olPIukFAoh zv!7bC*{@ZVRt&^LBM}b|XxVYI$hn7F1OB&oda7m#%P^4R}UgSAJ zH}~%)yGSv{+KViNl7W|@8ox;R7{g$URi4oE#-%GPcAMIF<0W{Y3&=qg<-%ECsbF%B5#aa{GHx{@MQgalQ$=N3(_xRYKWY`STuz&Z7zrkmt8GU}z%2AZ^2X@pz_) z^5nMTjvSHifaY%BdD0UU2|-P1rgSZ_Fvh8KXEdU?M{OQ7zNb18;iUsY`ylEu;KcIH zG1F_Xw*O^%3YHHsm;YqmP^pbgJ)eFpibpo$`v-x#yG#Pv56_7Kv+|MAA#MpRs5NY4 z-A~B>snz`(CjU7cJuc{v-TmSyJKlfKQMSK1s<|GY_cup#mSC%~punIOmXzt&?QB&Q z+!v2{1#yf<5s=scO%n?$u)RElReqlOE}l4>+u)fna>9Ga2%4w8=k6(vBg`U|D(KXtfL>78N{Q_0p zFiljSRfr9RQusBzl}MIWHARdR8NH3_k&0M~K)@(ez?PVijYb;P+Iwte`*r=IVy(*P zU*J+){M5ZETT&~8&JcL_@|e7Z5G+x{822Q!&*X+Y zpWb;5z9IbKbcl45$IK(R_9)YD+63-lW6doq4G`H&azLsagT-69tbPfrZ`t^oBsNzA zZud%=TFHX7%5Nc8oH-!DB@FbO#S-~o7+jUDr*3Qb(~i&|@D;xyB5gwC16^@7sskHs z_cV*Uz2GalAWlV!UHjd7Uf}7h4A9M9f1U@3!Q0e8oVI>Q5yL((r>%&>RFs^GhANxt zpx|v9v}7B8E2-AYYyGKakCf_+6Gtr|=fGWMF~D5y7_<&iWIlat$9s|0 zk|iPI`A&wfO?{-)gX5nnomB}qNYm!B_nb2|Xu`zxveDo=?E$y%4_%5vtJ*=N&3N(Q z2erxZ*izcvs}jAj6wubO7kB{v9AyfM%I)pit6>kym)74b?)L5wlZ%<9U?0vAsj{UJ zyBWJ&6TwxczahQZwqy=|-&v*TozIlNXM3|Yk9iJ%qIK2`w0O<+8$=`Cnt8#8MV#jDtQ<09eYaMUOO5^xE;pXSZ{m`TT#Lx1Bm*p5A z(ltlL=mu(;0^)bSt!3g9Twfqah<0e7U9YXy!ERR~xYr|Y7rU-f780P}$fWx{vzLy( zCg5A;BIR@dtbG@{4Kdbo`L3~`P9(Ld0$w0cHCGe1zIff>ZV-kW{OX8!mCDYPk+f0g za%K^w2V_jddS;2b8#c^jWwCexb?TY$U)GR*>YxAPX~^H4{LhgH`UZ-7l&{fk3)Fuv z5$u0SM5Efe-3rr})mGM623IdlKqAWN5fx^Uvyxz2fxJQioyJ?OhYyxZdeB46qTk~> z?tlj7tCJAn4e5Q4;$5qw2A&BpQdYLZp^RWQ0_xOLS8CqalWCU{V{36-@E2zGmKw?` zh0AxIcpz{pvx4oNN*8<>2ucc1zyWJkgXLoL&fBldo4a^=$c%^e%+KfvJC7DtwZzAR z3ep}WsO_Ej>JE!|ykZiC;z!mm%dCc;9fusAoFud; zid*&*6f^}$N6Rg0_fKmrg>LNj!-o4Vm?$tcWe^|` zmE$|>#~2YR#Va}Uw$!jg`MA66u%&S8H#zCO&N(q-lIF^Id7c5YtC(AY>w7usW)XU=Z(LsmL*Af~7AZUz z_8eLGB%Oqogxh22fR(1*JGeoF12%2oU}F%lP=!EeXI6ZQR8$4Xu-A>9yD(1YiVM*X z5sBn2+e*-n8MgnQn$Ex5>OY4lrTI#0~s zFF!X`Rlp#)< znX0M>2^Y~e@l6~l&)wx#>Iys?cGtTx6}EiL)dtr%|3Tt3G>xcUh_N?i9V52sN69@5 zn#OAF`O1R4e_T^sWnJCiFFx)1`Fq^X<&zzF`#IeJ1cv#Nv>m;^YaULPig31S2XVEE z5kO5lUw4(uFr%g$BmD?anpvJu+oQ$SX;QV;iL((7iCZp=r_)HZXri@%Go)PkXn*UOOI9gt<(&RpnvZWP)QJ`*&rO3Mc1MZVF~wt! z*wEbK7^t78>nyJDxxj~wWFquLV7)-|maLB1w@p?)ZX@@L?ssAGA` z=4l9!^4v#|LMUOg1^(ALL*AFQbbMuGBm7!(IQnGG0yrDq=qnEMC^O5AHJ3qdm?^6pMtZg9m zs1zGs0IYS+HVwI^o?KWkV=Mp}j&)EmDK|*PgHywh{mJ8gH0&iFeVJY>j%yK2Y=B|< zd$yyVej0_0d5h>-&|C{jmmCy5GN%6V+!)mhMagJy1N|tAOrp9MPu2sr0!0Gvz#>^@ zO3MyB^Z@vd(X&nRVuf+}Yt1|wi%`C}fIf=;yhlkwu>e6zTakeaF?WC;If=rMSdnT8 z8veJ^7K5qIeV?L%l4Mz!z7vUvS{v2KO$)v%!V(;>W{(`LEUB_&VUATotzXMwIy6pQ3Ie`=vuM~*q{8m1$)okda7hy^C zymOx9-wCKgbs&(Oauq-xZjnndzZzMtOhxOG@ZQO?NR(6xSx8{+OSkE;2yaybRlOAh zw1gup+2&w%A51-NG+k=D`jzNfi;5Yp*g7rkujg(=2E+!YKmfiayCYA`3v6mPE!`zr z)x5X`(D6DX=pWq{5DWSF* zGgxNX)xXlizOZ4C$*0j0?aHpv0%>h)ybW8{D8nbECD`1iX}fAnTmEKK*<-ajTzth?88k2;CzvlHl$Ea2H#j@nG&QZ=&!g zBVx9ik&ictKl{v7pgFzg9-5+Xqlx(tFD&&ANf-SJ4jr}Q8|N!uF>XK%Y=y`W?wzoK z=OsI5rRben2jQM7g^wQle93dB2PLcf17!f@TXo?{-qC70A}z4xo{`hflI{jtUxtJa>gs%DKjOZ&iy z=sV6Z#dqgC*ALdP_upC4$S zQ$sRXZ-9-HofrZi@Zq+}CIlju+%-_0aVn?UEe~BC;3zN(6@7_w*;Gp8v=x1(Biq$IOf6CzNjYxWu zYs_>dSzc*d@%{O7qKePw*&|q!wXq5_@NSi9y`heCOoTuI)!i9KSQJd(u>f9*`{N(33a17tJeDS=6j(!i*PUQw|DgV98$!$31Zd#xX+ ztesbb-~k?of3%Tz;iOHZyuzb&E^SOEoGQ-WTkzY9Aeux84y&q;ldjAP_7eY>iqWF7 z*I(3#|L+%dez?8=J04$irY_zM;PDy%4UfGU4f|2@%K~3MR3=X_tUq*stf zrfWXRmNPr%#u3pa)TzsmXPsj}q?twT5Or&4#2y*RE$;UA+n$#>CX=eU>e)1r5x_V{ z7d@t3AL9U_S4l)KX=ua=wrM4^3~E0YP{+zYQ)FMmEfya6;S>d}M-~YUq>LJ4C%9qR zIshV&f$?yTTko`r&@87)3C++*rMrr!r~@9HzRd z{b99`aYH_rx6rdfq$yr}Bv|Dx5pjvCp>yWAvZcV{@Y`*&g*j0UHWV%kJvOsEg5n!H z7SB1h%Lf|?#|KoTDQBg%+4}Zp@S2#fP3Q|5_6u*TY;~)n7ECZimmXd5G$Da_qUL^7A1=X4-cr=NWM;FO7x!u0 z7c;C?6ArQyCaFM_TTD^{)oi-LWvBTfG=I~jj`Q#N@Y%-0hFF?VRVI9cHe|_ zLxtG&&FELq(H1?Q5t#8x1z1nvVOl!sumer=-esrRai(#;XBbB^o#;y611&idWMAt=FDRoIpr15+jJY{7=w9q8X{3AZd5t1T` z+C!0+YEvD$xo|s5hNSG4mLSrCe9-Xe%|b|FFGvFdO;JRPo&NJ$x20nZg;eUfx9Tcm zP0end>}b|o>3%qClXVW0?nS=*nKF+UfIjuy$wr8ZBRz{VGCV zcMyJASYT~N6&h%;xr780HR%rvIu5$=k(Oscy$L>0A=D(!@j+!BPW^(XP*{iaIj$}R z4c*gOC$5N*R-<{>)IOE{4lkPuAG%^hS;tG=#8KmOzo05^Ajrxk2)WrbiKk`Oh?cyb zb}{nsp_ulXl}g;Mqyq7rNxwCb2!jL#jWEN zDXwDt!unSb=M!DwaO@7ZYZHp`PmL?#d2dwG!rZ4}tNihDod_jW`DESk(VMewV%hTP zpfOl8Zk(0x*4LkJM59^h&xd5o-++W_#tipqVG$2vo*0Ty=T-&gvwCX5s33t>9V4ZL zdqNzB?@&MmHxZTJF25Wnz8E$e{ShbBnObHc2zM~_?a*Z`;bw7zNmEt^ja>1P9{x&_ z;hG(xVa2=qbrwQZ@Ek&re!oqawM5^b8?FfIMxZl*ks3-2wuNw+X5B5!M!@Q|Pe2u&)tLVjo#9_)i zXnPlAlfKD3(8m z2&p|9LhuKB>|&99Rsh@k1P`geBDsA+GIl3YX1&W=&0YV8J3Bcz-}v6aU$NaQ3u_a| zZM}@8Ne$tbe`JPfEOA_9kp6UO_2E?RdaG*!^5sWt@|$hvax`t^U{G!)?sNTD>|uuQ zIZ5T}If3X>jw+$0rxKr0vr6Lqn;Tf*mPH+ezRoeg1Sovq#U8Lu^VMSKYG^v5)Csfu z7CrP7MIAq~=9sD#Kgr(6dS&E~@Obgb=YQ~K|7Kdy50n1C7UUe~1k&yx$ z**XuI>QuQoY;-l4!{H02M}GUpnIi}=3PT|7Jjrp6irrWGPQF`=r_r!niu3!8^Sf|E z?eFPL#|Xlp(rvI6rD`l_Zfy3|>>ZP$s#UAAbXuf4LU#@Cy_l<%H~N8<#->sPI`b@y zftEYYiGW&E)mR~4atf|6fwb4mN^dh5Uzvewm!v}s=-0>+s^0@iOL__~d3FUPJpip4 zL=WKChIRmlG;%GPR7{s#`tQ@?8Df1YT72IEJy-=4jcMl z2yVVGc-bcylm76WH#&4m_EzGEnzVry5DVSB6Ap`X>hG2Xs8*IJAheqb2ndtwNs*-S zEBYNhhc=!N7Cnd%KAW_*F;vOaS_XS@_X)7u`Ohm>+;9%m*EQfCeKJcwTophE$ptoq zn|gyd=t0UVWhYfFbz>Jan>z`+diaekvRZi3k=7eV5~OWPek>RZo$UQb$8V%KU_5v< ztyr-hw*J=K6EnVqM?uc8b{PB9}-wzSURrcReKsO&yg>9#DeWX8^T7tlHv zVX;O{yeLGYBXj$7a-?HX17}zlC7F9zT?L2N?Jz0i3rqJLhHm0hme0fp=uBcohWE+& zq+DoJQbl?9$%RfEF5#a_jd#JaH8|SEGiTYJ?jmZwtCAKdPFW_^)uXW8O{s#Fn5j;e zzc|5As$$L7iSP06;Ho2=!_5LsvB#+%2`eP>6@YktK0@~35?GJ7SD*L>ZI3~<+rpF* z{IOcSH)k>00;S6(lTGcE8eO*cRVY)OBiT3M5E1{N-L*X>zF0T--Imq+OqbA;D^v8b z2$Cv0iaaH-ntcI;8Q&nvR(W`<&m)8Wm|Y<5aU`Tv>Sgn9zwIx%wa015E zV|ez6&)E%G^)IT7j5a>9uni2-ckq&CSVul6j=ixTBL{u(H6PBna45)N$|!z`uHXt! z^6^9LtkCBSfL9AguamEcS-;@xq7wU6jr26RoL`$+-d4X=4DP!mcZ&8{hj( z8tm$wMVQk*Nf+KbZ#lap$}6UzE2fW8&>N`VIgFAxj^VZ}J9-WuAC@pezq}q_vH$!$ z*$(B${+VY=@bmNhOWM@Xz@^$4g0utetq(;vb&%tZDt0Ak(H>eK z+OLN{F*7^V06D1I^bJbvwPP{vU%^JRfn}M#4f!Kqo&;XVfLzf8d7A}gCe4&cV z=R-D&L5MtdX}d@-zO$0w2DbkdFqOFwjO1%>7eL_tBHFIYYCOAqm zW^mT3;$1md<%D`{kHwA!k!M8zve9eimBOd10@ciD$inyob#smEpsiOle$-C^5|GX3 zh7{G$vQw?(Abh8H#NTzB%O=DQlIx*y+59!e3c!k_^se8a`nNE^;kEH{LgWCkx3M9w zwPIL2ocA$n=SZ5nVn8WYO-;}Ix=xt6E*i*uiAe79U%=6XR0JP|bB6xDHd{e?>2 z`MU)K+UpnAxOK~?k96g_*oIfR(cqzvYC7g$FXXFxSK_2RddE(w9HuRIr*sGOf(z|j zlyIfhJ!Q!|e;MfE3CJe>?{@i9ZqHxsQW+A-eEDzfvc*e|*u#JbqYeG-Pj(6Zr(HH^ z%&FlJro6@0CQ(I{OGDrs3{W(JNcS}gJsi!4HTTyZnJt6b2j(rcRykFEWOJ=$SX#2k z9Cz;hH|(-5iPXQWP&LBk(=X-3dM@$ymjQP90GO)&&MtrG~Dd`qwoMVtb<-t!wT;nizODt<3uW_ceaV5SVa{wFGjLu( zYzQ)LjC#{T9MvFNd7^lB9gJ(Vkft#&86zCgqaUM4Qg*M|VXL;kS&&7peNqTVjpQ3o z=bJi)P6-E$@0#vzpt;7NZ8EqO#q{FV>k-&NK9=eXOsvwH;Ej?MO?7WCo9L8{#SU== zM5pluSMW3iqYRM6{EWNa7yC0o#pE82bqz2B!Nn6`a{Bs=5ht+~2`dlj=4TS6UVi1# z^_(yCt0Ztvi$EDsCzfrTF2DCQ&r4wtWLJaf73$i8c2w5v`1E$j@(4Tj4TE)|$#7@_ z%~}ccZvH(4&m;K<)|xB&(`632ZZjLwrnf4(c+5fQ9}9F5`!4k-on1ips}9q2(?HIQ z$dm%4{CgN5`$}>?Y6RLhp`Si#e>S&ai`#rH`Hku}DGJB=nZ63dd*wLE zIK!0G0EWf6kQq^zkhsM!U5AQ^T{Admk#JWBt*@ zMX{65#);al-nqofiM7SRECx*{9S)k6=NpAV8_6E;)`#p3-+P;bcfYPNo3vPa{qG*& zr__<3Jph{dF_Hf!$s>@MgLQy4vV+3cl}}Ut0XtqV-?hD5V?h7m0jU4W0~GzyGoStF znGpd!GhvueZE+O`KOOsQ>yMuK-0_c|`Gy1JN6-9r8Vb-eD*<|DF6EQb470(c}bgHtRvGQ{OBspW$HG>sU}rf zOUWSc=ZGlKngu4)Anh?t1Y|y5v|L^}4oHcxq6K4;v>*t3!HX-YlxOp?_t%J)qYEpX z@qc4YSmqua3df>9)ftr%DX|Wlvm`aWs&Lq6FKdKRPxXj{^q*z9Ys+T-U?adi#Oj~1 z6~RrX*${n~vxD~-UpXc>hMl{@S(#l)0bGmr`-?- z98w7!bx=m(wfuhj5;%OL5uaTcvL*<1g)4y7rAGYhLG8&Pg6 z-d?k(yhK`Netv@E+$J)jA%^MD$y@!@NbU4~v`BFDqFY?t#xW|IMSrZ-cAQUM~FhAA4!7*CxehmD^|K-a%C+l+;O?w_u8(b%C2EG zL{}c!nc?>!Rw zlW&O$$HBH>^&06-S6?oxO(z=j4qd5Hs@d#h&P;4!R#;QLYKUTn>S(iYORq0hNLG>u zLHnY+En8_HrN7w5po=Z(XVZ^o4&bN}s(8g2)-xDj-$hHjgPT^uMBTz4BzTb~`0U-SmS(0-BJwWWp(!k=?}*I}oF+_|^HyosCA^+Ij;{g=u$ z|DNCaDOKY?-){}egmdx(MlLmgvZ258TR&vS6)J1C*o<&K8+rz7qtIi{N&1Es zb8{p`#``Lfon#WAdrn-|#H3+)KK|JS4>WCvnDY@iln=76QdM#@1MO3IH8x^Q3&b1C zJwdjDp+4==I>^S0&?BP;y|z!Ssa5?5E2)sAOZS8?$6cz4(*2r0?SQ5+d!fH{enT}Y4WI7H(6WH9q@-;ON`;gg;0UwI!5 zFkUKH<8)z6(qOfCt9{<)jt!Ca zW18wgGc#_Yi$H$kPUnfZ}-Bucz1_OThPq!>x@TqwLOryL*ntq$38+!t0?V># zGwFls65ZabPFY?EIF>kG-ko6GS`NG{c;j!vDnZ^Nw2pB+#Utj@pb5uGAhwpQEd8FU zRkb?|#)A)3b)5%iD-DG?VKVPF=GCGVR~s%Ag62BYkAAVUe{Y8HlivMjJNr+ZA@FCn zQ7i%MjPcFy?d*pbcTa5tFhfA}KB(dlDTP6vHD7kpsr%qBy8;^-7HzsqCTlrtA9?!W z^t=1GY|lVT4sRqww^aK5**l+u^KJY=@)g0ZVDnz7)SHliY6R&*9v0NTgUP?0e<`$>A(c`**UlDX7ULVspA6&}cmnC>0YF6NT&?>CnnmuEw^J+-6<{@=M zJ^lzSoKN&tdbiEz8OFu8rTt7D1($3KL6(dzU7?rVgoU_W@PiyUF}ah3lbB+DsADq} zTxlF0;qVFen6mAl&HW0?i;k-3o4WD*wX+bFSvK#mwOSkKDz%bLwL{a=&5Jp&mc}(w z77f`J(hQNr%NY=iXP-t4wL#UmIRhA+4P5-)Mn?E7ga)qYL*Lsll$GRlesJ9REZNO0 z*6njR=Rn6|RV?n$fD=WQK-S7_lOI&(SOR&0z^$d;V#pA_l~`b$BiYO3%1h6DJMf^1 zKh^gTxgsr5_H8FT=8PJTw{yFff#+ki5BUQ}q@8NZ%&KQqu{XpVW9?*KQvo}a#rw{@{8Tkm4u|#+=>LJLZOgWnk@NLgGIe>vMW;SwpBlfN9e^UbQ@ro({#O!(b>JFk+X@pE1AqHer&r8*b3@q z;=R*>Ij8)#wHFP0$)36N%WV1I>u7(Hr~m8#{u3t%%VcU9;eZK(AfWum?>gEaI@>k% zl|L4+0i`hEz&O0^DP&qDJ;uOU}tnLvani4(SgdcnK!gynX2(Hb$dxvVclvfO{qkUn0320n{IJh$Ykjq zozKbN6%}NH2qFB|F@(LOVyPxHBz>x;Tl&;>5JlYHD%*aX9S1i_u5>jy*VLtyJWW{a zP;nsTE1iEfcv1i!MILnFkBujb-nk84A3No$M>vzX+m4P%&e(A~)W?4eXWMt0^Yy{( zBIh9OGjHC^yb~q;Zf5Q3A$j5X!U^feW6RTU6ZK z$?N2b0FOvOu)x+_#otRDM?`67(YeN}Namo$eN`k> zKmMTloVw=jTbDkeuJH~d zDDiR^aNU1be#&Im3y!g0<9_XIx)p-W`qt0+K7v`mm65hMJG>yfc$cIAYn7?CTV#R> zs!6ig=KHHDQLhq_DWC7q0}otE3=$_S#ugD~w8V>x%Q<^m~sSwCX+UukYx z#3Ne+#q>^&n`#_wVI=h{u$_RCpelLNgPYF6jgKQeTp4lJ?FCF3`p0Mby1Xcr7^_q< z4%VT-TJLW5dq?+^!N_qb=~P|Y0BdB1??U%)dO#s|eqmE8hbLGq3T8{B8-d2zKLulI zFkZTd2tDWPc!;GrVaMAzamH&sheKfIDaM3Nx-)xT>O>xWpj_wmUTqybe@QI66aN0$ z8j?N^x0G&$Y*=_IBj~)T=~RyZ<5v;ozvr=jlFI+*d#rQ{BIXEy$Kv@LkM%<*y7#BY zQs!6(dV6B1;hb(3U=pxfAQ9mHgbD^*DZC^s5l%{}B^&265gYHawnn0!9TrvSJ!u^r zV!97!>-|`chBL4HR&~@9GbWc%USpOtVbt?}uzysnmJn0QNWzuY{N1V|v#0azj1nt* z8<-W%So&`Fh$bt7kWzUufr&CT)s*U-CSpXDJ63X<6{7M5Ls%{iYT$6TZPnR~uDL~V zT_4fZgx}$l@2pBc%wsMcwx2p~XyIx=8k!OrNAFNLIQ`mMF|169NBoJJChe@&>6q=j zEo(Is!Zr{I5(|QVHT#W?!NL$B8`SUp`|{O!7#@52n^)9L5$Xuw27$y@{S4 z=*X=Hx{HNOLua$w7=BcH&%KZl@kDcy5_uh+epJGFwKuO(brC&f#nMd_t8RcdQmls^ zaq8YQK8)4$8MbwAR!Q~yG?t|1OEkwms3@!BO_jC{kLsAl5yd@5dmGmQ_I%)h0BY2r z#=KbL0s7fyi+T(EbAKi+jfUdKFI#R(D$2D8bsuEsTW{ZZ7PlhyW(?^iyL)OK@2*JU zicJ|Md&``xKXev2rE@)&%Z=mK(-pJSv+;AHNFz&)DlN`F%B6>78_{8R1ih)HMpQP_ zO!cWScGnucbeVKul9*XqsKY6m`H+SqD}Ajdbki8i+ym9dJU)c{Rdp&H_2bxe%T%O= zs}Q@FydF+YVX-L39?JNZ5&sR78x)iivMIvW`NYtZPBRdk;gH{EDe+M#MbO@DEkP-{ zGXWPQGz;8T5Wd2TJcx`0G=D(eZU-(+$|<2+6m@=5p()HD0|4svK1k5qpYkK`X(yIno2{ z#zfdQOM9$Xb|o13Nm0dIA`oF^SnuhHT`@Xey8}4}Wf)s_Z1By>qO*nfO{Ah=uZCFE zR8qb|e~ICNzr~V`Bg7+DC6#|HqKV;n z+%UVC0Cx@u%z@;bp0hxm#`vVIO?F*w5Fqdr2$an8%Yu45-~~ ze$Brqp0xJ6(-wR^0A!y0ygvHxxvZbG-v9Y7D~WVe#0@aZ>iL`X(H{!Q3YBh)w~T+N z1cTd~f|hG^pt}3uSqjW^tfe#fZ3Q&O+(qi*Cx@allsXEs=IM{bEeCUC2xUCuRMS%L zMCG{i@^)0->xcQ()$WXEFPsA{Pg7bS?2Pw644xT4Qb?BN+oKO=uqqz7UL6m`3{h+TLs3~;k$g)R8*LZ~5*5`AZk88VXE1x;SG~c97&Y6l*!NV__v9PMHZm0U z4gxt7WV=){<-R*9BKWgrtS?!BVg)Z6$~z5CW~#B>2~fkwn$ej}Q*Z2=hZP5G{xW+; zimnleK@mc>;HXb#0aqdS#Gc=~f;>;|*>CJ%y(;qrFjtxDD&vJZo37HjxCUfP>Xh&B zXnoSW-q@dFQIQoJeo|x#58167qpDozu9qcI<2jJwWI{Eyw`QRC#CE{8XNqu$wqx+e zcjan7^lsp?Fpn0lDPY{4GOeMlS;TwU0BKu+vD>!V9rZ7XBb^_ofMf7X#nG&Art^?D zObF;X<~EgUg!5iJH+}Zl5v5Irs$^7W;XgEvMM6{VW|ij-?@9`};Nr}n&7Df7-X9}Y z{LXj2O7AX8XQiR1%F`1S&H5!kb$g`RE}KP7*aD;uqXkxQyFpPDqh-oTZRi;>pl&vq z7^OY%pxN9@5<5YaV|kPk*n90I5XBs>I}Zc_o?ukU)_{R0P#rSySOkWTeQwRCaU$;^ z9+FPZ0T@*gtL}iBd{YZzCm*>_YL)o?_XKz7$7!lf7xUT#w`p{jm&w~}-kR|y6nkqn zZ=_3LP;vh<5fy{UM(u$_uB`;@tVLcr!@KoFGS8&%R;eWq;1QUzSIGx#)kmh>7;#yf zzb5Sddts2DG`&9?=zn4u#N@0()fh0QXhHx2LjPH{@nfR}#vh`;H8tx$)=9p1=-H!e za)BbB^j992sI-g;)f#GHe4%8}$AlB*S`ws;C#D=ctA6bu288ss;a?_~yPo|LU0o)4 z2U`<{G3eRfgs@sDNyQ=TD6oK`2GLl4{t{warJErWqECZDx}&K7ZZ#LHgXeWSXatVl zFEVIY9k0AxMFTU0zP)B=NE3#6Od{%VM80w@oGDs*Sj?}VQE9c@L1%0>5>TuTCaTLa$Y?1&qWb|d=ib%&Td4)wiE z-X=?h(@~!&oe3g_XG~F@fwZnJt(atqLb{l)K)}O_B(9u}cfl$16DZQ$gD&93%-_?? zvu)yC@xM0!Z5*HH!&)ghQ(mo!t+E*Bb|@^`A4<_8Z!b*EZJvq@=a&z1qsZ&9rd|4G zml2O*_x`|nbV^Cp`w)y&wG%noO-?}u_ru;;0xlrrs!NuhiUt6nUuuJ0? z|C6lLN8{6+!*vUViRbv-69*UHjw0m8h`WbhwhsI2aN(!yc}M!c%Bg&6?>OQSL)jvjc4{!4;*UJHQKC;IGG9K@9F7rNgyOQ64_H8O)RsVk~y+tKXqa z8d^{(2AC|$d%7-MNRX0HYI6%P%;}q;R@-B}Ho_7$>ms&y$|JsgE5X!OM&Le7fYpbf zhV*!yUd*R}LMXb~^$`c&gz_lS$Tf}Iuu`&AKSsAsVD0f75?DG?9+gg#SxaTId{^|9 zsM$v)tV~5ui_w%Pq@Ln<8!@XqNC!#|U-M0uk_lU917v(pf~fAbwM+}glR=*zOzzBC zuVvde0Vhtz{JD8K*(`LE>rc4)Hy2D8_vcW?aur$nStUj7i;@qJkhe_|c{$p!PRU9c zFZrdg9p_r+cgI%Ud)8De)X(hha%J?*aK(A&vK5_s36y$&K zDSxZcREGRd^eIg*S6@;9bC6&_WuxB?aeipu`c=nl07~PIb4oD1X%L-s=Pr3L8JP05 zaNtvP7Ru^RIg63B%E@4ovfG)rG#+)l!xT$4kBzxn%wLI?$Q~|d$Q5p;E?W|`0k>*Qn-qGlHD9)W+Lph{=%{Z2Pb%(OX!GEZ)nnF^p+l*|>^+*ux8tYG#ngyhGC zL~u!MBhSeiaj6@>XV)&1r_SKKyUMAvIkte_>c1zPwA+y?rpZDXoUrg)cg z(Z`un%|x84MksH#<`!SY_g>v-6EE}ooAI%f8idx&`rGvVf8XMZe zutr0^m5@_Vr*<_!3g3yeqxZ7`|8PHVakUmw|w9bGehDkPmd91Ka7=6h@S06MQ35;K~rJZV!g*cCP zng~ZVtFGgnk8wEoKp$-#|GwKw%RCYF6QvS^SQS}>Qtug59pa@xks~HXSVhK!m%!H@gF)0z;K!pH81gM^Kfem}|OZ;c?m$lhPV2W6Qg|QTv^;Wh2T2wg|nMcsvpfos#Zx9<%*2?Z^>ZsDr7JrY7GeJy#Q; z8F)~l%~9oT-??N2Xj%9>;89tSJm^1vTxBa=-Igwpl4C0HxF2%wY5A-L14qBC=MTq@ znW9O(Q`}(JD;*3mV6hL=;Oi($_Y`Ct^FHgSicH%n+s1W!Ak=5&GUxOA;{3enZt-d` zM^eRYZr2P>9-VTfhE1so!u};SMcZ@e=i(XY`!fC0H3n0`>@U@jS!OW`IU>e66|~ZC zVY{z9JckFPBwm@l3ha&S##PMy%&k$SuJELrx=>k|b=zG@6eT`;!bUZ^-U;JjN{^C! zwY;dH_OLm!46OVJjOYEm1oOOUlp1Z>Za?;s>{yYEX3|)d=DuUmNMecZ` zt3WAs(zB#7RjneCoChObCMYXPK=s}_vsf@ih1@&BP;X}%Ujj>Wk#ntc`PACHvgoul znldY3`r_h*J-wbf#hTMA23E&S`jVCr-cpoGBb(bNEZScyx3&1_neO64%9EP)eRP;o zF!~Xm^ym^?nV1S!H>u5RF;g1v2KBE5Gyt4&Y8E(58Ln*fd?X~?#ii<|yM7x!c~9ArpiInh{QFI{$LaUX5|3m#=u{H>(1 z#s1+!`F2k=H%NsD4_*|R{a8~Zm(L5bkM(5khc61*6Mi34e2Z&#IWx=JN}$WY87V-D zE1DXnBAG@paxZG5%*^H7PoOx zXdWw}YRKFT-me}~G+67g1`ozR4M7b4q{CP%s9rohXO++s56fku&e4lg*)aII@iGrO z4?dqe*p<9{BgnNAWm{T8ECj-_+o3#>VzTT0)o81KUw=A)NCyr)nUBKVLvC|siemGe zoqG;FI?e(=Ko<&J&(Q}|z#LOEgiDId*5Jw@L7Jcn+s#ssk9fDUCbQ6PN*!!vwXHh} z!diK1mxMe;_CcMWqQHZM=VZE0<_$#H=mvY!PUTginFS4p3T$;cor%zbLMq1NV&D}9FjnNh>wf=Bk|+fjKO?T)p$4+q_$@jpW<$ZP0H@Vy+?e*suFwia z84*1j1UrsSkW)%LBotC^4y?!Ka$K*sZNYK!Xj$~5`;t^(IYoQCz=IN75z4S0 zYfr4Z$Z<#Vu!F13cNNb7IgJm{2ix$M>US?q`o`2W-`T_-knCr3ek%t>xs{Aad#diE zqL1AGm#?@=@P3u4Y-U*+**M)kM}&$zcn9`~3UvlzII0NKRGm8yTm~*651qI_t-uC9 z(x;4OPsT3fdvA=Ma#(2NZ~k;`=r4_)_m$&)uqCFkM#r*x>RyG?;}Cwc8iMpteXx&E zR(X!;G`0TV+u->`$b}fyRb*y1{Xu!n(kPKpd{cJQ4twAcv0^J+#nmtP@iSahot~Bv zWbvzkoG-heBHqVu!Ogd}V|y-k%qv(2O+F5a;4ntfHGLO$BQ-^%VV4!(uFR<5Q0oWHG6H)Pz9Jnp5>#vD zROslNTb;gg9A=!&c|NJ{t=vgbJol}oh#11s=R)$xb!y8uwa_(^xw%gdb^`x@8ZrML z6E|vR!&l^i0|D&-qWTzssJ@ZydwB<2J4gBtW=__IHU{QabQX5Trl+cwb|@;S?^-?B zDXqs4DQHn4ds>;g@@WOBLCDOP3lrJ$X{+pmB@$#%(#Q;4W|Czy;vV^5h!Hk?Xu#2M zO>va#vZE`AvhzSg!C~OuBh}wy^UkE=3p z6)CAe#SHdy@FZ~1T(PcIS6#xLD`=3U<^rl)$;gS#Vg+IWH=0r~VWZLeJe_?Ci9Wy& zu`3unLmDLQ>_MuZr`=Sk!vz9|i!dR5`nr(3g-(J2ln~5un5-=mss-%YNlgN%dC6=> zZxJUHTJJ(Mj?5^~CmQ}%SzFM)Jv@avT{8rnaZKdkifb)&1!%4k8TUCK%RBu>W2AXqUb0|JaK(qy` zX{FRf3g_(o>UykI%zoD!h-Z}~3+Y@G5|KQ^l=k|)V{Im}0U70k_#`~N650`4gomWs z(B?R6!$k#*J)95W^GvMp%>-b_xr`4}Hm1-Or!;-A`jk6h!2a?v?XVzrSa6%ZObk1i zAV|D;7k>D(AnxFG_aC{lk*rJHoP+YV!SV!BdBsh4s7g4GVNCPV35E$p6+_Q?6eac( zh!o)n2%Op>_v(>wto%32optS3q_6F|Har2HS`2 z7ON4n5zD8oaD=j~&wAh40pG5*X>IjDf7885_e{u!{tn_!cUpJt-m~vccj~Q@6mN+I zt*@TxeAJxGbkgC#N#P+C%H;>n7EhgK8*KN1fje5ab|cz%>V973r6miBZcc~a1R`?z z6{&E1>le~r53QJ@cTrYYiyA^R_b`Rt&n22k>rYBApjuwBqM;66%}5LBG2YIX$!45D zzyy-0){Li)l;oKvjoYYtB16?oQq4rnBg0jodA-qHlGkS$XZ?7gM2cwzyikq>QM6$z zuN^OdN+Mu|zFO<+4LL{sm@D+q!MFs8%;v_}^{jU>&h@dn0uHo|4tV#A?BdmhEY3F~ z{yQ8sb*t$02e@l+-(5-?c^~7nOB*h)geAVn ziK8C8quz7LBVhgo^bj)lKumgw-CUH;-h#Yxm`c6TUTT@-I(l6pTIhfbV?F}6ce>V5 zG-f!1$w_lvXU24c*VIPgk~bOI(p;mtBDb)?=7?!kH%04$#nr2flyh};x~1VXCj{T^ z!^`^^zR%{@N^*SHUmYhpC+nS7r06?qUgj3oJx@+E&f8Yn0h0*)^9Q5zj+ZR&+r6!- z$7mehc^}Ws8$d_?a`x0+%a`71ce$`Z_ws!(%SS;+*|fZ1Hb_KYU63QUparub2>B*3 zE4B*DKr`2=5{vV*zK&+H+fFiWdoLLp*LYMZ210?UTqZBW9#2vsMJuZT?yHoBwz}Yg z{m|}3*AXW#O(3<}Rd#|G18#6|j)55_c5v`3hw$Sopf>{sK?Owx_;4s7rnXS+Xbec1 zTR_!>RTLl~z|VglzJSkwfV6~c0p&|?P{857GSAfem>|cQZ|GWhl(E*ZU z|NHO-CI@`-kENv_`T4(|7=ik{;|4f!1@LbG0os2bzQFQ;1)KlwVcy$182|3NnUm3O z=YZ3#0hRv%DS7{W_yXSpKKbv{4eacGcXqWsYoagU!Vd62Kz}sGKM!AEG{6+n&h7)C zQj4)8;LZ&`nA_U?4k1nfsL>s8{b9gOF{&DZW;{3KPw9~~VUSF1|#?6aa3%0r30tw+J2}h5mtHU}$Bm^uf)_*ztFF zP)M80dIq2{P4|!5-v4Y+U*O&U0{EdGkTGxsRMYvdf*${P^IJYxqn8hF01Yw#c$j}| zocqti7Z?d}kALv_3GOcopnn1V?URxUlUM5kG;;-@VTt|;3wZSfHpBZ1@;^7|{c)MU z?t=bn%l)-_=O1O0{&@iK0Pz3K^q+WtVft6p-%|Z`oAN)QEr@>u{a32LdqjU_`g5Dk zkpBkmUzz>^`dgyE?(+L5(OHV$ApeEvkHGZ*)TBQX^##2B-<$N$X!ozkzwYq)Cs9AZ zv-=n1zYzTqZ~GPQ?^~3Z={Ioy(xQI`@qR`Ab)UOGiIOn?2Kg^Uf5aMoh5LJ=mh8WQ z`xl~r1~7g_{&gd%KZ#m#{08|iM1Rb%euevcqS3s+f%_Mte@@$eMgDa=lRt^Zy!#FE zUx@xlbNLnS?}=^-{08n{i2j*7^DFYNyCnQcbW89z$bTXFL*4!>+}{(05d96@zYzUX zGXE>`uZ7zGBnm0^8|1$b{h?|774Gke-b?)k?q7)hDU|&c`Pcfwe-eF={tfb9i2l&_ z{R;Q@L_aJ12JT;o{wWyz75UeCntu|lQTz?^Ux@xcg`B-g13?sp-9lf%(x1{&()t88 zR$7>#f(i)=iYO>(R)q)&2obc>2e9@T?0pG)AHc$KhUMOKyvLAYNwFVaW|w=iJ2SVe z%4424#cb-Y2fRO~g&LN8?v-b%*Bb>*o$|uYc`VE=z>GSz>4belFh+3KDMNg;0pWhF zt#4o&w{tSudk1LwT7+rl-x1F@=ci-(gRl(Qjaa)ZBSXWznK+8g5I%%QBE+CwcP9)whZ$h&@pk<*jh4dvG9st4#>Ez*! z#U|801X>mf(?*Zb=m-!_NGA+$bTpy*7-(52OcgD(ata71q?3gAF`3Zv8PKv&m>yc_ z>l_eHNGAsGQZS*<3!r78FeUUoG`<9c6Vl1RtFldKa0Rq16sLh7qJ;*Kj&(J60}N;M zPXh4D%!Zhrz;6Lpl)}e6t#mQ~hF8+nniqDNQuhvUMQLKW#!+ewf#H>Oz2>DVrnEEy zTu}-Ow!f87Y33dnUP)JMo@+OyZ^RX)uw-kcx3N^xHJfJ?P3iRka7Afi(Z*5Qe*}gf kC0(_7#>AA`Pww>;&Y!HUPN#PuX$yJ!yrlOit>@owKR-4{djJ3c From 69a3e450223e74f606553843b9868ee7b783cd30 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 8 Apr 2026 23:51:41 +0000 Subject: [PATCH 33/38] save work --- docs/softmax_lowering.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md index e0c5a0e8..f462eb9b 100644 --- a/docs/softmax_lowering.md +++ b/docs/softmax_lowering.md @@ -3,7 +3,7 @@ ## Overview **Assumptions:** -Softmax dimension size is small (64 in this example). +Softmax dimension size is small (64 in this example). No tiling in reduction dim. The lowering process consists of seven stages: 1. **initial** - High-level tensor operations @@ -302,7 +302,7 @@ xegpu.store_nd %11, %12[%0, 0] ``` --- -## What is reused in between matmul and softmax? +## What is reused form the lighthouse project? The softmax implementation leverages several reusable lighthouse infrastructure components: @@ -326,7 +326,6 @@ The workload-specific code is minimal (payload generation + tiling/decomposition ### Payload Generation (`lighthouse.ingress.mlir_gen`) - **`get_mlir_elem_type`**: Type conversion utilities for constructing MLIR types -**Key Insight**: --- From c5e5d374abec1e989290a0b298369fbd35c7a540 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 9 Apr 2026 00:08:36 +0000 Subject: [PATCH 34/38] save work --- docs/softmax_lowering.md | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md index f462eb9b..e3f6518a 100644 --- a/docs/softmax_lowering.md +++ b/docs/softmax_lowering.md @@ -8,7 +8,7 @@ Softmax dimension size is small (64 in this example). No tiling in reduction dim The lowering process consists of seven stages: 1. **initial** - High-level tensor operations 2. **tiled-softmax** - Tiled softmax operations -3. **decomposed** - Decomposition into constituent operations +3. **decomposed** - Decomposition into generic operations 4. **vectorized** - Vector operations 5. **bufferized** - Memory-based representation 6. **xegpu-initial** - GPU kernel with XeGPU operations @@ -65,7 +65,7 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { ## Stage 3: Decomposed **Notes** -- Softmax decomposed into 4 constituent `linalg.generic` ops : max, sub+exp, sum, divide +- Softmax decomposed into 4 `linalg.generic` ops : max, sub+exp, sum, divide - Uses `structured.structured_decompose_interface` implemented by `linalg.softmax` **Code:** @@ -331,9 +331,9 @@ The workload-specific code is minimal (payload generation + tiling/decomposition # Supporting larger Softmax dimension sizes -Previsouly we tiled all ops in the parallel dimension only (i.e. non softmax dim). Handling a larger softmax dimension require tiling the softmax contituent ops in that dimension. +Previsouly we tiled all ops in the parallel dimension only (i.e. non softmax dim). Handling a larger softmax reduction dimension require tiling the softmax contituent ops in the reduction dimension. -**Approach:** Tile reductions along dimension 1 (using appropriate step size) and fuse producers into consumers to enable streaming computation. +**Approach:** Tile reductions along dimension 1 (using appropriate step size) and fuse producers into consumers to avoid extra memory buffers. --- @@ -357,8 +357,8 @@ scf.forall ... { // After: Division tiled into 64x16 chunks scf.forall ... { + // Tiled Max, Center+Exp, Sum ops ... %11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) { - // Max, Center+Exp, Sum ops ... %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) outs(%extracted_slice_5 : tensor<64x16xf32>) { ^bb0(%in: f32, %in_6: f32, %out: f32): %13 = arith.divf %in, %in_6 : f32 @@ -380,10 +380,11 @@ scf.forall ... { **Key Changes:** ```mlir +// Original tiled div loop. %11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) { %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1] - // Fused: sub+exp computed per 16-element chunk + // Fused recomputation: sub+exp computed per 16-element chunk %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) outs(%extracted_slice_5 : tensor<64x16xf32>) { ^bb0(%in: f32, %in_8: f32, %out: f32): @@ -443,10 +444,11 @@ scf.forall ... { **Key Changes:** ```mlir +// Original tiled mun loop. %12 = scf.for %arg4 = %c0_3 to %c64 step %c16 iter_args(%arg5 = %11) -> (tensor<64x16xf32>) { %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1] - // Fused: sub+exp + // Fused recomputation: sub+exp %14 = linalg.generic {...} ins(%extracted_slice_7, %extracted_slice_8 : tensor<64x16xf32>, tensor<64xf32>) outs(%extracted_slice_9 : tensor<64x16xf32>) { ^bb0(%in: f32, %in_11: f32, %out: f32): @@ -608,7 +610,7 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { **Notes:** - Convert `memref.alloc()` to `memref.alloca()` for stack allocation -- Reduces memory allocation overhead +- **Why?** We can only allocate SLM inside GPU kernel. **Code:** ```mlir From c001e19e3a74846985295b1f6bea94c969986405 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 9 Apr 2026 00:15:15 +0000 Subject: [PATCH 35/38] save work --- docs/softmax_lowering.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md index e3f6518a..37a5b659 100644 --- a/docs/softmax_lowering.md +++ b/docs/softmax_lowering.md @@ -512,15 +512,19 @@ scf.forall ... { - Convert tiled linalg operations to vector operations - `scf.for` loops remain but operate on vectors - Vector size: 64x16 for tiled operations +- Same buffer is used for max and sum streaming reductions. **Code:** ```mlir func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - // ... + // ... + %cst = arith.constant dense<0.000000e+00> : vector<64x16xf32> // 0.0 + %cst_1 = arith.constant dense<0xFFC00000> : vector<64x16xf32> // -inf %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) { // ... // Vectorized max reduction loop + %5 = tensor.empty() : tensor<64x16xf32> %6 = vector.transfer_write %cst_1, %5[%c0, %c0] : vector<64x16xf32>, tensor<64x16xf32> %7 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %6) -> (tensor<64x16xf32>) { %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32> @@ -531,7 +535,7 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { } %8 = vector.transfer_read %7[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32> %9 = vector.multi_reduction , %8, %cst_2 [1] : vector<64x16xf32> to vector<64xf32> - + %10 = vector.transfer_write %cst, %5[%c0, %c0] {in_bounds = [true, true]} : vector<64x16xf32>, tensor<64x16xf32> // Vectorized sum reduction loop with fused sub+exp %11 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %10) -> (tensor<64x16xf32>) { %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32> From a60dbe940e7e1a39e4de0568db706afe1834ce89 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 9 Apr 2026 00:18:04 +0000 Subject: [PATCH 36/38] save work --- docs/softmax_lowering.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md index 37a5b659..d769b08c 100644 --- a/docs/softmax_lowering.md +++ b/docs/softmax_lowering.md @@ -337,7 +337,7 @@ Previsouly we tiled all ops in the parallel dimension only (i.e. non softmax dim --- -## Decomposed → Tiled: Stage A - Tile div op +## Stage A - Tile div op **Notes:** - Tile the division operation with step size 16 along dimension 1 @@ -357,7 +357,7 @@ scf.forall ... { // After: Division tiled into 64x16 chunks scf.forall ... { - // Tiled Max, Center+Exp, Sum ops ... + // Max, Center+Exp, Sum ops ... %11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) { %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) outs(%extracted_slice_5 : tensor<64x16xf32>) { ^bb0(%in: f32, %in_6: f32, %out: f32): From 996a91168244217e00bd0ec6202a88343c6a673a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 9 Apr 2026 00:22:15 +0000 Subject: [PATCH 37/38] save work --- docs/softmax_lowering.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md index d769b08c..b56a80f6 100644 --- a/docs/softmax_lowering.md +++ b/docs/softmax_lowering.md @@ -676,3 +676,8 @@ gpu.module @payload_kernel { ``` **Summary:** At this stage, the kernel processes 64x16 chunks in streaming fashion through three sequential loops, minimizing memory footprint. + +## XeGPU improvements needed for e2e support (WIP) + +* Change `memeref.alloca()` address space to SLM. +* Support for lowering load/store using SLM to `xegpu.load/store_matrix` instead of `xegpu.load/store_nd` From 14176ac5acbd928bfd893244d4b0ef46125a8d7b Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 9 Apr 2026 14:48:46 +0000 Subject: [PATCH 38/38] save work --- docs/softmax_lowering.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md index b56a80f6..d15d2e85 100644 --- a/docs/softmax_lowering.md +++ b/docs/softmax_lowering.md @@ -290,7 +290,7 @@ gpu.module @payload_kernel [#xevm.target] { **Notes** - Sets the layout for anchor xegpu ops. Each Wg consistes of [8, 1] subgroups doing 8x64 softmax slice. -- Only sets the layotu for `store_nd`. Layout propagation does the rest. +- Only sets the layout for `store_nd`. Layout propagation does the rest. **Code (differences from xegpu-initial):** ```mlir