From eecee073a311b1ad4aad4bd24b97fdfc2caf3d03 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 13 Mar 2026 23:32:23 +0000 Subject: [PATCH 01/51] save work --- examples/xegpu/softmax.py | 336 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 336 insertions(+) create mode 100644 examples/xegpu/softmax.py diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py new file mode 100644 index 00000000..bb90f812 --- /dev/null +++ b/examples/xegpu/softmax.py @@ -0,0 +1,336 @@ +# RUN: %PYTHON %s --dump-kernel=xegpu-wg | FileCheck %s +# CHECK: module attributes {gpu.container_module} { + +""" +XeGPU softmax benchmark. +""" + +import argparse +import ctypes +from typing import Optional +from functools import cached_property + +import numpy as np +from mlir import ir +from mlir.execution_engine import ExecutionEngine +from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func + +from lighthouse.workload import benchmark +from lighthouse.utils.memref import to_ctype as memref_to_ctype +from lighthouse.utils.numpy import numpy_to_ctype +from lighthouse.utils.mlir import func_cif +from lighthouse.ingress.mlir_gen import get_mlir_elem_type +from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor + +from xegpu_workload import XeGPUWorkload + + +def softmax_complexity(M: int, N: int, nbytes: int): + """ + Complexity of softmax operation. + + For each row: + - O(N) to find max + - O(N) to compute exp(x - max) and sum + - O(N) to normalize + Total: 3*N operations per row, but with transcendental (exp) operations + """ + # Approximation: 5 FLOPs per element (max, sub, exp, sum, div) + # exp is expensive but we count it as ~1 FLOP for simplicity + flop_count = M * N * 5 + memory_reads = M * N * nbytes # read input + memory_writes = M * N * nbytes # write output + return flop_count, memory_reads, memory_writes + + +class XeGPUSoftmax(XeGPUWorkload): + """ + Softmax workload on XeGPU. + + Computes softmax along the last dimension (rows): + output[i, j] = exp(input[i, j] - max_i) / sum_i(exp(input[i, j] - max_i)) + + where max_i and sum_i are computed over row i. + """ + + def __init__( + self, + M: int, + N: int, + dtype: str = "f32", + ): + super().__init__() + self.M = M + self.N = N + self.shape = (M, N) + assert dtype == "f32", "Only f32 type is supported for softmax" + self.dtype_str = dtype + type_str_to_numpy = { + "f16": np.float16, + "f32": np.float32, + } + self.dtype = type_str_to_numpy[dtype] + + @cached_property + def _initial_host_arrays(self) -> tuple[np.ndarray]: + """Generate initial values on host with numpy.""" + np.random.seed(42) + # Use values in range [-0.5, 0.5] to avoid numerical issues + input_arr = np.random.uniform(-0.5, 0.5, self.shape).astype(self.dtype) + return (input_arr,) + + @cached_property + def _reference_solution(self) -> np.ndarray: + """Compute reference solution on host with numpy.""" + (input_arr,) = self._initial_host_arrays + # Use float32 for computation + x = input_arr.astype(np.float32) + # Compute softmax along axis 1 (each row independently) + # Numerically stable version: subtract max before exp + max_vals = np.max(x, axis=1, keepdims=True) + exp_vals = np.exp(x - max_vals) + sum_vals = np.sum(exp_vals, axis=1, keepdims=True) + output = exp_vals / sum_vals + return output.astype(self.dtype) + + def _get_input_arrays( + self, execution_engine: ExecutionEngine + ) -> list[ctypes.Structure]: + # Allocate device memory for input and output + input_gpu = self._allocate_array( + "input", self.shape, self.dtype_str, execution_engine + ) + output_gpu = self._allocate_array( + "output", self.shape, self.dtype_str, execution_engine + ) + + # Copy input to device + (input_host,) = self._initial_host_arrays + copy_fn = f"gpu_copy_2d_{self.dtype_str}" + execution_engine.invoke( + copy_fn, numpy_to_ctype(input_host), memref_to_ctype(input_gpu) + ) + + # Return memrefs: [output, input] + return [output_gpu, input_gpu] + + def check_correctness( + self, execution_engine: ExecutionEngine, verbose: int = 0 + ) -> bool: + # Copy result from device to host + output_gpu = self.gpu_memrefs[("output", self.dtype_str)] + output_host = np.zeros(self.shape, dtype=self.dtype) + execution_engine.invoke( + f"gpu_copy_2d_{self.dtype_str}", + memref_to_ctype(output_gpu), + numpy_to_ctype(output_host), + ) + + output_ref = self._reference_solution + output_computed = output_host.astype(np.float32) + + if verbose > 1: + print("Reference solution (first 5 rows):") + print(output_ref[:5]) + print("Computed solution (first 5 rows):") + print(output_computed[:5]) + + # Check row sums are close to 1.0 + row_sums = np.sum(output_computed, axis=1) + sums_ok = np.allclose(row_sums, 1.0, rtol=1e-5, atol=1e-6) + + # Check values match reference + values_ok = np.allclose(output_computed, output_ref, rtol=1e-4, atol=1e-6) + + success = sums_ok and values_ok + + if verbose: + if success: + print("PASSED") + else: + print("FAILED!") + if not sums_ok: + print(f" Row sums check failed. Min: {row_sums.min():.6f}, Max: {row_sums.max():.6f}") + if not values_ok: + max_diff = np.abs(output_computed - output_ref).max() + print(f" Values mismatch. Max abs diff: {max_diff:.6e}") + return success + + def get_complexity(self) -> tuple[int, int, int]: + nbytes = np.dtype(self.dtype).itemsize + return softmax_complexity(self.M, self.N, nbytes) + + def payload_module(self) -> ir.Module: + """Generate MLIR module for softmax payload.""" + mod = ir.Module.create() + dtype = get_mlir_elem_type(self.dtype_str) + memref_t = ir.MemRefType.get(self.shape, dtype) + + with ir.InsertionPoint(mod.body): + # Function signature: payload(output, input) + @func_cif(memref_t, memref_t, name=self.payload_function_name) + def payload(output, input_arg): + # Convert memrefs to tensors + output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True) + input_tensor = emit_buf_to_tensor(input_arg, restrict=True) + + # Create intermediate buffer for softmax (used internally by linalg.softmax) + # This stores the sum of exp values + M, N = self.shape + softmax_buf_type = ir.MemRefType.get((M,N), dtype) + softmax_buf = gpu.alloc(softmax_buf_type, None, [], [], []) + softmax_buf_tensor = emit_buf_to_tensor(softmax_buf, restrict=True, writable=True) + + # Compute softmax along dimension 1 (rows) + # linalg.softmax performs: exp(x - max(x)) / sum(exp(x - max(x))) + result = linalg.softmax( + (input_tensor.type,), input_tensor, softmax_buf_tensor, dimension=1 + ) + + # Materialize result back to output memref + bufferization.materialize_in_destination( + None, result, output, restrict=True, writable=True + ) + + # Cleanup + gpu.dealloc(None, [], softmax_buf) + + # Emit utility functions for GPU memory management + emit_gpu_util_funcs(dtype, rank=2) + + return mod + + def schedule_module( + self, stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None + ) -> ir.Module: + """ + Generate transform schedule for softmax. + + For now, returns an empty schedule. In the future, this would contain + tiling, vectorization, and XeGPU-specific lowering transformations. + """ + # TODO: Implement proper transform schedule + # For now, create a minimal schedule that just applies bufferization + mod = ir.Module.create() + with ir.InsertionPoint(mod.body): + from mlir.dialects import transform + + # Create a simple transform sequence + @func_cif(name="__transform_main") + def transform_main(): + # Empty transform - just identity + # In a full implementation, this would tile, vectorize, + # and lower to XeGPU operations + pass + + return mod + + def shared_libs(self) -> list[str]: + return ["libmlir_levelzero_runtime.so"] + + +def parse_cli(): + parser = argparse.ArgumentParser( + description="Softmax using MLIR XeGPU", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--sizes", + type=int, + nargs=2, + default=[1024, 512], + help="M,N matrix sizes (MxN)", + ) + parser.add_argument( + "--wg-tile", + type=int, + nargs=2, + default=[64, 32], + help="Workgroup tile size M,N.", + ) + parser.add_argument( + "--nruns", + type=int, + default=1000, + help="Number of runs to average the execution time.", + ) + parser.add_argument( + "--nwarmup", + type=int, + default=20, + help="Number of warm-up iterations before benchmarking.", + ) + parser.add_argument( + "--check-result", + action="store_true", + help="Check the result of the softmax computation.", + ) + parser.add_argument( + "--dump-kernel", + type=str, + choices=[ + "initial", + "tiled", + "vectorized", + "bufferized", + "xegpu-initial", + "xegpu-wg", + "final", + ], + help="Dump kernel IR at different stages of lowering and exit without " + "executing the kernel.", + ) + parser.add_argument( + "--dump-schedule", + action="store_true", + help="Dump transform schedule.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + args = parse_cli() + + params = { + "wg_m": args.wg_tile[0], + "wg_n": args.wg_tile[1], + } + + M, N = args.sizes + dtype = "f32" + + with ir.Context(), ir.Location.unknown(): + wload = XeGPUSoftmax(M=M, N=N, dtype=dtype) + + if args.dump_kernel or args.dump_schedule: + wload.lower_payload( + dump_payload=args.dump_kernel, + dump_schedule=args.dump_schedule, + schedule_parameters=params, + ) + else: + times = benchmark( + wload, + nruns=args.nruns, + nwarmup=args.nwarmup, + schedule_parameters=params, + check_correctness=args.check_result, + verbose=1, + ) + times *= 1e6 # convert to microseconds + elapsed = np.mean(times) + flop_count = wload.get_complexity()[0] + gflops = flop_count / (elapsed * 1e-6) / 1e9 + + def list2str(a): + return ",".join(map(str, a)) + + parts = [ + f"sizes={list2str(args.sizes)}", + f"dt={dtype}", + f"wg-tile={list2str(args.wg_tile)}", + f"time(us): {elapsed:.2f}", + f"GFLOPS: {gflops:.2f}", + ] + print(" ".join(parts)) From f991027da04fd09f164f10b771ea470b9b577519 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 16 Mar 2026 22:46:40 +0000 Subject: [PATCH 02/51] save work --- examples/xegpu/softmax.py | 93 ++++++++++++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 17 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index bb90f812..ab61f1e5 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -13,9 +13,9 @@ import numpy as np from mlir import ir from mlir.execution_engine import ExecutionEngine -from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func +from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func, math -from lighthouse.workload import benchmark +from lighthouse.workload import benchmark, get_bench_wrapper_schedule from lighthouse.utils.memref import to_ctype as memref_to_ctype from lighthouse.utils.numpy import numpy_to_ctype from lighthouse.utils.mlir import func_cif @@ -174,35 +174,94 @@ def payload(output, input_arg): output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True) input_tensor = emit_buf_to_tensor(input_arg, restrict=True) - # Create intermediate buffer for softmax (used internally by linalg.softmax) - # This stores the sum of exp values M, N = self.shape - softmax_buf_type = ir.MemRefType.get((M,N), dtype) - softmax_buf = gpu.alloc(softmax_buf_type, None, [], [], []) - softmax_buf_tensor = emit_buf_to_tensor(softmax_buf, restrict=True, writable=True) - # Compute softmax along dimension 1 (rows) - # linalg.softmax performs: exp(x - max(x)) / sum(exp(x - max(x))) - result = linalg.softmax( - (input_tensor.type,), input_tensor, softmax_buf_tensor, dimension=1 + # Define affine maps for indexing + # #map = affine_map<(d0, d1) -> (d0, d1)> (identity 2D) + # #map1 = affine_map<(d0, d1) -> (d0)> (broadcast/reduce along d1) + d0 = ir.AffineDimExpr.get(0) + d1 = ir.AffineDimExpr.get(1) + map_2d = ir.AffineMap.get(2, 0, [d0, d1]) + map_1d = ir.AffineMap.get(2, 0, [d0]) + + # Step 1: Find max - linalg.generic reduction + neg_inf = arith.constant(dtype, float('-inf')) + max_init = tensor.empty((M,), dtype) + max_filled = linalg.fill(neg_inf, outs=[max_init]) + + @linalg.generic( + [input_tensor], # inputs + [max_filled], # outputs + [map_2d, map_1d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types + ) + def row_max(in_val, acc): + return arith.maximumf(in_val, acc) + + # Step 2: Subtract max (broadcast) - linalg.generic elementwise + output_init = tensor.empty((M, N), dtype) + + @linalg.generic( + [input_tensor, row_max], # inputs + [output_init], # outputs + [map_2d, map_1d, map_2d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + ) + def shifted(in_val, max_val, out): + return arith.subf(in_val, max_val) + + # Step 3: Compute exp - linalg.generic elementwise + @linalg.generic( + [shifted], # inputs + [output_init], # outputs + [map_2d, map_2d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + ) + def exp_vals(in_val, out): + return math.exp(in_val) + + # Step 4: Sum exp values - linalg.generic reduction + # Create collapsed tensor for sum init + # sum_init_2d = tensor.empty((M, 1), dtype) + sum_init = tensor.empty((M,), dtype) + # sum_init = tensor.CollapseShapeOp(sum_init_2d, [[0, 1]]) + + + zero = arith.constant(dtype, 0.0) + sum_filled = linalg.fill(zero, outs=[sum_init]) + + @linalg.generic( + [exp_vals], # inputs + [sum_filled], # outputs + [map_2d, map_1d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types ) + def row_sum(in_val, acc): + return arith.addf(in_val, acc) + + # Step 5: Divide by sum (broadcast) - linalg.generic elementwise + @linalg.generic( + [exp_vals, row_sum], # inputs + [output_init], # outputs + [map_2d, map_1d, map_2d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + ) + def result(exp_val, sum_val, out): + return arith.divf(exp_val, sum_val) # Materialize result back to output memref bufferization.materialize_in_destination( None, result, output, restrict=True, writable=True ) - - # Cleanup - gpu.dealloc(None, [], softmax_buf) # Emit utility functions for GPU memory management emit_gpu_util_funcs(dtype, rank=2) return mod - def schedule_module( + def schedule_modules( self, stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None - ) -> ir.Module: + ) -> list[ir.Module]: """ Generate transform schedule for softmax. @@ -223,7 +282,7 @@ def transform_main(): # and lower to XeGPU operations pass - return mod + return [get_bench_wrapper_schedule(self), mod] def shared_libs(self) -> list[str]: return ["libmlir_levelzero_runtime.so"] From 22415bb54f34727337b66392f4585f09110d2b6a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 16 Mar 2026 22:48:34 +0000 Subject: [PATCH 03/51] save work --- examples/xegpu/softmax.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index ab61f1e5..dff8bd99 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -224,7 +224,7 @@ def exp_vals(in_val, out): # Create collapsed tensor for sum init # sum_init_2d = tensor.empty((M, 1), dtype) sum_init = tensor.empty((M,), dtype) - # sum_init = tensor.CollapseShapeOp(sum_init_2d, [[0, 1]]) + # tensor.CollapseShapeOp(sum_init, sum_init_2d, [[0, 1]]) zero = arith.constant(dtype, 0.0) From cb9ead174134465610ef797dbb33fbee0196e1cd Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 17 Mar 2026 22:05:47 +0000 Subject: [PATCH 04/51] save work --- examples/xegpu/softmax.py | 64 +++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 9 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index dff8bd99..97abd49a 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -269,20 +269,66 @@ def schedule_modules( tiling, vectorization, and XeGPU-specific lowering transformations. """ # TODO: Implement proper transform schedule - # For now, create a minimal schedule that just applies bufferization + # For now, create a minimal schedule that prints the last linalg operation mod = ir.Module.create() + mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get() + with ir.InsertionPoint(mod.body): from mlir.dialects import transform + from mlir.dialects.transform import structured + + # Create a transform sequence with proper signature + named_sequence = transform.named_sequence( + "__transform_main", + [transform.AnyOpType.get()], # input: module + [], # no outputs + arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}] + ) - # Create a simple transform sequence - @func_cif(name="__transform_main") - def transform_main(): - # Empty transform - just identity - # In a full implementation, this would tile, vectorize, - # and lower to XeGPU operations - pass + with ir.InsertionPoint(named_sequence.body): + # Get the input module (bodyTarget) + payload_mod = named_sequence.bodyTarget + + # Match all linalg.generic operations + # We have 5 generic ops in softmax: max, sub, exp, sum, div + generic_ops = structured.structured_match( + transform.AnyOpType.get(), + payload_mod, + ops=["linalg.generic"] + ) + + # Split the handle into individual operation handles + # For softmax, we have 5 operations + anytype = transform.AnyOpType.get() + split_ops = transform.split_handle( + (anytype, anytype, anytype, anytype, anytype), # 5 result types + generic_ops + ) + + # The last operation (index 4) is the division + last_op = split_ops[-1] + + # Print the last operation before tiling + # transform.print_(target=last_op, name="last_linalg_generic_before_tiling") + + # Tile the last operation using tile_using_forall + # Tile sizes: [64, 64] for the two parallel dimensions (M, N) + tiled_op, for_op = structured.structured_tile_using_forall( + anytype, anytype, + last_op, + num_threads=[], + tile_sizes=[], + static_tile_sizes=[64, 64], + ) + + # Print the tiled operation + # transform.print_(target=tiled_op, name="tiled_linalg_generic") + # transform.print_(target=for_op, name="forall_op") + + # Required: yield to end the transform sequence + transform.yield_() - return [get_bench_wrapper_schedule(self), mod] + return [mod] def shared_libs(self) -> list[str]: return ["libmlir_levelzero_runtime.so"] From ac39be33c3b90e6eb77c16237611fd33d4490695 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 18 Mar 2026 22:41:32 +0000 Subject: [PATCH 05/51] save work --- examples/xegpu/softmax.py | 86 +++++++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 8 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index 97abd49a..6d6f87eb 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -14,6 +14,8 @@ from mlir import ir from mlir.execution_engine import ExecutionEngine from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func, math +from mlir.dialects import transform +from mlir.dialects.transform import structured, loop from lighthouse.workload import benchmark, get_bench_wrapper_schedule from lighthouse.utils.memref import to_ctype as memref_to_ctype @@ -21,9 +23,25 @@ from lighthouse.utils.mlir import func_cif from lighthouse.ingress.mlir_gen import get_mlir_elem_type from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor +from lighthouse.pipeline.helper import ( + apply_registered_pass, + canonicalize, + match, +) +from mlir.dialects.transform import bufferization as transform_bufferization +from mlir.dialects.bufferization import LayoutMapOption from xegpu_workload import XeGPUWorkload +def match_and_split(*args, nhandles=1, **kwargs): + """Helper function that splits matched handles.""" + matched = match(*args, **kwargs) + anytype = transform.AnyOpType.get() + matched_ops = transform.split_handle((anytype,) * nhandles, matched) + if nhandles == 1: + matched_ops = [matched_ops] + return matched_ops + def softmax_complexity(M: int, N: int, nbytes: int): """ @@ -274,8 +292,7 @@ def schedule_modules( mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get() with ir.InsertionPoint(mod.body): - from mlir.dialects import transform - from mlir.dialects.transform import structured + # Create a transform sequence with proper signature named_sequence = transform.named_sequence( @@ -305,8 +322,11 @@ def schedule_modules( generic_ops ) - # The last operation (index 4) is the division - last_op = split_ops[-1] + # Reverse split_ops to have operations in reverse order + split_ops = list(reversed(split_ops)) + + # The first operation (after reversal) is the division - this is the consumer + last_op = split_ops[0] # Print the last operation before tiling # transform.print_(target=last_op, name="last_linalg_generic_before_tiling") @@ -318,12 +338,62 @@ def schedule_modules( last_op, num_threads=[], tile_sizes=[], - static_tile_sizes=[64, 64], + static_tile_sizes=(64,), ) - # Print the tiled operation - # transform.print_(target=tiled_op, name="tiled_linalg_generic") - # transform.print_(target=for_op, name="forall_op") + # Fuse the producer operations into the forall loop + # Iterate through remaining operations (already in reverse order) + current_forall = for_op + for producer_op in split_ops[1:]: + fused_op, current_forall = structured.structured_fuse_into_containing_op( + anytype, anytype, + producer_op, + current_forall + ) + + func = transform.get_parent_op( + anytype, + current_forall, + op_name="func.func", + deduplicate=True, + ) + transform.apply_cse(func) + canonicalize(func) + func = apply_registered_pass(func, "eliminate-empty-tensors") + func = structured.VectorizeChildrenAndApplyPatternsOp( + func, + fold_type_extensions_into_contract=True, + ).result + identity_layout = LayoutMapOption.IdentityLayoutMap + payload_mod = transform.get_parent_op( + anytype, + func, + op_name="builtin.module", + deduplicate=True, + ) + payload_mod = transform_bufferization.OneShotBufferizeOp( + payload_mod, + allow_return_allocs_from_loops=True, + bufferize_function_boundaries=True, + function_boundary_type_conversion=identity_layout, + ).result + payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") + transform.apply_cse(payload_mod) + canonicalize(payload_mod) + + # convert forall to parallel + wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) + for wg_loop in wg_loops: + wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) + func = transform.get_parent_op(anytype, wg_loop) + + # convert to scf.parallel to gpu.launch + func = apply_registered_pass(func, "gpu-map-parallel-loops") + func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") + func = apply_registered_pass(func, "lower-affine") + transform.apply_cse(func) + canonicalize(func) + # Required: yield to end the transform sequence transform.yield_() From 51d494e23aa34e3166210e080aa23663e98924c2 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 20 Mar 2026 17:10:46 +0000 Subject: [PATCH 06/51] save work --- examples/xegpu/softmax.py | 67 +++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index 6d6f87eb..be29cdd0 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -15,7 +15,7 @@ from mlir.execution_engine import ExecutionEngine from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func, math from mlir.dialects import transform -from mlir.dialects.transform import structured, loop +from mlir.dialects.transform import structured, loop, xegpu from lighthouse.workload import benchmark, get_bench_wrapper_schedule from lighthouse.utils.memref import to_ctype as memref_to_ctype @@ -338,7 +338,7 @@ def schedule_modules( last_op, num_threads=[], tile_sizes=[], - static_tile_sizes=(64,), + static_tile_sizes=(parameters["wg_rows"],), ) # Fuse the producer operations into the forall loop @@ -393,6 +393,39 @@ def schedule_modules( func = apply_registered_pass(func, "lower-affine") transform.apply_cse(func) canonicalize(func) + # set the number of threads for the gpu.launch operation + launch_op = match_and_split(func, ops={"gpu.launch"}) + num_threads = parameters["sg_rows"] * parameters["subgroup_size"] + xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) + + # outline gpu func + func = apply_registered_pass(func, "lower-affine") + canonicalize(func) + func = apply_registered_pass(func, "gpu-launch-sink-index-computations") + payload_mod = transform.get_parent_op( + anytype, + func, + op_name="builtin.module", + deduplicate=True, + ) + payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining") + # transform.PrintOp(target=payload_mod, name="before_gpu_outlining") + # transform.apply_cse(payload_mod) + + # set xevm target + # payload_mod = apply_registered_pass( + # payload_mod, + # "xevm-attach-target", + # options={"O": "3", "chip": "bmg"}, + # ) + + # # convert vector to xegpu + # gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"}) + # for gpu_mod in gpu_mod_ops: + # gpu_func = match(gpu_mod, ops={"gpu.func"}) + # gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") + # transform.apply_cse(gpu_func) + # Required: yield to end the transform sequence @@ -413,15 +446,26 @@ def parse_cli(): "--sizes", type=int, nargs=2, - default=[1024, 512], + default=[1024, 64], help="M,N matrix sizes (MxN)", ) parser.add_argument( - "--wg-tile", + "--wg-rows", type=int, - nargs=2, - default=[64, 32], - help="Workgroup tile size M,N.", + default=64, + help="Number of rows per workgroup.", + ) + parser.add_argument( + "--sg-rows", + type=int, + default=8, + help="Number of rows per subgroup.", + ) + parser.add_argument( + "--subgroup-size", + type=int, + default=16, + help="Subgroup size.", ) parser.add_argument( "--nruns", @@ -468,8 +512,9 @@ def parse_cli(): args = parse_cli() params = { - "wg_m": args.wg_tile[0], - "wg_n": args.wg_tile[1], + "wg_rows": args.wg_rows, + "sg_rows": args.sg_rows, + "subgroup_size": args.subgroup_size, } M, N = args.sizes @@ -504,7 +549,9 @@ def list2str(a): parts = [ f"sizes={list2str(args.sizes)}", f"dt={dtype}", - f"wg-tile={list2str(args.wg_tile)}", + f"wg-rows={args.wg_rows}", + f"sg-rows={args.sg_rows}", + f"subgroup-size={args.subgroup_size}", f"time(us): {elapsed:.2f}", f"GFLOPS: {gflops:.2f}", ] From 7ac8852412f06100c2cf188c37c2f18254e38c83 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 20 Mar 2026 21:37:04 +0000 Subject: [PATCH 07/51] save work --- examples/xegpu/softmax.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index be29cdd0..56b43c21 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -408,23 +408,34 @@ def schedule_modules( op_name="builtin.module", deduplicate=True, ) - payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining") + # payload = match(payload_mod, ops={"func.func"}) # transform.PrintOp(target=payload_mod, name="before_gpu_outlining") - # transform.apply_cse(payload_mod) + payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining") + transform.apply_cse(payload_mod) # set xevm target - # payload_mod = apply_registered_pass( - # payload_mod, - # "xevm-attach-target", - # options={"O": "3", "chip": "bmg"}, - # ) - - # # convert vector to xegpu - # gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"}) + payload_mod = apply_registered_pass( + payload_mod, + "xevm-attach-target", + options={"O": "3", "chip": "bmg"}, + ) + + # convert vector to xegpu + gpu_mod = match_and_split(payload_mod, ops={"gpu.module"}) # for gpu_mod in gpu_mod_ops: - # gpu_func = match(gpu_mod, ops={"gpu.func"}) - # gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") - # transform.apply_cse(gpu_func) + gpu_func = match(gpu_mod[0], ops={"gpu.func"}) + gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") + transform.apply_cse(gpu_func) + + # Set layout attributes for xegpu.store_nd operations + store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) + # for store_op in store_ops: + xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64]) + + payload_mod = apply_registered_pass( + payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} + ) + From d65bf9fe47d2c95c5fdc986177e604ec4c0b6f82 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 20 Mar 2026 22:58:52 +0000 Subject: [PATCH 08/51] save work --- examples/xegpu/softmax.py | 88 ++++++++++++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 20 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index 56b43c21..2debc8c8 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -311,14 +311,14 @@ def schedule_modules( generic_ops = structured.structured_match( transform.AnyOpType.get(), payload_mod, - ops=["linalg.generic"] + ops=["linalg.generic", "linalg.fill"] ) # Split the handle into individual operation handles # For softmax, we have 5 operations anytype = transform.AnyOpType.get() split_ops = transform.split_handle( - (anytype, anytype, anytype, anytype, anytype), # 5 result types + (anytype, anytype, anytype, anytype, anytype, anytype, anytype), # 7 result types generic_ops ) @@ -350,20 +350,23 @@ def schedule_modules( producer_op, current_forall ) - - func = transform.get_parent_op( - anytype, - current_forall, - op_name="func.func", - deduplicate=True, - ) - transform.apply_cse(func) - canonicalize(func) - func = apply_registered_pass(func, "eliminate-empty-tensors") + transform.annotate(current_forall, "gpu_loop") + + transform.apply_cse(payload_mod) + # canonicalize(payload_mod) + + # Vectorize and bufferize sequence + func = match(payload_mod, ops={"func.func"}) func = structured.VectorizeChildrenAndApplyPatternsOp( func, fold_type_extensions_into_contract=True, ).result + loops = match_and_split(payload_mod, ops={"scf.forall"}) + loop.loop_hoist_loop_invariant_subsets(loops[0]) + transform.apply_cse(payload_mod) + canonicalize(payload_mod) + # transform.PrintOp(target=payload_mod, name="vectorize") + identity_layout = LayoutMapOption.IdentityLayoutMap payload_mod = transform.get_parent_op( anytype, @@ -377,28 +380,73 @@ def schedule_modules( bufferize_function_boundaries=True, function_boundary_type_conversion=identity_layout, ).result + # payload_mod = transform_bufferization.OneShotBufferizeOp( + # payload_mod, + # allow_return_allocs_from_loops=False, + # bufferize_function_boundaries=True, + # function_boundary_type_conversion=identity_layout, + # ).result payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") + payload_mod = apply_registered_pass(payload_mod, "drop-equivalent-buffer-results") + payload_mod = apply_registered_pass( + payload_mod, + "buffer-results-to-out-params", + options={ + "add-result-attr": "true", + "hoist-dynamic-allocs": "true", + "hoist-static-allocs": "true", + "modify-public-functions": "true" + } + ) transform.apply_cse(payload_mod) canonicalize(payload_mod) + # # # transform.PrintOp(target=payload_mod, name="bufferize") + + # # func = match(payload_mod, ops={"func.func"}) + gpu_loop = match(payload_mod, op_attrs={"gpu_loop": ir.UnitAttr.get()}) + # # gpu_loop = transform.split_handle(anytype, gpu_loop) + gpu_loop = loop.loop_forall_to_parallel([anytype], gpu_loop) + + # # func = apply_registered_pass(payload_mod, "eliminate-empty-tensors") + # # func = structured.VectorizeChildrenAndApplyPatternsOp( + # # func, + # # fold_type_extensions_into_contract=True, + # # ).result + # # identity_layout = LayoutMapOption.IdentityLayoutMap + payload_mod = transform.get_parent_op( + anytype, + gpu_loop, + op_name="func.func", + deduplicate=True, + ) + # payload_mod = transform_bufferization.OneShotBufferizeOp( + # payload_mod, + # allow_return_allocs_from_loops=True, + # bufferize_function_boundaries=True, + # function_boundary_type_conversion=identity_layout, + # ).result + # payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") + # transform.apply_cse(payload_mod) + # canonicalize(payload_mod) - # convert forall to parallel - wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) - for wg_loop in wg_loops: - wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) - func = transform.get_parent_op(anytype, wg_loop) + # # convert forall to parallel + # wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) + # for wg_loop in wg_loops: + # wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) + # func = transform.get_parent_op(anytype, wg_loop) # convert to scf.parallel to gpu.launch - func = apply_registered_pass(func, "gpu-map-parallel-loops") + func = apply_registered_pass(payload_mod, "gpu-map-parallel-loops") func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") func = apply_registered_pass(func, "lower-affine") transform.apply_cse(func) canonicalize(func) - # set the number of threads for the gpu.launch operation + # # set the number of threads for the gpu.launch operation launch_op = match_and_split(func, ops={"gpu.launch"}) num_threads = parameters["sg_rows"] * parameters["subgroup_size"] xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) - # outline gpu func + # # outline gpu func func = apply_registered_pass(func, "lower-affine") canonicalize(func) func = apply_registered_pass(func, "gpu-launch-sink-index-computations") From 0bf3eb32a7fa9fbd66b0a7a238fb302607c2896a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 24 Mar 2026 20:32:41 +0000 Subject: [PATCH 09/51] save working version --- examples/xegpu/softmax.py | 138 ++++++++++++-------------------------- 1 file changed, 42 insertions(+), 96 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index 2debc8c8..e95286b5 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -17,6 +17,7 @@ from mlir.dialects import transform from mlir.dialects.transform import structured, loop, xegpu +from lighthouse import dialects as lh_dialects from lighthouse.workload import benchmark, get_bench_wrapper_schedule from lighthouse.utils.memref import to_ctype as memref_to_ctype from lighthouse.utils.numpy import numpy_to_ctype @@ -54,7 +55,6 @@ def softmax_complexity(M: int, N: int, nbytes: int): Total: 3*N operations per row, but with transcendental (exp) operations """ # Approximation: 5 FLOPs per element (max, sub, exp, sum, div) - # exp is expensive but we count it as ~1 FLOP for simplicity flop_count = M * N * 5 memory_reads = M * N * nbytes # read input memory_writes = M * N * nbytes # write output @@ -303,8 +303,15 @@ def schedule_modules( ) with ir.InsertionPoint(named_sequence.body): - # Get the input module (bodyTarget) - payload_mod = named_sequence.bodyTarget + # match the payload module + anytype = transform.AnyOpType.get() + func = match(named_sequence.bodyTarget, ops={"func.func"}) + payload_mod = transform.get_parent_op( + anytype, + func, + op_name="builtin.module", + deduplicate=True, + ) # Match all linalg.generic operations # We have 5 generic ops in softmax: max, sub, exp, sum, div @@ -315,7 +322,7 @@ def schedule_modules( ) # Split the handle into individual operation handles - # For softmax, we have 5 operations + # For softmax, we have 7 operations anytype = transform.AnyOpType.get() split_ops = transform.split_handle( (anytype, anytype, anytype, anytype, anytype, anytype, anytype), # 7 result types @@ -350,117 +357,59 @@ def schedule_modules( producer_op, current_forall ) - transform.annotate(current_forall, "gpu_loop") - - transform.apply_cse(payload_mod) - # canonicalize(payload_mod) + + func = transform.get_parent_op( + anytype, + current_forall, + op_name="func.func", + deduplicate=True, + ) + transform.apply_cse(func) + canonicalize(func) - # Vectorize and bufferize sequence - func = match(payload_mod, ops={"func.func"}) func = structured.VectorizeChildrenAndApplyPatternsOp( func, fold_type_extensions_into_contract=True, ).result - loops = match_and_split(payload_mod, ops={"scf.forall"}) - loop.loop_hoist_loop_invariant_subsets(loops[0]) - transform.apply_cse(payload_mod) - canonicalize(payload_mod) - # transform.PrintOp(target=payload_mod, name="vectorize") - + transform.apply_cse(func) + canonicalize(func) + payload_mod = apply_registered_pass(payload_mod, "eliminate-empty-tensors") identity_layout = LayoutMapOption.IdentityLayoutMap - payload_mod = transform.get_parent_op( - anytype, - func, - op_name="builtin.module", - deduplicate=True, - ) payload_mod = transform_bufferization.OneShotBufferizeOp( payload_mod, allow_return_allocs_from_loops=True, bufferize_function_boundaries=True, function_boundary_type_conversion=identity_layout, ).result - # payload_mod = transform_bufferization.OneShotBufferizeOp( - # payload_mod, - # allow_return_allocs_from_loops=False, - # bufferize_function_boundaries=True, - # function_boundary_type_conversion=identity_layout, - # ).result + # fold memref.subviews into vector.transfer_read/write ops payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") - payload_mod = apply_registered_pass(payload_mod, "drop-equivalent-buffer-results") - payload_mod = apply_registered_pass( - payload_mod, - "buffer-results-to-out-params", - options={ - "add-result-attr": "true", - "hoist-dynamic-allocs": "true", - "hoist-static-allocs": "true", - "modify-public-functions": "true" - } - ) transform.apply_cse(payload_mod) canonicalize(payload_mod) - # # # transform.PrintOp(target=payload_mod, name="bufferize") - - # # func = match(payload_mod, ops={"func.func"}) - gpu_loop = match(payload_mod, op_attrs={"gpu_loop": ir.UnitAttr.get()}) - # # gpu_loop = transform.split_handle(anytype, gpu_loop) - gpu_loop = loop.loop_forall_to_parallel([anytype], gpu_loop) - # # func = apply_registered_pass(payload_mod, "eliminate-empty-tensors") - # # func = structured.VectorizeChildrenAndApplyPatternsOp( - # # func, - # # fold_type_extensions_into_contract=True, - # # ).result - # # identity_layout = LayoutMapOption.IdentityLayoutMap - payload_mod = transform.get_parent_op( - anytype, - gpu_loop, - op_name="func.func", - deduplicate=True, - ) - # payload_mod = transform_bufferization.OneShotBufferizeOp( - # payload_mod, - # allow_return_allocs_from_loops=True, - # bufferize_function_boundaries=True, - # function_boundary_type_conversion=identity_layout, - # ).result - # payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") - # transform.apply_cse(payload_mod) - # canonicalize(payload_mod) - - # # convert forall to parallel - # wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) - # for wg_loop in wg_loops: - # wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) - # func = transform.get_parent_op(anytype, wg_loop) - + # convert forall to parallel + wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) + for wg_loop in wg_loops: + wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) + func = transform.get_parent_op(anytype, wg_loop) # convert to scf.parallel to gpu.launch - func = apply_registered_pass(payload_mod, "gpu-map-parallel-loops") + func = apply_registered_pass(func, "gpu-map-parallel-loops") func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") func = apply_registered_pass(func, "lower-affine") transform.apply_cse(func) canonicalize(func) - # # set the number of threads for the gpu.launch operation + + # set the number of threads for the gpu.launch operation launch_op = match_and_split(func, ops={"gpu.launch"}) num_threads = parameters["sg_rows"] * parameters["subgroup_size"] xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) - # # outline gpu func + # outline gpu func func = apply_registered_pass(func, "lower-affine") canonicalize(func) func = apply_registered_pass(func, "gpu-launch-sink-index-computations") - payload_mod = transform.get_parent_op( - anytype, - func, - op_name="builtin.module", - deduplicate=True, - ) - # payload = match(payload_mod, ops={"func.func"}) - # transform.PrintOp(target=payload_mod, name="before_gpu_outlining") payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining") transform.apply_cse(payload_mod) - + # set xevm target payload_mod = apply_registered_pass( payload_mod, @@ -469,12 +418,12 @@ def schedule_modules( ) # convert vector to xegpu - gpu_mod = match_and_split(payload_mod, ops={"gpu.module"}) - # for gpu_mod in gpu_mod_ops: - gpu_func = match(gpu_mod[0], ops={"gpu.func"}) - gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") - transform.apply_cse(gpu_func) - + gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"}) + for gpu_mod in gpu_mod_ops: + gpu_func = match(gpu_mod, ops={"gpu.func"}) + gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") + transform.apply_cse(gpu_func) + # Set layout attributes for xegpu.store_nd operations store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) # for store_op in store_ops: @@ -483,14 +432,10 @@ def schedule_modules( payload_mod = apply_registered_pass( payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} ) - - - - # Required: yield to end the transform sequence transform.yield_() - return [mod] + return [get_bench_wrapper_schedule(self), mod] def shared_libs(self) -> list[str]: return ["libmlir_levelzero_runtime.so"] @@ -580,6 +525,7 @@ def parse_cli(): dtype = "f32" with ir.Context(), ir.Location.unknown(): + lh_dialects.register_and_load() wload = XeGPUSoftmax(M=M, N=N, dtype=dtype) if args.dump_kernel or args.dump_schedule: From fabd656c1267437174b7c7b9ed472c0b76556ee9 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 24 Mar 2026 21:48:44 +0000 Subject: [PATCH 10/51] save working version --- examples/xegpu/softmax.py | 288 +----------------- .../ingress/mlir_gen/gpu_softmax_payload.py | 121 ++++++++ lighthouse/schedule/xegpu/softmax_schedule.py | 195 ++++++++++++ 3 files changed, 332 insertions(+), 272 deletions(-) create mode 100644 lighthouse/ingress/mlir_gen/gpu_softmax_payload.py create mode 100644 lighthouse/schedule/xegpu/softmax_schedule.py diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index e95286b5..474eb1a3 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -13,36 +13,17 @@ import numpy as np from mlir import ir from mlir.execution_engine import ExecutionEngine -from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func, math -from mlir.dialects import transform -from mlir.dialects.transform import structured, loop, xegpu from lighthouse import dialects as lh_dialects from lighthouse.workload import benchmark, get_bench_wrapper_schedule from lighthouse.utils.memref import to_ctype as memref_to_ctype from lighthouse.utils.numpy import numpy_to_ctype -from lighthouse.utils.mlir import func_cif from lighthouse.ingress.mlir_gen import get_mlir_elem_type -from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor -from lighthouse.pipeline.helper import ( - apply_registered_pass, - canonicalize, - match, -) -from mlir.dialects.transform import bufferization as transform_bufferization -from mlir.dialects.bufferization import LayoutMapOption +from lighthouse.ingress.mlir_gen.gpu_softmax_payload import generate_gpu_softmax_payload +from lighthouse.schedule.xegpu.softmax_schedule import get_softmax_schedule_module from xegpu_workload import XeGPUWorkload -def match_and_split(*args, nhandles=1, **kwargs): - """Helper function that splits matched handles.""" - matched = match(*args, **kwargs) - anytype = transform.AnyOpType.get() - matched_ops = transform.split_handle((anytype,) * nhandles, matched) - if nhandles == 1: - matched_ops = [matched_ops] - return matched_ops - def softmax_complexity(M: int, N: int, nbytes: int): """ @@ -180,262 +161,25 @@ def get_complexity(self) -> tuple[int, int, int]: def payload_module(self) -> ir.Module: """Generate MLIR module for softmax payload.""" - mod = ir.Module.create() dtype = get_mlir_elem_type(self.dtype_str) - memref_t = ir.MemRefType.get(self.shape, dtype) - - with ir.InsertionPoint(mod.body): - # Function signature: payload(output, input) - @func_cif(memref_t, memref_t, name=self.payload_function_name) - def payload(output, input_arg): - # Convert memrefs to tensors - output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True) - input_tensor = emit_buf_to_tensor(input_arg, restrict=True) - - M, N = self.shape - - # Define affine maps for indexing - # #map = affine_map<(d0, d1) -> (d0, d1)> (identity 2D) - # #map1 = affine_map<(d0, d1) -> (d0)> (broadcast/reduce along d1) - d0 = ir.AffineDimExpr.get(0) - d1 = ir.AffineDimExpr.get(1) - map_2d = ir.AffineMap.get(2, 0, [d0, d1]) - map_1d = ir.AffineMap.get(2, 0, [d0]) - - # Step 1: Find max - linalg.generic reduction - neg_inf = arith.constant(dtype, float('-inf')) - max_init = tensor.empty((M,), dtype) - max_filled = linalg.fill(neg_inf, outs=[max_init]) - - @linalg.generic( - [input_tensor], # inputs - [max_filled], # outputs - [map_2d, map_1d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types - ) - def row_max(in_val, acc): - return arith.maximumf(in_val, acc) - - # Step 2: Subtract max (broadcast) - linalg.generic elementwise - output_init = tensor.empty((M, N), dtype) - - @linalg.generic( - [input_tensor, row_max], # inputs - [output_init], # outputs - [map_2d, map_1d, map_2d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types - ) - def shifted(in_val, max_val, out): - return arith.subf(in_val, max_val) - - # Step 3: Compute exp - linalg.generic elementwise - @linalg.generic( - [shifted], # inputs - [output_init], # outputs - [map_2d, map_2d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types - ) - def exp_vals(in_val, out): - return math.exp(in_val) - - # Step 4: Sum exp values - linalg.generic reduction - # Create collapsed tensor for sum init - # sum_init_2d = tensor.empty((M, 1), dtype) - sum_init = tensor.empty((M,), dtype) - # tensor.CollapseShapeOp(sum_init, sum_init_2d, [[0, 1]]) - - - zero = arith.constant(dtype, 0.0) - sum_filled = linalg.fill(zero, outs=[sum_init]) - - @linalg.generic( - [exp_vals], # inputs - [sum_filled], # outputs - [map_2d, map_1d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types - ) - def row_sum(in_val, acc): - return arith.addf(in_val, acc) - - # Step 5: Divide by sum (broadcast) - linalg.generic elementwise - @linalg.generic( - [exp_vals, row_sum], # inputs - [output_init], # outputs - [map_2d, map_1d, map_2d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types - ) - def result(exp_val, sum_val, out): - return arith.divf(exp_val, sum_val) - - # Materialize result back to output memref - bufferization.materialize_in_destination( - None, result, output, restrict=True, writable=True - ) - - # Emit utility functions for GPU memory management - emit_gpu_util_funcs(dtype, rank=2) - - return mod + return generate_gpu_softmax_payload( + func_name=self.payload_function_name, + M=self.M, + N=self.N, + dtype=dtype, + ) def schedule_modules( self, stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None ) -> list[ir.Module]: - """ - Generate transform schedule for softmax. - - For now, returns an empty schedule. In the future, this would contain - tiling, vectorization, and XeGPU-specific lowering transformations. - """ - # TODO: Implement proper transform schedule - # For now, create a minimal schedule that prints the last linalg operation - mod = ir.Module.create() - mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get() - - with ir.InsertionPoint(mod.body): - - - # Create a transform sequence with proper signature - named_sequence = transform.named_sequence( - "__transform_main", - [transform.AnyOpType.get()], # input: module - [], # no outputs - arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}] - ) - - with ir.InsertionPoint(named_sequence.body): - # match the payload module - anytype = transform.AnyOpType.get() - func = match(named_sequence.bodyTarget, ops={"func.func"}) - payload_mod = transform.get_parent_op( - anytype, - func, - op_name="builtin.module", - deduplicate=True, - ) - - # Match all linalg.generic operations - # We have 5 generic ops in softmax: max, sub, exp, sum, div - generic_ops = structured.structured_match( - transform.AnyOpType.get(), - payload_mod, - ops=["linalg.generic", "linalg.fill"] - ) - - # Split the handle into individual operation handles - # For softmax, we have 7 operations - anytype = transform.AnyOpType.get() - split_ops = transform.split_handle( - (anytype, anytype, anytype, anytype, anytype, anytype, anytype), # 7 result types - generic_ops - ) - - # Reverse split_ops to have operations in reverse order - split_ops = list(reversed(split_ops)) - - # The first operation (after reversal) is the division - this is the consumer - last_op = split_ops[0] - - # Print the last operation before tiling - # transform.print_(target=last_op, name="last_linalg_generic_before_tiling") - - # Tile the last operation using tile_using_forall - # Tile sizes: [64, 64] for the two parallel dimensions (M, N) - tiled_op, for_op = structured.structured_tile_using_forall( - anytype, anytype, - last_op, - num_threads=[], - tile_sizes=[], - static_tile_sizes=(parameters["wg_rows"],), - ) - - # Fuse the producer operations into the forall loop - # Iterate through remaining operations (already in reverse order) - current_forall = for_op - for producer_op in split_ops[1:]: - fused_op, current_forall = structured.structured_fuse_into_containing_op( - anytype, anytype, - producer_op, - current_forall - ) - - func = transform.get_parent_op( - anytype, - current_forall, - op_name="func.func", - deduplicate=True, - ) - transform.apply_cse(func) - canonicalize(func) - - func = structured.VectorizeChildrenAndApplyPatternsOp( - func, - fold_type_extensions_into_contract=True, - ).result - transform.apply_cse(func) - canonicalize(func) - payload_mod = apply_registered_pass(payload_mod, "eliminate-empty-tensors") - identity_layout = LayoutMapOption.IdentityLayoutMap - payload_mod = transform_bufferization.OneShotBufferizeOp( - payload_mod, - allow_return_allocs_from_loops=True, - bufferize_function_boundaries=True, - function_boundary_type_conversion=identity_layout, - ).result - # fold memref.subviews into vector.transfer_read/write ops - payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") - transform.apply_cse(payload_mod) - canonicalize(payload_mod) - - # convert forall to parallel - wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) - for wg_loop in wg_loops: - wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) - func = transform.get_parent_op(anytype, wg_loop) - # convert to scf.parallel to gpu.launch - func = apply_registered_pass(func, "gpu-map-parallel-loops") - func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") - func = apply_registered_pass(func, "lower-affine") - transform.apply_cse(func) - canonicalize(func) - - # set the number of threads for the gpu.launch operation - launch_op = match_and_split(func, ops={"gpu.launch"}) - num_threads = parameters["sg_rows"] * parameters["subgroup_size"] - xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) - - # outline gpu func - func = apply_registered_pass(func, "lower-affine") - canonicalize(func) - func = apply_registered_pass(func, "gpu-launch-sink-index-computations") - payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining") - transform.apply_cse(payload_mod) - - # set xevm target - payload_mod = apply_registered_pass( - payload_mod, - "xevm-attach-target", - options={"O": "3", "chip": "bmg"}, - ) - - # convert vector to xegpu - gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"}) - for gpu_mod in gpu_mod_ops: - gpu_func = match(gpu_mod, ops={"gpu.func"}) - gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") - transform.apply_cse(gpu_func) - - # Set layout attributes for xegpu.store_nd operations - store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) - # for store_op in store_ops: - xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64]) - - payload_mod = apply_registered_pass( - payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} - ) - # Required: yield to end the transform sequence - transform.yield_() - - return [get_bench_wrapper_schedule(self), mod] + """Generate transform schedule for softmax.""" + return [ + get_bench_wrapper_schedule(self), + get_softmax_schedule_module( + stop_at_stage=stop_at_stage, + parameters=parameters, + ), + ] def shared_libs(self) -> list[str]: return ["libmlir_levelzero_runtime.so"] diff --git a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py new file mode 100644 index 00000000..26fd9042 --- /dev/null +++ b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py @@ -0,0 +1,121 @@ +"""Generate MLIR payload for GPU softmax operation.""" + +from mlir import ir +from mlir.dialects import linalg, bufferization, arith, tensor, func, math + +from lighthouse.utils.mlir import func_cif +from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor + + +def generate_gpu_softmax_payload( + func_name: str, + M: int, + N: int, + dtype: ir.Type, +) -> ir.Module: + """ + Generate MLIR module for softmax payload. + + Computes softmax along the last dimension (rows): + output[i, j] = exp(input[i, j] - max_i) / sum_i(exp(input[i, j] - max_i)) + + where max_i and sum_i are computed over row i. + + Args: + func_name: Name of the payload function + M: Number of rows + N: Number of columns + dtype: MLIR element type (e.g., F32Type) + + Returns: + MLIR module containing the softmax payload function + """ + mod = ir.Module.create() + shape = (M, N) + memref_t = ir.MemRefType.get(shape, dtype) + + with ir.InsertionPoint(mod.body): + # Function signature: payload(output, input) + @func_cif(memref_t, memref_t, name=func_name) + def payload(output, input_arg): + # Convert memrefs to tensors + output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True) + input_tensor = emit_buf_to_tensor(input_arg, restrict=True) + + # Define affine maps for indexing + # #map = affine_map<(d0, d1) -> (d0, d1)> (identity 2D) + # #map1 = affine_map<(d0, d1) -> (d0)> (broadcast/reduce along d1) + d0 = ir.AffineDimExpr.get(0) + d1 = ir.AffineDimExpr.get(1) + map_2d = ir.AffineMap.get(2, 0, [d0, d1]) + map_1d = ir.AffineMap.get(2, 0, [d0]) + + # Step 1: Find max - linalg.generic reduction + neg_inf = arith.constant(dtype, float('-inf')) + max_init = tensor.empty((M,), dtype) + max_filled = linalg.fill(neg_inf, outs=[max_init]) + + @linalg.generic( + [input_tensor], # inputs + [max_filled], # outputs + [map_2d, map_1d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types + ) + def row_max(in_val, acc): + return arith.maximumf(in_val, acc) + + # Step 2: Subtract max (broadcast) - linalg.generic elementwise + output_init = tensor.empty((M, N), dtype) + + @linalg.generic( + [input_tensor, row_max], # inputs + [output_init], # outputs + [map_2d, map_1d, map_2d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + ) + def shifted(in_val, max_val, out): + return arith.subf(in_val, max_val) + + # Step 3: Compute exp - linalg.generic elementwise + @linalg.generic( + [shifted], # inputs + [output_init], # outputs + [map_2d, map_2d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + ) + def exp_vals(in_val, out): + return math.exp(in_val) + + # Step 4: Sum exp values - linalg.generic reduction + sum_init = tensor.empty((M,), dtype) + zero = arith.constant(dtype, 0.0) + sum_filled = linalg.fill(zero, outs=[sum_init]) + + @linalg.generic( + [exp_vals], # inputs + [sum_filled], # outputs + [map_2d, map_1d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types + ) + def row_sum(in_val, acc): + return arith.addf(in_val, acc) + + # Step 5: Divide by sum (broadcast) - linalg.generic elementwise + @linalg.generic( + [exp_vals, row_sum], # inputs + [output_init], # outputs + [map_2d, map_1d, map_2d], # indexing_maps + [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + ) + def result(exp_val, sum_val, out): + return arith.divf(exp_val, sum_val) + + # Materialize result back to output memref + bufferization.materialize_in_destination( + None, result, output, restrict=True, writable=True + ) + + # Emit utility functions for GPU memory management + emit_gpu_util_funcs(dtype, rank=2) + + return mod diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py new file mode 100644 index 00000000..e01f9af6 --- /dev/null +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -0,0 +1,195 @@ +"""Generate MLIR transform schedule for XeGPU softmax operation.""" + +from typing import Optional + +from mlir import ir +from mlir.dialects import transform +from mlir.dialects.transform import structured, loop, xegpu +from mlir.dialects.transform import bufferization as transform_bufferization +from mlir.dialects.bufferization import LayoutMapOption + +from lighthouse.pipeline.helper import ( + apply_registered_pass, + canonicalize, + match, +) + + +def match_and_split(*args, nhandles=1, **kwargs): + """Helper function that splits matched handles.""" + matched = match(*args, **kwargs) + anytype = transform.AnyOpType.get() + matched_ops = transform.split_handle((anytype,) * nhandles, matched) + if nhandles == 1: + matched_ops = [matched_ops] + return matched_ops + + +def get_softmax_schedule_module( + stop_at_stage: Optional[str] = None, + parameters: Optional[dict] = None, +) -> ir.Module: + """ + Generate transform schedule for softmax operation. + + The schedule performs the following transformations: + 1. Tile the consumer operation (division) using forall + 2. Fuse producer operations into the forall loop + 3. Vectorize operations + 4. Bufferize tensors + 5. Convert to GPU dialect + 6. Lower to XeGPU operations + + Args: + stop_at_stage: Optional stage name to stop early (for debugging) + parameters: Dictionary with scheduling parameters: + - wg_rows: Number of rows per workgroup + - sg_rows: Number of rows per subgroup + - subgroup_size: Size of subgroup + + Returns: + MLIR module containing the transform schedule + """ + assert parameters is not None, "Schedule parameters must be provided" + + mod = ir.Module.create() + mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get() + + with ir.InsertionPoint(mod.body): + # Create a transform sequence with proper signature + named_sequence = transform.named_sequence( + "__transform_main", + [transform.AnyOpType.get()], # input: module + [], # no outputs + arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}] + ) + + with ir.InsertionPoint(named_sequence.body): + # match the payload module + anytype = transform.AnyOpType.get() + func = match(named_sequence.bodyTarget, ops={"func.func"}) + payload_mod = transform.get_parent_op( + anytype, + func, + op_name="builtin.module", + deduplicate=True, + ) + + # Match all linalg.generic and linalg.fill operations + # We have 7 operations in softmax: + # fill(max_init), max, sub, exp, fill(sum_init), sum, div + generic_ops = structured.structured_match( + transform.AnyOpType.get(), + payload_mod, + ops=["linalg.generic", "linalg.fill"] + ) + + # Split the handle into individual operation handles + anytype = transform.AnyOpType.get() + split_ops = transform.split_handle( + (anytype, anytype, anytype, anytype, anytype, anytype, anytype), # 7 result types + generic_ops + ) + + # Reverse split_ops to have operations in reverse order + split_ops = list(reversed(split_ops)) + + # The first operation (after reversal) is the division - this is the consumer + last_op = split_ops[0] + + # Tile the last operation using tile_using_forall + tiled_op, for_op = structured.structured_tile_using_forall( + anytype, anytype, + last_op, + num_threads=[], + tile_sizes=[], + static_tile_sizes=(parameters["wg_rows"],), + ) + + # Fuse the producer operations into the forall loop + # Iterate through remaining operations (already in reverse order) + current_forall = for_op + for producer_op in split_ops[1:]: + fused_op, current_forall = structured.structured_fuse_into_containing_op( + anytype, anytype, + producer_op, + current_forall + ) + + func = transform.get_parent_op( + anytype, + current_forall, + op_name="func.func", + deduplicate=True, + ) + transform.apply_cse(func) + canonicalize(func) + + func = structured.VectorizeChildrenAndApplyPatternsOp( + func, + fold_type_extensions_into_contract=True, + ).result + transform.apply_cse(func) + canonicalize(func) + payload_mod = apply_registered_pass(payload_mod, "eliminate-empty-tensors") + identity_layout = LayoutMapOption.IdentityLayoutMap + payload_mod = transform_bufferization.OneShotBufferizeOp( + payload_mod, + allow_return_allocs_from_loops=True, + bufferize_function_boundaries=True, + function_boundary_type_conversion=identity_layout, + ).result + # fold memref.subviews into vector.transfer_read/write ops + payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") + transform.apply_cse(payload_mod) + canonicalize(payload_mod) + + # convert forall to parallel + wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) + for wg_loop in wg_loops: + wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) + func = transform.get_parent_op(anytype, wg_loop) + # convert scf.parallel to gpu.launch + func = apply_registered_pass(func, "gpu-map-parallel-loops") + func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") + func = apply_registered_pass(func, "lower-affine") + transform.apply_cse(func) + canonicalize(func) + + # set the number of threads for the gpu.launch operation + launch_op = match_and_split(func, ops={"gpu.launch"}) + num_threads = parameters["sg_rows"] * parameters["subgroup_size"] + xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) + + # outline gpu func + func = apply_registered_pass(func, "lower-affine") + canonicalize(func) + func = apply_registered_pass(func, "gpu-launch-sink-index-computations") + payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining") + transform.apply_cse(payload_mod) + + # set xevm target + payload_mod = apply_registered_pass( + payload_mod, + "xevm-attach-target", + options={"O": "3", "chip": "bmg"}, + ) + + # convert vector to xegpu + gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"}) + for gpu_mod in gpu_mod_ops: + gpu_func = match(gpu_mod, ops={"gpu.func"}) + gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") + transform.apply_cse(gpu_func) + + # Set layout attributes for xegpu.store_nd operations + store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) + xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64]) + + payload_mod = apply_registered_pass( + payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} + ) + # Required: yield to end the transform sequence + transform.yield_() + + return mod From 1e63d7de4e88047d2251643e5649a477fd0a7c1b Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 24 Mar 2026 22:15:39 +0000 Subject: [PATCH 11/51] save working version --- lighthouse/pipeline/helper.py | 16 + lighthouse/schedule/xegpu/mlp_schedule.py | 18 +- lighthouse/schedule/xegpu/softmax_schedule.py | 307 +++++++++++------- 3 files changed, 203 insertions(+), 138 deletions(-) diff --git a/lighthouse/pipeline/helper.py b/lighthouse/pipeline/helper.py index 213b45df..9c4820d5 100644 --- a/lighthouse/pipeline/helper.py +++ b/lighthouse/pipeline/helper.py @@ -35,3 +35,19 @@ def cleanup_func(target): func = structured.MatchOp.match_op_names(target, ["func.func"]).result transform.apply_cse(func) canonicalize(func) + + +class PipelineInterrupt(Exception): + """Exception to signal early termination of the transform schedule.""" + + pass + + +def match_and_split(*args, nhandles=1, **kwargs): + """Helper function that splits matched handles.""" + matched = match(*args, **kwargs) + anytype = transform.AnyOpType.get() + matched_ops = transform.split_handle((anytype,) * nhandles, matched) + if nhandles == 1: + matched_ops = [matched_ops] + return matched_ops diff --git a/lighthouse/schedule/xegpu/mlp_schedule.py b/lighthouse/schedule/xegpu/mlp_schedule.py index e9fa7909..94cbdf01 100644 --- a/lighthouse/schedule/xegpu/mlp_schedule.py +++ b/lighthouse/schedule/xegpu/mlp_schedule.py @@ -11,6 +11,8 @@ apply_registered_pass, canonicalize, match, + match_and_split, + PipelineInterrupt, ) from lighthouse.dialects import smt_ext, transform_smt_ext as td_smt_ext @@ -33,22 +35,6 @@ MIN_NB_THREADS = 16 -class PipelineInterrupt(Exception): - """Exception to signal early termination of the transform schedule.""" - - pass - - -def match_and_split(*args, nhandles=1, **kwargs): - """Helper function that splits matched handles.""" - matched = match(*args, **kwargs) - anytype = transform.AnyOpType.get() - matched_ops = transform.split_handle((anytype,) * nhandles, matched) - if nhandles == 1: - matched_ops = [matched_ops] - return matched_ops - - @KnobValue.ast_rewrite(in_exprs=True) def params_with_constraints_imposed( params: dict[str, int | None], knob_name_prefix="" diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index e01f9af6..98acd4e6 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -12,19 +12,11 @@ apply_registered_pass, canonicalize, match, + match_and_split, + PipelineInterrupt, ) -def match_and_split(*args, nhandles=1, **kwargs): - """Helper function that splits matched handles.""" - matched = match(*args, **kwargs) - anytype = transform.AnyOpType.get() - matched_ops = transform.split_handle((anytype,) * nhandles, matched) - if nhandles == 1: - matched_ops = [matched_ops] - return matched_ops - - def get_softmax_schedule_module( stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None, @@ -75,121 +67,192 @@ def get_softmax_schedule_module( deduplicate=True, ) - # Match all linalg.generic and linalg.fill operations - # We have 7 operations in softmax: - # fill(max_init), max, sub, exp, fill(sum_init), sum, div - generic_ops = structured.structured_match( - transform.AnyOpType.get(), + xegpu_softmax_transform_schedule( payload_mod, - ops=["linalg.generic", "linalg.fill"] - ) - - # Split the handle into individual operation handles - anytype = transform.AnyOpType.get() - split_ops = transform.split_handle( - (anytype, anytype, anytype, anytype, anytype, anytype, anytype), # 7 result types - generic_ops - ) - - # Reverse split_ops to have operations in reverse order - split_ops = list(reversed(split_ops)) - - # The first operation (after reversal) is the division - this is the consumer - last_op = split_ops[0] - - # Tile the last operation using tile_using_forall - tiled_op, for_op = structured.structured_tile_using_forall( - anytype, anytype, - last_op, - num_threads=[], - tile_sizes=[], - static_tile_sizes=(parameters["wg_rows"],), + parameters=parameters, + stop_at_stage=stop_at_stage or "", ) + + return mod - # Fuse the producer operations into the forall loop - # Iterate through remaining operations (already in reverse order) - current_forall = for_op - for producer_op in split_ops[1:]: - fused_op, current_forall = structured.structured_fuse_into_containing_op( - anytype, anytype, - producer_op, - current_forall - ) - - func = transform.get_parent_op( - anytype, - current_forall, - op_name="func.func", - deduplicate=True, - ) - transform.apply_cse(func) - canonicalize(func) - - func = structured.VectorizeChildrenAndApplyPatternsOp( - func, - fold_type_extensions_into_contract=True, - ).result - transform.apply_cse(func) - canonicalize(func) - payload_mod = apply_registered_pass(payload_mod, "eliminate-empty-tensors") - identity_layout = LayoutMapOption.IdentityLayoutMap - payload_mod = transform_bufferization.OneShotBufferizeOp( - payload_mod, - allow_return_allocs_from_loops=True, - bufferize_function_boundaries=True, - function_boundary_type_conversion=identity_layout, - ).result - # fold memref.subviews into vector.transfer_read/write ops - payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops") - transform.apply_cse(payload_mod) - canonicalize(payload_mod) - - # convert forall to parallel - wg_loops = match_and_split(payload_mod, ops={"scf.forall"}) - for wg_loop in wg_loops: - wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) - func = transform.get_parent_op(anytype, wg_loop) - # convert scf.parallel to gpu.launch - func = apply_registered_pass(func, "gpu-map-parallel-loops") - func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") - func = apply_registered_pass(func, "lower-affine") - transform.apply_cse(func) - canonicalize(func) - - # set the number of threads for the gpu.launch operation - launch_op = match_and_split(func, ops={"gpu.launch"}) - num_threads = parameters["sg_rows"] * parameters["subgroup_size"] - xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) - - # outline gpu func - func = apply_registered_pass(func, "lower-affine") - canonicalize(func) - func = apply_registered_pass(func, "gpu-launch-sink-index-computations") - payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining") - transform.apply_cse(payload_mod) - - # set xevm target - payload_mod = apply_registered_pass( - payload_mod, - "xevm-attach-target", - options={"O": "3", "chip": "bmg"}, - ) - # convert vector to xegpu - gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"}) - for gpu_mod in gpu_mod_ops: - gpu_func = match(gpu_mod, ops={"gpu.func"}) - gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") - transform.apply_cse(gpu_func) - - # Set layout attributes for xegpu.store_nd operations - store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) - xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64]) - - payload_mod = apply_registered_pass( - payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} - ) - # Required: yield to end the transform sequence - transform.yield_() +def xegpu_softmax_transform_schedule( + mod: ir.Value[transform.AnyOpType], + parameters: dict, + stop_at_stage: str = "", +): + """Transform schedule for softmax payload.""" + try: + mod = bundle_xegpu_softmax_schedule( + mod, + parameters=parameters, + stop_at_stage=stop_at_stage, + ) + + mod = bundle_xegpu_to_binary( + mod, + stop_at_stage=stop_at_stage, + ) + except PipelineInterrupt: + pass + finally: + transform.yield_() + + +def bundle_xegpu_softmax_schedule( + mod: ir.Value[transform.AnyOpType], + parameters: dict, + stop_at_stage: str = "", +) -> ir.Value[transform.AnyOpType]: + """Schedule for lowering softmax payload to xegpu wg level.""" + if stop_at_stage == "initial": + raise PipelineInterrupt() + + anytype = transform.AnyOpType.get() + + # Match all linalg.generic and linalg.fill operations + # We have 7 operations in softmax: + # fill(max_init), max, sub, exp, fill(sum_init), sum, div + generic_ops = structured.structured_match( + transform.AnyOpType.get(), + mod, + ops=["linalg.generic", "linalg.fill"] + ) + + # Split the handle into individual operation handles + split_ops = transform.split_handle( + (anytype, anytype, anytype, anytype, anytype, anytype, anytype), # 7 result types + generic_ops + ) + + # Reverse split_ops to have operations in reverse order + split_ops = list(reversed(split_ops)) + + # The first operation (after reversal) is the division - this is the consumer + last_op = split_ops[0] + + # Tile the last operation using tile_using_forall + tiled_op, for_op = structured.structured_tile_using_forall( + anytype, anytype, + last_op, + num_threads=[], + tile_sizes=[], + static_tile_sizes=(parameters["wg_rows"],), + ) + + # Fuse the producer operations into the forall loop + # Iterate through remaining operations (already in reverse order) + current_forall = for_op + for producer_op in split_ops[1:]: + fused_op, current_forall = structured.structured_fuse_into_containing_op( + anytype, anytype, + producer_op, + current_forall + ) + + func = transform.get_parent_op( + anytype, + current_forall, + op_name="func.func", + deduplicate=True, + ) + transform.apply_cse(func) + canonicalize(func) + + if stop_at_stage == "tiled": + raise PipelineInterrupt() + + # vectorize + func = structured.VectorizeChildrenAndApplyPatternsOp( + func, + fold_type_extensions_into_contract=True, + ).result + transform.apply_cse(func) + canonicalize(func) + + if stop_at_stage == "vectorized": + raise PipelineInterrupt() + + # bufferize + mod = apply_registered_pass(mod, "eliminate-empty-tensors") + identity_layout = LayoutMapOption.IdentityLayoutMap + mod = transform_bufferization.OneShotBufferizeOp( + mod, + allow_return_allocs_from_loops=True, + bufferize_function_boundaries=True, + function_boundary_type_conversion=identity_layout, + ).result + # fold memref.subviews into vector.transfer_read/write ops + mod = apply_registered_pass(mod, "fold-memref-alias-ops") + transform.apply_cse(mod) + canonicalize(mod) + + if stop_at_stage == "bufferized": + raise PipelineInterrupt() + + # convert forall to parallel + wg_loops = match_and_split(mod, ops={"scf.forall"}) + for wg_loop in wg_loops: + wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) + func = transform.get_parent_op(anytype, wg_loop) + + # convert scf.parallel to gpu.launch + func = apply_registered_pass(func, "gpu-map-parallel-loops") + func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") + func = apply_registered_pass(func, "lower-affine") + transform.apply_cse(func) + canonicalize(func) + + # set the number of threads for the gpu.launch operation + launch_op = match_and_split(func, ops={"gpu.launch"}) + num_threads = parameters["sg_rows"] * parameters["subgroup_size"] + xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) + + # outline gpu func + func = apply_registered_pass(func, "lower-affine") + canonicalize(func) + func = apply_registered_pass(func, "gpu-launch-sink-index-computations") + mod = apply_registered_pass(mod, "gpu-kernel-outlining") + transform.apply_cse(mod) + + # set xevm target + mod = apply_registered_pass( + mod, + "xevm-attach-target", + options={"O": "3", "chip": "bmg"}, + ) + + # convert vector to xegpu + gpu_mod_ops = match_and_split(mod, ops={"gpu.module"}) + for gpu_mod in gpu_mod_ops: + gpu_func = match(gpu_mod, ops={"gpu.func"}) + gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") + transform.apply_cse(gpu_func) + + if stop_at_stage == "xegpu-initial": + raise PipelineInterrupt() + + # Set layout attributes for xegpu.store_nd operations + store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) + xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64]) + + if stop_at_stage == "xegpu-wg": + raise PipelineInterrupt() + + return mod + + +def bundle_xegpu_to_binary( + mod: ir.Value, stop_at_stage: str = "" +) -> ir.Value[transform.AnyOpType]: + """Schedule for lowering xegpu wg level to binary.""" + # upstream xegpu/xevm pipeline is payload independent. + mod = apply_registered_pass( + mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} + ) + + if stop_at_stage == "final": + raise PipelineInterrupt() + return mod From 64b5d73f52813059ad51293e028c0c1f2cabd459 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 24 Mar 2026 22:47:05 +0000 Subject: [PATCH 12/51] save working version --- examples/xegpu/softmax.py | 1 + lighthouse/schedule/xegpu/softmax_schedule.py | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index 474eb1a3..c31af47a 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -260,6 +260,7 @@ def parse_cli(): args = parse_cli() params = { + "sizes": args.sizes, "wg_rows": args.wg_rows, "sg_rows": args.sg_rows, "subgroup_size": args.subgroup_size, diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 98acd4e6..26bc5f3e 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -206,7 +206,8 @@ def bundle_xegpu_softmax_schedule( # set the number of threads for the gpu.launch operation launch_op = match_and_split(func, ops={"gpu.launch"}) - num_threads = parameters["sg_rows"] * parameters["subgroup_size"] + num_subgroups = parameters["wg_rows"] // parameters["sg_rows"] + num_threads = num_subgroups * parameters["subgroup_size"] xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) # outline gpu func @@ -233,9 +234,12 @@ def bundle_xegpu_softmax_schedule( if stop_at_stage == "xegpu-initial": raise PipelineInterrupt() - # Set layout attributes for xegpu.store_nd operations + # Set layout attributes for xegpu.store_nd operations. + # FIXME: currently ecah subgroup is handling the entire row. store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) - xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64]) + sg_layout = [parameters["sg_rows"], 1] + sg_data = [parameters["sg_rows"], parameters["sizes"][1]] + xegpu.set_op_layout_attr(store_ops[0], sg_layout=sg_layout, sg_data=sg_data) if stop_at_stage == "xegpu-wg": raise PipelineInterrupt() From 108f2c09e8ee111b6b96f97553bc9bacead0013e Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 25 Mar 2026 20:48:48 +0000 Subject: [PATCH 13/51] save working version --- lighthouse/schedule/xegpu/mlp_schedule.py | 13 +------------ lighthouse/schedule/xegpu/softmax_schedule.py | 17 ++--------------- 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/lighthouse/schedule/xegpu/mlp_schedule.py b/lighthouse/schedule/xegpu/mlp_schedule.py index 94cbdf01..ab11a75c 100644 --- a/lighthouse/schedule/xegpu/mlp_schedule.py +++ b/lighthouse/schedule/xegpu/mlp_schedule.py @@ -14,6 +14,7 @@ match_and_split, PipelineInterrupt, ) +from lighthouse.schedule.xegpu.helper import bundle_xegpu_to_binary from lighthouse.dialects import smt_ext, transform_smt_ext as td_smt_ext from lighthouse.dialects.transform_tune_ext import knob, KnobValue @@ -600,15 +601,3 @@ def annotate_ab_load(tile, layout_load, layout_dpas): canonicalize(gpu_func) transform.apply_cse(gpu_func) - - -def bundle_xegpu_to_binary( - mod: ir.Value, stop_at_stage: str = "" -) -> ir.Value[transform.AnyOpType]: - """Schedule for lowering xegpu wg level to binary.""" - # upstream xegpu/xevm pipeline is payload independent. - mod = apply_registered_pass( - mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} - ) - - return mod diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 26bc5f3e..9e5226b9 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -15,6 +15,7 @@ match_and_split, PipelineInterrupt, ) +from lighthouse.schedule.xegpu.helper import bundle_xegpu_to_binary def get_softmax_schedule_module( @@ -38,6 +39,7 @@ def get_softmax_schedule_module( - wg_rows: Number of rows per workgroup - sg_rows: Number of rows per subgroup - subgroup_size: Size of subgroup + - sizes: Tuple with the sizes of the input tensors (e.g. (M, N)) Returns: MLIR module containing the transform schedule @@ -245,18 +247,3 @@ def bundle_xegpu_softmax_schedule( raise PipelineInterrupt() return mod - - -def bundle_xegpu_to_binary( - mod: ir.Value, stop_at_stage: str = "" -) -> ir.Value[transform.AnyOpType]: - """Schedule for lowering xegpu wg level to binary.""" - # upstream xegpu/xevm pipeline is payload independent. - mod = apply_registered_pass( - mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} - ) - - if stop_at_stage == "final": - raise PipelineInterrupt() - - return mod From a7e1e6c7535c5dd085c14ebea39ae13ac25eeb69 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 25 Mar 2026 20:58:29 +0000 Subject: [PATCH 14/51] save working version --- lighthouse/schedule/xegpu/helper.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 lighthouse/schedule/xegpu/helper.py diff --git a/lighthouse/schedule/xegpu/helper.py b/lighthouse/schedule/xegpu/helper.py new file mode 100644 index 00000000..1452301e --- /dev/null +++ b/lighthouse/schedule/xegpu/helper.py @@ -0,0 +1,21 @@ +"""Helper functions for XeGPU scheduling.""" + +from mlir import ir +from mlir.dialects import transform + +from lighthouse.pipeline.helper import apply_registered_pass, PipelineInterrupt + + +def bundle_xegpu_to_binary( + mod: ir.Value, stop_at_stage: str = "" +) -> ir.Value[transform.AnyOpType]: + """Schedule for lowering xegpu wg level to binary.""" + # upstream xegpu/xevm pipeline is payload independent. + mod = apply_registered_pass( + mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} + ) + + if stop_at_stage == "final": + raise PipelineInterrupt() + + return mod From df53caa54ff2c066393f3fca9479203cc0ef471a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 25 Mar 2026 21:09:29 +0000 Subject: [PATCH 15/51] precommit issues --- examples/xegpu/softmax.py | 12 +-- .../ingress/mlir_gen/gpu_softmax_payload.py | 68 +++++++++------ lighthouse/schedule/xegpu/helper.py | 2 +- lighthouse/schedule/xegpu/softmax_schedule.py | 87 ++++++++++--------- 4 files changed, 97 insertions(+), 72 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index c31af47a..e3cf5840 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -28,7 +28,7 @@ def softmax_complexity(M: int, N: int, nbytes: int): """ Complexity of softmax operation. - + For each row: - O(N) to find max - O(N) to compute exp(x - max) and sum @@ -127,7 +127,7 @@ def check_correctness( output_ref = self._reference_solution output_computed = output_host.astype(np.float32) - + if verbose > 1: print("Reference solution (first 5 rows):") print(output_ref[:5]) @@ -137,10 +137,10 @@ def check_correctness( # Check row sums are close to 1.0 row_sums = np.sum(output_computed, axis=1) sums_ok = np.allclose(row_sums, 1.0, rtol=1e-5, atol=1e-6) - + # Check values match reference values_ok = np.allclose(output_computed, output_ref, rtol=1e-4, atol=1e-6) - + success = sums_ok and values_ok if verbose: @@ -149,7 +149,9 @@ def check_correctness( else: print("FAILED!") if not sums_ok: - print(f" Row sums check failed. Min: {row_sums.min():.6f}, Max: {row_sums.max():.6f}") + print( + f" Row sums check failed. Min: {row_sums.min():.6f}, Max: {row_sums.max():.6f}" + ) if not values_ok: max_diff = np.abs(output_computed - output_ref).max() print(f" Values mismatch. Max abs diff: {max_diff:.6e}") diff --git a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py index 26fd9042..4568448e 100644 --- a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py +++ b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py @@ -1,10 +1,13 @@ """Generate MLIR payload for GPU softmax operation.""" from mlir import ir -from mlir.dialects import linalg, bufferization, arith, tensor, func, math +from mlir.dialects import linalg, bufferization, arith, tensor, math from lighthouse.utils.mlir import func_cif -from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor +from lighthouse.ingress.mlir_gen.gpu_utils import ( + emit_gpu_util_funcs, + emit_buf_to_tensor, +) def generate_gpu_softmax_payload( @@ -15,33 +18,33 @@ def generate_gpu_softmax_payload( ) -> ir.Module: """ Generate MLIR module for softmax payload. - + Computes softmax along the last dimension (rows): output[i, j] = exp(input[i, j] - max_i) / sum_i(exp(input[i, j] - max_i)) - + where max_i and sum_i are computed over row i. - + Args: func_name: Name of the payload function M: Number of rows - N: Number of columns + N: Number of columns dtype: MLIR element type (e.g., F32Type) - + Returns: MLIR module containing the softmax payload function """ mod = ir.Module.create() shape = (M, N) memref_t = ir.MemRefType.get(shape, dtype) - + with ir.InsertionPoint(mod.body): # Function signature: payload(output, input) @func_cif(memref_t, memref_t, name=func_name) def payload(output, input_arg): # Convert memrefs to tensors - output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True) + emit_buf_to_tensor(output, restrict=True, writable=True) input_tensor = emit_buf_to_tensor(input_arg, restrict=True) - + # Define affine maps for indexing # #map = affine_map<(d0, d1) -> (d0, d1)> (identity 2D) # #map1 = affine_map<(d0, d1) -> (d0)> (broadcast/reduce along d1) @@ -49,67 +52,82 @@ def payload(output, input_arg): d1 = ir.AffineDimExpr.get(1) map_2d = ir.AffineMap.get(2, 0, [d0, d1]) map_1d = ir.AffineMap.get(2, 0, [d0]) - + # Step 1: Find max - linalg.generic reduction - neg_inf = arith.constant(dtype, float('-inf')) + neg_inf = arith.constant(dtype, float("-inf")) max_init = tensor.empty((M,), dtype) max_filled = linalg.fill(neg_inf, outs=[max_init]) - + @linalg.generic( [input_tensor], # inputs [max_filled], # outputs [map_2d, map_1d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types + [ + linalg.IteratorType.parallel, + linalg.IteratorType.reduction, + ], # iterator_types ) def row_max(in_val, acc): return arith.maximumf(in_val, acc) - + # Step 2: Subtract max (broadcast) - linalg.generic elementwise output_init = tensor.empty((M, N), dtype) - + @linalg.generic( [input_tensor, row_max], # inputs [output_init], # outputs [map_2d, map_1d, map_2d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + [ + linalg.IteratorType.parallel, + linalg.IteratorType.parallel, + ], # iterator_types ) def shifted(in_val, max_val, out): return arith.subf(in_val, max_val) - + # Step 3: Compute exp - linalg.generic elementwise @linalg.generic( [shifted], # inputs [output_init], # outputs [map_2d, map_2d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + [ + linalg.IteratorType.parallel, + linalg.IteratorType.parallel, + ], # iterator_types ) def exp_vals(in_val, out): return math.exp(in_val) - + # Step 4: Sum exp values - linalg.generic reduction sum_init = tensor.empty((M,), dtype) zero = arith.constant(dtype, 0.0) sum_filled = linalg.fill(zero, outs=[sum_init]) - + @linalg.generic( [exp_vals], # inputs [sum_filled], # outputs [map_2d, map_1d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.reduction], # iterator_types + [ + linalg.IteratorType.parallel, + linalg.IteratorType.reduction, + ], # iterator_types ) def row_sum(in_val, acc): return arith.addf(in_val, acc) - + # Step 5: Divide by sum (broadcast) - linalg.generic elementwise @linalg.generic( [exp_vals, row_sum], # inputs [output_init], # outputs [map_2d, map_1d, map_2d], # indexing_maps - [linalg.IteratorType.parallel, linalg.IteratorType.parallel], # iterator_types + [ + linalg.IteratorType.parallel, + linalg.IteratorType.parallel, + ], # iterator_types ) def result(exp_val, sum_val, out): return arith.divf(exp_val, sum_val) - + # Materialize result back to output memref bufferization.materialize_in_destination( None, result, output, restrict=True, writable=True diff --git a/lighthouse/schedule/xegpu/helper.py b/lighthouse/schedule/xegpu/helper.py index 1452301e..0c1d93a7 100644 --- a/lighthouse/schedule/xegpu/helper.py +++ b/lighthouse/schedule/xegpu/helper.py @@ -14,7 +14,7 @@ def bundle_xegpu_to_binary( mod = apply_registered_pass( mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} ) - + if stop_at_stage == "final": raise PipelineInterrupt() diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 9e5226b9..d9701b84 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -24,7 +24,7 @@ def get_softmax_schedule_module( ) -> ir.Module: """ Generate transform schedule for softmax operation. - + The schedule performs the following transformations: 1. Tile the consumer operation (division) using forall 2. Fuse producer operations into the forall loop @@ -32,32 +32,32 @@ def get_softmax_schedule_module( 4. Bufferize tensors 5. Convert to GPU dialect 6. Lower to XeGPU operations - + Args: stop_at_stage: Optional stage name to stop early (for debugging) parameters: Dictionary with scheduling parameters: - wg_rows: Number of rows per workgroup - - sg_rows: Number of rows per subgroup + - sg_rows: Number of rows per subgroup - subgroup_size: Size of subgroup - sizes: Tuple with the sizes of the input tensors (e.g. (M, N)) - + Returns: MLIR module containing the transform schedule """ assert parameters is not None, "Schedule parameters must be provided" - + mod = ir.Module.create() mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get() - + with ir.InsertionPoint(mod.body): # Create a transform sequence with proper signature named_sequence = transform.named_sequence( "__transform_main", [transform.AnyOpType.get()], # input: module [], # no outputs - arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}] + arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}], ) - + with ir.InsertionPoint(named_sequence.body): # match the payload module anytype = transform.AnyOpType.get() @@ -68,13 +68,13 @@ def get_softmax_schedule_module( op_name="builtin.module", deduplicate=True, ) - + xegpu_softmax_transform_schedule( payload_mod, parameters=parameters, stop_at_stage=stop_at_stage or "", ) - + return mod @@ -107,36 +107,43 @@ def bundle_xegpu_softmax_schedule( stop_at_stage: str = "", ) -> ir.Value[transform.AnyOpType]: """Schedule for lowering softmax payload to xegpu wg level.""" - + if stop_at_stage == "initial": raise PipelineInterrupt() - + anytype = transform.AnyOpType.get() - + # Match all linalg.generic and linalg.fill operations - # We have 7 operations in softmax: + # We have 7 operations in softmax: # fill(max_init), max, sub, exp, fill(sum_init), sum, div generic_ops = structured.structured_match( - transform.AnyOpType.get(), - mod, - ops=["linalg.generic", "linalg.fill"] + transform.AnyOpType.get(), mod, ops=["linalg.generic", "linalg.fill"] ) - + # Split the handle into individual operation handles split_ops = transform.split_handle( - (anytype, anytype, anytype, anytype, anytype, anytype, anytype), # 7 result types - generic_ops + ( + anytype, + anytype, + anytype, + anytype, + anytype, + anytype, + anytype, + ), # 7 result types + generic_ops, ) - + # Reverse split_ops to have operations in reverse order split_ops = list(reversed(split_ops)) - + # The first operation (after reversal) is the division - this is the consumer last_op = split_ops[0] # Tile the last operation using tile_using_forall tiled_op, for_op = structured.structured_tile_using_forall( - anytype, anytype, + anytype, + anytype, last_op, num_threads=[], tile_sizes=[], @@ -148,11 +155,9 @@ def bundle_xegpu_softmax_schedule( current_forall = for_op for producer_op in split_ops[1:]: fused_op, current_forall = structured.structured_fuse_into_containing_op( - anytype, anytype, - producer_op, - current_forall + anytype, anytype, producer_op, current_forall ) - + func = transform.get_parent_op( anytype, current_forall, @@ -161,10 +166,10 @@ def bundle_xegpu_softmax_schedule( ) transform.apply_cse(func) canonicalize(func) - + if stop_at_stage == "tiled": raise PipelineInterrupt() - + # vectorize func = structured.VectorizeChildrenAndApplyPatternsOp( func, @@ -172,10 +177,10 @@ def bundle_xegpu_softmax_schedule( ).result transform.apply_cse(func) canonicalize(func) - + if stop_at_stage == "vectorized": raise PipelineInterrupt() - + # bufferize mod = apply_registered_pass(mod, "eliminate-empty-tensors") identity_layout = LayoutMapOption.IdentityLayoutMap @@ -189,36 +194,36 @@ def bundle_xegpu_softmax_schedule( mod = apply_registered_pass(mod, "fold-memref-alias-ops") transform.apply_cse(mod) canonicalize(mod) - + if stop_at_stage == "bufferized": raise PipelineInterrupt() - + # convert forall to parallel wg_loops = match_and_split(mod, ops={"scf.forall"}) for wg_loop in wg_loops: wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) func = transform.get_parent_op(anytype, wg_loop) - + # convert scf.parallel to gpu.launch func = apply_registered_pass(func, "gpu-map-parallel-loops") func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") func = apply_registered_pass(func, "lower-affine") transform.apply_cse(func) canonicalize(func) - + # set the number of threads for the gpu.launch operation launch_op = match_and_split(func, ops={"gpu.launch"}) num_subgroups = parameters["wg_rows"] // parameters["sg_rows"] num_threads = num_subgroups * parameters["subgroup_size"] xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1]) - + # outline gpu func func = apply_registered_pass(func, "lower-affine") canonicalize(func) func = apply_registered_pass(func, "gpu-launch-sink-index-computations") mod = apply_registered_pass(mod, "gpu-kernel-outlining") transform.apply_cse(mod) - + # set xevm target mod = apply_registered_pass( mod, @@ -232,18 +237,18 @@ def bundle_xegpu_softmax_schedule( gpu_func = match(gpu_mod, ops={"gpu.func"}) gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") transform.apply_cse(gpu_func) - + if stop_at_stage == "xegpu-initial": raise PipelineInterrupt() - + # Set layout attributes for xegpu.store_nd operations. # FIXME: currently ecah subgroup is handling the entire row. store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) sg_layout = [parameters["sg_rows"], 1] sg_data = [parameters["sg_rows"], parameters["sizes"][1]] xegpu.set_op_layout_attr(store_ops[0], sg_layout=sg_layout, sg_data=sg_data) - + if stop_at_stage == "xegpu-wg": raise PipelineInterrupt() - + return mod From 9bcc6538adc23d8cb2f9186bfb24cb581919c68d Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 27 Mar 2026 22:36:53 +0000 Subject: [PATCH 16/51] use linalg.softmax --- .../ingress/mlir_gen/gpu_softmax_payload.py | 90 ++----------------- lighthouse/schedule/xegpu/softmax_schedule.py | 60 ++++--------- 2 files changed, 27 insertions(+), 123 deletions(-) diff --git a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py index 4568448e..05dc148a 100644 --- a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py +++ b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py @@ -1,7 +1,7 @@ """Generate MLIR payload for GPU softmax operation.""" from mlir import ir -from mlir.dialects import linalg, bufferization, arith, tensor, math +from mlir.dialects import linalg, bufferization, tensor from lighthouse.utils.mlir import func_cif from lighthouse.ingress.mlir_gen.gpu_utils import ( @@ -45,88 +45,16 @@ def payload(output, input_arg): emit_buf_to_tensor(output, restrict=True, writable=True) input_tensor = emit_buf_to_tensor(input_arg, restrict=True) - # Define affine maps for indexing - # #map = affine_map<(d0, d1) -> (d0, d1)> (identity 2D) - # #map1 = affine_map<(d0, d1) -> (d0)> (broadcast/reduce along d1) - d0 = ir.AffineDimExpr.get(0) - d1 = ir.AffineDimExpr.get(1) - map_2d = ir.AffineMap.get(2, 0, [d0, d1]) - map_1d = ir.AffineMap.get(2, 0, [d0]) + # Create output tensor and fill with zeros + output_init = tensor.empty(shape, dtype) - # Step 1: Find max - linalg.generic reduction - neg_inf = arith.constant(dtype, float("-inf")) - max_init = tensor.empty((M,), dtype) - max_filled = linalg.fill(neg_inf, outs=[max_init]) - - @linalg.generic( - [input_tensor], # inputs - [max_filled], # outputs - [map_2d, map_1d], # indexing_maps - [ - linalg.IteratorType.parallel, - linalg.IteratorType.reduction, - ], # iterator_types - ) - def row_max(in_val, acc): - return arith.maximumf(in_val, acc) - - # Step 2: Subtract max (broadcast) - linalg.generic elementwise - output_init = tensor.empty((M, N), dtype) - - @linalg.generic( - [input_tensor, row_max], # inputs - [output_init], # outputs - [map_2d, map_1d, map_2d], # indexing_maps - [ - linalg.IteratorType.parallel, - linalg.IteratorType.parallel, - ], # iterator_types - ) - def shifted(in_val, max_val, out): - return arith.subf(in_val, max_val) - - # Step 3: Compute exp - linalg.generic elementwise - @linalg.generic( - [shifted], # inputs - [output_init], # outputs - [map_2d, map_2d], # indexing_maps - [ - linalg.IteratorType.parallel, - linalg.IteratorType.parallel, - ], # iterator_types - ) - def exp_vals(in_val, out): - return math.exp(in_val) - - # Step 4: Sum exp values - linalg.generic reduction - sum_init = tensor.empty((M,), dtype) - zero = arith.constant(dtype, 0.0) - sum_filled = linalg.fill(zero, outs=[sum_init]) - - @linalg.generic( - [exp_vals], # inputs - [sum_filled], # outputs - [map_2d, map_1d], # indexing_maps - [ - linalg.IteratorType.parallel, - linalg.IteratorType.reduction, - ], # iterator_types - ) - def row_sum(in_val, acc): - return arith.addf(in_val, acc) - - # Step 5: Divide by sum (broadcast) - linalg.generic elementwise - @linalg.generic( - [exp_vals, row_sum], # inputs - [output_init], # outputs - [map_2d, map_1d, map_2d], # indexing_maps - [ - linalg.IteratorType.parallel, - linalg.IteratorType.parallel, - ], # iterator_types + # Apply softmax along dimension 1 (last dimension) + result = linalg.softmax( + result=[ir.RankedTensorType.get(shape, dtype)], + input=input_tensor, + output=output_init, + dimension=1, ) - def result(exp_val, sum_val, out): - return arith.divf(exp_val, sum_val) # Materialize result back to output memref bufferization.materialize_in_destination( diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index d9701b84..35c3ab4c 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -26,12 +26,11 @@ def get_softmax_schedule_module( Generate transform schedule for softmax operation. The schedule performs the following transformations: - 1. Tile the consumer operation (division) using forall - 2. Fuse producer operations into the forall loop - 3. Vectorize operations - 4. Bufferize tensors - 5. Convert to GPU dialect - 6. Lower to XeGPU operations + 1. Tile the linalg.softmax operation using forall + 2. Vectorize operations + 3. Bufferize tensors + 4. Convert to GPU dialect + 5. Lower to XeGPU operations Args: stop_at_stage: Optional stage name to stop early (for debugging) @@ -113,57 +112,34 @@ def bundle_xegpu_softmax_schedule( anytype = transform.AnyOpType.get() - # Match all linalg.generic and linalg.fill operations - # We have 7 operations in softmax: - # fill(max_init), max, sub, exp, fill(sum_init), sum, div - generic_ops = structured.structured_match( - transform.AnyOpType.get(), mod, ops=["linalg.generic", "linalg.fill"] + # Match linalg.softmax operation + # We have only 1 operation: linalg.softmax + softmax_op = structured.structured_match( + transform.AnyOpType.get(), mod, ops=["linalg.softmax"] ) - # Split the handle into individual operation handles - split_ops = transform.split_handle( - ( - anytype, - anytype, - anytype, - anytype, - anytype, - anytype, - anytype, - ), # 7 result types - generic_ops, - ) - - # Reverse split_ops to have operations in reverse order - split_ops = list(reversed(split_ops)) - - # The first operation (after reversal) is the division - this is the consumer - last_op = split_ops[0] - - # Tile the last operation using tile_using_forall + # Tile the softmax operation using tile_using_forall tiled_op, for_op = structured.structured_tile_using_forall( anytype, anytype, - last_op, + softmax_op, num_threads=[], tile_sizes=[], static_tile_sizes=(parameters["wg_rows"],), ) - # Fuse the producer operations into the forall loop - # Iterate through remaining operations (already in reverse order) - current_forall = for_op - for producer_op in split_ops[1:]: - fused_op, current_forall = structured.structured_fuse_into_containing_op( - anytype, anytype, producer_op, current_forall - ) - func = transform.get_parent_op( anytype, - current_forall, + for_op, op_name="func.func", deduplicate=True, ) + # Decompose softmax into linalg.generic operations + softmax_ops = structured.structured_match( + transform.AnyOpType.get(), func, ops=["linalg.softmax"] + ) + structured.structured_decompose_interface(anytype, softmax_ops) + transform.apply_cse(func) canonicalize(func) From 3f5cbceacfca0cf15ea47bab5e29e1b62de48b4e Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 30 Mar 2026 19:28:26 +0000 Subject: [PATCH 17/51] save work --- examples/xegpu/softmax.py | 17 +++++++++-- lighthouse/schedule/xegpu/softmax_schedule.py | 29 +++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index e3cf5840..27a58ec2 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -217,6 +217,12 @@ def parse_cli(): default=16, help="Subgroup size.", ) + parser.add_argument( + "--reduction-step-size", + type=int, + default=16, + help="Step size for reduction loop tiling (optional).", + ) parser.add_argument( "--nruns", type=int, @@ -266,6 +272,7 @@ def parse_cli(): "wg_rows": args.wg_rows, "sg_rows": args.sg_rows, "subgroup_size": args.subgroup_size, + "reduction_step_size": args.reduction_step_size, } M, N = args.sizes @@ -304,7 +311,13 @@ def list2str(a): f"wg-rows={args.wg_rows}", f"sg-rows={args.sg_rows}", f"subgroup-size={args.subgroup_size}", - f"time(us): {elapsed:.2f}", - f"GFLOPS: {gflops:.2f}", ] + if args.reduction_step_size is not None: + parts.append(f"reduction-step-size={args.reduction_step_size}") + parts.extend( + [ + f"time(us): {elapsed:.2f}", + f"GFLOPS: {gflops:.2f}", + ] + ) print(" ".join(parts)) diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 35c3ab4c..11410612 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -39,6 +39,7 @@ def get_softmax_schedule_module( - sg_rows: Number of rows per subgroup - subgroup_size: Size of subgroup - sizes: Tuple with the sizes of the input tensors (e.g. (M, N)) + - reduction_step_size: Optional step size for tiling reduction loops Returns: MLIR module containing the transform schedule @@ -140,6 +141,34 @@ def bundle_xegpu_softmax_schedule( ) structured.structured_decompose_interface(anytype, softmax_ops) + linalg_ops = match_and_split( + func, ops={"linalg.generic", "linalg.fill"}, nhandles=6 + ) + init_max_reduction = linalg_ops[0] + max_reduction = linalg_ops[1] + max_center_and_exp_op = linalg_ops[2] + init_sum_reduction = linalg_ops[3] + sum_reduction = linalg_ops[4] + div_op = linalg_ops[5] + + reduction_step_size = parameters["reduction_step_size"] + + # Tile the max reduction using TileReductionUsingFor + _, _, _, for_op = structured.structured_tile_reduction_using_for( + [anytype], + anytype, + anytype, + anytype, + target=max_reduction, + tile_sizes=[0, reduction_step_size], + ) + + # Fuse the init_max_reduction into the for loop + # fused_init, new_for_loop = structured.structured_fuse_into_containing_op( + # anytype, anytype, init_max_reduction, for_op + # ) + transform.PrintOp(target=init_max_reduction) + transform.apply_cse(func) canonicalize(func) From 6204d6c2a59cb8d816f87b2fc811d96163a613a3 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 30 Mar 2026 22:21:34 +0000 Subject: [PATCH 18/51] add inner dim tiling --- lighthouse/schedule/xegpu/softmax_schedule.py | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 11410612..9bda77fc 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -144,30 +144,28 @@ def bundle_xegpu_softmax_schedule( linalg_ops = match_and_split( func, ops={"linalg.generic", "linalg.fill"}, nhandles=6 ) - init_max_reduction = linalg_ops[0] max_reduction = linalg_ops[1] max_center_and_exp_op = linalg_ops[2] - init_sum_reduction = linalg_ops[3] sum_reduction = linalg_ops[4] div_op = linalg_ops[5] reduction_step_size = parameters["reduction_step_size"] - # Tile the max reduction using TileReductionUsingFor - _, _, _, for_op = structured.structured_tile_reduction_using_for( - [anytype], - anytype, - anytype, - anytype, - target=max_reduction, - tile_sizes=[0, reduction_step_size], - ) - - # Fuse the init_max_reduction into the for loop - # fused_init, new_for_loop = structured.structured_fuse_into_containing_op( - # anytype, anytype, init_max_reduction, for_op - # ) - transform.PrintOp(target=init_max_reduction) + # Tile all reduction ops using the same step size + reduction_ops = [max_reduction, sum_reduction] + for reduction_op in reduction_ops: + structured.structured_tile_reduction_using_for( + [anytype], + anytype, + anytype, + anytype, + target=reduction_op, + tile_sizes=[0, reduction_step_size], + ) + # Tile elementwise ops to match the reduction tile size + elementwise_ops = [max_center_and_exp_op, div_op] + for elementwise_op in elementwise_ops: + structured.TileUsingForOp(elementwise_op, sizes=[0, reduction_step_size]) transform.apply_cse(func) canonicalize(func) From 1feb0d48d94e6cc46fb6cbf7c01dc9825e43f73a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 1 Apr 2026 22:09:26 +0000 Subject: [PATCH 19/51] save fused version --- examples/xegpu/softmax.py | 1 + lighthouse/schedule/xegpu/softmax_schedule.py | 76 +++++++++++++++---- 2 files changed, 62 insertions(+), 15 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index 324d6e0c..8049f0f4 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -212,6 +212,7 @@ def parse_cli(): "tiled", "vectorized", "bufferized", + "gpu-outlining", "xegpu-initial", "xegpu-wg", "final", diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 361dfdfe..b42b6479 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -140,6 +140,7 @@ def bundle_xegpu_softmax_schedule( transform.AnyOpType.get(), func, ops=["linalg.softmax"] ) structured.structured_decompose_interface(anytype, softmax_ops) + # transform.print_(target=func, name="After structured_decompose_interface") linalg_ops = match_and_split( func, ops={"linalg.generic", "linalg.fill"}, nhandles=6 @@ -151,22 +152,60 @@ def bundle_xegpu_softmax_schedule( reduction_step_size = parameters["reduction_step_size"] - # Tile all reduction ops using the same step size - reduction_ops = [max_reduction, sum_reduction] - for reduction_op in reduction_ops: - structured.structured_tile_reduction_using_for( - [anytype], - anytype, - anytype, - anytype, - target=reduction_op, - tile_sizes=[0, reduction_step_size], - ) - # Tile elementwise ops to match the reduction tile size - elementwise_ops = [max_center_and_exp_op, div_op] - for elementwise_op in elementwise_ops: - structured.TileUsingForOp(elementwise_op, sizes=[0, reduction_step_size]) + # Tile the division op and fuse the sub+exp producer into it + _, div_loop = structured.TileUsingForOp( + div_op, sizes=[0, reduction_step_size] + ).results + + # Fuse max_center_and_exp_op into the div loop + _, fused_loop = structured.structured_fuse_into_containing_op( + anytype, + anytype, + producer_op=max_center_and_exp_op, + containing_op=div_loop, + ) + + # Tile the sum reduction and fuse the sub+exp producer into it + _, _, _, sum_loop = structured.structured_tile_reduction_using_for( + [anytype], + anytype, + anytype, + anytype, + target=sum_reduction, + tile_sizes=[0, reduction_step_size], + ) + + func = transform.get_parent_op( + anytype, + fused_loop, + op_name="func.func", + deduplicate=True, + ) + # Re-match and split linalg generic ops, there are 5 at this point + linalg_ops = match_and_split(func, ops={"linalg.generic"}, nhandles=5) + max_center_and_exp_op = linalg_ops[1] + + # Fuse max_center_and_exp_op into the sum reduction loop + _, fused_sum_loop = structured.structured_fuse_into_containing_op( + anytype, + anytype, + producer_op=max_center_and_exp_op, + containing_op=sum_loop, + ) + + # Tile the max reduction. + max_reduction = linalg_ops[0] + structured.structured_tile_reduction_using_for( + [anytype], + anytype, + anytype, + anytype, + target=max_reduction, + tile_sizes=[0, reduction_step_size], + ) + + # Cleanup after tiling and fusion transform.apply_cse(func) canonicalize(func) @@ -227,6 +266,9 @@ def bundle_xegpu_softmax_schedule( mod = apply_registered_pass(mod, "gpu-kernel-outlining") transform.apply_cse(mod) + if stop_at_stage == "gpu-outlining": + raise PipelineInterrupt() + # set xevm target mod = apply_registered_pass( mod, @@ -241,6 +283,10 @@ def bundle_xegpu_softmax_schedule( gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") transform.apply_cse(gpu_func) + # Cleanup. + transform.apply_cse(mod) + canonicalize(mod) + if stop_at_stage == "xegpu-initial": raise PipelineInterrupt() From a28cf4a1035cf12956aa9922ff0fbd3a46e51314 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 1 Apr 2026 22:30:40 +0000 Subject: [PATCH 20/51] save work --- lighthouse/schedule/xegpu/softmax_schedule.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index b42b6479..18186b89 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -237,6 +237,17 @@ def bundle_xegpu_softmax_schedule( transform.apply_cse(mod) canonicalize(mod) + # promote memref.alloc to memref.alloca in payload function + func = match(mod, ops={"func.func"}) + func = apply_registered_pass( + func, + "promote-buffers-to-stack", + options={ + "max-alloc-size-in-bytes": "8192", + "max-rank-of-allocated-memref": "2", + }, + ) + if stop_at_stage == "bufferized": raise PipelineInterrupt() From 79e2f737caae45431f207744187a7119c5504b12 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Apr 2026 19:16:35 +0000 Subject: [PATCH 21/51] save work --- lighthouse/schedule/xegpu/softmax_schedule.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 18186b89..bda5f819 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -303,10 +303,10 @@ def bundle_xegpu_softmax_schedule( # Set layout attributes for xegpu.store_nd operations. # FIXME: currently ecah subgroup is handling the entire row. - store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) + store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=5) sg_layout = [parameters["sg_rows"], 1] - sg_data = [parameters["sg_rows"], parameters["sizes"][1]] - xegpu.set_anchor_layout(store_ops[0], sg_layout=sg_layout, sg_data=sg_data) + sg_data = [parameters["sg_rows"], parameters["reduction_step_size"]] + xegpu.set_anchor_layout(store_ops[-1], sg_layout=sg_layout, sg_data=sg_data) if stop_at_stage == "xegpu-wg": raise PipelineInterrupt() From 55c175c0f69c511f6d753b0f7c55b5480d5303b3 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Apr 2026 22:24:15 +0000 Subject: [PATCH 22/51] save work --- docs/softmax_lowering.md | 402 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 402 insertions(+) create mode 100644 docs/softmax_lowering.md diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md new file mode 100644 index 00000000..f67779ae --- /dev/null +++ b/docs/softmax_lowering.md @@ -0,0 +1,402 @@ +# Linalg softmax lowering in XeGPU pipeline + +## Overview + +The lowering process consists of seven stages: +1. **initial** - High-level tensor operations +2. **tiled-softmax** - Tiled softmax operations +3. **decomposed** - Decomposition into constituent operations +4. **vectorized** - Vector operations +5. **bufferized** - Memory-based representation +6. **xegpu-initial** - GPU kernel with XeGPU operations +7. **xegpu-wg** - Work-group optimized XeGPU + +--- + +## Stage 1: Initial + +**Code:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + // ... + %2 = tensor.empty() : tensor<1024x64xf32> + %3 = linalg.softmax dimension(1) ins(%1 : tensor<1024x64xf32>) + outs(%2 : tensor<1024x64xf32>) -> tensor<1024x64xf32> + // ... + return +} +``` +--- + +## Stage 2: Tiled Softmax + +**Key Characteristics:** +- Work distribution via `scf.forall` (16 parallel iterations) +- Each tile processes 64x64 elements + +**Code:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + // ... + %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) { + %4 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2) + // Extract 64x64 input slice + %extracted_slice = tensor.extract_slice ... + // Extract 64x64 output slice + %extracted_slice_0 = tensor.extract_slice ... + // Apply softmax to the tile + %5 = linalg.softmax dimension(1) ins(%extracted_slice : tensor<64x64xf32>) + outs(%extracted_slice_0 : tensor<64x64xf32>) -> tensor<64x64xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %5 into %arg3[%4, %c0] [64, 64] [1, 1] : + tensor<64x64xf32> into tensor<1024x64xf32> + } + } + // ... + return +} +``` + +--- + +## Stage 3: Decomposed + +**Key Characteristics:** +- Softmax decomposed into 4 constituent `linalg.generic` ops : max, sub+exp, sum, divide +- Uses `structured.structured_decompose_interface` implemented by `linalg.softmax` + +**Code:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 0xFFC00000 : f32 // -inf for max reduction + %0 = bufferization.to_tensor %arg1 restrict : memref<1024x64xf32> to tensor<1024x64xf32> + %1 = tensor.empty() : tensor<1024x64xf32> + + %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x64xf32>) { + %3 = affine.apply #map(%arg2) // %3 = %arg2 * 64 + %extracted_slice = tensor.extract_slice %0[%3, 0] [64, 64] [1, 1] : + tensor<1024x64xf32> to tensor<64x64xf32> + + // Step 1: Find max along dimension 1 + %4 = tensor.empty() : tensor<64xf32> + %5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<64xf32>) -> tensor<64xf32> + %6 = linalg.generic {indexing_maps = [#map1, #map2], + iterator_types = ["parallel", "reduction"]} + ins(%extracted_slice : tensor<64x64xf32>) outs(%5 : tensor<64xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = arith.maxnumf %in, %out : f32 + linalg.yield %11 : f32 + } -> tensor<64xf32> + + // Step 2: Subtract max and exponentiate + %7 = linalg.generic {indexing_maps = [#map1, #map2, #map1], + iterator_types = ["parallel", "parallel"]} + ins(%extracted_slice, %6 : tensor<64x64xf32>, tensor<64xf32>) + outs(%extracted_slice_1 : tensor<64x64xf32>) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %11 = arith.subf %in, %in_2 : f32 + %12 = math.exp %11 : f32 + linalg.yield %12 : f32 + } -> tensor<64x64xf32> + + // Step 3: Sum exponentials + %8 = linalg.fill ins(%cst : f32) outs(%4 : tensor<64xf32>) -> tensor<64xf32> + %9 = linalg.generic {indexing_maps = [#map1, #map2], + iterator_types = ["parallel", "reduction"]} + ins(%7 : tensor<64x64xf32>) outs(%8 : tensor<64xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = arith.addf %in, %out : f32 + linalg.yield %11 : f32 + } -> tensor<64xf32> + + // Step 4: Normalize by sum + %10 = linalg.generic {indexing_maps = [#map1, #map2, #map1], + iterator_types = ["parallel", "parallel"]} + ins(%7, %9 : tensor<64x64xf32>, tensor<64xf32>) + outs(%extracted_slice_1 : tensor<64x64xf32>) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %11 = arith.divf %in, %in_2 : f32 + linalg.yield %11 : f32 + } -> tensor<64x64xf32> + + scf.forall.in_parallel { + tensor.parallel_insert_slice %10 into %arg3[%3, 0] [64, 64] [1, 1] : + tensor<64x64xf32> into tensor<1024x64xf32> + } + } + return +} +``` + +**What Happens:** +- The 1024x64 input is divided into 16 tiles of 64x64 each +- Softmax algorithm made explicit: + 1. **Max reduction**: Find maximum value per row (for numerical stability) + 2. **Exp**: Compute exp(x - max) for each element + 3. **Sum reduction**: Sum exponentials per row + 4. **Normalize**: Divide each element by its row sum +- Each tile is processed independently, enabling parallelization +- Results are inserted back into the output tensor + +--- + +## Stage 4: Vectorized + +**Key Characteristics:** +- `linalg.generic` operations replaced with vector operations +- SIMD-friendly representation using `vector<64x64xf32>` +- Explicit vector multi-reductions +- Vector transfers for reading/writing data + +**Code:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + %cst = arith.constant dense<0.000000e+00> : vector<64xf32> + %0 = ub.poison : f32 + %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32> + %c0 = arith.constant 0 : index + %1 = bufferization.to_tensor %arg1 restrict : memref<1024x64xf32> to tensor<1024x64xf32> + %2 = tensor.empty() : tensor<1024x64xf32> + + %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) { + %4 = affine.apply #map(%arg2) // %4 = %arg2 * 64 + %extracted_slice = tensor.extract_slice %arg3[%4, 0] [64, 64] [1, 1] + + // Vector read: Load 64x64 tile + %5 = vector.transfer_read %1[%4, %c0], %0 {in_bounds = [true, true]} : + tensor<1024x64xf32>, vector<64x64xf32> + + // Max reduction: Reduce dimension 1 -> vector<64xf32> + %6 = vector.multi_reduction , %5, %cst_0 [1] : + vector<64x64xf32> to vector<64xf32> + + // Broadcast max values back to 64x64 and transpose + %7 = vector.broadcast %6 : vector<64xf32> to vector<64x64xf32> + %8 = vector.transpose %7, [1, 0] : vector<64x64xf32> to vector<64x64xf32> + + // Subtract max and exponentiate + %9 = arith.subf %5, %8 : vector<64x64xf32> + %10 = math.exp %9 : vector<64x64xf32> + + // Sum reduction: Reduce dimension 1 -> vector<64xf32> + %11 = vector.multi_reduction , %10, %cst [1] : + vector<64x64xf32> to vector<64xf32> + + // Broadcast sums back to 64x64 and transpose + %12 = vector.broadcast %11 : vector<64xf32> to vector<64x64xf32> + %13 = vector.transpose %12, [1, 0] : vector<64x64xf32> to vector<64x64xf32> + + // Normalize + %14 = arith.divf %10, %13 : vector<64x64xf32> + + // Vector write + %15 = vector.transfer_write %14, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : + vector<64x64xf32>, tensor<64x64xf32> + + scf.forall.in_parallel { + tensor.parallel_insert_slice %15 into %arg3[%4, 0] [64, 64] [1, 1] + } + } + return +} +``` + +**What Happens:** +- Linalg operations converted to vector dialect operations +- `vector.transfer_read` loads entire 64x64 tile at once +- `vector.multi_reduction` performs SIMD reductions (max and sum) +- `vector.broadcast` and `vector.transpose` handle dimension alignment +- All arithmetic operations work on vectors, enabling SIMD execution +- `vector.transfer_write` stores results back + +--- + +## Stage 5: Bufferized + +**Key Characteristics:** +- Tensors eliminated, working directly with memrefs +- Vector operations read/write directly from/to memory +- No more tensor extract/insert operations +- Simplified control flow + +**Code:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + %cst = arith.constant dense<0.000000e+00> : vector<64xf32> + %0 = ub.poison : f32 + %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32> + %c0 = arith.constant 0 : index + + scf.forall (%arg2) in (16) { + %1 = affine.apply #map(%arg2) // %1 = %arg2 * 64 + + // Direct memref read + %2 = vector.transfer_read %arg1[%1, %c0], %0 {in_bounds = [true, true]} : + memref<1024x64xf32>, vector<64x64xf32> + + // Max reduction + %3 = vector.multi_reduction , %2, %cst_0 [1] : + vector<64x64xf32> to vector<64xf32> + %4 = vector.broadcast %3 : vector<64xf32> to vector<64x64xf32> + %5 = vector.transpose %4, [1, 0] : vector<64x64xf32> to vector<64x64xf32> + + // Subtract and exp + %6 = arith.subf %2, %5 : vector<64x64xf32> + %7 = math.exp %6 : vector<64x64xf32> + + // Sum reduction + %8 = vector.multi_reduction , %7, %cst [1] : + vector<64x64xf32> to vector<64xf32> + %9 = vector.broadcast %8 : vector<64xf32> to vector<64x64xf32> + %10 = vector.transpose %9, [1, 0] : vector<64x64xf32> to vector<64x64xf32> + + // Normalize + %11 = arith.divf %7, %10 : vector<64x64xf32> + + // Direct memref write + vector.transfer_write %11, %arg0[%1, %c0] {in_bounds = [true, true]} : + vector<64x64xf32>, memref<1024x64xf32> + } + return +} +``` + +**What Happens:** +- All tensor operations converted to memref-based operations +- `scf.forall` no longer uses `shared_outs`, simplified to pure side effects +- Vector transfers work directly on input/output memrefs +- Memory layout is now explicit +- This representation is ready for GPU kernel extraction + +--- + +## Stage 6: XeGPU-Initial + +**Key Characteristics:** +- GPU kernel separated from host code +- `gpu.launch_func` invocation with grid/block dimensions +- XeGPU tensor descriptors for memory access +- Block-based load/store operations + +**Code:** + +**Host Side:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c128 = arith.constant 128 : index + gpu.launch_func @payload_kernel::@payload_kernel + blocks in (%c16, %c1, %c1) + threads in (%c128, %c1, %c1) + args(%arg1 : memref<1024x64xf32>, %arg0 : memref<1024x64xf32>) + return +} +``` + +**GPU Kernel:** +```mlir +gpu.module @payload_kernel [#xevm.target] { + gpu.func @payload_kernel(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) kernel + attributes {known_block_size = array, + known_grid_size = array} { + %cst = arith.constant dense<0.000000e+00> : vector<64xf32> + %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32> + %c64 = arith.constant 64 : index + %block_id_x = gpu.block_id x + %0 = arith.muli %block_id_x, %c64 overflow : index + + // Create XeGPU tensor descriptor for load + %1 = xegpu.create_nd_tdesc %arg0 : memref<1024x64xf32> -> + !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> + + // XeGPU block load + %2 = xegpu.load_nd %1[%0, 0] : + !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> -> + vector<64x64xf32> + + // Same compute operations as before + %3 = vector.multi_reduction , %2, %cst_0 [1] : + vector<64x64xf32> to vector<64xf32> + %4 = vector.broadcast %3 : vector<64xf32> to vector<64x64xf32> + %5 = vector.transpose %4, [1, 0] : vector<64x64xf32> to vector<64x64xf32> + %6 = arith.subf %2, %5 : vector<64x64xf32> + %7 = math.exp %6 : vector<64x64xf32> + %8 = vector.multi_reduction , %7, %cst [1] : + vector<64x64xf32> to vector<64xf32> + %9 = vector.broadcast %8 : vector<64xf32> to vector<64x64xf32> + %10 = vector.transpose %9, [1, 0] : vector<64x64xf32> to vector<64x64xf32> + %11 = arith.divf %7, %10 : vector<64x64xf32> + + // Create XeGPU tensor descriptor for store + %12 = xegpu.create_nd_tdesc %arg1 : memref<1024x64xf32> -> + !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> + + // XeGPU block store + xegpu.store_nd %11, %12[%0, 0] : + vector<64x64xf32>, + !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> + + gpu.return + } +} +``` + +**What Happens:** +- Host function now calls `gpu.launch_func` with 16 blocks, 128 threads per block +- Separate `gpu.module` contains the kernel code +- `gpu.block_id x` replaces the loop iterator +- `xegpu.create_nd_tdesc` creates tensor descriptors for memory regions +- `xegpu.load_nd` performs hardware-optimized block loads +- `xegpu.store_nd` performs hardware-optimized block stores +- XeGPU operations map directly to Intel GPU instructions +- Boundary checking disabled for performance (sizes known at compile time) + +--- + +## Stage 7: XeGPU-WG (Work-Group Optimized) + +**Key Characteristics:** +- Additional layout hints for work-group optimization +- Sub-group layout specification: `sg_layout` and `sg_data` +- Optimized memory access patterns for Intel XeGPU + +**Code (differences from xegpu-initial):** +```mlir +// Store operation now includes layout hints +xegpu.store_nd %11, %12[%0, 0] + <{layout = #xegpu.layout}> : + vector<64x64xf32>, + !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> +``` + +**What Happens:** +- The `layout` attribute provides explicit sub-group (SG) tiling information: + - `sg_layout = [8, 1]`: Data is distributed across 8 sub-groups in the first dimension + - `sg_data = [8, 64]`: Each sub-group handles an 8x64 slice of data +- This layout specification: + - Matches the 64x64 total data size (8 sub-groups × 8 rows = 64 rows, 64 columns) + - Optimizes coalesced memory accesses + - Enables efficient SIMD execution within each sub-group + - Aligns with Intel GPU hardware execution model (128 threads = 8 sub-groups of 16 threads) +- The layout is applied to the store operation to optimize write patterns +- Load operation remains unchanged as reads are typically more flexible + +--- + +## Summary + +The lowering pipeline progressively transforms abstract operations into hardware-specific instructions: + +| Stage | Abstraction Level | Key Operations | +|-------|------------------|----------------| +| **initial** | High-level ML | `linalg.softmax` on full tensor | +| **tiled-softmax** | Tiled high-level | `linalg.softmax` on tiles, `scf.forall` | +| **decomposed** | Tiled computation | `linalg.generic` with reductions | +| **vectorized** | Vector operations | `vector.multi_reduction`, `vector.transfer_read/write` | +| **bufferized** | Memory-based | Direct memref operations with vectors | +| **xegpu-initial** | GPU-specific | `xegpu.load_nd`, `xegpu.store_nd`, `gpu.launch_func` | +| **xegpu-wg** | Hardware-optimized | Layout hints for sub-group optimization | + +Each stage maintains the same computational semantics while providing increasingly detailed control over execution and memory access patterns, ultimately targeting efficient execution on Intel XeGPU hardware. From bf3a8c66a058cf4f7e8443d7247f2494437672a1 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Apr 2026 22:51:42 +0000 Subject: [PATCH 23/51] save work --- docs/softmax_lowering.md | 158 ++++++++------------------------------- 1 file changed, 31 insertions(+), 127 deletions(-) diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md index f67779ae..c5608e37 100644 --- a/docs/softmax_lowering.md +++ b/docs/softmax_lowering.md @@ -1,7 +1,10 @@ -# Linalg softmax lowering in XeGPU pipeline +# Linalg softmax lowering to XeGPU (Currently supported in lighthouse) ## Overview +**Assumptions:** +Softmax dimension size is small (64 in this example). + The lowering process consists of seven stages: 1. **initial** - High-level tensor operations 2. **tiled-softmax** - Tiled softmax operations @@ -30,7 +33,7 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { ## Stage 2: Tiled Softmax -**Key Characteristics:** +**Notes** - Work distribution via `scf.forall` (16 parallel iterations) - Each tile processes 64x64 elements @@ -61,63 +64,45 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { ## Stage 3: Decomposed -**Key Characteristics:** +**Notes** - Softmax decomposed into 4 constituent `linalg.generic` ops : max, sub+exp, sum, divide - Uses `structured.structured_decompose_interface` implemented by `linalg.softmax` **Code:** ```mlir func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - %cst = arith.constant 0.000000e+00 : f32 - %cst_0 = arith.constant 0xFFC00000 : f32 // -inf for max reduction - %0 = bufferization.to_tensor %arg1 restrict : memref<1024x64xf32> to tensor<1024x64xf32> - %1 = tensor.empty() : tensor<1024x64xf32> + // ... %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x64xf32>) { %3 = affine.apply #map(%arg2) // %3 = %arg2 * 64 - %extracted_slice = tensor.extract_slice %0[%3, 0] [64, 64] [1, 1] : - tensor<1024x64xf32> to tensor<64x64xf32> + %extracted_slice = tensor.extract_slice ... // Step 1: Find max along dimension 1 %4 = tensor.empty() : tensor<64xf32> %5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<64xf32>) -> tensor<64xf32> - %6 = linalg.generic {indexing_maps = [#map1, #map2], - iterator_types = ["parallel", "reduction"]} - ins(%extracted_slice : tensor<64x64xf32>) outs(%5 : tensor<64xf32>) { - ^bb0(%in: f32, %out: f32): + %6 = linalg.generic // ... %11 = arith.maxnumf %in, %out : f32 - linalg.yield %11 : f32 + // ... } -> tensor<64xf32> // Step 2: Subtract max and exponentiate - %7 = linalg.generic {indexing_maps = [#map1, #map2, #map1], - iterator_types = ["parallel", "parallel"]} - ins(%extracted_slice, %6 : tensor<64x64xf32>, tensor<64xf32>) - outs(%extracted_slice_1 : tensor<64x64xf32>) { - ^bb0(%in: f32, %in_2: f32, %out: f32): + %7 = linalg.generic // ... %11 = arith.subf %in, %in_2 : f32 %12 = math.exp %11 : f32 - linalg.yield %12 : f32 + // ... } -> tensor<64x64xf32> // Step 3: Sum exponentials %8 = linalg.fill ins(%cst : f32) outs(%4 : tensor<64xf32>) -> tensor<64xf32> - %9 = linalg.generic {indexing_maps = [#map1, #map2], - iterator_types = ["parallel", "reduction"]} - ins(%7 : tensor<64x64xf32>) outs(%8 : tensor<64xf32>) { - ^bb0(%in: f32, %out: f32): + %9 = linalg.generic // ... %11 = arith.addf %in, %out : f32 - linalg.yield %11 : f32 + // ... } -> tensor<64xf32> // Step 4: Normalize by sum - %10 = linalg.generic {indexing_maps = [#map1, #map2, #map1], - iterator_types = ["parallel", "parallel"]} - ins(%7, %9 : tensor<64x64xf32>, tensor<64xf32>) - outs(%extracted_slice_1 : tensor<64x64xf32>) { - ^bb0(%in: f32, %in_2: f32, %out: f32): + %10 = linalg.generic // ... %11 = arith.divf %in, %in_2 : f32 - linalg.yield %11 : f32 + // ... } -> tensor<64x64xf32> scf.forall.in_parallel { @@ -129,39 +114,21 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { } ``` -**What Happens:** -- The 1024x64 input is divided into 16 tiles of 64x64 each -- Softmax algorithm made explicit: - 1. **Max reduction**: Find maximum value per row (for numerical stability) - 2. **Exp**: Compute exp(x - max) for each element - 3. **Sum reduction**: Sum exponentials per row - 4. **Normalize**: Divide each element by its row sum -- Each tile is processed independently, enabling parallelization -- Results are inserted back into the output tensor - --- ## Stage 4: Vectorized -**Key Characteristics:** +**Notes** - `linalg.generic` operations replaced with vector operations -- SIMD-friendly representation using `vector<64x64xf32>` -- Explicit vector multi-reductions - Vector transfers for reading/writing data **Code:** ```mlir func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - %cst = arith.constant dense<0.000000e+00> : vector<64xf32> - %0 = ub.poison : f32 - %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32> - %c0 = arith.constant 0 : index - %1 = bufferization.to_tensor %arg1 restrict : memref<1024x64xf32> to tensor<1024x64xf32> - %2 = tensor.empty() : tensor<1024x64xf32> - + // ... %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) { %4 = affine.apply #map(%arg2) // %4 = %arg2 * 64 - %extracted_slice = tensor.extract_slice %arg3[%4, 0] [64, 64] [1, 1] + %extracted_slice = tensor.extract_slice .. // Vector read: Load 64x64 tile %5 = vector.transfer_read %1[%4, %c0], %0 {in_bounds = [true, true]} : @@ -201,32 +168,17 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { return } ``` - -**What Happens:** -- Linalg operations converted to vector dialect operations -- `vector.transfer_read` loads entire 64x64 tile at once -- `vector.multi_reduction` performs SIMD reductions (max and sum) -- `vector.broadcast` and `vector.transpose` handle dimension alignment -- All arithmetic operations work on vectors, enabling SIMD execution -- `vector.transfer_write` stores results back - --- ## Stage 5: Bufferized -**Key Characteristics:** +**Notes** - Tensors eliminated, working directly with memrefs -- Vector operations read/write directly from/to memory -- No more tensor extract/insert operations -- Simplified control flow **Code:** ```mlir func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - %cst = arith.constant dense<0.000000e+00> : vector<64xf32> - %0 = ub.poison : f32 - %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32> - %c0 = arith.constant 0 : index + // ... scf.forall (%arg2) in (16) { %1 = affine.apply #map(%arg2) // %1 = %arg2 * 64 @@ -262,31 +214,21 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { } ``` -**What Happens:** -- All tensor operations converted to memref-based operations -- `scf.forall` no longer uses `shared_outs`, simplified to pure side effects -- Vector transfers work directly on input/output memrefs -- Memory layout is now explicit -- This representation is ready for GPU kernel extraction - --- ## Stage 6: XeGPU-Initial -**Key Characteristics:** -- GPU kernel separated from host code +**Notes** +- GPU kernel separated from host code (Gpu Outlining) - `gpu.launch_func` invocation with grid/block dimensions -- XeGPU tensor descriptors for memory access -- Block-based load/store operations +- Use `vector-to-xegpu` **Code:** **Host Side:** ```mlir func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - %c1 = arith.constant 1 : index - %c16 = arith.constant 16 : index - %c128 = arith.constant 128 : index + // ... gpu.launch_func @payload_kernel::@payload_kernel blocks in (%c16, %c1, %c1) threads in (%c128, %c1, %c1) @@ -301,9 +243,7 @@ gpu.module @payload_kernel [#xevm.target] { gpu.func @payload_kernel(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) kernel attributes {known_block_size = array, known_grid_size = array} { - %cst = arith.constant dense<0.000000e+00> : vector<64xf32> - %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32> - %c64 = arith.constant 64 : index + // ... %block_id_x = gpu.block_id x %0 = arith.muli %block_id_x, %c64 overflow : index @@ -343,24 +283,14 @@ gpu.module @payload_kernel [#xevm.target] { } ``` -**What Happens:** -- Host function now calls `gpu.launch_func` with 16 blocks, 128 threads per block -- Separate `gpu.module` contains the kernel code -- `gpu.block_id x` replaces the loop iterator -- `xegpu.create_nd_tdesc` creates tensor descriptors for memory regions -- `xegpu.load_nd` performs hardware-optimized block loads -- `xegpu.store_nd` performs hardware-optimized block stores -- XeGPU operations map directly to Intel GPU instructions -- Boundary checking disabled for performance (sizes known at compile time) - --- ## Stage 7: XeGPU-WG (Work-Group Optimized) -**Key Characteristics:** -- Additional layout hints for work-group optimization -- Sub-group layout specification: `sg_layout` and `sg_data` -- Optimized memory access patterns for Intel XeGPU +**Notes** +- Sets the layout for anchor xegpu ops. Each Wg consistes of [8, 1] subgroups + doing 8x64 softmax slice. +- Only sets the layotu for `store_nd`. Layout propagation does the rest. **Code (differences from xegpu-initial):** ```mlir @@ -371,32 +301,6 @@ xegpu.store_nd %11, %12[%0, 0] !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> ``` -**What Happens:** -- The `layout` attribute provides explicit sub-group (SG) tiling information: - - `sg_layout = [8, 1]`: Data is distributed across 8 sub-groups in the first dimension - - `sg_data = [8, 64]`: Each sub-group handles an 8x64 slice of data -- This layout specification: - - Matches the 64x64 total data size (8 sub-groups × 8 rows = 64 rows, 64 columns) - - Optimizes coalesced memory accesses - - Enables efficient SIMD execution within each sub-group - - Aligns with Intel GPU hardware execution model (128 threads = 8 sub-groups of 16 threads) -- The layout is applied to the store operation to optimize write patterns -- Load operation remains unchanged as reads are typically more flexible - --- -## Summary - -The lowering pipeline progressively transforms abstract operations into hardware-specific instructions: - -| Stage | Abstraction Level | Key Operations | -|-------|------------------|----------------| -| **initial** | High-level ML | `linalg.softmax` on full tensor | -| **tiled-softmax** | Tiled high-level | `linalg.softmax` on tiles, `scf.forall` | -| **decomposed** | Tiled computation | `linalg.generic` with reductions | -| **vectorized** | Vector operations | `vector.multi_reduction`, `vector.transfer_read/write` | -| **bufferized** | Memory-based | Direct memref operations with vectors | -| **xegpu-initial** | GPU-specific | `xegpu.load_nd`, `xegpu.store_nd`, `gpu.launch_func` | -| **xegpu-wg** | Hardware-optimized | Layout hints for sub-group optimization | - -Each stage maintains the same computational semantics while providing increasingly detailed control over execution and memory access patterns, ultimately targeting efficient execution on Intel XeGPU hardware. +# Supporting larger Softmax dimension sizes. From b083887154b083d184430c0c87baf887d155dcca Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Apr 2026 23:03:15 +0000 Subject: [PATCH 24/51] save work --- examples/xegpu/softmax.py | 8 -- lighthouse/schedule/xegpu/softmax_schedule.py | 90 +------------------ 2 files changed, 3 insertions(+), 95 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index 8049f0f4..afca86e3 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -181,12 +181,6 @@ def parse_cli(): default=16, help="Subgroup size.", ) - parser.add_argument( - "--reduction-step-size", - type=int, - default=16, - help="Step size for reduction loop tiling (optional).", - ) parser.add_argument( "--nruns", type=int, @@ -212,7 +206,6 @@ def parse_cli(): "tiled", "vectorized", "bufferized", - "gpu-outlining", "xegpu-initial", "xegpu-wg", "final", @@ -244,7 +237,6 @@ def parse_cli(): "wg_rows": args.wg_rows, "sg_rows": args.sg_rows, "subgroup_size": args.subgroup_size, - "reduction_step_size": args.reduction_step_size, } M, N = args.sizes diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index bda5f819..0907bc5a 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -39,7 +39,6 @@ def get_softmax_schedule_module( - sg_rows: Number of rows per subgroup - subgroup_size: Size of subgroup - sizes: Tuple with the sizes of the input tensors (e.g. (M, N)) - - reduction_step_size: Optional step size for tiling reduction loops Returns: MLIR module containing the transform schedule @@ -140,72 +139,7 @@ def bundle_xegpu_softmax_schedule( transform.AnyOpType.get(), func, ops=["linalg.softmax"] ) structured.structured_decompose_interface(anytype, softmax_ops) - # transform.print_(target=func, name="After structured_decompose_interface") - linalg_ops = match_and_split( - func, ops={"linalg.generic", "linalg.fill"}, nhandles=6 - ) - max_reduction = linalg_ops[1] - max_center_and_exp_op = linalg_ops[2] - sum_reduction = linalg_ops[4] - div_op = linalg_ops[5] - - reduction_step_size = parameters["reduction_step_size"] - - # Tile the division op and fuse the sub+exp producer into it - _, div_loop = structured.TileUsingForOp( - div_op, sizes=[0, reduction_step_size] - ).results - - # Fuse max_center_and_exp_op into the div loop - _, fused_loop = structured.structured_fuse_into_containing_op( - anytype, - anytype, - producer_op=max_center_and_exp_op, - containing_op=div_loop, - ) - - # Tile the sum reduction and fuse the sub+exp producer into it - _, _, _, sum_loop = structured.structured_tile_reduction_using_for( - [anytype], - anytype, - anytype, - anytype, - target=sum_reduction, - tile_sizes=[0, reduction_step_size], - ) - - func = transform.get_parent_op( - anytype, - fused_loop, - op_name="func.func", - deduplicate=True, - ) - - # Re-match and split linalg generic ops, there are 5 at this point - linalg_ops = match_and_split(func, ops={"linalg.generic"}, nhandles=5) - max_center_and_exp_op = linalg_ops[1] - - # Fuse max_center_and_exp_op into the sum reduction loop - _, fused_sum_loop = structured.structured_fuse_into_containing_op( - anytype, - anytype, - producer_op=max_center_and_exp_op, - containing_op=sum_loop, - ) - - # Tile the max reduction. - max_reduction = linalg_ops[0] - structured.structured_tile_reduction_using_for( - [anytype], - anytype, - anytype, - anytype, - target=max_reduction, - tile_sizes=[0, reduction_step_size], - ) - - # Cleanup after tiling and fusion transform.apply_cse(func) canonicalize(func) @@ -237,17 +171,6 @@ def bundle_xegpu_softmax_schedule( transform.apply_cse(mod) canonicalize(mod) - # promote memref.alloc to memref.alloca in payload function - func = match(mod, ops={"func.func"}) - func = apply_registered_pass( - func, - "promote-buffers-to-stack", - options={ - "max-alloc-size-in-bytes": "8192", - "max-rank-of-allocated-memref": "2", - }, - ) - if stop_at_stage == "bufferized": raise PipelineInterrupt() @@ -277,9 +200,6 @@ def bundle_xegpu_softmax_schedule( mod = apply_registered_pass(mod, "gpu-kernel-outlining") transform.apply_cse(mod) - if stop_at_stage == "gpu-outlining": - raise PipelineInterrupt() - # set xevm target mod = apply_registered_pass( mod, @@ -294,19 +214,15 @@ def bundle_xegpu_softmax_schedule( gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") transform.apply_cse(gpu_func) - # Cleanup. - transform.apply_cse(mod) - canonicalize(mod) - if stop_at_stage == "xegpu-initial": raise PipelineInterrupt() # Set layout attributes for xegpu.store_nd operations. # FIXME: currently ecah subgroup is handling the entire row. - store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=5) + store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) sg_layout = [parameters["sg_rows"], 1] - sg_data = [parameters["sg_rows"], parameters["reduction_step_size"]] - xegpu.set_anchor_layout(store_ops[-1], sg_layout=sg_layout, sg_data=sg_data) + sg_data = [parameters["sg_rows"], parameters["sizes"][1]] + xegpu.set_anchor_layout(store_ops[0], sg_layout=sg_layout, sg_data=sg_data) if stop_at_stage == "xegpu-wg": raise PipelineInterrupt() From c02b66b6b95e34e1dfdda31508e97ec50d507a03 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Apr 2026 23:16:12 +0000 Subject: [PATCH 25/51] fused version --- examples/xegpu/softmax.py | 8 ++ lighthouse/schedule/xegpu/softmax_schedule.py | 90 ++++++++++++++++++- 2 files changed, 95 insertions(+), 3 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index afca86e3..8049f0f4 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -181,6 +181,12 @@ def parse_cli(): default=16, help="Subgroup size.", ) + parser.add_argument( + "--reduction-step-size", + type=int, + default=16, + help="Step size for reduction loop tiling (optional).", + ) parser.add_argument( "--nruns", type=int, @@ -206,6 +212,7 @@ def parse_cli(): "tiled", "vectorized", "bufferized", + "gpu-outlining", "xegpu-initial", "xegpu-wg", "final", @@ -237,6 +244,7 @@ def parse_cli(): "wg_rows": args.wg_rows, "sg_rows": args.sg_rows, "subgroup_size": args.subgroup_size, + "reduction_step_size": args.reduction_step_size, } M, N = args.sizes diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 0907bc5a..bda5f819 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -39,6 +39,7 @@ def get_softmax_schedule_module( - sg_rows: Number of rows per subgroup - subgroup_size: Size of subgroup - sizes: Tuple with the sizes of the input tensors (e.g. (M, N)) + - reduction_step_size: Optional step size for tiling reduction loops Returns: MLIR module containing the transform schedule @@ -139,7 +140,72 @@ def bundle_xegpu_softmax_schedule( transform.AnyOpType.get(), func, ops=["linalg.softmax"] ) structured.structured_decompose_interface(anytype, softmax_ops) + # transform.print_(target=func, name="After structured_decompose_interface") + linalg_ops = match_and_split( + func, ops={"linalg.generic", "linalg.fill"}, nhandles=6 + ) + max_reduction = linalg_ops[1] + max_center_and_exp_op = linalg_ops[2] + sum_reduction = linalg_ops[4] + div_op = linalg_ops[5] + + reduction_step_size = parameters["reduction_step_size"] + + # Tile the division op and fuse the sub+exp producer into it + _, div_loop = structured.TileUsingForOp( + div_op, sizes=[0, reduction_step_size] + ).results + + # Fuse max_center_and_exp_op into the div loop + _, fused_loop = structured.structured_fuse_into_containing_op( + anytype, + anytype, + producer_op=max_center_and_exp_op, + containing_op=div_loop, + ) + + # Tile the sum reduction and fuse the sub+exp producer into it + _, _, _, sum_loop = structured.structured_tile_reduction_using_for( + [anytype], + anytype, + anytype, + anytype, + target=sum_reduction, + tile_sizes=[0, reduction_step_size], + ) + + func = transform.get_parent_op( + anytype, + fused_loop, + op_name="func.func", + deduplicate=True, + ) + + # Re-match and split linalg generic ops, there are 5 at this point + linalg_ops = match_and_split(func, ops={"linalg.generic"}, nhandles=5) + max_center_and_exp_op = linalg_ops[1] + + # Fuse max_center_and_exp_op into the sum reduction loop + _, fused_sum_loop = structured.structured_fuse_into_containing_op( + anytype, + anytype, + producer_op=max_center_and_exp_op, + containing_op=sum_loop, + ) + + # Tile the max reduction. + max_reduction = linalg_ops[0] + structured.structured_tile_reduction_using_for( + [anytype], + anytype, + anytype, + anytype, + target=max_reduction, + tile_sizes=[0, reduction_step_size], + ) + + # Cleanup after tiling and fusion transform.apply_cse(func) canonicalize(func) @@ -171,6 +237,17 @@ def bundle_xegpu_softmax_schedule( transform.apply_cse(mod) canonicalize(mod) + # promote memref.alloc to memref.alloca in payload function + func = match(mod, ops={"func.func"}) + func = apply_registered_pass( + func, + "promote-buffers-to-stack", + options={ + "max-alloc-size-in-bytes": "8192", + "max-rank-of-allocated-memref": "2", + }, + ) + if stop_at_stage == "bufferized": raise PipelineInterrupt() @@ -200,6 +277,9 @@ def bundle_xegpu_softmax_schedule( mod = apply_registered_pass(mod, "gpu-kernel-outlining") transform.apply_cse(mod) + if stop_at_stage == "gpu-outlining": + raise PipelineInterrupt() + # set xevm target mod = apply_registered_pass( mod, @@ -214,15 +294,19 @@ def bundle_xegpu_softmax_schedule( gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") transform.apply_cse(gpu_func) + # Cleanup. + transform.apply_cse(mod) + canonicalize(mod) + if stop_at_stage == "xegpu-initial": raise PipelineInterrupt() # Set layout attributes for xegpu.store_nd operations. # FIXME: currently ecah subgroup is handling the entire row. - store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) + store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=5) sg_layout = [parameters["sg_rows"], 1] - sg_data = [parameters["sg_rows"], parameters["sizes"][1]] - xegpu.set_anchor_layout(store_ops[0], sg_layout=sg_layout, sg_data=sg_data) + sg_data = [parameters["sg_rows"], parameters["reduction_step_size"]] + xegpu.set_anchor_layout(store_ops[-1], sg_layout=sg_layout, sg_data=sg_data) if stop_at_stage == "xegpu-wg": raise PipelineInterrupt() From bce6260e57741e5439efa90c9640aff19f90c8f0 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Apr 2026 23:35:41 +0000 Subject: [PATCH 26/51] tiled reduction doc --- docs/softmax_lowering.md | 331 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 330 insertions(+), 1 deletion(-) diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md index c5608e37..a31e6f4c 100644 --- a/docs/softmax_lowering.md +++ b/docs/softmax_lowering.md @@ -303,4 +303,333 @@ xegpu.store_nd %11, %12[%0, 0] --- -# Supporting larger Softmax dimension sizes. +# Supporting larger Softmax dimension sizes + +When the softmax dimension is larger than what can fit efficiently in registers, additional tiling and fusion transformations are applied to the reduction dimension. This section shows the intermediate stages between "decomposed" and "vectorized". + +**Approach:** Tile reductions along dimension 1 (step size = 16) and fuse producers into consumers to enable streaming computation. + +--- + +## Decomposed → Tiled: Stage A - Tile div op + +**Notes:** +- Tile the division operation with step size 16 along dimension 1 +- Creates `scf.for` loop iterating over 64 elements in chunks of 16 + +**Key Changes:** +```mlir +// Before: Single division linalg.generic over 64x64 +%11 = linalg.generic {...} ins(%8, %10 : tensor<64x64xf32>, tensor<64xf32>) + outs(%extracted_slice_0 : tensor<64x64xf32>) { ... } -> tensor<64x64xf32> + +// After: Division tiled into 64x16 chunks +%11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) { + %extracted_slice_3 = tensor.extract_slice %8[0, %arg4] [64, 16] [1, 1] + %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) + outs(%extracted_slice_5 : tensor<64x16xf32>) { ... } -> tensor<64x16xf32> + %inserted_slice = tensor.insert_slice %12 into %arg5[0, %arg4] [64, 16] [1, 1] + scf.yield %inserted_slice +} +``` + +--- + +## Stage B - Fuse sub+exp into div loop + +**Notes:** +- Fuse the `sub+exp` producer (max_center_and_exp_op) into the div loop +- Recomputes exp values on-the-fly instead of materializing full 64x64 tensor + +**Key Changes:** +```mlir +%11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) { + %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1] + + // Fused: sub+exp computed per 16-element chunk + %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) + outs(%extracted_slice_5 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_8: f32, %out: f32): + %14 = arith.subf %in, %in_8 : f32 + %15 = math.exp %14 : f32 + linalg.yield %15 : f32 + } -> tensor<64x16xf32> + + // Division operation + %13 = linalg.generic {...} ins(%12, %extracted_slice_6 : tensor<64x16xf32>, tensor<64xf32>) + outs(%extracted_slice_7 : tensor<64x16xf32>) { ... } -> tensor<64x16xf32> + // ... +} +``` + +--- + +## Stage C - Tile sum reduction + +**Notes:** +- Tile the sum reduction using `structured_tile_reduction_using_for` +- Creates intermediate accumulator tensor (64x16) +- Final reduction via `linalg.reduce` over dimension 1 + +**Key Changes:** +```mlir +// Tiled sum reduction with intermediate accumulator +%10 = tensor.empty() : tensor<64x16xf32> +%11 = linalg.fill ins(%cst_2 : f32) outs(%10 : tensor<64x16xf32>) -> tensor<64x16xf32> + +%12 = scf.for %arg4 = %c0_3 to %c64 step %c16 iter_args(%arg5 = %11) -> (tensor<64x16xf32>) { + %extracted_slice_7 = tensor.extract_slice %8[0, %arg4] [64, 16] [1, 1] + %14 = linalg.generic {...} ins(%extracted_slice_7 : tensor<64x16xf32>) + outs(%extracted_slice_8 : tensor<64x16xf32>) { + ^bb0(%in: f32, %out: f32): + %15 = arith.addf %in, %out : f32 + linalg.yield %15 : f32 + } -> tensor<64x16xf32> + // ... +} + +// Final reduction to 64xf32 +%reduced = linalg.reduce ins(%12 : tensor<64x16xf32>) outs(%9 : tensor<64xf32>) dimensions = [1] + (%in: f32, %init: f32) { + %14 = arith.addf %in, %init : f32 + linalg.yield %14 : f32 + } +``` + +--- + +## Stage D - Fuse sub+exp into sum loop + +**Notes:** +- Fuse `sub+exp` into the sum reduction loop +- Stream computation: compute exp and accumulate in same loop + +**Key Changes:** +```mlir +%12 = scf.for %arg4 = %c0_3 to %c64 step %c16 iter_args(%arg5 = %11) -> (tensor<64x16xf32>) { + %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1] + + // Fused: sub+exp + %14 = linalg.generic {...} ins(%extracted_slice_7, %extracted_slice_8 : tensor<64x16xf32>, tensor<64xf32>) + outs(%extracted_slice_9 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_11: f32, %out: f32): + %16 = arith.subf %in, %in_11 : f32 + %17 = math.exp %16 : f32 + linalg.yield %17 : f32 + } -> tensor<64x16xf32> + + // Accumulate sum + %15 = linalg.generic {...} ins(%14 : tensor<64x16xf32>) + outs(%extracted_slice_10 : tensor<64x16xf32>) { + ^bb0(%in: f32, %out: f32): + %16 = arith.addf %in, %out : f32 + linalg.yield %16 : f32 + } -> tensor<64x16xf32> + // ... +} +``` + +--- + +## Stage E - Tile max reduction + +**Notes:** +- Tile max reduction similar to sum reduction +- Creates 64x16 intermediate accumulator +- Final reduction via `linalg.reduce` with maxnumf + +**Key Changes:** +```mlir +// Tiled max reduction +%7 = tensor.empty() : tensor<64x16xf32> +%8 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<64x16xf32>) -> tensor<64x16xf32> + +%9 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %8) -> (tensor<64x16xf32>) { + %extracted_slice_12 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1] + %16 = linalg.generic {...} ins(%extracted_slice_12 : tensor<64x16xf32>) + outs(%extracted_slice_13 : tensor<64x16xf32>) { + ^bb0(%in: f32, %out: f32): + %17 = arith.maxnumf %in, %out : f32 + linalg.yield %17 : f32 + } -> tensor<64x16xf32> + // ... +} + +// Final max reduction +%reduced = linalg.reduce ins(%9 : tensor<64x16xf32>) outs(%6 : tensor<64xf32>) dimensions = [1] + (%in: f32, %init: f32) { + %16 = arith.maxnumf %in, %init : f32 + linalg.yield %16 : f32 + } +``` + +**Result:** Now all three major computations (max, sum, div) are tiled and operate on 64x16 chunks, with exp computation fused into both sum and div loops. + +--- + +## Stage F - Vectorization + +**Notes:** +- Convert tiled linalg operations to vector operations +- `scf.for` loops remain but operate on vectors +- Vector size: 64x16 for tiled operations + +**Code:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + // ... + %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) { + // ... + + // Vectorized max reduction loop + %6 = vector.transfer_write %cst_1, %5[%c0, %c0] : vector<64x16xf32>, tensor<64x16xf32> + %7 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %6) -> (tensor<64x16xf32>) { + %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32> + %16 = vector.transfer_read %arg5[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32> + %17 = arith.maxnumf %15, %16 : vector<64x16xf32> + %18 = vector.transfer_write %17, %arg5[%c0, %c0] : vector<64x16xf32>, tensor<64x16xf32> + scf.yield %18 : tensor<64x16xf32> + } + %8 = vector.transfer_read %7[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32> + %9 = vector.multi_reduction , %8, %cst_2 [1] : vector<64x16xf32> to vector<64xf32> + + // Vectorized sum reduction loop with fused sub+exp + %11 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %10) -> (tensor<64x16xf32>) { + %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32> + %16 = vector.broadcast %9 : vector<64xf32> to vector<16x64xf32> + %17 = vector.transpose %16, [1, 0] : vector<16x64xf32> to vector<64x16xf32> + %18 = arith.subf %15, %17 : vector<64x16xf32> + %19 = math.exp %18 : vector<64x16xf32> + %20 = vector.transfer_read %arg5[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32> + %21 = arith.addf %19, %20 : vector<64x16xf32> + %22 = vector.transfer_write %21, %arg5[%c0, %c0] : vector<64x16xf32>, tensor<64x16xf32> + scf.yield %22 : tensor<64x16xf32> + } + %12 = vector.transfer_read %11[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32> + %13 = vector.multi_reduction , %12, %cst_0 [1] : vector<64x16xf32> to vector<64xf32> + + // Vectorized div loop with fused sub+exp + %14 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %extracted_slice) -> (tensor<64x64xf32>) { + %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32> + %16 = vector.broadcast %9 : vector<64xf32> to vector<16x64xf32> + %17 = vector.transpose %16, [1, 0] : vector<16x64xf32> to vector<64x16xf32> + %18 = arith.subf %15, %17 : vector<64x16xf32> + %19 = math.exp %18 : vector<64x16xf32> + %20 = vector.broadcast %13 : vector<64xf32> to vector<16x64xf32> + %21 = vector.transpose %20, [1, 0] : vector<16x64xf32> to vector<64x16xf32> + %22 = arith.divf %19, %21 : vector<64x16xf32> + %23 = vector.transfer_write %22, %arg5[%c0, %arg4] : vector<64x16xf32>, tensor<64x64xf32> + scf.yield %23 : tensor<64x64xf32> + } + } + // ... +} +``` + +--- + +## Stage G - Bufferization + +**Notes:** +- Convert tensors to memrefs +- Allocate stack buffer for 64x16 accumulator: `memref.alloc()` + +**Code:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + // ... + scf.forall (%arg2) in (16) { + %1 = affine.apply #map(%arg2) + %subview = memref.subview %arg0[%1, 0] [64, 64] [1, 1] + + // Allocate accumulator buffer + %alloc = memref.alloc() {alignment = 64 : i64} : memref<64x16xf32> + + // Max reduction loop + vector.transfer_write %cst_1, %alloc[%c0, %c0] : vector<64x16xf32>, memref<64x16xf32> + scf.for %arg3 = %c0 to %c64 step %c16 { + %6 = vector.transfer_read %arg1[%1, %arg3], %0 : memref<1024x64xf32>, vector<64x16xf32> + %7 = vector.transfer_read %alloc[%c0, %c0], %0 : memref<64x16xf32>, vector<64x16xf32> + %8 = arith.maxnumf %6, %7 : vector<64x16xf32> + vector.transfer_write %8, %alloc[%c0, %c0] : vector<64x16xf32>, memref<64x16xf32> + } + %2 = vector.transfer_read %alloc[%c0, %c0], %0 : memref<64x16xf32>, vector<64x16xf32> + %3 = vector.multi_reduction , %2, %cst_2 [1] : vector<64x16xf32> to vector<64xf32> + + // Sum reduction loop (reuses %alloc) + // ... + + // Div loop (writes to %subview) + // ... + } +} +``` + +--- + +## Stage H - Promote buffers to stack + +**Notes:** +- Convert `memref.alloc()` to `memref.alloca()` for stack allocation +- Reduces memory allocation overhead + +**Code:** +```mlir +scf.forall (%arg2) in (16) { + %1 = affine.apply #map(%arg2) + %subview = memref.subview %arg0[%1, 0] [64, 64] [1, 1] + + // Stack allocation instead of heap + %alloca = memref.alloca() {alignment = 64 : i64} : memref<64x16xf32> + + // ... same operations using %alloca ... +} +``` + +--- + +## Stage I - GPU outlining + +**Notes:** +- Convert `scf.forall` to `scf.parallel`, then to `gpu.launch` +- Extract GPU kernel into separate `gpu.module` +- Set thread count: 128 threads = (64 rows / 8 sg_rows) × 16 subgroup_size + +**Host Side:** +```mlir +func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + gpu.launch_func @payload_kernel::@payload_kernel + blocks in (%c16, %c1, %c1) + threads in (%c128, %c1, %c1) + args(%arg0 : memref<1024x64xf32>, %arg1 : memref<1024x64xf32>) + return +} +``` + +**GPU Kernel:** +```mlir +gpu.module @payload_kernel { + gpu.func @payload_kernel(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) kernel + attributes {known_block_size = array, + known_grid_size = array} { + %block_id_x = gpu.block_id x + %1 = arith.muli %block_id_x, %c64 overflow : index + %subview = memref.subview %arg0[%1, 0] [64, 64] [1, 1] + %alloca = memref.alloca() {alignment = 64 : i64} : memref<64x16xf32> + + // Three reduction loops (max, sum, div) with same structure + scf.for %arg2 = %c0 to %c64 step %c16 { + // Max: accumulate max values + // Sum: compute & accumulate exp(x - max) + // Div: compute exp(x - max) / sum + } + + gpu.return + } +} +``` + +**Summary:** At this stage, the kernel processes 64x16 chunks in streaming fashion through three sequential loops, minimizing memory footprint. From 2df0777727704d1e3e14f2b6d66911908fedab27 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Apr 2026 23:36:42 +0000 Subject: [PATCH 27/51] tiled reduction doc --- lighthouse/schedule/xegpu/softmax_schedule.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index bda5f819..0248919f 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -140,7 +140,7 @@ def bundle_xegpu_softmax_schedule( transform.AnyOpType.get(), func, ops=["linalg.softmax"] ) structured.structured_decompose_interface(anytype, softmax_ops) - # transform.print_(target=func, name="After structured_decompose_interface") + transform.print_(target=func, name="Aftemr structured_decompose_interface") linalg_ops = match_and_split( func, ops={"linalg.generic", "linalg.fill"}, nhandles=6 @@ -156,6 +156,8 @@ def bundle_xegpu_softmax_schedule( _, div_loop = structured.TileUsingForOp( div_op, sizes=[0, reduction_step_size] ).results + + transform.print_(target=func, name="After tiling div op") # Fuse max_center_and_exp_op into the div loop _, fused_loop = structured.structured_fuse_into_containing_op( @@ -164,6 +166,8 @@ def bundle_xegpu_softmax_schedule( producer_op=max_center_and_exp_op, containing_op=div_loop, ) + transform.print_(target=func, name="After fusing max_center_and_exp_op into div loop") + # Tile the sum reduction and fuse the sub+exp producer into it _, _, _, sum_loop = structured.structured_tile_reduction_using_for( @@ -174,6 +178,8 @@ def bundle_xegpu_softmax_schedule( target=sum_reduction, tile_sizes=[0, reduction_step_size], ) + + transform.print_(target=func, name="After tiling sum reduction") func = transform.get_parent_op( anytype, @@ -193,6 +199,7 @@ def bundle_xegpu_softmax_schedule( producer_op=max_center_and_exp_op, containing_op=sum_loop, ) + transform.print_(target=func, name="After fusing max_center_and_exp_op into sum loop") # Tile the max reduction. max_reduction = linalg_ops[0] @@ -204,6 +211,8 @@ def bundle_xegpu_softmax_schedule( target=max_reduction, tile_sizes=[0, reduction_step_size], ) + transform.print_(target=func, name="After tiling max reduction") + # Cleanup after tiling and fusion transform.apply_cse(func) @@ -219,6 +228,8 @@ def bundle_xegpu_softmax_schedule( ).result transform.apply_cse(func) canonicalize(func) + + transform.print_(target=func, name="After vectorization") if stop_at_stage == "vectorized": raise PipelineInterrupt() @@ -236,6 +247,8 @@ def bundle_xegpu_softmax_schedule( mod = apply_registered_pass(mod, "fold-memref-alias-ops") transform.apply_cse(mod) canonicalize(mod) + + transform.print_(target=mod, name="After bufferization") # promote memref.alloc to memref.alloca in payload function func = match(mod, ops={"func.func"}) @@ -247,6 +260,8 @@ def bundle_xegpu_softmax_schedule( "max-rank-of-allocated-memref": "2", }, ) + + transform.print_(target=func, name="After promoting buffers to stack") if stop_at_stage == "bufferized": raise PipelineInterrupt() @@ -276,6 +291,8 @@ def bundle_xegpu_softmax_schedule( func = apply_registered_pass(func, "gpu-launch-sink-index-computations") mod = apply_registered_pass(mod, "gpu-kernel-outlining") transform.apply_cse(mod) + + transform.print_(target=mod, name="After GPU outlining") if stop_at_stage == "gpu-outlining": raise PipelineInterrupt() From 56687b7571d3160474dea5dfd4503c307a258dad Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Apr 2026 23:37:03 +0000 Subject: [PATCH 28/51] tiled reduction doc --- lighthouse/schedule/xegpu/softmax_schedule.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 0248919f..7912a45b 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -156,7 +156,7 @@ def bundle_xegpu_softmax_schedule( _, div_loop = structured.TileUsingForOp( div_op, sizes=[0, reduction_step_size] ).results - + transform.print_(target=func, name="After tiling div op") # Fuse max_center_and_exp_op into the div loop @@ -166,8 +166,9 @@ def bundle_xegpu_softmax_schedule( producer_op=max_center_and_exp_op, containing_op=div_loop, ) - transform.print_(target=func, name="After fusing max_center_and_exp_op into div loop") - + transform.print_( + target=func, name="After fusing max_center_and_exp_op into div loop" + ) # Tile the sum reduction and fuse the sub+exp producer into it _, _, _, sum_loop = structured.structured_tile_reduction_using_for( @@ -178,7 +179,7 @@ def bundle_xegpu_softmax_schedule( target=sum_reduction, tile_sizes=[0, reduction_step_size], ) - + transform.print_(target=func, name="After tiling sum reduction") func = transform.get_parent_op( @@ -199,7 +200,9 @@ def bundle_xegpu_softmax_schedule( producer_op=max_center_and_exp_op, containing_op=sum_loop, ) - transform.print_(target=func, name="After fusing max_center_and_exp_op into sum loop") + transform.print_( + target=func, name="After fusing max_center_and_exp_op into sum loop" + ) # Tile the max reduction. max_reduction = linalg_ops[0] @@ -213,7 +216,6 @@ def bundle_xegpu_softmax_schedule( ) transform.print_(target=func, name="After tiling max reduction") - # Cleanup after tiling and fusion transform.apply_cse(func) canonicalize(func) @@ -228,7 +230,7 @@ def bundle_xegpu_softmax_schedule( ).result transform.apply_cse(func) canonicalize(func) - + transform.print_(target=func, name="After vectorization") if stop_at_stage == "vectorized": @@ -247,7 +249,7 @@ def bundle_xegpu_softmax_schedule( mod = apply_registered_pass(mod, "fold-memref-alias-ops") transform.apply_cse(mod) canonicalize(mod) - + transform.print_(target=mod, name="After bufferization") # promote memref.alloc to memref.alloca in payload function @@ -260,7 +262,7 @@ def bundle_xegpu_softmax_schedule( "max-rank-of-allocated-memref": "2", }, ) - + transform.print_(target=func, name="After promoting buffers to stack") if stop_at_stage == "bufferized": @@ -291,7 +293,7 @@ def bundle_xegpu_softmax_schedule( func = apply_registered_pass(func, "gpu-launch-sink-index-computations") mod = apply_registered_pass(mod, "gpu-kernel-outlining") transform.apply_cse(mod) - + transform.print_(target=mod, name="After GPU outlining") if stop_at_stage == "gpu-outlining": From d2d4c49f70c10328a3e57da2b6b359f2539ad5df Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 15 Apr 2026 23:50:39 +0000 Subject: [PATCH 29/51] save work --- .../transform/transform_ext/__init__.py | 2 + .../transform_ext/ops/update_address_space.py | 102 ++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 lighthouse/dialects/transform/transform_ext/ops/update_address_space.py diff --git a/lighthouse/dialects/transform/transform_ext/__init__.py b/lighthouse/dialects/transform/transform_ext/__init__.py index aba08bf0..fb705805 100644 --- a/lighthouse/dialects/transform/transform_ext/__init__.py +++ b/lighthouse/dialects/transform/transform_ext/__init__.py @@ -9,6 +9,7 @@ from .ops.extract_handle import extract_handle from .ops.get_tileable_consumers import get_tileable_consumers from .ops.get_tiling_sizes import get_tiling_sizes +from .ops.update_address_space import update_address_space __all__ = [ "TransformExtensionDialect", @@ -22,4 +23,5 @@ "register_and_load", "replace", "wrap_in_benching_func", + "update_address_space", ] diff --git a/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py b/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py new file mode 100644 index 00000000..e0752f34 --- /dev/null +++ b/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py @@ -0,0 +1,102 @@ +from mlir import ir +from mlir.dialects import ext, transform, memref +from mlir.dialects.transform import DiagnosedSilenceableFailure + +from lighthouse.dialects.transform.transform_ext import TransformExtensionDialect + + +class UpdateAddressSpace(TransformExtensionDialect.Operation, name="update_address_space"): + """Update the address space of a memref allocation operation. + + Takes a target memref allocation operation and updates its address space + to the provided value. + """ + + target: ext.Operand[transform.AnyOpType] + address_space: ir.IntegerAttr + updated_op: ext.Result[transform.AnyOpType[()]] = ext.result(infer_type=True) + + @classmethod + def attach_interface_impls(cls, ctx=None): + cls.TransformOpInterfaceModel.attach(cls.OPERATION_NAME, context=ctx) + cls.MemoryEffectsOpInterfaceModel.attach(cls.OPERATION_NAME, context=ctx) + + class TransformOpInterfaceModel(transform.TransformOpInterface): + @staticmethod + def apply( + op: "UpdateAddressSpace", + rewriter: transform.TransformRewriter, + results: transform.TransformResults, + state: transform.TransformState, + ) -> DiagnosedSilenceableFailure: + # Get the target operations to transform + target_ops = state.get_payload_ops(op.target) + + # Get the address space value from the attribute + address_space_value = ir.IntegerAttr(op.address_space).value + + new_ops = [] + + for target_op in target_ops: + # Verify this is a memref.alloca operation + if target_op.OPERATION_NAME != "memref.alloca": + return DiagnosedSilenceableFailure.emit_silenceable_error( + f"Expected memref.alloca operation, got {target_op.OPERATION_NAME}" + ) + + # Get the current result type (should be a MemRefType) + old_result_type = target_op.results[0].type + + memref_type = ir.MemRefType(old_result_type) + + # Create a new memref type with the specified address space + new_memref_type = ir.MemRefType.get( + memref_type.shape, + memref_type.element_type, + layout=memref_type.layout, + memory_space=ir.Attribute.parse(f"{address_space_value}") + ) + print(new_memref_type) + + # Replace the operation with a new one that has the updated type + with ir.InsertionPoint(target_op): + + # Get the operands from the original alloca (dynamic sizes and symbols) + dynamic_sizes = list(target_op.operands[:target_op.attributes["operandSegmentSizes"][0]]) + symbol_operands = list(target_op.operands[target_op.attributes["operandSegmentSizes"][0]:]) + + # Create a new alloca with the updated type + new_alloca = memref.alloca(new_memref_type, dynamic_sizes, symbol_operands) + print(new_alloca) + + # Replace all uses of the old operation with the new one + # rewriter.replace_all_uses_with(target_op.results[0], new_alloca.results[0]) + + # Erase the old operation + rewriter.replace_op(target_op, [new_alloca]) + + new_ops.append(new_alloca.owner) + + # Set the results to the new operations + results.set_ops(op.updated_op, new_ops) + return DiagnosedSilenceableFailure.Success + + @staticmethod + def allow_repeated_handle_operands(_op: "UpdateAddressSpace") -> bool: + return False + + class MemoryEffectsOpInterfaceModel(ir.MemoryEffectsOpInterface): + @staticmethod + def get_effects(op: ir.Operation, effects): + transform.consumes_handle(op.op_operands[:1], effects) + transform.produces_handle(op.results, effects) + transform.modifies_payload(effects) + + +def update_address_space( + target: ir.Value, + address_space: int | ir.IntegerAttr, +) -> ir.Value: + if not isinstance(address_space, ir.IntegerAttr): + address_space = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), address_space) + return UpdateAddressSpace(target, address_space=address_space).updated_op From 32a345a87913e8f6a4ad982752dbd1fa234a03e2 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 16 Apr 2026 23:16:02 +0000 Subject: [PATCH 30/51] save work --- .../transform/transform_ext/__init__.py | 2 +- .../transform_ext/ops/update_address_space.py | 41 ++++++++++--------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/lighthouse/dialects/transform/transform_ext/__init__.py b/lighthouse/dialects/transform/transform_ext/__init__.py index fb705805..997522a2 100644 --- a/lighthouse/dialects/transform/transform_ext/__init__.py +++ b/lighthouse/dialects/transform/transform_ext/__init__.py @@ -22,6 +22,6 @@ "param_cmp_eq", "register_and_load", "replace", - "wrap_in_benching_func", "update_address_space", + "wrap_in_benching_func", ] diff --git a/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py b/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py index e0752f34..09313647 100644 --- a/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py +++ b/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py @@ -5,7 +5,9 @@ from lighthouse.dialects.transform.transform_ext import TransformExtensionDialect -class UpdateAddressSpace(TransformExtensionDialect.Operation, name="update_address_space"): +class UpdateAddressSpace( + TransformExtensionDialect.Operation, name="update_address_space" +): """Update the address space of a memref allocation operation. Takes a target memref allocation operation and updates its address space @@ -31,10 +33,8 @@ def apply( ) -> DiagnosedSilenceableFailure: # Get the target operations to transform target_ops = state.get_payload_ops(op.target) - # Get the address space value from the attribute address_space_value = ir.IntegerAttr(op.address_space).value - new_ops = [] for target_op in target_ops: @@ -46,35 +46,36 @@ def apply( # Get the current result type (should be a MemRefType) old_result_type = target_op.results[0].type - memref_type = ir.MemRefType(old_result_type) - # Create a new memref type with the specified address space new_memref_type = ir.MemRefType.get( memref_type.shape, memref_type.element_type, layout=memref_type.layout, - memory_space=ir.Attribute.parse(f"{address_space_value}") + memory_space=ir.Attribute.parse(f"{address_space_value}"), ) - print(new_memref_type) # Replace the operation with a new one that has the updated type with ir.InsertionPoint(target_op): - # Get the operands from the original alloca (dynamic sizes and symbols) - dynamic_sizes = list(target_op.operands[:target_op.attributes["operandSegmentSizes"][0]]) - symbol_operands = list(target_op.operands[target_op.attributes["operandSegmentSizes"][0]:]) - + dynamic_sizes = list( + target_op.operands[ + : target_op.attributes["operandSegmentSizes"][0] + ] + ) + symbol_operands = list( + target_op.operands[ + target_op.attributes["operandSegmentSizes"][0] : + ] + ) # Create a new alloca with the updated type - new_alloca = memref.alloca(new_memref_type, dynamic_sizes, symbol_operands) - print(new_alloca) - + new_alloca = memref.alloca( + new_memref_type, dynamic_sizes, symbol_operands + ) # Replace all uses of the old operation with the new one - # rewriter.replace_all_uses_with(target_op.results[0], new_alloca.results[0]) - - # Erase the old operation + # FIXME: This won't handle operations that consume the memref type and + # return a new memref (such as subview). rewriter.replace_op(target_op, [new_alloca]) - new_ops.append(new_alloca.owner) # Set the results to the new operations @@ -98,5 +99,7 @@ def update_address_space( address_space: int | ir.IntegerAttr, ) -> ir.Value: if not isinstance(address_space, ir.IntegerAttr): - address_space = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), address_space) + address_space = ir.IntegerAttr.get( + ir.IntegerType.get_signless(64), address_space + ) return UpdateAddressSpace(target, address_space=address_space).updated_op From eacc9d876840c7c2d27597fbfd97d605e6ac4319 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 17 Apr 2026 00:10:42 +0000 Subject: [PATCH 31/51] save work --- .../transform_ext/ops/update_address_space.py | 73 +++++++++---------- lighthouse/schedule/xegpu/softmax_schedule.py | 31 ++------ 2 files changed, 42 insertions(+), 62 deletions(-) diff --git a/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py b/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py index 09313647..2c40bfce 100644 --- a/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py +++ b/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py @@ -32,51 +32,46 @@ def apply( state: transform.TransformState, ) -> DiagnosedSilenceableFailure: # Get the target operations to transform - target_ops = state.get_payload_ops(op.target) + target_op = state.get_payload_ops(op.target)[0] # Get the address space value from the attribute address_space_value = ir.IntegerAttr(op.address_space).value new_ops = [] - for target_op in target_ops: - # Verify this is a memref.alloca operation - if target_op.OPERATION_NAME != "memref.alloca": - return DiagnosedSilenceableFailure.emit_silenceable_error( - f"Expected memref.alloca operation, got {target_op.OPERATION_NAME}" - ) - - # Get the current result type (should be a MemRefType) - old_result_type = target_op.results[0].type - memref_type = ir.MemRefType(old_result_type) - # Create a new memref type with the specified address space - new_memref_type = ir.MemRefType.get( - memref_type.shape, - memref_type.element_type, - layout=memref_type.layout, - memory_space=ir.Attribute.parse(f"{address_space_value}"), + # Verify this is a memref.alloca operation + if target_op.OPERATION_NAME != "memref.alloca": + return DiagnosedSilenceableFailure.emit_silenceable_error( + f"Expected memref.alloca operation, got {target_op.OPERATION_NAME}" ) - # Replace the operation with a new one that has the updated type - with ir.InsertionPoint(target_op): - # Get the operands from the original alloca (dynamic sizes and symbols) - dynamic_sizes = list( - target_op.operands[ - : target_op.attributes["operandSegmentSizes"][0] - ] - ) - symbol_operands = list( - target_op.operands[ - target_op.attributes["operandSegmentSizes"][0] : - ] - ) - # Create a new alloca with the updated type - new_alloca = memref.alloca( - new_memref_type, dynamic_sizes, symbol_operands - ) - # Replace all uses of the old operation with the new one - # FIXME: This won't handle operations that consume the memref type and - # return a new memref (such as subview). - rewriter.replace_op(target_op, [new_alloca]) - new_ops.append(new_alloca.owner) + # Get the current result type (should be a MemRefType) + old_result_type = target_op.results[0].type + memref_type = ir.MemRefType(old_result_type) + # Create a new memref type with the specified address space + new_memref_type = ir.MemRefType.get( + memref_type.shape, + memref_type.element_type, + layout=memref_type.layout, + memory_space=ir.Attribute.parse(f"{address_space_value}"), + ) + + # Replace the operation with a new one that has the updated type + with ir.InsertionPoint(target_op): + # Get the operands from the original alloca (dynamic sizes and symbols) + dynamic_sizes = list( + target_op.operands[: target_op.attributes["operandSegmentSizes"][0]] + ) + symbol_operands = list( + target_op.operands[target_op.attributes["operandSegmentSizes"][0] :] + ) + # Create a new alloca with the updated type + new_alloca = memref.alloca( + new_memref_type, dynamic_sizes, symbol_operands + ) + # Replace all uses of the old operation with the new one + # FIXME: This won't handle operations that consume the memref type and + # return a new memref (such as subview). + rewriter.replace_op(target_op, [new_alloca]) + new_ops.append(new_alloca.owner) # Set the results to the new operations results.set_ops(op.updated_op, new_ops) diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 7912a45b..8876d052 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -16,6 +16,7 @@ PipelineInterrupt, ) from lighthouse.schedule.xegpu.helper import bundle_xegpu_to_binary +from lighthouse.dialects.transform import transform_ext def get_softmax_schedule_module( @@ -140,7 +141,6 @@ def bundle_xegpu_softmax_schedule( transform.AnyOpType.get(), func, ops=["linalg.softmax"] ) structured.structured_decompose_interface(anytype, softmax_ops) - transform.print_(target=func, name="Aftemr structured_decompose_interface") linalg_ops = match_and_split( func, ops={"linalg.generic", "linalg.fill"}, nhandles=6 @@ -157,8 +157,6 @@ def bundle_xegpu_softmax_schedule( div_op, sizes=[0, reduction_step_size] ).results - transform.print_(target=func, name="After tiling div op") - # Fuse max_center_and_exp_op into the div loop _, fused_loop = structured.structured_fuse_into_containing_op( anytype, @@ -166,9 +164,6 @@ def bundle_xegpu_softmax_schedule( producer_op=max_center_and_exp_op, containing_op=div_loop, ) - transform.print_( - target=func, name="After fusing max_center_and_exp_op into div loop" - ) # Tile the sum reduction and fuse the sub+exp producer into it _, _, _, sum_loop = structured.structured_tile_reduction_using_for( @@ -180,8 +175,6 @@ def bundle_xegpu_softmax_schedule( tile_sizes=[0, reduction_step_size], ) - transform.print_(target=func, name="After tiling sum reduction") - func = transform.get_parent_op( anytype, fused_loop, @@ -200,9 +193,6 @@ def bundle_xegpu_softmax_schedule( producer_op=max_center_and_exp_op, containing_op=sum_loop, ) - transform.print_( - target=func, name="After fusing max_center_and_exp_op into sum loop" - ) # Tile the max reduction. max_reduction = linalg_ops[0] @@ -214,7 +204,6 @@ def bundle_xegpu_softmax_schedule( target=max_reduction, tile_sizes=[0, reduction_step_size], ) - transform.print_(target=func, name="After tiling max reduction") # Cleanup after tiling and fusion transform.apply_cse(func) @@ -231,8 +220,6 @@ def bundle_xegpu_softmax_schedule( transform.apply_cse(func) canonicalize(func) - transform.print_(target=func, name="After vectorization") - if stop_at_stage == "vectorized": raise PipelineInterrupt() @@ -250,8 +237,6 @@ def bundle_xegpu_softmax_schedule( transform.apply_cse(mod) canonicalize(mod) - transform.print_(target=mod, name="After bufferization") - # promote memref.alloc to memref.alloca in payload function func = match(mod, ops={"func.func"}) func = apply_registered_pass( @@ -263,8 +248,6 @@ def bundle_xegpu_softmax_schedule( }, ) - transform.print_(target=func, name="After promoting buffers to stack") - if stop_at_stage == "bufferized": raise PipelineInterrupt() @@ -294,8 +277,6 @@ def bundle_xegpu_softmax_schedule( mod = apply_registered_pass(mod, "gpu-kernel-outlining") transform.apply_cse(mod) - transform.print_(target=mod, name="After GPU outlining") - if stop_at_stage == "gpu-outlining": raise PipelineInterrupt() @@ -306,12 +287,16 @@ def bundle_xegpu_softmax_schedule( options={"O": "3", "chip": "bmg"}, ) - # convert vector to xegpu + # for each gpu function in the gpu module, change memref.alloca address + # space to 3 (SLM) and convert vector to xegpu. gpu_mod_ops = match_and_split(mod, ops={"gpu.module"}) for gpu_mod in gpu_mod_ops: gpu_func = match(gpu_mod, ops={"gpu.func"}) - gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") - transform.apply_cse(gpu_func) + allocas = match_and_split(gpu_func, ops={"memref.alloca"}) + for alloca in allocas: + transform_ext.update_address_space(alloca, address_space=3) + # gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") + # transform.apply_cse(gpu_func) # Cleanup. transform.apply_cse(mod) From 1313477172c7442e322c6bbb1cbd614c415e8172 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 20 Apr 2026 20:34:32 +0000 Subject: [PATCH 32/51] working version --- examples/xegpu/softmax.py | 4 +++- .../transform_ext/ops/update_address_space.py | 2 +- lighthouse/schedule/xegpu/softmax_schedule.py | 15 +++++++++------ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py index cc130297..f75613d0 100644 --- a/examples/xegpu/softmax.py +++ b/examples/xegpu/softmax.py @@ -155,7 +155,7 @@ def parse_cli(): "--sizes", type=int, nargs=2, - default=[1024, 64], + default=[1024, 512], help="M,N matrix sizes (MxN)", ) parser.add_argument( @@ -290,6 +290,8 @@ def parse_cli(): ) if not success: raise ValueError("Result mismatch!") + else: + print("Result is correct. Proceeding to benchmark...") times = runner.benchmark( host_input_buffers=wload._initial_host_arrays, diff --git a/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py b/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py index 2c40bfce..8d0b6041 100644 --- a/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py +++ b/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py @@ -16,7 +16,7 @@ class UpdateAddressSpace( target: ext.Operand[transform.AnyOpType] address_space: ir.IntegerAttr - updated_op: ext.Result[transform.AnyOpType[()]] = ext.result(infer_type=True) + updated_op: ext.Result[transform.AnyOpType[()]] = ext.infer_result() @classmethod def attach_interface_impls(cls, ctx=None): diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 8876d052..862677df 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -295,8 +295,8 @@ def bundle_xegpu_softmax_schedule( allocas = match_and_split(gpu_func, ops={"memref.alloca"}) for alloca in allocas: transform_ext.update_address_space(alloca, address_space=3) - # gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") - # transform.apply_cse(gpu_func) + gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") + transform.apply_cse(gpu_func) # Cleanup. transform.apply_cse(mod) @@ -305,12 +305,15 @@ def bundle_xegpu_softmax_schedule( if stop_at_stage == "xegpu-initial": raise PipelineInterrupt() - # Set layout attributes for xegpu.store_nd operations. - # FIXME: currently ecah subgroup is handling the entire row. - store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=5) + # Set layout attributes for xegpu.store_nd and xegpu.store_matrix ops. + store_nd_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1) + store_matrix_ops = match_and_split(gpu_func, ops={"xegpu.store_matrix"}, nhandles=4) sg_layout = [parameters["sg_rows"], 1] sg_data = [parameters["sg_rows"], parameters["reduction_step_size"]] - xegpu.set_anchor_layout(store_ops[-1], sg_layout=sg_layout, sg_data=sg_data) + for store_op in store_nd_ops: + xegpu.set_anchor_layout(store_op, sg_layout=sg_layout, sg_data=sg_data) + for store_op in store_matrix_ops: + xegpu.set_anchor_layout(store_op, sg_layout=sg_layout, sg_data=sg_data) if stop_at_stage == "xegpu-wg": raise PipelineInterrupt() From f1857aabfd159b7ab30d9a3b5fb3217efff781c9 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 20 Apr 2026 20:36:54 +0000 Subject: [PATCH 33/51] working version --- docs/softmax_lowering.md | 635 --------------------------------------- 1 file changed, 635 deletions(-) delete mode 100644 docs/softmax_lowering.md diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md deleted file mode 100644 index a31e6f4c..00000000 --- a/docs/softmax_lowering.md +++ /dev/null @@ -1,635 +0,0 @@ -# Linalg softmax lowering to XeGPU (Currently supported in lighthouse) - -## Overview - -**Assumptions:** -Softmax dimension size is small (64 in this example). - -The lowering process consists of seven stages: -1. **initial** - High-level tensor operations -2. **tiled-softmax** - Tiled softmax operations -3. **decomposed** - Decomposition into constituent operations -4. **vectorized** - Vector operations -5. **bufferized** - Memory-based representation -6. **xegpu-initial** - GPU kernel with XeGPU operations -7. **xegpu-wg** - Work-group optimized XeGPU - ---- - -## Stage 1: Initial - -**Code:** -```mlir -func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - // ... - %2 = tensor.empty() : tensor<1024x64xf32> - %3 = linalg.softmax dimension(1) ins(%1 : tensor<1024x64xf32>) - outs(%2 : tensor<1024x64xf32>) -> tensor<1024x64xf32> - // ... - return -} -``` ---- - -## Stage 2: Tiled Softmax - -**Notes** -- Work distribution via `scf.forall` (16 parallel iterations) -- Each tile processes 64x64 elements - -**Code:** -```mlir -func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - // ... - %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) { - %4 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2) - // Extract 64x64 input slice - %extracted_slice = tensor.extract_slice ... - // Extract 64x64 output slice - %extracted_slice_0 = tensor.extract_slice ... - // Apply softmax to the tile - %5 = linalg.softmax dimension(1) ins(%extracted_slice : tensor<64x64xf32>) - outs(%extracted_slice_0 : tensor<64x64xf32>) -> tensor<64x64xf32> - scf.forall.in_parallel { - tensor.parallel_insert_slice %5 into %arg3[%4, %c0] [64, 64] [1, 1] : - tensor<64x64xf32> into tensor<1024x64xf32> - } - } - // ... - return -} -``` - ---- - -## Stage 3: Decomposed - -**Notes** -- Softmax decomposed into 4 constituent `linalg.generic` ops : max, sub+exp, sum, divide -- Uses `structured.structured_decompose_interface` implemented by `linalg.softmax` - -**Code:** -```mlir -func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - // ... - - %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x64xf32>) { - %3 = affine.apply #map(%arg2) // %3 = %arg2 * 64 - %extracted_slice = tensor.extract_slice ... - - // Step 1: Find max along dimension 1 - %4 = tensor.empty() : tensor<64xf32> - %5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<64xf32>) -> tensor<64xf32> - %6 = linalg.generic // ... - %11 = arith.maxnumf %in, %out : f32 - // ... - } -> tensor<64xf32> - - // Step 2: Subtract max and exponentiate - %7 = linalg.generic // ... - %11 = arith.subf %in, %in_2 : f32 - %12 = math.exp %11 : f32 - // ... - } -> tensor<64x64xf32> - - // Step 3: Sum exponentials - %8 = linalg.fill ins(%cst : f32) outs(%4 : tensor<64xf32>) -> tensor<64xf32> - %9 = linalg.generic // ... - %11 = arith.addf %in, %out : f32 - // ... - } -> tensor<64xf32> - - // Step 4: Normalize by sum - %10 = linalg.generic // ... - %11 = arith.divf %in, %in_2 : f32 - // ... - } -> tensor<64x64xf32> - - scf.forall.in_parallel { - tensor.parallel_insert_slice %10 into %arg3[%3, 0] [64, 64] [1, 1] : - tensor<64x64xf32> into tensor<1024x64xf32> - } - } - return -} -``` - ---- - -## Stage 4: Vectorized - -**Notes** -- `linalg.generic` operations replaced with vector operations -- Vector transfers for reading/writing data - -**Code:** -```mlir -func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - // ... - %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) { - %4 = affine.apply #map(%arg2) // %4 = %arg2 * 64 - %extracted_slice = tensor.extract_slice .. - - // Vector read: Load 64x64 tile - %5 = vector.transfer_read %1[%4, %c0], %0 {in_bounds = [true, true]} : - tensor<1024x64xf32>, vector<64x64xf32> - - // Max reduction: Reduce dimension 1 -> vector<64xf32> - %6 = vector.multi_reduction , %5, %cst_0 [1] : - vector<64x64xf32> to vector<64xf32> - - // Broadcast max values back to 64x64 and transpose - %7 = vector.broadcast %6 : vector<64xf32> to vector<64x64xf32> - %8 = vector.transpose %7, [1, 0] : vector<64x64xf32> to vector<64x64xf32> - - // Subtract max and exponentiate - %9 = arith.subf %5, %8 : vector<64x64xf32> - %10 = math.exp %9 : vector<64x64xf32> - - // Sum reduction: Reduce dimension 1 -> vector<64xf32> - %11 = vector.multi_reduction , %10, %cst [1] : - vector<64x64xf32> to vector<64xf32> - - // Broadcast sums back to 64x64 and transpose - %12 = vector.broadcast %11 : vector<64xf32> to vector<64x64xf32> - %13 = vector.transpose %12, [1, 0] : vector<64x64xf32> to vector<64x64xf32> - - // Normalize - %14 = arith.divf %10, %13 : vector<64x64xf32> - - // Vector write - %15 = vector.transfer_write %14, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : - vector<64x64xf32>, tensor<64x64xf32> - - scf.forall.in_parallel { - tensor.parallel_insert_slice %15 into %arg3[%4, 0] [64, 64] [1, 1] - } - } - return -} -``` ---- - -## Stage 5: Bufferized - -**Notes** -- Tensors eliminated, working directly with memrefs - -**Code:** -```mlir -func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - // ... - - scf.forall (%arg2) in (16) { - %1 = affine.apply #map(%arg2) // %1 = %arg2 * 64 - - // Direct memref read - %2 = vector.transfer_read %arg1[%1, %c0], %0 {in_bounds = [true, true]} : - memref<1024x64xf32>, vector<64x64xf32> - - // Max reduction - %3 = vector.multi_reduction , %2, %cst_0 [1] : - vector<64x64xf32> to vector<64xf32> - %4 = vector.broadcast %3 : vector<64xf32> to vector<64x64xf32> - %5 = vector.transpose %4, [1, 0] : vector<64x64xf32> to vector<64x64xf32> - - // Subtract and exp - %6 = arith.subf %2, %5 : vector<64x64xf32> - %7 = math.exp %6 : vector<64x64xf32> - - // Sum reduction - %8 = vector.multi_reduction , %7, %cst [1] : - vector<64x64xf32> to vector<64xf32> - %9 = vector.broadcast %8 : vector<64xf32> to vector<64x64xf32> - %10 = vector.transpose %9, [1, 0] : vector<64x64xf32> to vector<64x64xf32> - - // Normalize - %11 = arith.divf %7, %10 : vector<64x64xf32> - - // Direct memref write - vector.transfer_write %11, %arg0[%1, %c0] {in_bounds = [true, true]} : - vector<64x64xf32>, memref<1024x64xf32> - } - return -} -``` - ---- - -## Stage 6: XeGPU-Initial - -**Notes** -- GPU kernel separated from host code (Gpu Outlining) -- `gpu.launch_func` invocation with grid/block dimensions -- Use `vector-to-xegpu` - -**Code:** - -**Host Side:** -```mlir -func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - // ... - gpu.launch_func @payload_kernel::@payload_kernel - blocks in (%c16, %c1, %c1) - threads in (%c128, %c1, %c1) - args(%arg1 : memref<1024x64xf32>, %arg0 : memref<1024x64xf32>) - return -} -``` - -**GPU Kernel:** -```mlir -gpu.module @payload_kernel [#xevm.target] { - gpu.func @payload_kernel(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) kernel - attributes {known_block_size = array, - known_grid_size = array} { - // ... - %block_id_x = gpu.block_id x - %0 = arith.muli %block_id_x, %c64 overflow : index - - // Create XeGPU tensor descriptor for load - %1 = xegpu.create_nd_tdesc %arg0 : memref<1024x64xf32> -> - !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> - - // XeGPU block load - %2 = xegpu.load_nd %1[%0, 0] : - !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> -> - vector<64x64xf32> - - // Same compute operations as before - %3 = vector.multi_reduction , %2, %cst_0 [1] : - vector<64x64xf32> to vector<64xf32> - %4 = vector.broadcast %3 : vector<64xf32> to vector<64x64xf32> - %5 = vector.transpose %4, [1, 0] : vector<64x64xf32> to vector<64x64xf32> - %6 = arith.subf %2, %5 : vector<64x64xf32> - %7 = math.exp %6 : vector<64x64xf32> - %8 = vector.multi_reduction , %7, %cst [1] : - vector<64x64xf32> to vector<64xf32> - %9 = vector.broadcast %8 : vector<64xf32> to vector<64x64xf32> - %10 = vector.transpose %9, [1, 0] : vector<64x64xf32> to vector<64x64xf32> - %11 = arith.divf %7, %10 : vector<64x64xf32> - - // Create XeGPU tensor descriptor for store - %12 = xegpu.create_nd_tdesc %arg1 : memref<1024x64xf32> -> - !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> - - // XeGPU block store - xegpu.store_nd %11, %12[%0, 0] : - vector<64x64xf32>, - !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> - - gpu.return - } -} -``` - ---- - -## Stage 7: XeGPU-WG (Work-Group Optimized) - -**Notes** -- Sets the layout for anchor xegpu ops. Each Wg consistes of [8, 1] subgroups - doing 8x64 softmax slice. -- Only sets the layotu for `store_nd`. Layout propagation does the rest. - -**Code (differences from xegpu-initial):** -```mlir -// Store operation now includes layout hints -xegpu.store_nd %11, %12[%0, 0] - <{layout = #xegpu.layout}> : - vector<64x64xf32>, - !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr> -``` - ---- - -# Supporting larger Softmax dimension sizes - -When the softmax dimension is larger than what can fit efficiently in registers, additional tiling and fusion transformations are applied to the reduction dimension. This section shows the intermediate stages between "decomposed" and "vectorized". - -**Approach:** Tile reductions along dimension 1 (step size = 16) and fuse producers into consumers to enable streaming computation. - ---- - -## Decomposed → Tiled: Stage A - Tile div op - -**Notes:** -- Tile the division operation with step size 16 along dimension 1 -- Creates `scf.for` loop iterating over 64 elements in chunks of 16 - -**Key Changes:** -```mlir -// Before: Single division linalg.generic over 64x64 -%11 = linalg.generic {...} ins(%8, %10 : tensor<64x64xf32>, tensor<64xf32>) - outs(%extracted_slice_0 : tensor<64x64xf32>) { ... } -> tensor<64x64xf32> - -// After: Division tiled into 64x16 chunks -%11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) { - %extracted_slice_3 = tensor.extract_slice %8[0, %arg4] [64, 16] [1, 1] - %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) - outs(%extracted_slice_5 : tensor<64x16xf32>) { ... } -> tensor<64x16xf32> - %inserted_slice = tensor.insert_slice %12 into %arg5[0, %arg4] [64, 16] [1, 1] - scf.yield %inserted_slice -} -``` - ---- - -## Stage B - Fuse sub+exp into div loop - -**Notes:** -- Fuse the `sub+exp` producer (max_center_and_exp_op) into the div loop -- Recomputes exp values on-the-fly instead of materializing full 64x64 tensor - -**Key Changes:** -```mlir -%11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) { - %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1] - - // Fused: sub+exp computed per 16-element chunk - %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) - outs(%extracted_slice_5 : tensor<64x16xf32>) { - ^bb0(%in: f32, %in_8: f32, %out: f32): - %14 = arith.subf %in, %in_8 : f32 - %15 = math.exp %14 : f32 - linalg.yield %15 : f32 - } -> tensor<64x16xf32> - - // Division operation - %13 = linalg.generic {...} ins(%12, %extracted_slice_6 : tensor<64x16xf32>, tensor<64xf32>) - outs(%extracted_slice_7 : tensor<64x16xf32>) { ... } -> tensor<64x16xf32> - // ... -} -``` - ---- - -## Stage C - Tile sum reduction - -**Notes:** -- Tile the sum reduction using `structured_tile_reduction_using_for` -- Creates intermediate accumulator tensor (64x16) -- Final reduction via `linalg.reduce` over dimension 1 - -**Key Changes:** -```mlir -// Tiled sum reduction with intermediate accumulator -%10 = tensor.empty() : tensor<64x16xf32> -%11 = linalg.fill ins(%cst_2 : f32) outs(%10 : tensor<64x16xf32>) -> tensor<64x16xf32> - -%12 = scf.for %arg4 = %c0_3 to %c64 step %c16 iter_args(%arg5 = %11) -> (tensor<64x16xf32>) { - %extracted_slice_7 = tensor.extract_slice %8[0, %arg4] [64, 16] [1, 1] - %14 = linalg.generic {...} ins(%extracted_slice_7 : tensor<64x16xf32>) - outs(%extracted_slice_8 : tensor<64x16xf32>) { - ^bb0(%in: f32, %out: f32): - %15 = arith.addf %in, %out : f32 - linalg.yield %15 : f32 - } -> tensor<64x16xf32> - // ... -} - -// Final reduction to 64xf32 -%reduced = linalg.reduce ins(%12 : tensor<64x16xf32>) outs(%9 : tensor<64xf32>) dimensions = [1] - (%in: f32, %init: f32) { - %14 = arith.addf %in, %init : f32 - linalg.yield %14 : f32 - } -``` - ---- - -## Stage D - Fuse sub+exp into sum loop - -**Notes:** -- Fuse `sub+exp` into the sum reduction loop -- Stream computation: compute exp and accumulate in same loop - -**Key Changes:** -```mlir -%12 = scf.for %arg4 = %c0_3 to %c64 step %c16 iter_args(%arg5 = %11) -> (tensor<64x16xf32>) { - %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1] - - // Fused: sub+exp - %14 = linalg.generic {...} ins(%extracted_slice_7, %extracted_slice_8 : tensor<64x16xf32>, tensor<64xf32>) - outs(%extracted_slice_9 : tensor<64x16xf32>) { - ^bb0(%in: f32, %in_11: f32, %out: f32): - %16 = arith.subf %in, %in_11 : f32 - %17 = math.exp %16 : f32 - linalg.yield %17 : f32 - } -> tensor<64x16xf32> - - // Accumulate sum - %15 = linalg.generic {...} ins(%14 : tensor<64x16xf32>) - outs(%extracted_slice_10 : tensor<64x16xf32>) { - ^bb0(%in: f32, %out: f32): - %16 = arith.addf %in, %out : f32 - linalg.yield %16 : f32 - } -> tensor<64x16xf32> - // ... -} -``` - ---- - -## Stage E - Tile max reduction - -**Notes:** -- Tile max reduction similar to sum reduction -- Creates 64x16 intermediate accumulator -- Final reduction via `linalg.reduce` with maxnumf - -**Key Changes:** -```mlir -// Tiled max reduction -%7 = tensor.empty() : tensor<64x16xf32> -%8 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<64x16xf32>) -> tensor<64x16xf32> - -%9 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %8) -> (tensor<64x16xf32>) { - %extracted_slice_12 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1] - %16 = linalg.generic {...} ins(%extracted_slice_12 : tensor<64x16xf32>) - outs(%extracted_slice_13 : tensor<64x16xf32>) { - ^bb0(%in: f32, %out: f32): - %17 = arith.maxnumf %in, %out : f32 - linalg.yield %17 : f32 - } -> tensor<64x16xf32> - // ... -} - -// Final max reduction -%reduced = linalg.reduce ins(%9 : tensor<64x16xf32>) outs(%6 : tensor<64xf32>) dimensions = [1] - (%in: f32, %init: f32) { - %16 = arith.maxnumf %in, %init : f32 - linalg.yield %16 : f32 - } -``` - -**Result:** Now all three major computations (max, sum, div) are tiled and operate on 64x16 chunks, with exp computation fused into both sum and div loops. - ---- - -## Stage F - Vectorization - -**Notes:** -- Convert tiled linalg operations to vector operations -- `scf.for` loops remain but operate on vectors -- Vector size: 64x16 for tiled operations - -**Code:** -```mlir -func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - // ... - %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) { - // ... - - // Vectorized max reduction loop - %6 = vector.transfer_write %cst_1, %5[%c0, %c0] : vector<64x16xf32>, tensor<64x16xf32> - %7 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %6) -> (tensor<64x16xf32>) { - %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32> - %16 = vector.transfer_read %arg5[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32> - %17 = arith.maxnumf %15, %16 : vector<64x16xf32> - %18 = vector.transfer_write %17, %arg5[%c0, %c0] : vector<64x16xf32>, tensor<64x16xf32> - scf.yield %18 : tensor<64x16xf32> - } - %8 = vector.transfer_read %7[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32> - %9 = vector.multi_reduction , %8, %cst_2 [1] : vector<64x16xf32> to vector<64xf32> - - // Vectorized sum reduction loop with fused sub+exp - %11 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %10) -> (tensor<64x16xf32>) { - %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32> - %16 = vector.broadcast %9 : vector<64xf32> to vector<16x64xf32> - %17 = vector.transpose %16, [1, 0] : vector<16x64xf32> to vector<64x16xf32> - %18 = arith.subf %15, %17 : vector<64x16xf32> - %19 = math.exp %18 : vector<64x16xf32> - %20 = vector.transfer_read %arg5[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32> - %21 = arith.addf %19, %20 : vector<64x16xf32> - %22 = vector.transfer_write %21, %arg5[%c0, %c0] : vector<64x16xf32>, tensor<64x16xf32> - scf.yield %22 : tensor<64x16xf32> - } - %12 = vector.transfer_read %11[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32> - %13 = vector.multi_reduction , %12, %cst_0 [1] : vector<64x16xf32> to vector<64xf32> - - // Vectorized div loop with fused sub+exp - %14 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %extracted_slice) -> (tensor<64x64xf32>) { - %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32> - %16 = vector.broadcast %9 : vector<64xf32> to vector<16x64xf32> - %17 = vector.transpose %16, [1, 0] : vector<16x64xf32> to vector<64x16xf32> - %18 = arith.subf %15, %17 : vector<64x16xf32> - %19 = math.exp %18 : vector<64x16xf32> - %20 = vector.broadcast %13 : vector<64xf32> to vector<16x64xf32> - %21 = vector.transpose %20, [1, 0] : vector<16x64xf32> to vector<64x16xf32> - %22 = arith.divf %19, %21 : vector<64x16xf32> - %23 = vector.transfer_write %22, %arg5[%c0, %arg4] : vector<64x16xf32>, tensor<64x64xf32> - scf.yield %23 : tensor<64x64xf32> - } - } - // ... -} -``` - ---- - -## Stage G - Bufferization - -**Notes:** -- Convert tensors to memrefs -- Allocate stack buffer for 64x16 accumulator: `memref.alloc()` - -**Code:** -```mlir -func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - // ... - scf.forall (%arg2) in (16) { - %1 = affine.apply #map(%arg2) - %subview = memref.subview %arg0[%1, 0] [64, 64] [1, 1] - - // Allocate accumulator buffer - %alloc = memref.alloc() {alignment = 64 : i64} : memref<64x16xf32> - - // Max reduction loop - vector.transfer_write %cst_1, %alloc[%c0, %c0] : vector<64x16xf32>, memref<64x16xf32> - scf.for %arg3 = %c0 to %c64 step %c16 { - %6 = vector.transfer_read %arg1[%1, %arg3], %0 : memref<1024x64xf32>, vector<64x16xf32> - %7 = vector.transfer_read %alloc[%c0, %c0], %0 : memref<64x16xf32>, vector<64x16xf32> - %8 = arith.maxnumf %6, %7 : vector<64x16xf32> - vector.transfer_write %8, %alloc[%c0, %c0] : vector<64x16xf32>, memref<64x16xf32> - } - %2 = vector.transfer_read %alloc[%c0, %c0], %0 : memref<64x16xf32>, vector<64x16xf32> - %3 = vector.multi_reduction , %2, %cst_2 [1] : vector<64x16xf32> to vector<64xf32> - - // Sum reduction loop (reuses %alloc) - // ... - - // Div loop (writes to %subview) - // ... - } -} -``` - ---- - -## Stage H - Promote buffers to stack - -**Notes:** -- Convert `memref.alloc()` to `memref.alloca()` for stack allocation -- Reduces memory allocation overhead - -**Code:** -```mlir -scf.forall (%arg2) in (16) { - %1 = affine.apply #map(%arg2) - %subview = memref.subview %arg0[%1, 0] [64, 64] [1, 1] - - // Stack allocation instead of heap - %alloca = memref.alloca() {alignment = 64 : i64} : memref<64x16xf32> - - // ... same operations using %alloca ... -} -``` - ---- - -## Stage I - GPU outlining - -**Notes:** -- Convert `scf.forall` to `scf.parallel`, then to `gpu.launch` -- Extract GPU kernel into separate `gpu.module` -- Set thread count: 128 threads = (64 rows / 8 sg_rows) × 16 subgroup_size - -**Host Side:** -```mlir -func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) { - %c16 = arith.constant 16 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - gpu.launch_func @payload_kernel::@payload_kernel - blocks in (%c16, %c1, %c1) - threads in (%c128, %c1, %c1) - args(%arg0 : memref<1024x64xf32>, %arg1 : memref<1024x64xf32>) - return -} -``` - -**GPU Kernel:** -```mlir -gpu.module @payload_kernel { - gpu.func @payload_kernel(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) kernel - attributes {known_block_size = array, - known_grid_size = array} { - %block_id_x = gpu.block_id x - %1 = arith.muli %block_id_x, %c64 overflow : index - %subview = memref.subview %arg0[%1, 0] [64, 64] [1, 1] - %alloca = memref.alloca() {alignment = 64 : i64} : memref<64x16xf32> - - // Three reduction loops (max, sum, div) with same structure - scf.for %arg2 = %c0 to %c64 step %c16 { - // Max: accumulate max values - // Sum: compute & accumulate exp(x - max) - // Div: compute exp(x - max) / sum - } - - gpu.return - } -} -``` - -**Summary:** At this stage, the kernel processes 64x16 chunks in streaming fashion through three sequential loops, minimizing memory footprint. From 240cf084338baf9f30ebe2315d09a489170530d9 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 20 Apr 2026 22:27:53 +0000 Subject: [PATCH 34/51] add initial version --- examples/xegpu/fused_attention.py | 339 ++++++++++++++++++ .../mlir_gen/gpu_fused_attention_payload.py | 69 ++++ .../xegpu/fused_attention_schedule.py | 164 +++++++++ 3 files changed, 572 insertions(+) create mode 100644 examples/xegpu/fused_attention.py create mode 100644 lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py create mode 100644 lighthouse/schedule/xegpu/fused_attention_schedule.py diff --git a/examples/xegpu/fused_attention.py b/examples/xegpu/fused_attention.py new file mode 100644 index 00000000..5aa84e2a --- /dev/null +++ b/examples/xegpu/fused_attention.py @@ -0,0 +1,339 @@ +# RUN: %PYTHON %s --dump-kernel=xegpu-wg | FileCheck %s +# CHECK: module attributes {gpu.container_module} { + +""" +XeGPU fused attention benchmark. +""" + +import argparse +from typing import Optional +from functools import cached_property + +import numpy as np +from mlir import ir + +from lighthouse import dialects as lh_dialects +from lighthouse.execution.runner import Runner +from lighthouse.pipeline.driver import TransformDriver +from lighthouse.execution import GPUMemoryManager +from lighthouse.utils.numpy import mlir_to_numpy_dtype +from lighthouse.ingress.mlir_gen import get_mlir_elem_type +from lighthouse.ingress.mlir_gen.gpu_fused_attention_payload import generate_gpu_fused_attention_payload +from lighthouse.schedule.xegpu.fused_attention_schedule import get_fused_attention_schedule_module + + +def fused_attention_complexity(Z: int, H: int, n_ctx: int, n_head: int, nbytes: int): + """ + Complexity of fused attention operation. + + For each batch and head: + - Q @ K^T: O(n_ctx^2 * n_head) operations + - Softmax: O(n_ctx^2) operations + - Attention @ V: O(n_ctx^2 * n_head) operations + Total: approximately 2*n_ctx^2*n_head FLOPs per batch and head + """ + # Approximation: 2 * n_ctx^2 * n_head FLOPs per batch and head + flop_count = Z * H * 2 * n_ctx * n_ctx * n_head + # Memory: read Q, K, V and write output + memory_reads = 3 * Z * H * n_ctx * n_head * nbytes + memory_writes = Z * H * n_ctx * n_head * nbytes + return flop_count, memory_reads, memory_writes + + +def check_correctness( + Q: np.ndarray, K: np.ndarray, V: np.ndarray, output_arr: np.ndarray, verbose: int = 0 +) -> bool: + """ + Check correctness of fused attention output. + + Reference implementation: + - scores = Q @ K^T / sqrt(n_head) + - attention_weights = softmax(scores, dim=-1) + - output = attention_weights @ V + """ + # Use float32 for computation + Q_f32 = Q.astype(np.float32) + K_f32 = K.astype(np.float32) + V_f32 = V.astype(np.float32) + + Z, H, n_ctx, n_head = Q.shape + scale = 1.0 / np.sqrt(n_head) + + output_ref = np.zeros_like(Q_f32) + + # Compute reference for each batch and head + for z in range(Z): + for h in range(H): + # scores = Q @ K^T / sqrt(n_head) + scores = Q_f32[z, h] @ K_f32[z, h].T * scale + + # softmax along last dimension + max_vals = np.max(scores, axis=1, keepdims=True) + exp_vals = np.exp(scores - max_vals) + sum_vals = np.sum(exp_vals, axis=1, keepdims=True) + attention_weights = exp_vals / sum_vals + + # output = attention_weights @ V + output_ref[z, h] = attention_weights @ V_f32[z, h] + + output = output_arr.astype(np.float32) + + if verbose > 1: + print("Reference solution (first batch, first head, first 5 rows):") + print(output_ref[0, 0, :5]) + print("Computed solution (first batch, first head, first 5 rows):") + print(output[0, 0, :5]) + + # Check values match reference + values_ok = np.allclose(output, output_ref, rtol=1e-3, atol=1e-4) + + success = values_ok + + if verbose: + if success: + print("PASSED") + else: + print("FAILED!") + if not values_ok: + max_diff = np.abs(output - output_ref).max() + print(f" Values mismatch. Max abs diff: {max_diff:.6e}") + return success + + +class XeGPUFusedAttention: + """ + Fused attention workload on XeGPU. + + Computes fused attention: + output = softmax(Q @ K^T / sqrt(n_head)) @ V + + All Q, K, V matrices have shape (Z, H, n_ctx, n_head) where: + - Z: batch size + - H: number of heads + - n_ctx: context length + - n_head: head dimension + """ + + def __init__( + self, + Z: int, + H: int, + n_ctx: int, + n_head: int, + dtype: str = "f32", + ): + self.Z = Z + self.H = H + self.n_ctx = n_ctx + self.n_head = n_head + self.shape = (Z, H, n_ctx, n_head) + assert dtype == "f32", "Only f32 type is supported for fused attention" + self.elem_type = get_mlir_elem_type(dtype) + self.dtype = mlir_to_numpy_dtype(self.elem_type) + self.memory_manager_class = GPUMemoryManager + self.payload_function_name = "payload" + + @cached_property + def _initial_host_arrays(self) -> tuple[np.ndarray]: + """Generate initial values on host with numpy.""" + np.random.seed(42) + # Initialize Q, K, V with small random values + Q = np.random.uniform(-0.5, 0.5, self.shape).astype(self.dtype) + K = np.random.uniform(-0.5, 0.5, self.shape).astype(self.dtype) + V = np.random.uniform(-0.5, 0.5, self.shape).astype(self.dtype) + output_arr = np.zeros(self.shape, dtype=self.dtype) + return (output_arr, Q, K, V) + + def get_complexity(self) -> tuple[int, int, int]: + nbytes = np.dtype(self.dtype).itemsize + return fused_attention_complexity(self.Z, self.H, self.n_ctx, self.n_head, nbytes) + + def payload_module(self) -> ir.Module: + """Generate MLIR module for fused attention payload.""" + return generate_gpu_fused_attention_payload( + func_name=self.payload_function_name, + Z=self.Z, + H=self.H, + n_ctx=self.n_ctx, + n_head=self.n_head, + dtype=self.elem_type, + ) + + def schedule_modules( + self, stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None + ) -> list[ir.Module]: + """Generate transform schedule for fused attention.""" + return [ + Runner.get_bench_wrapper_schedule(self.payload_function_name), + get_fused_attention_schedule_module( + stop_at_stage=stop_at_stage, + parameters=parameters, + ), + ] + + def shared_libs(self) -> list[str]: + return ["libmlir_levelzero_runtime.so"] + + +def parse_cli(): + parser = argparse.ArgumentParser( + description="Fused Attention using MLIR XeGPU", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--batch-size", + type=int, + default=2, + help="Batch size (Z)", + ) + parser.add_argument( + "--num-heads", + type=int, + default=8, + help="Number of attention heads (H)", + ) + parser.add_argument( + "--n-ctx", + type=int, + default=512, + help="Context length (sequence length)", + ) + parser.add_argument( + "--n-head", + type=int, + default=64, + help="Head dimension", + ) + parser.add_argument( + "--nruns", + type=int, + default=1000, + help="Number of runs to average the execution time.", + ) + parser.add_argument( + "--nwarmup", + type=int, + default=20, + help="Number of warm-up iterations before benchmarking.", + ) + parser.add_argument( + "--check-result", + action="store_true", + help="Check the result of the fused attention computation.", + ) + parser.add_argument( + "--dump-kernel", + type=str, + choices=[ + "initial", + "tiled", + "vectorized", + "bufferized", + "gpu-outlining", + "xegpu-initial", + "xegpu-wg", + "final", + ], + help="Dump kernel IR at different stages of lowering and exit without " + "executing the kernel.", + ) + parser.add_argument( + "--dump-schedule", + action="store_true", + help="Dump transform schedule.", + ) + parser.add_argument( + "--verbose", + "-v", + action="count", + default=0, + help="Increase output verbosity (e.g. print reference and computed solutions).", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + args = parse_cli() + + params = { + "batch_size": args.batch_size, + "num_heads": args.num_heads, + "n_ctx": args.n_ctx, + "n_head": args.n_head, + } + + Z = args.batch_size + H = args.num_heads + n_ctx = args.n_ctx + n_head = args.n_head + dtype = "f32" + + with ir.Context(), ir.Location.unknown(): + lh_dialects.register_and_load() + wload = XeGPUFusedAttention(Z=Z, H=H, n_ctx=n_ctx, n_head=n_head, dtype=dtype) + + if args.dump_kernel or args.dump_schedule: + pipeline = TransformDriver( + wload.schedule_modules( + stop_at_stage=args.dump_kernel, parameters=params + ) + ) + payload = pipeline.apply(wload.payload_module()) + if args.dump_kernel: + print(payload) + if args.dump_schedule: + for schedule_module in wload.schedule_modules(parameters=params): + print(schedule_module) + else: + pipeline = TransformDriver(wload.schedule_modules(parameters=params)) + payload = pipeline.apply(wload.payload_module()) + runner = Runner( + payload, + mem_manager_cls=wload.memory_manager_class, + shared_libs=wload.shared_libs(), + ) + if args.check_result: + # Setup callback function to copy result from device to host. + result_host_copy, argument_access_callback = ( + Runner.get_gpu_argument_access_callback(wload.shape, wload.dtype) + ) + + # Execute kernel once. + runner.execute( + host_input_buffers=wload._initial_host_arrays, + payload_function_name=wload.payload_function_name, + argument_access_callback=argument_access_callback, + ) + + # Compute reference solution on host. + Q, K, V = wload._initial_host_arrays[1:4] + success = check_correctness( + Q, K, V, + result_host_copy, + verbose=args.verbose, + ) + if not success: + raise ValueError("Result mismatch!") + else: + print("Result is correct. Proceeding to benchmark...") + + times = runner.benchmark( + host_input_buffers=wload._initial_host_arrays, + nruns=args.nruns, + nwarmup=args.nwarmup, + ) + times *= 1e6 # convert to microseconds + elapsed = np.mean(times) + flop_count = wload.get_complexity()[0] + gflops = flop_count / (elapsed * 1e-6) / 1e9 + + print( + f"batch-size={Z} " + f"num-heads={H} " + f"n-ctx={n_ctx} " + f"n-head={n_head} " + f"dt={dtype} " + f"time(us): {elapsed:.2f} " + f"GFLOPS: {gflops:.2f} " + ) diff --git a/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py b/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py new file mode 100644 index 00000000..788aa175 --- /dev/null +++ b/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py @@ -0,0 +1,69 @@ +"""Generate MLIR payload for GPU fused attention operation.""" + +from mlir import ir +from mlir.dialects import linalg, bufferization, tensor, arith + +from lighthouse.utils.mlir import func_cif +from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs +from lighthouse.ingress.mlir_gen.utils import emit_buf_to_tensor + + +def generate_gpu_fused_attention_payload( + func_name: str, + Z: int, + H: int, + n_ctx: int, + n_head: int, + dtype: ir.Type, +) -> ir.Module: + """ + Generate MLIR module for fused attention payload. + + Computes fused attention: + output = softmax(Q @ K^T / sqrt(n_head)) @ V + + Args: + func_name: Name of the payload function + Z: Batch size + H: Number of attention heads + n_ctx: Context length (sequence length) + n_head: Head dimension + dtype: MLIR element type (e.g., F32Type) + + Returns: + MLIR module containing the fused attention payload function + """ + mod = ir.Module.create() + shape = (Z, H, n_ctx, n_head) + memref_t = ir.MemRefType.get(shape, dtype) + + with ir.InsertionPoint(mod.body): + # Function signature: payload(output, Q, K, V) + @func_cif(memref_t, memref_t, memref_t, memref_t, name=func_name) + def payload(output, Q_arg, K_arg, V_arg): + # Convert memrefs to tensors + emit_buf_to_tensor(output, restrict=True, writable=True) + Q_tensor = emit_buf_to_tensor(Q_arg, restrict=True) + K_tensor = emit_buf_to_tensor(K_arg, restrict=True) + V_tensor = emit_buf_to_tensor(V_arg, restrict=True) + + # TODO: Implement fused attention computation + # This will involve: + # 1. Q @ K^T (batch matmul with transpose) + # 2. Scale by 1/sqrt(n_head) + # 3. Softmax along last dimension + # 4. Result @ V (batch matmul) + + # Placeholder: create empty output tensor + output_init = tensor.empty(shape, dtype) + result = output_init + + # Materialize result back to output memref + bufferization.materialize_in_destination( + None, result, output, restrict=True, writable=True + ) + + # Emit utility functions for GPU memory management + emit_gpu_util_funcs(dtype, rank=4) + + return mod diff --git a/lighthouse/schedule/xegpu/fused_attention_schedule.py b/lighthouse/schedule/xegpu/fused_attention_schedule.py new file mode 100644 index 00000000..a05e2ca9 --- /dev/null +++ b/lighthouse/schedule/xegpu/fused_attention_schedule.py @@ -0,0 +1,164 @@ +"""Generate MLIR transform schedule for XeGPU fused attention operation.""" + +from typing import Optional + +from mlir import ir +from mlir.dialects import transform +from mlir.dialects.transform import structured, loop, xegpu +from mlir.dialects.transform import bufferization as transform_bufferization +from mlir.dialects.bufferization import LayoutMapOption + +from lighthouse.pipeline.helper import ( + apply_registered_pass, + canonicalize, + match, + match_and_split, + PipelineInterrupt, +) +from lighthouse.schedule.xegpu.helper import bundle_xegpu_to_binary +from lighthouse.dialects.transform import transform_ext + + +def get_fused_attention_schedule_module( + stop_at_stage: Optional[str] = None, + parameters: Optional[dict] = None, +) -> ir.Module: + """ + Generate transform schedule for fused attention operation. + + The schedule performs the following transformations: + 1. Tile the fused attention operation + 2. Vectorize operations + 3. Bufferize tensors + 4. Convert to GPU dialect + 5. Lower to XeGPU operations + + Args: + stop_at_stage: Optional stage name to stop early (for debugging) + parameters: Dictionary with scheduling parameters: + - batch_size: Batch size (Z) + - num_heads: Number of attention heads (H) + - n_ctx: Context length + - n_head: Head dimension + + Returns: + MLIR module containing the transform schedule + """ + assert parameters is not None, "Schedule parameters must be provided" + + mod = ir.Module.create() + mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get() + + with ir.InsertionPoint(mod.body): + # Create a transform sequence with proper signature + named_sequence = transform.named_sequence( + "__transform_main", + [transform.AnyOpType.get()], # input: module + [], # no outputs + arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}], + ) + + with ir.InsertionPoint(named_sequence.body): + # match the payload module + anytype = transform.AnyOpType.get() + func = match(named_sequence.bodyTarget, ops={"func.func"}) + payload_mod = transform.get_parent_op( + anytype, + func, + op_name="builtin.module", + deduplicate=True, + ) + + xegpu_fused_attention_transform_schedule( + payload_mod, + parameters=parameters, + stop_at_stage=stop_at_stage or "", + ) + + return mod + + +def xegpu_fused_attention_transform_schedule( + mod: ir.Value[transform.AnyOpType], + parameters: dict, + stop_at_stage: str = "", +): + """Transform schedule for fused attention payload.""" + try: + mod = bundle_xegpu_fused_attention_schedule( + mod, + parameters=parameters, + stop_at_stage=stop_at_stage, + ) + + mod = bundle_xegpu_to_binary( + mod, + stop_at_stage=stop_at_stage, + ) + except PipelineInterrupt: + pass + finally: + transform.yield_() + + +def bundle_xegpu_fused_attention_schedule( + mod: ir.Value[transform.AnyOpType], + parameters: dict, + stop_at_stage: str = "", +) -> ir.Value[transform.AnyOpType]: + """Schedule for lowering fused attention payload to xegpu wg level.""" + + if stop_at_stage == "initial": + raise PipelineInterrupt() + + anytype = transform.AnyOpType.get() + + # TODO: Implement tiling, fusion, and lowering for fused attention + # This will involve: + # 1. Matching and tiling matmul operations (Q @ K^T) + # 2. Fusing softmax operation + # 3. Tiling second matmul (attention @ V) + # 4. Vectorization + # 5. Bufferization + # 6. GPU outlining + # 7. XeGPU lowering + + func = match(mod, ops={"func.func"}) + + if stop_at_stage == "tiled": + raise PipelineInterrupt() + + # vectorize (placeholder) + # func = structured.VectorizeChildrenAndApplyPatternsOp( + # func, + # fold_type_extensions_into_contract=True, + # ).result + transform.apply_cse(func) + canonicalize(func) + + if stop_at_stage == "vectorized": + raise PipelineInterrupt() + + # bufferize (placeholder) + # mod = apply_registered_pass(mod, "eliminate-empty-tensors") + # identity_layout = LayoutMapOption.IdentityLayoutMap + # mod = transform_bufferization.OneShotBufferizeOp( + # mod, + # allow_return_allocs_from_loops=True, + # bufferize_function_boundaries=True, + # function_boundary_type_conversion=identity_layout, + # ).result + + if stop_at_stage == "bufferized": + raise PipelineInterrupt() + + if stop_at_stage == "gpu-outlining": + raise PipelineInterrupt() + + if stop_at_stage == "xegpu-initial": + raise PipelineInterrupt() + + if stop_at_stage == "xegpu-wg": + raise PipelineInterrupt() + + return mod From 3262e4a58ee3e7756b3098033815f3a719041407 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 20 Apr 2026 22:28:04 +0000 Subject: [PATCH 35/51] add initial version --- examples/xegpu/fused_attention.py | 22 ++++++++++++++----- .../mlir_gen/gpu_fused_attention_payload.py | 2 +- .../xegpu/fused_attention_schedule.py | 6 ----- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/examples/xegpu/fused_attention.py b/examples/xegpu/fused_attention.py index 5aa84e2a..c42faf0e 100644 --- a/examples/xegpu/fused_attention.py +++ b/examples/xegpu/fused_attention.py @@ -18,8 +18,12 @@ from lighthouse.execution import GPUMemoryManager from lighthouse.utils.numpy import mlir_to_numpy_dtype from lighthouse.ingress.mlir_gen import get_mlir_elem_type -from lighthouse.ingress.mlir_gen.gpu_fused_attention_payload import generate_gpu_fused_attention_payload -from lighthouse.schedule.xegpu.fused_attention_schedule import get_fused_attention_schedule_module +from lighthouse.ingress.mlir_gen.gpu_fused_attention_payload import ( + generate_gpu_fused_attention_payload, +) +from lighthouse.schedule.xegpu.fused_attention_schedule import ( + get_fused_attention_schedule_module, +) def fused_attention_complexity(Z: int, H: int, n_ctx: int, n_head: int, nbytes: int): @@ -41,7 +45,11 @@ def fused_attention_complexity(Z: int, H: int, n_ctx: int, n_head: int, nbytes: def check_correctness( - Q: np.ndarray, K: np.ndarray, V: np.ndarray, output_arr: np.ndarray, verbose: int = 0 + Q: np.ndarray, + K: np.ndarray, + V: np.ndarray, + output_arr: np.ndarray, + verbose: int = 0, ) -> bool: """ Check correctness of fused attention output. @@ -146,7 +154,9 @@ def _initial_host_arrays(self) -> tuple[np.ndarray]: def get_complexity(self) -> tuple[int, int, int]: nbytes = np.dtype(self.dtype).itemsize - return fused_attention_complexity(self.Z, self.H, self.n_ctx, self.n_head, nbytes) + return fused_attention_complexity( + self.Z, self.H, self.n_ctx, self.n_head, nbytes + ) def payload_module(self) -> ir.Module: """Generate MLIR module for fused attention payload.""" @@ -309,7 +319,9 @@ def parse_cli(): # Compute reference solution on host. Q, K, V = wload._initial_host_arrays[1:4] success = check_correctness( - Q, K, V, + Q, + K, + V, result_host_copy, verbose=args.verbose, ) diff --git a/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py b/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py index 788aa175..73046604 100644 --- a/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py +++ b/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py @@ -1,7 +1,7 @@ """Generate MLIR payload for GPU fused attention operation.""" from mlir import ir -from mlir.dialects import linalg, bufferization, tensor, arith +from mlir.dialects import bufferization, tensor from lighthouse.utils.mlir import func_cif from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs diff --git a/lighthouse/schedule/xegpu/fused_attention_schedule.py b/lighthouse/schedule/xegpu/fused_attention_schedule.py index a05e2ca9..5fa47770 100644 --- a/lighthouse/schedule/xegpu/fused_attention_schedule.py +++ b/lighthouse/schedule/xegpu/fused_attention_schedule.py @@ -4,19 +4,13 @@ from mlir import ir from mlir.dialects import transform -from mlir.dialects.transform import structured, loop, xegpu -from mlir.dialects.transform import bufferization as transform_bufferization -from mlir.dialects.bufferization import LayoutMapOption from lighthouse.pipeline.helper import ( - apply_registered_pass, canonicalize, match, - match_and_split, PipelineInterrupt, ) from lighthouse.schedule.xegpu.helper import bundle_xegpu_to_binary -from lighthouse.dialects.transform import transform_ext def get_fused_attention_schedule_module( From 2135de3537ed55cca6da480ec38fcfa82cf83dd7 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 21 Apr 2026 20:59:59 +0000 Subject: [PATCH 36/51] payload done --- .../mlir_gen/gpu_fused_attention_payload.py | 90 ++++++++++++++++--- 1 file changed, 80 insertions(+), 10 deletions(-) diff --git a/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py b/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py index 73046604..87818d08 100644 --- a/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py +++ b/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py @@ -1,7 +1,9 @@ """Generate MLIR payload for GPU fused attention operation.""" +import math + from mlir import ir -from mlir.dialects import bufferization, tensor +from mlir.dialects import arith, bufferization, linalg, tensor from lighthouse.utils.mlir import func_cif from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs @@ -47,16 +49,84 @@ def payload(output, Q_arg, K_arg, V_arg): K_tensor = emit_buf_to_tensor(K_arg, restrict=True) V_tensor = emit_buf_to_tensor(V_arg, restrict=True) - # TODO: Implement fused attention computation - # This will involve: - # 1. Q @ K^T (batch matmul with transpose) - # 2. Scale by 1/sqrt(n_head) - # 3. Softmax along last dimension - # 4. Result @ V (batch matmul) + # Collapse first 3 dimensions (Z, H, n_ctx) into a single dimension + # From (Z, H, n_ctx, n_head) to (Z*H*n_ctx, n_head) + collapsed_dim = Z * H * n_ctx + collapsed_shape_2d = (collapsed_dim, n_head) + + Q_2d = tensor.collapse_shape( + ir.RankedTensorType.get(collapsed_shape_2d, dtype), + Q_tensor, + reassociation=[[0, 1, 2], [3]], + ) + K_2d = tensor.collapse_shape( + ir.RankedTensorType.get(collapsed_shape_2d, dtype), + K_tensor, + reassociation=[[0, 1, 2], [3]], + ) + V_2d = tensor.collapse_shape( + ir.RankedTensorType.get(collapsed_shape_2d, dtype), + V_tensor, + reassociation=[[0, 1, 2], [3]], + ) + + # Step 1: Transpose K to get K^T + # Permute from (collapsed_dim, n_head) to (n_head, collapsed_dim) + kt_shape_2d = (n_head, collapsed_dim) + kt_init = tensor.empty(kt_shape_2d, dtype) + K_transposed = linalg.transpose(K_2d, outs=[kt_init], permutation=[1, 0]) + + # Step 2: Compute Q @ K^T + # Q: (collapsed_dim, n_head) @ K^T: (n_head, collapsed_dim) + # Result: (collapsed_dim, collapsed_dim) + qkt_shape_2d = (collapsed_dim, collapsed_dim) + qkt_init = tensor.empty(qkt_shape_2d, dtype) + # Initialize with zeros for matmul accumulation + zero = arith.constant(dtype, 0.0) + qkt_init_filled = linalg.fill(zero, outs=[qkt_init]) + + # Matmul: Q @ K^T + qkt = linalg.matmul(Q_2d, K_transposed, outs=[qkt_init_filled]) + + # Step 3: Scale by 1/sqrt(n_head) + scale_factor = 1.0 / math.sqrt(n_head) + scale_const = arith.constant(dtype, scale_factor) - # Placeholder: create empty output tensor - output_init = tensor.empty(shape, dtype) - result = output_init + # Create a tensor filled with the scale factor + scale_tensor_init = tensor.empty(qkt_shape_2d, dtype) + scale_tensor = linalg.fill(scale_const, outs=[scale_tensor_init]) + + # Elementwise multiply qkt with scale tensor + scaled_qkt_init = tensor.empty(qkt_shape_2d, dtype) + scaled_qkt = linalg.mul(qkt, scale_tensor, outs=[scaled_qkt_init]) + + # Step 4: Apply softmax along the last dimension (dim=1 in 2D) + softmax_init = tensor.empty(qkt_shape_2d, dtype) + attention_weights = linalg.softmax( + result=[ir.RankedTensorType.get(qkt_shape_2d, dtype)], + input=scaled_qkt, + output=softmax_init, + dimension=1, + ) + + # Step 5: Multiply attention weights by V + # attention_weights: (collapsed_dim, collapsed_dim) @ V: (collapsed_dim, n_head) + # Result: (collapsed_dim, n_head) + output_2d_init = tensor.empty(collapsed_shape_2d, dtype) + output_2d_init_filled = linalg.fill(zero, outs=[output_2d_init]) + + result_2d = linalg.matmul( + attention_weights, V_2d, outs=[output_2d_init_filled] + ) + + # Expand back to 4D: (Z*H*n_ctx, n_head) -> (Z, H, n_ctx, n_head) + result = tensor.expand_shape( + ir.RankedTensorType.get(shape, dtype), + result_2d, + reassociation=[[0, 1, 2], [3]], + output_shape=[], + static_output_shape=shape, + ) # Materialize result back to output memref bufferization.materialize_in_destination( From 361e069c7f079dd781c96b7331d97f465bd62446 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 21 Apr 2026 21:51:32 +0000 Subject: [PATCH 37/51] tiled last matmul --- examples/xegpu/fused_attention.py | 7 ++++ .../xegpu/fused_attention_schedule.py | 42 +++++++++++++------ 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/examples/xegpu/fused_attention.py b/examples/xegpu/fused_attention.py index c42faf0e..213915f0 100644 --- a/examples/xegpu/fused_attention.py +++ b/examples/xegpu/fused_attention.py @@ -214,6 +214,12 @@ def parse_cli(): default=64, help="Head dimension", ) + parser.add_argument( + "--wg-tile-size", + type=int, + default=64, + help="Workgroup tile size for the collapsed batch dimension (Z*H*n_ctx)", + ) parser.add_argument( "--nruns", type=int, @@ -271,6 +277,7 @@ def parse_cli(): "num_heads": args.num_heads, "n_ctx": args.n_ctx, "n_head": args.n_head, + "wg_tile_size": args.wg_tile_size, } Z = args.batch_size diff --git a/lighthouse/schedule/xegpu/fused_attention_schedule.py b/lighthouse/schedule/xegpu/fused_attention_schedule.py index 5fa47770..d66efa24 100644 --- a/lighthouse/schedule/xegpu/fused_attention_schedule.py +++ b/lighthouse/schedule/xegpu/fused_attention_schedule.py @@ -4,10 +4,12 @@ from mlir import ir from mlir.dialects import transform +from mlir.dialects.transform import structured from lighthouse.pipeline.helper import ( canonicalize, match, + match_and_split, PipelineInterrupt, ) from lighthouse.schedule.xegpu.helper import bundle_xegpu_to_binary @@ -34,6 +36,7 @@ def get_fused_attention_schedule_module( - num_heads: Number of attention heads (H) - n_ctx: Context length - n_head: Head dimension + - wg_tile_size: Workgroup tile size for the collapsed batch dimension (Z*H*n_ctx) Returns: MLIR module containing the transform schedule @@ -106,18 +109,33 @@ def bundle_xegpu_fused_attention_schedule( raise PipelineInterrupt() anytype = transform.AnyOpType.get() - - # TODO: Implement tiling, fusion, and lowering for fused attention - # This will involve: - # 1. Matching and tiling matmul operations (Q @ K^T) - # 2. Fusing softmax operation - # 3. Tiling second matmul (attention @ V) - # 4. Vectorization - # 5. Bufferization - # 6. GPU outlining - # 7. XeGPU lowering - - func = match(mod, ops={"func.func"}) + # Match all matmul operations - there should be 2: + # 1. Q @ K^T + # 2. attention_weights @ V + matmul_ops = match_and_split(mod, ops={"linalg.matmul"}, nhandles=2) + + # Get the last matmul (attention_weights @ V) + last_matmul = matmul_ops[1] + func = transform.get_parent_op( + anytype, + last_matmul, + op_name="func.func", + deduplicate=True, + ) + + # Tile the last matmul in the batch dimension using tile_using_forall + # Batch dimension is the first dimension (collapsed_dim = Z * H * n_ctx) + # Extract workgroup tile size from parameters + wg_tile_size = parameters["wg_tile_size"] + + tiled_matmul, forall_loop = structured.structured_tile_using_forall( + anytype, + anytype, + last_matmul, + num_threads=[], + tile_sizes=[], + static_tile_sizes=(wg_tile_size, 0), + ) if stop_at_stage == "tiled": raise PipelineInterrupt() From 4d0827ec0b4e533a07ffe13fff42f7c9447722e8 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 21 Apr 2026 23:59:37 +0000 Subject: [PATCH 38/51] change to batch matmul --- .../mlir_gen/gpu_fused_attention_payload.py | 82 +++++++++---------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py b/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py index 87818d08..c2f3d4ec 100644 --- a/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py +++ b/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py @@ -49,81 +49,81 @@ def payload(output, Q_arg, K_arg, V_arg): K_tensor = emit_buf_to_tensor(K_arg, restrict=True) V_tensor = emit_buf_to_tensor(V_arg, restrict=True) - # Collapse first 3 dimensions (Z, H, n_ctx) into a single dimension - # From (Z, H, n_ctx, n_head) to (Z*H*n_ctx, n_head) - collapsed_dim = Z * H * n_ctx - collapsed_shape_2d = (collapsed_dim, n_head) + # Collapse first 2 dimensions (Z, H) into a batch dimension + # From (Z, H, n_ctx, n_head) to (Z*H, n_ctx, n_head) + batch_dim = Z * H + collapsed_shape_3d = (batch_dim, n_ctx, n_head) - Q_2d = tensor.collapse_shape( - ir.RankedTensorType.get(collapsed_shape_2d, dtype), + Q_3d = tensor.collapse_shape( + ir.RankedTensorType.get(collapsed_shape_3d, dtype), Q_tensor, - reassociation=[[0, 1, 2], [3]], + reassociation=[[0, 1], [2], [3]], ) - K_2d = tensor.collapse_shape( - ir.RankedTensorType.get(collapsed_shape_2d, dtype), + K_3d = tensor.collapse_shape( + ir.RankedTensorType.get(collapsed_shape_3d, dtype), K_tensor, - reassociation=[[0, 1, 2], [3]], + reassociation=[[0, 1], [2], [3]], ) - V_2d = tensor.collapse_shape( - ir.RankedTensorType.get(collapsed_shape_2d, dtype), + V_3d = tensor.collapse_shape( + ir.RankedTensorType.get(collapsed_shape_3d, dtype), V_tensor, - reassociation=[[0, 1, 2], [3]], + reassociation=[[0, 1], [2], [3]], ) # Step 1: Transpose K to get K^T - # Permute from (collapsed_dim, n_head) to (n_head, collapsed_dim) - kt_shape_2d = (n_head, collapsed_dim) - kt_init = tensor.empty(kt_shape_2d, dtype) - K_transposed = linalg.transpose(K_2d, outs=[kt_init], permutation=[1, 0]) - - # Step 2: Compute Q @ K^T - # Q: (collapsed_dim, n_head) @ K^T: (n_head, collapsed_dim) - # Result: (collapsed_dim, collapsed_dim) - qkt_shape_2d = (collapsed_dim, collapsed_dim) - qkt_init = tensor.empty(qkt_shape_2d, dtype) + # Permute from (batch_dim, n_ctx, n_head) to (batch_dim, n_head, n_ctx) + kt_shape_3d = (batch_dim, n_head, n_ctx) + kt_init = tensor.empty(kt_shape_3d, dtype) + K_transposed = linalg.transpose(K_3d, outs=[kt_init], permutation=[0, 2, 1]) + + # Step 2: Compute Q @ K^T using batch_matmul + # Q: (batch_dim, n_ctx, n_head) @ K^T: (batch_dim, n_head, n_ctx) + # Result: (batch_dim, n_ctx, n_ctx) + qkt_shape_3d = (batch_dim, n_ctx, n_ctx) + qkt_init = tensor.empty(qkt_shape_3d, dtype) # Initialize with zeros for matmul accumulation zero = arith.constant(dtype, 0.0) qkt_init_filled = linalg.fill(zero, outs=[qkt_init]) - # Matmul: Q @ K^T - qkt = linalg.matmul(Q_2d, K_transposed, outs=[qkt_init_filled]) + # Batch matmul: Q @ K^T + qkt = linalg.batch_matmul(Q_3d, K_transposed, outs=[qkt_init_filled]) # Step 3: Scale by 1/sqrt(n_head) scale_factor = 1.0 / math.sqrt(n_head) scale_const = arith.constant(dtype, scale_factor) # Create a tensor filled with the scale factor - scale_tensor_init = tensor.empty(qkt_shape_2d, dtype) + scale_tensor_init = tensor.empty(qkt_shape_3d, dtype) scale_tensor = linalg.fill(scale_const, outs=[scale_tensor_init]) # Elementwise multiply qkt with scale tensor - scaled_qkt_init = tensor.empty(qkt_shape_2d, dtype) + scaled_qkt_init = tensor.empty(qkt_shape_3d, dtype) scaled_qkt = linalg.mul(qkt, scale_tensor, outs=[scaled_qkt_init]) - # Step 4: Apply softmax along the last dimension (dim=1 in 2D) - softmax_init = tensor.empty(qkt_shape_2d, dtype) + # Step 4: Apply softmax along the last dimension (dim=2 in 3D) + softmax_init = tensor.empty(qkt_shape_3d, dtype) attention_weights = linalg.softmax( - result=[ir.RankedTensorType.get(qkt_shape_2d, dtype)], + result=[ir.RankedTensorType.get(qkt_shape_3d, dtype)], input=scaled_qkt, output=softmax_init, - dimension=1, + dimension=2, ) - # Step 5: Multiply attention weights by V - # attention_weights: (collapsed_dim, collapsed_dim) @ V: (collapsed_dim, n_head) - # Result: (collapsed_dim, n_head) - output_2d_init = tensor.empty(collapsed_shape_2d, dtype) - output_2d_init_filled = linalg.fill(zero, outs=[output_2d_init]) + # Step 5: Multiply attention weights by V using batch_matmul + # attention_weights: (batch_dim, n_ctx, n_ctx) @ V: (batch_dim, n_ctx, n_head) + # Result: (batch_dim, n_ctx, n_head) + output_3d_init = tensor.empty(collapsed_shape_3d, dtype) + output_3d_init_filled = linalg.fill(zero, outs=[output_3d_init]) - result_2d = linalg.matmul( - attention_weights, V_2d, outs=[output_2d_init_filled] + result_3d = linalg.batch_matmul( + attention_weights, V_3d, outs=[output_3d_init_filled] ) - # Expand back to 4D: (Z*H*n_ctx, n_head) -> (Z, H, n_ctx, n_head) + # Expand back to 4D: (Z*H, n_ctx, n_head) -> (Z, H, n_ctx, n_head) result = tensor.expand_shape( ir.RankedTensorType.get(shape, dtype), - result_2d, - reassociation=[[0, 1, 2], [3]], + result_3d, + reassociation=[[0, 1], [2], [3]], output_shape=[], static_output_shape=shape, ) From e379b683225f6943ae0b9645c0b3a5aed4bec420 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 22 Apr 2026 16:26:23 +0000 Subject: [PATCH 39/51] save work --- examples/xegpu/fused_attention.py | 3 +- .../xegpu/fused_attention_schedule.py | 109 ++++++++++++++++-- 2 files changed, 104 insertions(+), 8 deletions(-) diff --git a/examples/xegpu/fused_attention.py b/examples/xegpu/fused_attention.py index 213915f0..da61b3f7 100644 --- a/examples/xegpu/fused_attention.py +++ b/examples/xegpu/fused_attention.py @@ -242,7 +242,8 @@ def parse_cli(): type=str, choices=[ "initial", - "tiled", + "outer-tiled", + "inner-tiled", "vectorized", "bufferized", "gpu-outlining", diff --git a/lighthouse/schedule/xegpu/fused_attention_schedule.py b/lighthouse/schedule/xegpu/fused_attention_schedule.py index d66efa24..6ea15081 100644 --- a/lighthouse/schedule/xegpu/fused_attention_schedule.py +++ b/lighthouse/schedule/xegpu/fused_attention_schedule.py @@ -109,6 +109,7 @@ def bundle_xegpu_fused_attention_schedule( raise PipelineInterrupt() anytype = transform.AnyOpType.get() + anyvalue = transform.AnyValueType.get() # Match all matmul operations - there should be 2: # 1. Q @ K^T # 2. attention_weights @ V @@ -137,17 +138,111 @@ def bundle_xegpu_fused_attention_schedule( static_tile_sizes=(wg_tile_size, 0), ) - if stop_at_stage == "tiled": - raise PipelineInterrupt() + # Fuse the softmax producer into forall + softmax_ops = match_and_split(func, ops={"linalg.softmax"}, nhandles=1) + softmax_op = softmax_ops[0] + fused_softmax_op, forall_loop = structured.structured_fuse_into_containing_op( + anytype, + anytype, + producer_op=softmax_op, + containing_op=forall_loop, + ) + transform.apply_cse(func) + canonicalize(func) - # vectorize (placeholder) - # func = structured.VectorizeChildrenAndApplyPatternsOp( - # func, - # fold_type_extensions_into_contract=True, - # ).result + # Fuse linalg.mul (scaling) into forall + mul_ops = match_and_split(func, ops={"linalg.mul"}, nhandles=1) + mul_op = mul_ops[0] + _, forall_loop = structured.structured_fuse_into_containing_op( + anytype, + anytype, + producer_op=mul_op, + containing_op=forall_loop, + ) + transform.apply_cse(func) + canonicalize(func) + + # Fuse the first matmul (Q @ K^T) into forall + matmul_ops = match_and_split( + func, ops={"linalg.matmul"}, nhandles=2 + ) # Two matmuls are present. + first_matmul = matmul_ops[0] + _, forall_loop = structured.structured_fuse_into_containing_op( + anytype, + anytype, + producer_op=first_matmul, + containing_op=forall_loop, + ) + transform.apply_cse(func) + canonicalize(func) + + # Fuse linalg.transpose (K transpose) into forall + transpose_ops = match_and_split(func, ops={"linalg.transpose"}, nhandles=1) + transpose_op = transpose_ops[0] + _, forall_loop = structured.structured_fuse_into_containing_op( + anytype, + anytype, + producer_op=transpose_op, + containing_op=forall_loop, + ) transform.apply_cse(func) canonicalize(func) + # At this point all of the key operations are fused into the forall loop. + # Remaining linalg.fill ops can be fused trivially. + fill_ops = match_and_split(func, ops={"linalg.fill"}, nhandles=3) + for fill_op in fill_ops: + _, forall_loop = structured.structured_fuse_into_containing_op( + anytype, + anytype, + producer_op=fill_op, + containing_op=forall_loop, + ) + transform.apply_cse(func) + canonicalize(func) + + # tensor.empty() holding the result of transpose can be fused. + transpose_op = match_and_split(func, ops={"linalg.transpose"}, nhandles=1)[0] + transpose_init = transform.get_producer_of_operand( + anytype, transpose_op, operand_number=1 + ) + _, forall_loop = structured.structured_fuse_into_containing_op( + anytype, + anytype, + producer_op=transpose_init, + containing_op=forall_loop, + ) + transform.apply_cse(func) + canonicalize(func) + + # tensor.empty() ops holding the result of the softmax can also be fused. + softmax_op = match_and_split(func, ops={"linalg.softmax"}, nhandles=1)[0] + softmax_init = transform.get_producer_of_operand( + anytype, softmax_op, operand_number=1 + ) + _, forall_loop = structured.structured_fuse_into_containing_op( + anytype, + anytype, + producer_op=softmax_init, + containing_op=forall_loop, + ) + transform.apply_cse(func) + canonicalize(func) + + if stop_at_stage == "outer-tiled": + raise PipelineInterrupt() + + # # vectorize (placeholder) + # # func = structured.VectorizeChildrenAndApplyPatternsOp( + # # func, + # # fold_type_extensions_into_contract=True, + # # ).result + # transform.apply_cse(func) + # canonicalize(func) + + if stop_at_stage == "inner-tiled": + raise PipelineInterrupt() + if stop_at_stage == "vectorized": raise PipelineInterrupt() From 06e7edefa4213bd656a5d881cb8b105459b027b9 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 22 Apr 2026 20:12:39 +0000 Subject: [PATCH 40/51] save initial softmax doc --- .../softmax_lowering_flow.md | 608 ++++++++++++++++++ 1 file changed, 608 insertions(+) create mode 100644 reduction_tiling_docs/softmax_lowering_flow.md diff --git a/reduction_tiling_docs/softmax_lowering_flow.md b/reduction_tiling_docs/softmax_lowering_flow.md new file mode 100644 index 00000000..bccf3aa0 --- /dev/null +++ b/reduction_tiling_docs/softmax_lowering_flow.md @@ -0,0 +1,608 @@ +# Softmax Lowering Flow: IR Transformation Stages + +**Input Shape**: `1024x512xf32` (1024 rows, 512 columns) +**Softmax Dimension**: dim=1 (along the 512-element rows) + +--- + +## Stage 1: Initial IR + +Single high-level `linalg.softmax` operation on the full tensor. + +```mlir +func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { + %1 = bufferization.to_tensor %arg1 : tensor<1024x512xf32> + + // Single softmax op over entire tensor + %3 = linalg.softmax dimension(1) ins(%1 : tensor<1024x512xf32>) + outs(%2 : tensor<1024x512xf32>) -> tensor<1024x512xf32> + + bufferization.materialize_in_destination %3 in %arg0 +} +``` + +--- + +## Stage 2: After Tiling Parallel Dim + +Parallel dimension (rows) tiled into 16 chunks of 64 rows each. Introduces `scf.forall` for parallel execution. + +```mlir +func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { + // Parallel loop over 16 tiles (1024 / 64 = 16) + %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x512xf32>) { + %4 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2) + %slice = tensor.extract_slice %1[%4, 0] [64, 512] [1, 1] + + // Softmax on 64x512 slice + %5 = linalg.softmax dimension(1) ins(%slice : tensor<64x512xf32>) + outs(%slice_0 : tensor<64x512xf32>) -> tensor<64x512xf32> + + scf.forall.in_parallel { + tensor.parallel_insert_slice %5 into %arg3[%4, 0] [64, 512] [1, 1] + } + } +} +``` + +--- + +## Stage 3: After Decomposing Softmax + +Softmax decomposed into 4 operations: max reduction → center+exp → sum reduction → division. + +```mlir +func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { + %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x512xf32>) { + %slice = tensor.extract_slice %1[%4, 0] [64, 512] [1, 1] + + // 1. Max reduction: (64,512) -> (64,) + %7 = linalg.generic {indexing_maps = [map<(d0,d1) -> (d0,d1)>, map<(d0,d1) -> (d0)>], + iterator_types = ["parallel", "reduction"]} + ins(%slice : tensor<64x512xf32>) outs(%6 : tensor<64xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = arith.maxnumf %in, %out : f32 + linalg.yield %12 : f32 + } -> tensor<64xf32> + + // 2. Center and exp: (64,512) -> (64,512) + %8 = linalg.generic {indexing_maps = [map<(d0,d1) -> (d0,d1)>, map<(d0,d1) -> (d0)>, map<(d0,d1) -> (d0,d1)>], + iterator_types = ["parallel", "parallel"]} + ins(%slice, %7 : tensor<64x512xf32>, tensor<64xf32>) outs(%slice_0 : tensor<64x512xf32>) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %12 = arith.subf %in, %in_2 : f32 + %13 = math.exp %12 : f32 + linalg.yield %13 : f32 + } -> tensor<64x512xf32> + + // 3. Sum reduction: (64,512) -> (64,) + %10 = linalg.generic {indexing_maps = [map<(d0,d1) -> (d0,d1)>, map<(d0,d1) -> (d0)>], + iterator_types = ["parallel", "reduction"]} + ins(%8 : tensor<64x512xf32>) outs(%9 : tensor<64xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = arith.addf %in, %out : f32 + linalg.yield %12 : f32 + } -> tensor<64xf32> + + // 4. Division: (64,512) -> (64,512) + %11 = linalg.generic {indexing_maps = [map<(d0,d1) -> (d0,d1)>, map<(d0,d1) -> (d0)>, map<(d0,d1) -> (d0,d1)>], + iterator_types = ["parallel", "parallel"]} + ins(%8, %10 : tensor<64x512xf32>, tensor<64xf32>) outs(%slice_0 : tensor<64x512xf32>) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %12 = arith.divf %in, %in_2 : f32 + linalg.yield %12 : f32 + } -> tensor<64x512xf32> + + scf.forall.in_parallel { + tensor.parallel_insert_slice %11 into %arg3[%4, 0] [64, 512] [1, 1] + } + } +} +``` + +**Loop structure:** +- **Outer**: `scf.forall` with 16 parallel iterations +- Each iteration performs 4 sequential linalg ops + +**Key operations:** +1. **Max reduction**: Reduces from `(64, 512)` to `(64,)` using `maxnumf` +2. **Center+exp**: Element-wise subtract max and apply exp +3. **Sum reduction**: Reduces from `(64, 512)` to `(64,)` using `addf` +4. **Division**: Element-wise divide by sum + +--- + +## Stage 4: After Tiling Division + +Division operation tiled along dimension 1 into chunks of 16 columns. + +```mlir +func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { + %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x512xf32>) { + %slice = tensor.extract_slice %0[%3, 0] [64, 512] [1, 1] + + // Max reduction (64,512) -> (64,) + %6 = linalg.generic {iterator_types = ["parallel", "reduction"]} + ins(%slice) outs(%5) { maxnumf } -> tensor<64xf32> + + // Center and exp (64,512) -> (64,512) + %7 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice, %6) outs(%slice_1) { subf, exp } -> tensor<64x512xf32> + + // Sum reduction (64,512) -> (64,) + %9 = linalg.generic {iterator_types = ["parallel", "reduction"]} + ins(%7) outs(%8) { addf } -> tensor<64xf32> + + // Division tiled over columns: loop from 0 to 512 step 16 + %10 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %slice_1) -> (tensor<64x512xf32>) { + %slice_2 = tensor.extract_slice %7[0, %arg4] [64, 16] [1, 1] + %slice_3 = tensor.extract_slice %arg5[0, %arg4] [64, 16] [1, 1] + + // Division on 64x16 tile + %11 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_2, %9 : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_3 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_4: f32, %out: f32): + %12 = arith.divf %in, %in_4 : f32 + linalg.yield %12 : f32 + } -> tensor<64x16xf32> + + %inserted = tensor.insert_slice %11 into %arg5[0, %arg4] [64, 16] [1, 1] + scf.yield %inserted : tensor<64x512xf32> + } + + scf.forall.in_parallel { + tensor.parallel_insert_slice %10 into %arg3[%3, 0] [64, 512] [1, 1] + } + } +} +``` + +**Loop structure:** +- **Outer**: `scf.forall` with 16 parallel iterations (64-row tiles) +- **Inner**: `scf.for` with 32 sequential iterations (512/16 = 32 column tiles) + +**Key change:** Division now operates on `64x16` tiles instead of full `64x512` + +--- + +## Stage 5: After Fusing Max+Center+Exp into Division Loop + +The center-and-exp computation is fused into the division loop to recompute values on-the-fly. + +```mlir +func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { + %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x512xf32>) { + %slice = tensor.extract_slice %0[%3, 0] [64, 512] [1, 1] + + // Max reduction (64,512) -> (64,) + %6 = linalg.generic {iterator_types = ["parallel", "reduction"]} + ins(%slice) outs(%5) { maxnumf } -> tensor<64xf32> + + // Center and exp (still materialized for sum reduction) + %7 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice, %6) outs(%slice_1) { subf, exp } -> tensor<64x512xf32> + + // Sum reduction (64,512) -> (64,) + %9 = linalg.generic {iterator_types = ["parallel", "reduction"]} + ins(%7) outs(%8) { addf } -> tensor<64xf32> + + // Division loop with fused center+exp+div + %10 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %slice_1) -> (tensor<64x512xf32>) { + %slice_2 = tensor.extract_slice %slice[0, %arg4] [64, 16] [1, 1] // from original input + %slice_3 = tensor.extract_slice %arg5[0, %arg4] [64, 16] [1, 1] + + // Fused center+exp on 64x16 tile + %11 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_2, %6 : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_3 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_4: f32, %out: f32): + %13 = arith.subf %in, %in_4 : f32 + %14 = math.exp %13 : f32 + linalg.yield %14 : f32 + } -> tensor<64x16xf32> + + // Division on 64x16 tile + %12 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%11, %9 : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_3 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_4: f32, %out: f32): + %13 = arith.divf %in, %in_4 : f32 + linalg.yield %13 : f32 + } -> tensor<64x16xf32> + + %inserted = tensor.insert_slice %12 into %arg5[0, %arg4] [64, 16] [1, 1] + scf.yield %inserted : tensor<64x512xf32> + } + + scf.forall.in_parallel { + tensor.parallel_insert_slice %10 into %arg3[%3, 0] [64, 512] [1, 1] + } + } +} +``` + +**Loop structure:** +- **Outer**: `scf.forall` with 16 parallel iterations +- **Inner**: `scf.for` with 32 sequential iterations + +**Key change:** Inside the division loop, center+exp is recomputed on `64x16` tiles from the original input, avoiding the need to store the full `64x512` exp tensor. + +--- + +## Stage 6: After Tiling Sum Reduction + +Sum reduction tiled into chunks of 16 columns, introducing partial sums followed by a final reduction. + +```mlir +func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { + %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x512xf32>) { + %slice = tensor.extract_slice %0[%3, 0] [64, 512] [1, 1] + + // Max reduction (64,512) -> (64,) + %6 = linalg.generic {iterator_types = ["parallel", "reduction"]} + ins(%slice) outs(%5) { maxnumf } -> tensor<64xf32> + + // Center and exp (64,512) -> (64,512) + %7 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice, %6) outs(%slice_1) { subf, exp } -> tensor<64x512xf32> + + // Tiled sum reduction: accumulate into 64x16 buffer + %11 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %10) -> (tensor<64x16xf32>) { + %slice_2 = tensor.extract_slice %7[0, %arg4] [64, 16] [1, 1] + + // Accumulate sums + %13 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_2 : tensor<64x16xf32>) outs(%arg5 : tensor<64x16xf32>) { + ^bb0(%in: f32, %out: f32): + %14 = arith.addf %in, %out : f32 + linalg.yield %14 : f32 + } -> tensor<64x16xf32> + + scf.yield %13 : tensor<64x16xf32> + } + + // Final reduction: (64,16) -> (64,) + %reduced = linalg.reduce ins(%11 : tensor<64x16xf32>) outs(%8 : tensor<64xf32>) dimensions = [1] { + (%in: f32, %init: f32) { + %13 = arith.addf %in, %init : f32 + linalg.yield %13 : f32 + } + } + + // Division loop (same as before) + %12 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %slice_1) -> (tensor<64x512xf32>) { + // ... fused center+exp+div ... + } + + scf.forall.in_parallel { + tensor.parallel_insert_slice %12 into %arg3[%3, 0] [64, 512] [1, 1] + } + } +} +``` + +**Loop structure:** +- **Outer**: `scf.forall` with 16 parallel iterations +- **Sum reduction loop**: `scf.for` with 32 iterations, accumulating into `64x16` +- **Final reduction**: `linalg.reduce` from `(64, 16)` to `(64,)` +- **Division loop**: `scf.for` with 32 iterations + +**Key change:** Sum reduction split into partial accumulation (loop) + final reduction (linalg.reduce) + +--- + +## Stage 7: After Fusing Max+Center+Exp into Sum Reduction Loop + +The center-and-exp computation is now fused into the sum reduction loop as well. + +```mlir +func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { + %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x512xf32>) { + %slice = tensor.extract_slice %0[%3, 0] [64, 512] [1, 1] + + // Max reduction (64,512) -> (64,) + %6 = linalg.generic {iterator_types = ["parallel", "reduction"]} + ins(%slice) outs(%5) { maxnumf } -> tensor<64xf32> + + // Sum reduction loop with fused center+exp + %10 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %9) -> (tensor<64x16xf32>) { + %slice_2 = tensor.extract_slice %slice[0, %arg4] [64, 16] [1, 1] + + // Fused center+exp + %12 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_2, %6 : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_3 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_4: f32, %out: f32): + %14 = arith.subf %in, %in_4 : f32 + %15 = math.exp %14 : f32 + linalg.yield %15 : f32 + } -> tensor<64x16xf32> + + // Accumulate into sum buffer + %13 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%12 : tensor<64x16xf32>) outs(%arg5 : tensor<64x16xf32>) { + ^bb0(%in: f32, %out: f32): + %14 = arith.addf %in, %out : f32 + linalg.yield %14 : f32 + } -> tensor<64x16xf32> + + scf.yield %13 : tensor<64x16xf32> + } + + // Final reduction: (64,16) -> (64,) + %reduced = linalg.reduce ins(%10 : tensor<64x16xf32>) outs(%7 : tensor<64xf32>) dimensions = [1] { + (%in: f32, %init: f32) { + %12 = arith.addf %in, %init : f32 + linalg.yield %12 : f32 + } + } + + // Division loop with fused center+exp+div + %11 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %slice_1) -> (tensor<64x512xf32>) { + %slice_2 = tensor.extract_slice %slice[0, %arg4] [64, 16] [1, 1] + + // Fused center+exp + %12 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_2, %6 : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_3 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_4: f32, %out: f32): + %14 = arith.subf %in, %in_4 : f32 + %15 = math.exp %14 : f32 + linalg.yield %15 : f32 + } -> tensor<64x16xf32> + + // Division + %13 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%12, %reduced : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_3 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_4: f32, %out: f32): + %14 = arith.divf %in, %in_4 : f32 + linalg.yield %14 : f32 + } -> tensor<64x16xf32> + + %inserted = tensor.insert_slice %13 into %arg5[0, %arg4] [64, 16] [1, 1] + scf.yield %inserted : tensor<64x512xf32> + } + + scf.forall.in_parallel { + tensor.parallel_insert_slice %11 into %arg3[%3, 0] [64, 512] [1, 1] + } + } +} +``` + +**Loop structure:** +- **Outer**: `scf.forall` with 16 parallel iterations +- **Sum reduction loop**: `scf.for` with 32 iterations (fused center+exp+accumulate) +- **Final reduction**: `linalg.reduce` +- **Division loop**: `scf.for` with 32 iterations (fused center+exp+div) + +**Key change:** Center+exp is recomputed twice (once for sum, once for division) to avoid storing intermediate `64x512` tensor. + +--- + +## Stage 8: After Tiling Max Reduction + +Max reduction also tiled into 16-column chunks with partial max followed by final reduction. + +```mlir +func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { + %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x512xf32>) { + %slice = tensor.extract_slice %0[%3, 0] [64, 512] [1, 1] + + // Tiled max reduction: accumulate into 64x16 buffer + %8 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %7) -> (tensor<64x16xf32>) { + %slice_7 = tensor.extract_slice %slice[0, %arg4] [64, 16] [1, 1] + + // Max accumulation + %14 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_7 : tensor<64x16xf32>) outs(%slice_8 : tensor<64x16xf32>) { + ^bb0(%in: f32, %out: f32): + %15 = arith.maxnumf %in, %out : f32 + linalg.yield %15 : f32 + } -> tensor<64x16xf32> + + %inserted = tensor.insert_slice %14 into %arg5[0, 0] [64, 16] [1, 1] + scf.yield %inserted : tensor<64x16xf32> + } + + // Final max reduction: (64,16) -> (64,) + %reduced = linalg.reduce ins(%8 : tensor<64x16xf32>) outs(%5 : tensor<64xf32>) dimensions = [1] { + (%in: f32, %init: f32) { + %14 = arith.maxnumf %in, %init : f32 + linalg.yield %14 : f32 + } + } + + // Sum reduction loop with fused center+exp + %12 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %11) -> (tensor<64x16xf32>) { + %slice_7 = tensor.extract_slice %slice[0, %arg4] [64, 16] [1, 1] + + // Fused center+exp using reduced max + %14 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_7, %reduced : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_8 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_9: f32, %out: f32): + %16 = arith.subf %in, %in_9 : f32 + %17 = math.exp %16 : f32 + linalg.yield %17 : f32 + } -> tensor<64x16xf32> + + // Sum accumulation + %15 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%14 : tensor<64x16xf32>) outs(%arg5 : tensor<64x16xf32>) { + ^bb0(%in: f32, %out: f32): + %16 = arith.addf %in, %out : f32 + linalg.yield %16 : f32 + } -> tensor<64x16xf32> + + scf.yield %15 : tensor<64x16xf32> + } + + // Final sum reduction: (64,16) -> (64,) + %reduced_6 = linalg.reduce ins(%12 : tensor<64x16xf32>) outs(%9 : tensor<64xf32>) dimensions = [1] { + (%in: f32, %init: f32) { + %14 = arith.addf %in, %init : f32 + linalg.yield %14 : f32 + } + } + + // Division loop with fused center+exp+div + %13 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %slice_1) -> (tensor<64x512xf32>) { + %slice_7 = tensor.extract_slice %slice[0, %arg4] [64, 16] [1, 1] + + // Fused center+exp + %14 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_7, %reduced : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_8 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_9: f32, %out: f32): + %16 = arith.subf %in, %in_9 : f32 + %17 = math.exp %16 : f32 + linalg.yield %17 : f32 + } -> tensor<64x16xf32> + + // Division + %15 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%14, %reduced_6 : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_8 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_9: f32, %out: f32): + %16 = arith.divf %in, %in_9 : f32 + linalg.yield %16 : f32 + } -> tensor<64x16xf32> + + %inserted = tensor.insert_slice %15 into %arg5[0, %arg4] [64, 16] [1, 1] + scf.yield %inserted : tensor<64x512xf32> + } + + scf.forall.in_parallel { + tensor.parallel_insert_slice %13 into %arg3[%3, 0] [64, 512] [1, 1] + } + } +} +``` + +**Loop structure:** +- **Outer**: `scf.forall` with 16 parallel iterations (64-row tiles) +- **Max reduction loop**: `scf.for` with 32 iterations → `linalg.reduce` +- **Sum reduction loop**: `scf.for` with 32 iterations → `linalg.reduce` +- **Division loop**: `scf.for` with 32 iterations + +**Key change:** All three stages (max, sum, div) now use tiled loops operating on `64x16` chunks. + +--- + +## Stage 9: Final Vectorized XeGPU Version + +After vectorization, bufferization, and conversion to XeGPU operations. Uses shared local memory (SLM) for partial reductions. + +```mlir +gpu.module @payload_kernel { + gpu.func @payload_kernel(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) kernel { + %block_id_x = gpu.block_id x + %0 = arith.muli %block_id_x, %c64 : index + %subview = memref.subview %arg0[%0, 0] [64, 512] [1, 1] + + // Allocate SLM buffer for partial reductions + %alloca = memref.alloca() : memref<64x16xf32, 3> + %1 = xegpu.create_mem_desc %alloca : !xegpu.mem_desc<64x16xf32> + + // Max reduction loop + xegpu.store_matrix %cst_2, %1[0, 0] // init with -inf + scf.for %arg2 = %c0 to %c512 step %c16 { + // Load 64x16 tile from global memory + %6 = xegpu.create_nd_tdesc %arg1 : !xegpu.tensor_desc<64x16xf32> + %7 = xegpu.load_nd %6[%0, %arg2] : vector<64x16xf32> + + // Load partial max from SLM, compute max, store back + %8 = xegpu.load_matrix %1[0, 0] : vector<64x16xf32> + %9 = arith.maxnumf %7, %8 : vector<64x16xf32> + xegpu.store_matrix %9, %1[0, 0] + } + + // Final max reduction across 16 columns + %2 = xegpu.load_matrix %1[0, 0] : vector<64x16xf32> + %3 = vector.multi_reduction , %2, %cst_1 [1] : vector<64x16xf32> to vector<64xf32> + + // Sum reduction loop + xegpu.store_matrix %cst_0, %1[0, 0] // init with 0.0 + scf.for %arg2 = %c0 to %c512 step %c16 { + // Load 64x16 tile + %6 = xegpu.create_nd_tdesc %arg1 : !xegpu.tensor_desc<64x16xf32> + %7 = xegpu.load_nd %6[%0, %arg2] : vector<64x16xf32> + + // Fused center+exp + %8 = vector.broadcast %3 : vector<64xf32> to vector<16x64xf32> + %9 = vector.transpose %8, [1, 0] : vector<64x16xf32> + %10 = arith.subf %7, %9 : vector<64x16xf32> + %11 = math.exp %10 : vector<64x16xf32> + + // Accumulate sum in SLM + %12 = xegpu.load_matrix %1[0, 0] : vector<64x16xf32> + %13 = arith.addf %11, %12 : vector<64x16xf32> + xegpu.store_matrix %13, %1[0, 0] + } + + // Final sum reduction across 16 columns + %4 = xegpu.load_matrix %1[0, 0] : vector<64x16xf32> + %5 = vector.multi_reduction , %4, %cst [1] : vector<64x16xf32> to vector<64xf32> + + // Division loop + scf.for %arg2 = %c0 to %c512 step %c16 { + // Load 64x16 tile + %6 = xegpu.create_nd_tdesc %arg1 : !xegpu.tensor_desc<64x16xf32> + %7 = xegpu.load_nd %6[%0, %arg2] : vector<64x16xf32> + + // Fused center+exp + %8 = vector.broadcast %3 : vector<64xf32> to vector<16x64xf32> + %9 = vector.transpose %8, [1, 0] : vector<64x16xf32> + %10 = arith.subf %7, %9 : vector<64x16xf32> + %11 = math.exp %10 : vector<64x16xf32> + + // Division + %12 = vector.broadcast %5 : vector<64xf32> to vector<16x64xf32> + %13 = vector.transpose %12, [1, 0] : vector<64x16xf32> + %14 = arith.divf %11, %13 : vector<64x16xf32> + + // Store result to global memory + %18 = xegpu.create_nd_tdesc %intptr : !xegpu.tensor_desc<64x16xf32> + xegpu.store_nd %14, %18[0, %arg2] + } + + gpu.return + } +} +``` + +**Loop structure:** +- **Grid**: 16 blocks (one per 64-row tile) +- **Per block**: + - **Max reduction loop**: 32 iterations (512/16) + - **Final max reduction**: `vector.multi_reduction` + - **Sum reduction loop**: 32 iterations (512/16) + - **Final sum reduction**: `vector.multi_reduction` + - **Division loop**: 32 iterations (512/16) + +**Key transformations:** +- Linalg operations → vectorized operations on `vector<64x16xf32>` +- Tensor buffers → SLM allocation (`memref<64x16xf32, 3>`) +- Memory operations → XeGPU load/store operations (`xegpu.load_nd`, `xegpu.store_nd`, `xegpu.load_matrix`, `xegpu.store_matrix`) +- GPU kernel launch with 16 blocks × 128 threads + +--- + +## Summary of Transformations + +| Stage | Key Transformation | Loop Structure | +|-------|-------------------|----------------| +| 1 | Initial high-level softmax | No loops | +| 2 | Tile parallel dimension | `scf.forall(16)` | +| 3 | Decompose softmax | `scf.forall(16)` + 4 sequential ops | +| 4 | Tile division | `scf.forall(16)` → `scf.for(32)` | +| 5 | Fuse into division loop | Recompute center+exp in div loop | +| 6 | Tile sum reduction | Add sum loop + final reduction | +| 7 | Fuse into sum loop | Recompute center+exp in sum loop | +| 8 | Tile max reduction | Add max loop + final reduction | +| 9 | Vectorize + XeGPU | GPU kernel with SLM and vector ops | + +**Final computation pattern per GPU block:** +1. **Max reduction**: 32-iteration loop with SLM accumulation → final reduction +2. **Sum reduction**: 32-iteration loop (fused center+exp) with SLM accumulation → final reduction +3. **Division**: 32-iteration loop (fused center+exp+div) writing to global memory + +This progressive lowering enables efficient GPU execution with: +- Parallelism across 64-row tiles +- SLM for partial reduction storage +- Recomputation of center+exp to reduce memory traffic +- Vectorized 64x16 tile operations From 05e3e07f677e1d5d253e8c94b8d262eca9df2dd0 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 22 Apr 2026 20:14:27 +0000 Subject: [PATCH 41/51] save work --- lighthouse/schedule/xegpu/softmax_schedule.py | 41 ++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py index 862677df..b406b5fc 100644 --- a/lighthouse/schedule/xegpu/softmax_schedule.py +++ b/lighthouse/schedule/xegpu/softmax_schedule.py @@ -102,6 +102,22 @@ def xegpu_softmax_transform_schedule( transform.yield_() +def match_and_print_parent_function(op, msg): + """Get the parent function of an operation and print it. + + Args: + op: The operation whose parent function to find + func_name: Name label to use when printing the function + """ + anytype = transform.AnyOpType.get() + func = transform.get_parent_op( + anytype, + op, + op_name="func.func", + deduplicate=True, + ) + transform.print_(target=func, name=msg) + def bundle_xegpu_softmax_schedule( mod: ir.Value[transform.AnyOpType], parameters: dict, @@ -120,6 +136,8 @@ def bundle_xegpu_softmax_schedule( transform.AnyOpType.get(), mod, ops=["linalg.softmax"] ) + match_and_print_parent_function(softmax_op, "initial") + # Tile the softmax operation using tile_using_forall tiled_op, for_op = structured.structured_tile_using_forall( anytype, @@ -129,6 +147,7 @@ def bundle_xegpu_softmax_schedule( tile_sizes=[], static_tile_sizes=(parameters["wg_rows"],), ) + match_and_print_parent_function(for_op, "after tiling parallel dim") func = transform.get_parent_op( anytype, @@ -142,9 +161,12 @@ def bundle_xegpu_softmax_schedule( ) structured.structured_decompose_interface(anytype, softmax_ops) + + linalg_ops = match_and_split( func, ops={"linalg.generic", "linalg.fill"}, nhandles=6 ) + match_and_print_parent_function(linalg_ops[0], "after decomposing softmax") max_reduction = linalg_ops[1] max_center_and_exp_op = linalg_ops[2] sum_reduction = linalg_ops[4] @@ -156,6 +178,10 @@ def bundle_xegpu_softmax_schedule( _, div_loop = structured.TileUsingForOp( div_op, sizes=[0, reduction_step_size] ).results + # Cleanup after tiling and fusion + transform.apply_cse(func) + canonicalize(func) + match_and_print_parent_function(div_loop, "after tiling div") # Fuse max_center_and_exp_op into the div loop _, fused_loop = structured.structured_fuse_into_containing_op( @@ -164,6 +190,10 @@ def bundle_xegpu_softmax_schedule( producer_op=max_center_and_exp_op, containing_op=div_loop, ) + # Cleanup after tiling and fusion + transform.apply_cse(func) + canonicalize(func) + match_and_print_parent_function(fused_loop, "after fusing max_center_and_exp into div loop") # Tile the sum reduction and fuse the sub+exp producer into it _, _, _, sum_loop = structured.structured_tile_reduction_using_for( @@ -174,6 +204,10 @@ def bundle_xegpu_softmax_schedule( target=sum_reduction, tile_sizes=[0, reduction_step_size], ) + # Cleanup after tiling and fusion + transform.apply_cse(func) + canonicalize(func) + match_and_print_parent_function(sum_loop, "after tiling sum reduction") func = transform.get_parent_op( anytype, @@ -193,10 +227,14 @@ def bundle_xegpu_softmax_schedule( producer_op=max_center_and_exp_op, containing_op=sum_loop, ) + # Cleanup after tiling and fusion + transform.apply_cse(func) + canonicalize(func) + match_and_print_parent_function(fused_sum_loop, "after fusing max_center_and_exp into sum reduction loop") # Tile the max reduction. max_reduction = linalg_ops[0] - structured.structured_tile_reduction_using_for( + _, _, _, max_loop = structured.structured_tile_reduction_using_for( [anytype], anytype, anytype, @@ -204,6 +242,7 @@ def bundle_xegpu_softmax_schedule( target=max_reduction, tile_sizes=[0, reduction_step_size], ) + match_and_print_parent_function(max_loop, "after tiling max reduction") # Cleanup after tiling and fusion transform.apply_cse(func) From 028b27d36d6da38031652cf6ffff94acae913c66 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 22 Apr 2026 20:23:28 +0000 Subject: [PATCH 42/51] save work --- .../softmax_lowering_flow.md | 68 +------------------ 1 file changed, 1 insertion(+), 67 deletions(-) diff --git a/reduction_tiling_docs/softmax_lowering_flow.md b/reduction_tiling_docs/softmax_lowering_flow.md index bccf3aa0..4996e3d7 100644 --- a/reduction_tiling_docs/softmax_lowering_flow.md +++ b/reduction_tiling_docs/softmax_lowering_flow.md @@ -100,16 +100,6 @@ func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { } ``` -**Loop structure:** -- **Outer**: `scf.forall` with 16 parallel iterations -- Each iteration performs 4 sequential linalg ops - -**Key operations:** -1. **Max reduction**: Reduces from `(64, 512)` to `(64,)` using `maxnumf` -2. **Center+exp**: Element-wise subtract max and apply exp -3. **Sum reduction**: Reduces from `(64, 512)` to `(64,)` using `addf` -4. **Division**: Element-wise divide by sum - --- ## Stage 4: After Tiling Division @@ -157,12 +147,6 @@ func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { } ``` -**Loop structure:** -- **Outer**: `scf.forall` with 16 parallel iterations (64-row tiles) -- **Inner**: `scf.for` with 32 sequential iterations (512/16 = 32 column tiles) - -**Key change:** Division now operates on `64x16` tiles instead of full `64x512` - --- ## Stage 5: After Fusing Max+Center+Exp into Division Loop @@ -219,11 +203,6 @@ func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { } ``` -**Loop structure:** -- **Outer**: `scf.forall` with 16 parallel iterations -- **Inner**: `scf.for` with 32 sequential iterations - -**Key change:** Inside the division loop, center+exp is recomputed on `64x16` tiles from the original input, avoiding the need to store the full `64x512` exp tensor. --- @@ -279,14 +258,6 @@ func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { } ``` -**Loop structure:** -- **Outer**: `scf.forall` with 16 parallel iterations -- **Sum reduction loop**: `scf.for` with 32 iterations, accumulating into `64x16` -- **Final reduction**: `linalg.reduce` from `(64, 16)` to `(64,)` -- **Division loop**: `scf.for` with 32 iterations - -**Key change:** Sum reduction split into partial accumulation (loop) + final reduction (linalg.reduce) - --- ## Stage 7: After Fusing Max+Center+Exp into Sum Reduction Loop @@ -366,14 +337,6 @@ func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { } ``` -**Loop structure:** -- **Outer**: `scf.forall` with 16 parallel iterations -- **Sum reduction loop**: `scf.for` with 32 iterations (fused center+exp+accumulate) -- **Final reduction**: `linalg.reduce` -- **Division loop**: `scf.for` with 32 iterations (fused center+exp+div) - -**Key change:** Center+exp is recomputed twice (once for sum, once for division) to avoid storing intermediate `64x512` tensor. - --- ## Stage 8: After Tiling Max Reduction @@ -473,14 +436,6 @@ func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { } ``` -**Loop structure:** -- **Outer**: `scf.forall` with 16 parallel iterations (64-row tiles) -- **Max reduction loop**: `scf.for` with 32 iterations → `linalg.reduce` -- **Sum reduction loop**: `scf.for` with 32 iterations → `linalg.reduce` -- **Division loop**: `scf.for` with 32 iterations - -**Key change:** All three stages (max, sum, div) now use tiled loops operating on `64x16` chunks. - --- ## Stage 9: Final Vectorized XeGPU Version @@ -565,21 +520,6 @@ gpu.module @payload_kernel { } ``` -**Loop structure:** -- **Grid**: 16 blocks (one per 64-row tile) -- **Per block**: - - **Max reduction loop**: 32 iterations (512/16) - - **Final max reduction**: `vector.multi_reduction` - - **Sum reduction loop**: 32 iterations (512/16) - - **Final sum reduction**: `vector.multi_reduction` - - **Division loop**: 32 iterations (512/16) - -**Key transformations:** -- Linalg operations → vectorized operations on `vector<64x16xf32>` -- Tensor buffers → SLM allocation (`memref<64x16xf32, 3>`) -- Memory operations → XeGPU load/store operations (`xegpu.load_nd`, `xegpu.store_nd`, `xegpu.load_matrix`, `xegpu.store_matrix`) -- GPU kernel launch with 16 blocks × 128 threads - --- ## Summary of Transformations @@ -599,10 +539,4 @@ gpu.module @payload_kernel { **Final computation pattern per GPU block:** 1. **Max reduction**: 32-iteration loop with SLM accumulation → final reduction 2. **Sum reduction**: 32-iteration loop (fused center+exp) with SLM accumulation → final reduction -3. **Division**: 32-iteration loop (fused center+exp+div) writing to global memory - -This progressive lowering enables efficient GPU execution with: -- Parallelism across 64-row tiles -- SLM for partial reduction storage -- Recomputation of center+exp to reduce memory traffic -- Vectorized 64x16 tile operations +3. **Division**: 32-iteration loop (fused center+exp+div) writing to global memory \ No newline at end of file From cd93df60b53a0a47307e00e33d23645849ab6eb3 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 23 Apr 2026 02:10:36 +0000 Subject: [PATCH 43/51] add optimization note --- .../softmax_lowering_flow.md | 192 +++++++++++++++++- 1 file changed, 191 insertions(+), 1 deletion(-) diff --git a/reduction_tiling_docs/softmax_lowering_flow.md b/reduction_tiling_docs/softmax_lowering_flow.md index 4996e3d7..c6d8ad50 100644 --- a/reduction_tiling_docs/softmax_lowering_flow.md +++ b/reduction_tiling_docs/softmax_lowering_flow.md @@ -539,4 +539,194 @@ gpu.module @payload_kernel { **Final computation pattern per GPU block:** 1. **Max reduction**: 32-iteration loop with SLM accumulation → final reduction 2. **Sum reduction**: 32-iteration loop (fused center+exp) with SLM accumulation → final reduction -3. **Division**: 32-iteration loop (fused center+exp+div) writing to global memory \ No newline at end of file +3. **Division**: 32-iteration loop (fused center+exp+div) writing to global memory + +--- + +## Optimization: Fusing Max and Sum Reduction Loops + +After Stage 8, we can apply an additional optimization to fuse the max reduction loop and sum reduction loop into a single loop. This reduces the number of loops from 3 to 2. + +### Key Insight + +The optimization leverages the **online softmax algorithm**, which allows us to incrementally update both the global maximum and the global sum as we process each tile of the reduction dimension. For each 16-column tile: + +1. Compute the **local max** for the tile +2. Update the **global max** using the local max +3. Compute the **local centered sum** using exp(x - local_max) +4. **Rescale** the global sum by exp(global_max_old - global_max_new) +5. **Add** the rescaled local sum to the global sum + +This maintains numerical stability while processing tiles incrementally, since we adjust previous sums by the correction factor when we discover a new maximum. + +### Before: Separate Max and Sum Loops (3 loops total) + +```mlir +func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { + %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x512xf32>) { + %slice = tensor.extract_slice %0[%3, 0] [64, 512] [1, 1] + + // Loop 1: Max reduction + %8 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %7) -> (tensor<64x16xf32>) { + %slice_7 = tensor.extract_slice %slice[0, %arg4] [64, 16] [1, 1] + + %14 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_7 : tensor<64x16xf32>) outs(%slice_8 : tensor<64x16xf32>) { + ^bb0(%in: f32, %out: f32): + %15 = arith.maxnumf %in, %out : f32 + linalg.yield %15 : f32 + } -> tensor<64x16xf32> + + scf.yield %14 : tensor<64x16xf32> + } + %reduced = linalg.reduce ins(%8) outs(%5) dimensions = [1] { maxnumf } + + // Loop 2: Sum reduction with center+exp + %12 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %11) -> (tensor<64x16xf32>) { + %slice_7 = tensor.extract_slice %slice[0, %arg4] [64, 16] [1, 1] + + // Center+exp using global max + %14 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_7, %reduced : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_8) { + ^bb0(%in: f32, %in_9: f32, %out: f32): + %16 = arith.subf %in, %in_9 : f32 + %17 = math.exp %16 : f32 + linalg.yield %17 : f32 + } -> tensor<64x16xf32> + + // Accumulate sum + %15 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%14) outs(%arg5) { + ^bb0(%in: f32, %out: f32): + %16 = arith.addf %in, %out : f32 + linalg.yield %16 : f32 + } -> tensor<64x16xf32> + + scf.yield %15 : tensor<64x16xf32> + } + %reduced_6 = linalg.reduce ins(%12) outs(%9) dimensions = [1] { addf } + + // Loop 3: Division with center+exp+div + %13 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %slice_1) -> (tensor<64x512xf32>) { + // ... fused center+exp+div ... + } + } +} +``` + +### After: Fused Max+Sum Loop (2 loops total) + +```mlir +func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { + %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x512xf32>) { + %slice = tensor.extract_slice %0[%3, 0] [64, 512] [1, 1] + + // Loop 1: Fused max+sum reduction (online softmax) + %fused = scf.for %arg4 = %c0 to %c512 step %c16 + iter_args(%global_max = %init_max, %global_sum_buffer = %7) + -> (tensor<64xf32>, tensor<64x16xf32>) { + %slice_7 = tensor.extract_slice %slice[0, %arg4] [64, 16] [1, 1] + + // Step 1: Compute local max for this tile + %local_max = linalg.reduce ins(%slice_7 : tensor<64x16xf32>) outs(%5 : tensor<64xf32>) dimensions = [1] { + (%in: f32, %init: f32) { + %max = arith.maxnumf %in, %init : f32 + linalg.yield %max : f32 + } + } + + // Step 2: Update global max + %new_global_max = linalg.generic {iterator_types = ["parallel"]} + ins(%global_max, %local_max : tensor<64xf32>, tensor<64xf32>) outs(%out_max : tensor<64xf32>) { + ^bb0(%old_max: f32, %curr_max: f32, %out: f32): + %updated = arith.maxnumf %old_max, %curr_max : f32 + linalg.yield %updated : f32 + } -> tensor<64xf32> + + // Step 3: Compute local centered sum: exp(x - local_max) + %local_exp = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_7, %local_max : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_8 : tensor<64x16xf32>) { + ^bb0(%in: f32, %max: f32, %out: f32): + %centered = arith.subf %in, %max : f32 + %exp_val = math.exp %centered : f32 + linalg.yield %exp_val : f32 + } -> tensor<64x16xf32> + + // Reduce to get tile sum + %local_sum_buffer = linalg.reduce ins(%local_exp : tensor<64x16xf32>) outs(%9 : tensor<64x16xf32>) dimensions = [1] { + (%in: f32, %init: f32) { + %sum = arith.addf %in, %init : f32 + linalg.yield %sum : f32 + } + } + + // Step 4: Rescale global sum by exp(old_max - new_max) and add local sum + %updated_global_sum = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%global_sum_buffer, %global_max, %new_global_max, %local_sum_buffer : + tensor<64x16xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x16xf32>) + outs(%out_sum : tensor<64x16xf32>) { + ^bb0(%old_sum: f32, %old_max: f32, %new_max: f32, %local: f32, %out: f32): + // Correction factor: exp(old_max - new_max) + %max_diff = arith.subf %old_max, %new_max : f32 + %scale = math.exp %max_diff : f32 + %rescaled_sum = arith.mulf %old_sum, %scale : f32 + + // Add local sum (already centered on local_max, need to rescale) + %local_scale_diff = arith.subf %local_max, %new_max : f32 + %local_scale = math.exp %local_scale_diff : f32 + %rescaled_local = arith.mulf %local, %local_scale : f32 + + %updated = arith.addf %rescaled_sum, %rescaled_local : f32 + linalg.yield %updated : f32 + } -> tensor<64x16xf32> + + scf.yield %new_global_max, %updated_global_sum : tensor<64xf32>, tensor<64x16xf32> + } + + // Extract final results + %final_max = %fused#0 : tensor<64xf32> + %final_sum_buffer = %fused#1 : tensor<64x16xf32> + %final_sum = linalg.reduce ins(%final_sum_buffer) outs(%9) dimensions = [1] { addf } + + // Loop 2: Division with center+exp+div + %13 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %slice_1) -> (tensor<64x512xf32>) { + %slice_7 = tensor.extract_slice %slice[0, %arg4] [64, 16] [1, 1] + + // Center+exp + %14 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_7, %final_max : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_8) { + ^bb0(%in: f32, %max: f32, %out: f32): + %centered = arith.subf %in, %max : f32 + %exp_val = math.exp %centered : f32 + linalg.yield %exp_val : f32 + } -> tensor<64x16xf32> + + // Division + %15 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%14, %final_sum : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_8) { + ^bb0(%exp_val: f32, %sum: f32, %out: f32): + %result = arith.divf %exp_val, %sum : f32 + linalg.yield %result : f32 + } -> tensor<64x16xf32> + + %inserted = tensor.insert_slice %15 into %arg5[0, %arg4] [64, 16] [1, 1] + scf.yield %inserted : tensor<64x512xf32> + } + } +} +``` + +### Benefits + +1. **Reduced loop count**: 3 loops → 2 loops (fused max+sum, division) +2. **Better memory locality**: Input data is read only twice instead of three times +3. **Lower latency**: One fewer synchronization point between reduction phases +4. **Same numerical stability**: Uses the online softmax algorithm which maintains stability through incremental rescaling + +### Trade-offs + +- **Increased per-iteration complexity**: Each iteration of the fused loop performs more operations (max update, sum rescaling, correction factors) +- **More register pressure**: Need to carry both `global_max` and `global_sum_buffer` across iterations +- **Additional exp operations**: Computing correction factors requires exp(old_max - new_max) and exp(local_max - new_max) per tile + +This optimization is particularly valuable for GPU implementations where memory bandwidth is the bottleneck, as reducing the number of passes over the input data can significantly improve performance despite the increased computational complexity per iteration. From 9bbb99c982c1e3a59bcfaa4608e1ca7f195b24ab Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 23 Apr 2026 02:19:03 +0000 Subject: [PATCH 44/51] save work --- reduction_tiling_docs/softmax_lowering_flow.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/reduction_tiling_docs/softmax_lowering_flow.md b/reduction_tiling_docs/softmax_lowering_flow.md index c6d8ad50..45dd0249 100644 --- a/reduction_tiling_docs/softmax_lowering_flow.md +++ b/reduction_tiling_docs/softmax_lowering_flow.md @@ -720,13 +720,5 @@ func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { 1. **Reduced loop count**: 3 loops → 2 loops (fused max+sum, division) 2. **Better memory locality**: Input data is read only twice instead of three times -3. **Lower latency**: One fewer synchronization point between reduction phases -4. **Same numerical stability**: Uses the online softmax algorithm which maintains stability through incremental rescaling - -### Trade-offs - -- **Increased per-iteration complexity**: Each iteration of the fused loop performs more operations (max update, sum rescaling, correction factors) -- **More register pressure**: Need to carry both `global_max` and `global_sum_buffer` across iterations -- **Additional exp operations**: Computing correction factors requires exp(old_max - new_max) and exp(local_max - new_max) per tile This optimization is particularly valuable for GPU implementations where memory bandwidth is the bottleneck, as reducing the number of passes over the input data can significantly improve performance despite the increased computational complexity per iteration. From 8637269866ad578fb68bb9611b9e6cf80ec836af Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 23 Apr 2026 04:54:26 +0000 Subject: [PATCH 45/51] add attention doc --- .../fused_attention_tiling.md | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 reduction_tiling_docs/fused_attention_tiling.md diff --git a/reduction_tiling_docs/fused_attention_tiling.md b/reduction_tiling_docs/fused_attention_tiling.md new file mode 100644 index 00000000..445b692a --- /dev/null +++ b/reduction_tiling_docs/fused_attention_tiling.md @@ -0,0 +1,53 @@ +## Attention Tiling + +# Linalg level implementation + +Input sizes: +- Q: 4096x64 +- K: 4096x64 +- V: 4096x64 + +``` +func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, +%V: memref<4096x64xf32>, %out: memref<4096x64xf32>) { + ... + // Transpose K + %k_transpose = linalg.transpose ... -> tensor<64x4096xf32> + + // QK^T + %QKT = linalg.matmul ins(%q, %k_transpose : tensor<4096x64xf32>, tensor<64x4096xf32>) + outs(%empty_NxN : tensor<4096x4096xf32>) -> tensor<4096x4096xf32> + + // Fill with -inf + %t_minf = linalg.fill ins(%cst_minus_inf : f32) outs(%empty_N : tensor<4096xf32>) -> tensor<4096xf32> + + // Max reduce along rows + %max = linalg.reduce ins(%QKT : tensor<4096x4096xf32>) ... %m = arith.maximumf %in, %init : f32 -> tensor<4096xf32> + + // Broadcast max + %maxb = linalg.broadcast ins(%max: tensor<4096xf32>) outs(%empty_NxN : tensor<4096x4096xf32>) dimensions = [1] -> tensor<4096x4096xf32> + + // Subtract + %sub = linalg.elemwise_binary {fun = #linalg.binary_fn} ... -> tensor<4096x4096xf32> + + // Exp + %exp = linalg.elemwise_unary {fun = #linalg.unary_fn} ... -> tensor<4096x4096xf32> + + // Fill with zeros + %t_zeros = linalg.fill ins(%c0f : f32) outs(%empty_N : tensor<4096xf32>) -> tensor<4096xf32> + + // Sum reduce along rows + %sum = linalg.reduce ... %s = arith.addf %in, %init : f32 ... -> tensor<4096xf32> + + // Broadcast sum and div + %sums = linalg.broadcast ... -> tensor<4096x4096xf32> + %p = linalg.elemwise_binary {fun = #linalg.binary_fn
} + ins(%exp, %sums : tensor<4096x4096xf32>, tensor<4096x4096xf32>) ... -> tensor<4096x4096xf32> + + // Final matmul + %o = linalg.matmul ins(%p, %v : tensor<4096x4096xf32>, tensor<4096x64xf32>) ... -> tensor<4096x64xf32> + ... +} +``` + +# Stage 1: Tile the last matmul in K dim. From 037193b2ebf52dc28a0fd4f173d97116c26a7ad5 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 23 Apr 2026 05:13:46 +0000 Subject: [PATCH 46/51] add optimization note --- .../fused_attention_tiling.md | 193 +++++++++++++++++- 1 file changed, 190 insertions(+), 3 deletions(-) diff --git a/reduction_tiling_docs/fused_attention_tiling.md b/reduction_tiling_docs/fused_attention_tiling.md index 445b692a..8ac7657e 100644 --- a/reduction_tiling_docs/fused_attention_tiling.md +++ b/reduction_tiling_docs/fused_attention_tiling.md @@ -1,6 +1,6 @@ -## Attention Tiling +# Attention Tiling -# Linalg level implementation +## Linalg level implementation Input sizes: - Q: 4096x64 @@ -50,4 +50,191 @@ func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, } ``` -# Stage 1: Tile the last matmul in K dim. +--- + +## Stage 1: Tile the last matmul in K dim (tile size = 16) + +After tiling the final matmul `%o = linalg.matmul ins(%p, %v)` along the K dimension with tile size 16: + +``` +func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, +%V: memref<4096x64xf32>, %out: memref<4096x64xf32>) { + ... + // Compute p = Softmax(Q @ K^T) + // ... + + // Final matmul TILED in K dimension (4096 / 16 = 256 tiles) + // Loop over K dimension: k = 0 to 4096 step 16 + %c0 = arith.constant 0 : index + %c4096 = arith.constant 4096 : index + %c16 = arith.constant 16 : index + + // Initialize output with zeros: 4096x64 + %o_init = linalg.fill ins(%c0f : f32) outs(%empty_out : tensor<4096x64xf32>) -> tensor<4096x64xf32> + + %o = scf.for %k = %c0 to %c4096 step %c16 iter_args(%o_acc = %o_init) -> (tensor<4096x64xf32>) { + // Extract slice from %p: 4096x16 (from columns [k:k+16]) + %p_slice = tensor.extract_slice %p[0, %k][4096, 16][1, 1] -> tensor<4096x16xf32> + + // Extract slice from %v: 16x64 (from rows [k:k+16]) + %v_slice = tensor.extract_slice %v[%k, 0][16, 64][1, 1] -> tensor<16x64xf32> + + // Partial matmul: (4096x16) @ (16x64) -> 4096x64 + %partial = linalg.matmul ins(%p_slice, %v_slice : tensor<4096x16xf32>, tensor<16x64xf32>) + outs(%empty_partial : tensor<4096x64xf32>) -> tensor<4096x64xf32> + + // Accumulate: 4096x64 + 4096x64 -> 4096x64 + %o_new = linalg.elemwise_binary {fun = #linalg.binary_fn} + ins(%o_acc, %partial : tensor<4096x64xf32>, tensor<4096x64xf32>) ... -> tensor<4096x64xf32> + + scf.yield %o_new : tensor<4096x64xf32> + } + ... +} +``` + +## Stage 2: Tile and fuse the softmax computation (tile size = 16) + +After tiling the softmax computation in the reduction dimension with tile size 16 and fusing operations, following the pattern from the softmax lowering flow: + +``` +func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, +%V: memref<4096x64xf32>, %out: memref<4096x64xf32>) { + ... + // === First matmul: Q @ K^T === + // Transpose K: 4096x64 -> 64x4096 + %k_transpose = linalg.transpose ... -> tensor<64x4096xf32> + + // QK^T: (4096x64) @ (64x4096) -> 4096x4096 + %QKT = linalg.matmul ins(%q, %k_transpose : tensor<4096x64xf32>, tensor<64x4096xf32>) + outs(%empty_NxN : tensor<4096x4096xf32>) -> tensor<4096x4096xf32> + + + // === Tiled and fused softmax computation === + // Tile size = 16, number of tiles = 4096 / 16 = 256 + %c0 = arith.constant 0 : index + %c4096 = arith.constant 4096 : index + %c16 = arith.constant 16 : index + + // Initialize max buffer with -inf: 4096x16 + %max_buffer_init = linalg.fill ins(%cst_minus_inf : f32) outs(%empty_max_buf : tensor<4096x16xf32>) -> tensor<4096x16xf32> + + // Loop 1: Max reduction (4096 / 16 = 256 iterations) + %max_buffer = scf.for %k = %c0 to %c4096 step %c16 iter_args(%max_acc = %max_buffer_init) -> (tensor<4096x16xf32>) { + // Extract slice from QKT: 4096x16 + %QKT_slice = tensor.extract_slice %QKT[0, %k][4096, 16][1, 1] -> tensor<4096x16xf32> + + // Max accumulation: 4096x16 + %max_new = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%QKT_slice : tensor<4096x16xf32>) outs(%max_acc : tensor<4096x16xf32>) { + ^bb0(%in: f32, %out: f32): + %max_val = arith.maxnumf %in, %out : f32 + linalg.yield %max_val : f32 + } -> tensor<4096x16xf32> + + scf.yield %max_new : tensor<4096x16xf32> + } + + // Final max reduction: 4096x16 -> 4096 + %max = linalg.reduce ins(%max_buffer : tensor<4096x16xf32>) outs(%empty_N : tensor<4096xf32>) dimensions = [1] { + (%in: f32, %init: f32) { + %m = arith.maxnumf %in, %init : f32 + linalg.yield %m : f32 + } + } -> tensor<4096xf32> + + + // Initialize sum buffer with zeros: 4096x16 + %sum_buffer_init = linalg.fill ins(%c0f : f32) outs(%empty_sum_buf : tensor<4096x16xf32>) -> tensor<4096x16xf32> + + // Loop 2: Sum reduction with fused center+exp (256 iterations) + %sum_buffer = scf.for %k = %c0 to %c4096 step %c16 iter_args(%sum_acc = %sum_buffer_init) -> (tensor<4096x16xf32>) { + // Extract slice from QKT: 4096x16 + %QKT_slice = tensor.extract_slice %QKT[0, %k][4096, 16][1, 1] -> tensor<4096x16xf32> + + // Fused center+exp: 4096x16 + %exp_slice = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%QKT_slice, %max : tensor<4096x16xf32>, tensor<4096xf32>) outs(%empty_slice : tensor<4096x16xf32>) { + ^bb0(%in: f32, %max_val: f32, %out: f32): + %centered = arith.subf %in, %max_val : f32 + %exp_val = math.exp %centered : f32 + linalg.yield %exp_val : f32 + } -> tensor<4096x16xf32> + + // Sum accumulation: 4096x16 + %sum_new = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%exp_slice : tensor<4096x16xf32>) outs(%sum_acc : tensor<4096x16xf32>) { + ^bb0(%in: f32, %out: f32): + %sum_val = arith.addf %in, %out : f32 + linalg.yield %sum_val : f32 + } -> tensor<4096x16xf32> + + scf.yield %sum_new : tensor<4096x16xf32> + } + + // Final sum reduction: 4096x16 -> 4096 + %sum = linalg.reduce ins(%sum_buffer : tensor<4096x16xf32>) outs(%empty_N : tensor<4096xf32>) dimensions = [1] { + (%in: f32, %init: f32) { + %s = arith.addf %in, %init : f32 + linalg.yield %s : f32 + } + } -> tensor<4096xf32> + + + // Initialize output buffer for softmax: 4096x4096 + %p_init = linalg.fill ins(%c0f : f32) outs(%empty_NxN : tensor<4096x4096xf32>) -> tensor<4096x4096xf32> + + // Loop 3: Division with fused center+exp+div (256 iterations) + %p = scf.for %k = %c0 to %c4096 step %c16 iter_args(%p_acc = %p_init) -> (tensor<4096x4096xf32>) { + // Extract slice from QKT: 4096x16 + %QKT_slice = tensor.extract_slice %QKT[0, %k][4096, 16][1, 1] -> tensor<4096x16xf32> + + // Fused center+exp: 4096x16 + %exp_slice = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%QKT_slice, %max : tensor<4096x16xf32>, tensor<4096xf32>) outs(%empty_slice : tensor<4096x16xf32>) { + ^bb0(%in: f32, %max_val: f32, %out: f32): + %centered = arith.subf %in, %max_val : f32 + %exp_val = math.exp %centered : f32 + linalg.yield %exp_val : f32 + } -> tensor<4096x16xf32> + + // Division: 4096x16 + %p_slice = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%exp_slice, %sum : tensor<4096x16xf32>, tensor<4096xf32>) outs(%empty_slice : tensor<4096x16xf32>) { + ^bb0(%exp_val: f32, %sum_val: f32, %out: f32): + %result = arith.divf %exp_val, %sum_val : f32 + linalg.yield %result : f32 + } -> tensor<4096x16xf32> + + // Insert slice back: 4096x16 -> 4096x4096 + %p_new = tensor.insert_slice %p_slice into %p_acc[0, %k][4096, 16][1, 1] -> tensor<4096x4096xf32> + + scf.yield %p_new : tensor<4096x4096xf32> + } + // Result: %p contains softmax(Q @ K^T) with shape 4096x4096 + + + // === Final matmul TILED in K dimension (256 iterations) === + // Initialize output with zeros: 4096x64 + %o_init = linalg.fill ins(%c0f : f32) outs(%empty_out : tensor<4096x64xf32>) -> tensor<4096x64xf32> + + %o = scf.for %k = %c0 to %c4096 step %c16 iter_args(%o_acc = %o_init) -> (tensor<4096x64xf32>) { + // Extract slice from %p: 4096x16 + %p_slice = tensor.extract_slice %p[0, %k][4096, 16][1, 1] -> tensor<4096x16xf32> + + // Extract slice from %v: 16x64 + %v_slice = tensor.extract_slice %v[%k, 0][16, 64][1, 1] -> tensor<16x64xf32> + + // Partial matmul: (4096x16) @ (16x64) -> 4096x64 + %partial = linalg.matmul ins(%p_slice, %v_slice : tensor<4096x16xf32>, tensor<16x64xf32>) + outs(%empty_partial : tensor<4096x64xf32>) -> tensor<4096x64xf32> + + // Accumulate: 4096x64 + 4096x64 -> 4096x64 + %o_new = linalg.elemwise_binary {fun = #linalg.binary_fn} + ins(%o_acc, %partial : tensor<4096x64xf32>, tensor<4096x64xf32>) ... -> tensor<4096x64xf32> + + scf.yield %o_new : tensor<4096x64xf32> + } + ... +} +``` From 32133cf387feab4bf23d62781a3f23354900bf53 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 23 Apr 2026 05:16:10 +0000 Subject: [PATCH 47/51] add optimization note --- reduction_tiling_docs/fused_attention_tiling.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/reduction_tiling_docs/fused_attention_tiling.md b/reduction_tiling_docs/fused_attention_tiling.md index 8ac7657e..f392f6fe 100644 --- a/reduction_tiling_docs/fused_attention_tiling.md +++ b/reduction_tiling_docs/fused_attention_tiling.md @@ -7,7 +7,7 @@ Input sizes: - K: 4096x64 - V: 4096x64 -``` +```mlir func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, %V: memref<4096x64xf32>, %out: memref<4096x64xf32>) { ... @@ -56,7 +56,7 @@ func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, After tiling the final matmul `%o = linalg.matmul ins(%p, %v)` along the K dimension with tile size 16: -``` +```mlir func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, %V: memref<4096x64xf32>, %out: memref<4096x64xf32>) { ... @@ -97,7 +97,7 @@ func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, After tiling the softmax computation in the reduction dimension with tile size 16 and fusing operations, following the pattern from the softmax lowering flow: -``` +```mlir func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, %V: memref<4096x64xf32>, %out: memref<4096x64xf32>) { ... From 3583e8133e5e2eca216434ded2d21f0608aced14 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 23 Apr 2026 14:00:39 +0000 Subject: [PATCH 48/51] add attention doc --- .../fused_attention_tiling.md | 172 +++++++++--------- 1 file changed, 86 insertions(+), 86 deletions(-) diff --git a/reduction_tiling_docs/fused_attention_tiling.md b/reduction_tiling_docs/fused_attention_tiling.md index f392f6fe..24375a16 100644 --- a/reduction_tiling_docs/fused_attention_tiling.md +++ b/reduction_tiling_docs/fused_attention_tiling.md @@ -3,49 +3,49 @@ ## Linalg level implementation Input sizes: -- Q: 4096x64 +- Q: 64x64 - K: 4096x64 - V: 4096x64 ```mlir -func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, -%V: memref<4096x64xf32>, %out: memref<4096x64xf32>) { +func.func @attention(%Q : memref<64x64xf32>, %K: memref<4096x64xf32>, +%V: memref<4096x64xf32>, %out: memref<64x64xf32>) { ... // Transpose K %k_transpose = linalg.transpose ... -> tensor<64x4096xf32> // QK^T - %QKT = linalg.matmul ins(%q, %k_transpose : tensor<4096x64xf32>, tensor<64x4096xf32>) - outs(%empty_NxN : tensor<4096x4096xf32>) -> tensor<4096x4096xf32> + %QKT = linalg.matmul ins(%q, %k_transpose : tensor<64x64xf32>, tensor<64x4096xf32>) + outs(%empty_NxN : tensor<64x4096xf32>) -> tensor<64x4096xf32> // Fill with -inf - %t_minf = linalg.fill ins(%cst_minus_inf : f32) outs(%empty_N : tensor<4096xf32>) -> tensor<4096xf32> + %t_minf = linalg.fill ins(%cst_minus_inf : f32) outs(%empty_N : tensor<64xf32>) -> tensor<64xf32> // Max reduce along rows - %max = linalg.reduce ins(%QKT : tensor<4096x4096xf32>) ... %m = arith.maximumf %in, %init : f32 -> tensor<4096xf32> + %max = linalg.reduce ins(%QKT : tensor<64x4096xf32>) ... %m = arith.maximumf %in, %init : f32 -> tensor<64xf32> // Broadcast max - %maxb = linalg.broadcast ins(%max: tensor<4096xf32>) outs(%empty_NxN : tensor<4096x4096xf32>) dimensions = [1] -> tensor<4096x4096xf32> + %maxb = linalg.broadcast ins(%max: tensor<64xf32>) outs(%empty_NxN : tensor<64x4096xf32>) dimensions = [1] -> tensor<64x4096xf32> // Subtract - %sub = linalg.elemwise_binary {fun = #linalg.binary_fn} ... -> tensor<4096x4096xf32> + %sub = linalg.elemwise_binary {fun = #linalg.binary_fn} ... -> tensor<64x4096xf32> // Exp - %exp = linalg.elemwise_unary {fun = #linalg.unary_fn} ... -> tensor<4096x4096xf32> + %exp = linalg.elemwise_unary {fun = #linalg.unary_fn} ... -> tensor<64x4096xf32> // Fill with zeros - %t_zeros = linalg.fill ins(%c0f : f32) outs(%empty_N : tensor<4096xf32>) -> tensor<4096xf32> + %t_zeros = linalg.fill ins(%c0f : f32) outs(%empty_N : tensor<64xf32>) -> tensor<64xf32> // Sum reduce along rows - %sum = linalg.reduce ... %s = arith.addf %in, %init : f32 ... -> tensor<4096xf32> + %sum = linalg.reduce ... %s = arith.addf %in, %init : f32 ... -> tensor<64xf32> // Broadcast sum and div - %sums = linalg.broadcast ... -> tensor<4096x4096xf32> + %sums = linalg.broadcast ... -> tensor<64x4096xf32> %p = linalg.elemwise_binary {fun = #linalg.binary_fn
} - ins(%exp, %sums : tensor<4096x4096xf32>, tensor<4096x4096xf32>) ... -> tensor<4096x4096xf32> + ins(%exp, %sums : tensor<64x4096xf32>, tensor<64x4096xf32>) ... -> tensor<64x4096xf32> // Final matmul - %o = linalg.matmul ins(%p, %v : tensor<4096x4096xf32>, tensor<4096x64xf32>) ... -> tensor<4096x64xf32> + %o = linalg.matmul ins(%p, %v : tensor<64x4096xf32>, tensor<4096x64xf32>) ... -> tensor<64x64xf32> ... } ``` @@ -57,8 +57,8 @@ func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, After tiling the final matmul `%o = linalg.matmul ins(%p, %v)` along the K dimension with tile size 16: ```mlir -func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, -%V: memref<4096x64xf32>, %out: memref<4096x64xf32>) { +func.func @attention(%Q : memref<64x64xf32>, %K: memref<4096x64xf32>, +%V: memref<4096x64xf32>, %out: memref<64x64xf32>) { ... // Compute p = Softmax(Q @ K^T) // ... @@ -69,25 +69,25 @@ func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, %c4096 = arith.constant 4096 : index %c16 = arith.constant 16 : index - // Initialize output with zeros: 4096x64 - %o_init = linalg.fill ins(%c0f : f32) outs(%empty_out : tensor<4096x64xf32>) -> tensor<4096x64xf32> + // Initialize output with zeros: 64x64 + %o_init = linalg.fill ins(%c0f : f32) outs(%empty_out : tensor<64x64xf32>) -> tensor<64x64xf32> - %o = scf.for %k = %c0 to %c4096 step %c16 iter_args(%o_acc = %o_init) -> (tensor<4096x64xf32>) { - // Extract slice from %p: 4096x16 (from columns [k:k+16]) - %p_slice = tensor.extract_slice %p[0, %k][4096, 16][1, 1] -> tensor<4096x16xf32> + %o = scf.for %k = %c0 to %c4096 step %c16 iter_args(%o_acc = %o_init) -> (tensor<64x64xf32>) { + // Extract slice from %p: 64x16 (from columns [k:k+16]) + %p_slice = tensor.extract_slice %p[0, %k][64, 16][1, 1] -> tensor<64x16xf32> // Extract slice from %v: 16x64 (from rows [k:k+16]) %v_slice = tensor.extract_slice %v[%k, 0][16, 64][1, 1] -> tensor<16x64xf32> - // Partial matmul: (4096x16) @ (16x64) -> 4096x64 - %partial = linalg.matmul ins(%p_slice, %v_slice : tensor<4096x16xf32>, tensor<16x64xf32>) - outs(%empty_partial : tensor<4096x64xf32>) -> tensor<4096x64xf32> + // Partial matmul: (64x16) @ (16x64) -> 64x64 + %partial = linalg.matmul ins(%p_slice, %v_slice : tensor<64x16xf32>, tensor<16x64xf32>) + outs(%empty_partial : tensor<64x64xf32>) -> tensor<64x64xf32> - // Accumulate: 4096x64 + 4096x64 -> 4096x64 + // Accumulate: 64x64 + 64x64 -> 64x64 %o_new = linalg.elemwise_binary {fun = #linalg.binary_fn} - ins(%o_acc, %partial : tensor<4096x64xf32>, tensor<4096x64xf32>) ... -> tensor<4096x64xf32> + ins(%o_acc, %partial : tensor<64x64xf32>, tensor<64x64xf32>) ... -> tensor<64x64xf32> - scf.yield %o_new : tensor<4096x64xf32> + scf.yield %o_new : tensor<64x64xf32> } ... } @@ -98,16 +98,16 @@ func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, After tiling the softmax computation in the reduction dimension with tile size 16 and fusing operations, following the pattern from the softmax lowering flow: ```mlir -func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, -%V: memref<4096x64xf32>, %out: memref<4096x64xf32>) { +func.func @attention(%Q : memref<64x64xf32>, %K: memref<4096x64xf32>, +%V: memref<4096x64xf32>, %out: memref<64x64xf32>) { ... // === First matmul: Q @ K^T === // Transpose K: 4096x64 -> 64x4096 %k_transpose = linalg.transpose ... -> tensor<64x4096xf32> - // QK^T: (4096x64) @ (64x4096) -> 4096x4096 - %QKT = linalg.matmul ins(%q, %k_transpose : tensor<4096x64xf32>, tensor<64x4096xf32>) - outs(%empty_NxN : tensor<4096x4096xf32>) -> tensor<4096x4096xf32> + // QK^T: (64x64) @ (64x4096) -> 64x4096 + %QKT = linalg.matmul ins(%q, %k_transpose : tensor<64x64xf32>, tensor<64x4096xf32>) + outs(%empty_NxN : tensor<64x4096xf32>) -> tensor<64x4096xf32> // === Tiled and fused softmax computation === @@ -116,124 +116,124 @@ func.func @attention(%Q : memref<4096x64xf32>, %K: memref<4096x64xf32>, %c4096 = arith.constant 4096 : index %c16 = arith.constant 16 : index - // Initialize max buffer with -inf: 4096x16 - %max_buffer_init = linalg.fill ins(%cst_minus_inf : f32) outs(%empty_max_buf : tensor<4096x16xf32>) -> tensor<4096x16xf32> + // Initialize max buffer with -inf: 64x16 + %max_buffer_init = linalg.fill ins(%cst_minus_inf : f32) outs(%empty_max_buf : tensor<64x16xf32>) -> tensor<64x16xf32> // Loop 1: Max reduction (4096 / 16 = 256 iterations) - %max_buffer = scf.for %k = %c0 to %c4096 step %c16 iter_args(%max_acc = %max_buffer_init) -> (tensor<4096x16xf32>) { - // Extract slice from QKT: 4096x16 - %QKT_slice = tensor.extract_slice %QKT[0, %k][4096, 16][1, 1] -> tensor<4096x16xf32> + %max_buffer = scf.for %k = %c0 to %c4096 step %c16 iter_args(%max_acc = %max_buffer_init) -> (tensor<64x16xf32>) { + // Extract slice from QKT: 64x16 + %QKT_slice = tensor.extract_slice %QKT[0, %k][64, 16][1, 1] -> tensor<64x16xf32> - // Max accumulation: 4096x16 + // Max accumulation: 64x16 %max_new = linalg.generic {iterator_types = ["parallel", "parallel"]} - ins(%QKT_slice : tensor<4096x16xf32>) outs(%max_acc : tensor<4096x16xf32>) { + ins(%QKT_slice : tensor<64x16xf32>) outs(%max_acc : tensor<64x16xf32>) { ^bb0(%in: f32, %out: f32): %max_val = arith.maxnumf %in, %out : f32 linalg.yield %max_val : f32 - } -> tensor<4096x16xf32> + } -> tensor<64x16xf32> - scf.yield %max_new : tensor<4096x16xf32> + scf.yield %max_new : tensor<64x16xf32> } - // Final max reduction: 4096x16 -> 4096 - %max = linalg.reduce ins(%max_buffer : tensor<4096x16xf32>) outs(%empty_N : tensor<4096xf32>) dimensions = [1] { + // Final max reduction: 64x16 -> 64 + %max = linalg.reduce ins(%max_buffer : tensor<64x16xf32>) outs(%empty_N : tensor<64xf32>) dimensions = [1] { (%in: f32, %init: f32) { %m = arith.maxnumf %in, %init : f32 linalg.yield %m : f32 } - } -> tensor<4096xf32> + } -> tensor<64xf32> - // Initialize sum buffer with zeros: 4096x16 - %sum_buffer_init = linalg.fill ins(%c0f : f32) outs(%empty_sum_buf : tensor<4096x16xf32>) -> tensor<4096x16xf32> + // Initialize sum buffer with zeros: 64x16 + %sum_buffer_init = linalg.fill ins(%c0f : f32) outs(%empty_sum_buf : tensor<64x16xf32>) -> tensor<64x16xf32> // Loop 2: Sum reduction with fused center+exp (256 iterations) - %sum_buffer = scf.for %k = %c0 to %c4096 step %c16 iter_args(%sum_acc = %sum_buffer_init) -> (tensor<4096x16xf32>) { - // Extract slice from QKT: 4096x16 - %QKT_slice = tensor.extract_slice %QKT[0, %k][4096, 16][1, 1] -> tensor<4096x16xf32> + %sum_buffer = scf.for %k = %c0 to %c4096 step %c16 iter_args(%sum_acc = %sum_buffer_init) -> (tensor<64x16xf32>) { + // Extract slice from QKT: 64x16 + %QKT_slice = tensor.extract_slice %QKT[0, %k][64, 16][1, 1] -> tensor<64x16xf32> - // Fused center+exp: 4096x16 + // Fused center+exp: 64x16 %exp_slice = linalg.generic {iterator_types = ["parallel", "parallel"]} - ins(%QKT_slice, %max : tensor<4096x16xf32>, tensor<4096xf32>) outs(%empty_slice : tensor<4096x16xf32>) { + ins(%QKT_slice, %max : tensor<64x16xf32>, tensor<64xf32>) outs(%empty_slice : tensor<64x16xf32>) { ^bb0(%in: f32, %max_val: f32, %out: f32): %centered = arith.subf %in, %max_val : f32 %exp_val = math.exp %centered : f32 linalg.yield %exp_val : f32 - } -> tensor<4096x16xf32> + } -> tensor<64x16xf32> - // Sum accumulation: 4096x16 + // Sum accumulation: 64x16 %sum_new = linalg.generic {iterator_types = ["parallel", "parallel"]} - ins(%exp_slice : tensor<4096x16xf32>) outs(%sum_acc : tensor<4096x16xf32>) { + ins(%exp_slice : tensor<64x16xf32>) outs(%sum_acc : tensor<64x16xf32>) { ^bb0(%in: f32, %out: f32): %sum_val = arith.addf %in, %out : f32 linalg.yield %sum_val : f32 - } -> tensor<4096x16xf32> + } -> tensor<64x16xf32> - scf.yield %sum_new : tensor<4096x16xf32> + scf.yield %sum_new : tensor<64x16xf32> } - // Final sum reduction: 4096x16 -> 4096 - %sum = linalg.reduce ins(%sum_buffer : tensor<4096x16xf32>) outs(%empty_N : tensor<4096xf32>) dimensions = [1] { + // Final sum reduction: 64x16 -> 64 + %sum = linalg.reduce ins(%sum_buffer : tensor<64x16xf32>) outs(%empty_N : tensor<64xf32>) dimensions = [1] { (%in: f32, %init: f32) { %s = arith.addf %in, %init : f32 linalg.yield %s : f32 } - } -> tensor<4096xf32> + } -> tensor<64xf32> - // Initialize output buffer for softmax: 4096x4096 - %p_init = linalg.fill ins(%c0f : f32) outs(%empty_NxN : tensor<4096x4096xf32>) -> tensor<4096x4096xf32> + // Initialize output buffer for softmax: 64x4096 + %p_init = linalg.fill ins(%c0f : f32) outs(%empty_NxN : tensor<64x4096xf32>) -> tensor<64x4096xf32> // Loop 3: Division with fused center+exp+div (256 iterations) - %p = scf.for %k = %c0 to %c4096 step %c16 iter_args(%p_acc = %p_init) -> (tensor<4096x4096xf32>) { - // Extract slice from QKT: 4096x16 - %QKT_slice = tensor.extract_slice %QKT[0, %k][4096, 16][1, 1] -> tensor<4096x16xf32> + %p = scf.for %k = %c0 to %c4096 step %c16 iter_args(%p_acc = %p_init) -> (tensor<64x4096xf32>) { + // Extract slice from QKT: 64x16 + %QKT_slice = tensor.extract_slice %QKT[0, %k][64, 16][1, 1] -> tensor<64x16xf32> - // Fused center+exp: 4096x16 + // Fused center+exp: 64x16 %exp_slice = linalg.generic {iterator_types = ["parallel", "parallel"]} - ins(%QKT_slice, %max : tensor<4096x16xf32>, tensor<4096xf32>) outs(%empty_slice : tensor<4096x16xf32>) { + ins(%QKT_slice, %max : tensor<64x16xf32>, tensor<64xf32>) outs(%empty_slice : tensor<64x16xf32>) { ^bb0(%in: f32, %max_val: f32, %out: f32): %centered = arith.subf %in, %max_val : f32 %exp_val = math.exp %centered : f32 linalg.yield %exp_val : f32 - } -> tensor<4096x16xf32> + } -> tensor<64x16xf32> - // Division: 4096x16 + // Division: 64x16 %p_slice = linalg.generic {iterator_types = ["parallel", "parallel"]} - ins(%exp_slice, %sum : tensor<4096x16xf32>, tensor<4096xf32>) outs(%empty_slice : tensor<4096x16xf32>) { + ins(%exp_slice, %sum : tensor<64x16xf32>, tensor<64xf32>) outs(%empty_slice : tensor<64x16xf32>) { ^bb0(%exp_val: f32, %sum_val: f32, %out: f32): %result = arith.divf %exp_val, %sum_val : f32 linalg.yield %result : f32 - } -> tensor<4096x16xf32> + } -> tensor<64x16xf32> - // Insert slice back: 4096x16 -> 4096x4096 - %p_new = tensor.insert_slice %p_slice into %p_acc[0, %k][4096, 16][1, 1] -> tensor<4096x4096xf32> + // Insert slice back: 64x16 -> 64x4096 + %p_new = tensor.insert_slice %p_slice into %p_acc[0, %k][64, 16][1, 1] -> tensor<64x4096xf32> - scf.yield %p_new : tensor<4096x4096xf32> + scf.yield %p_new : tensor<64x4096xf32> } - // Result: %p contains softmax(Q @ K^T) with shape 4096x4096 + // Result: %p contains softmax(Q @ K^T) with shape 64x4096 // === Final matmul TILED in K dimension (256 iterations) === - // Initialize output with zeros: 4096x64 - %o_init = linalg.fill ins(%c0f : f32) outs(%empty_out : tensor<4096x64xf32>) -> tensor<4096x64xf32> + // Initialize output with zeros: 64x64 + %o_init = linalg.fill ins(%c0f : f32) outs(%empty_out : tensor<64x64xf32>) -> tensor<64x64xf32> - %o = scf.for %k = %c0 to %c4096 step %c16 iter_args(%o_acc = %o_init) -> (tensor<4096x64xf32>) { - // Extract slice from %p: 4096x16 - %p_slice = tensor.extract_slice %p[0, %k][4096, 16][1, 1] -> tensor<4096x16xf32> + %o = scf.for %k = %c0 to %c4096 step %c16 iter_args(%o_acc = %o_init) -> (tensor<64x64xf32>) { + // Extract slice from %p: 64x16 + %p_slice = tensor.extract_slice %p[0, %k][64, 16][1, 1] -> tensor<64x16xf32> // Extract slice from %v: 16x64 %v_slice = tensor.extract_slice %v[%k, 0][16, 64][1, 1] -> tensor<16x64xf32> - // Partial matmul: (4096x16) @ (16x64) -> 4096x64 - %partial = linalg.matmul ins(%p_slice, %v_slice : tensor<4096x16xf32>, tensor<16x64xf32>) - outs(%empty_partial : tensor<4096x64xf32>) -> tensor<4096x64xf32> + // Partial matmul: (64x16) @ (16x64) -> 64x64 + %partial = linalg.matmul ins(%p_slice, %v_slice : tensor<64x16xf32>, tensor<16x64xf32>) + outs(%empty_partial : tensor<64x64xf32>) -> tensor<64x64xf32> - // Accumulate: 4096x64 + 4096x64 -> 4096x64 + // Accumulate: 64x64 + 64x64 -> 64x64 %o_new = linalg.elemwise_binary {fun = #linalg.binary_fn} - ins(%o_acc, %partial : tensor<4096x64xf32>, tensor<4096x64xf32>) ... -> tensor<4096x64xf32> + ins(%o_acc, %partial : tensor<64x64xf32>, tensor<64x64xf32>) ... -> tensor<64x64xf32> - scf.yield %o_new : tensor<4096x64xf32> + scf.yield %o_new : tensor<64x64xf32> } ... } From 06cf4f2ee2958a457332c06bec75a87613bbfcc8 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 23 Apr 2026 21:26:59 +0000 Subject: [PATCH 49/51] save work --- examples/xegpu/fused_attention.py | 4 +- .../mlir_gen/gpu_fused_attention_payload.py | 98 +++---- .../xegpu/fused_attention_schedule.py | 243 +++++++++--------- 3 files changed, 179 insertions(+), 166 deletions(-) diff --git a/examples/xegpu/fused_attention.py b/examples/xegpu/fused_attention.py index da61b3f7..2bf59882 100644 --- a/examples/xegpu/fused_attention.py +++ b/examples/xegpu/fused_attention.py @@ -193,13 +193,13 @@ def parse_cli(): parser.add_argument( "--batch-size", type=int, - default=2, + default=1, help="Batch size (Z)", ) parser.add_argument( "--num-heads", type=int, - default=8, + default=1, help="Number of attention heads (H)", ) parser.add_argument( diff --git a/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py b/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py index c2f3d4ec..07bc3665 100644 --- a/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py +++ b/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py @@ -49,81 +49,81 @@ def payload(output, Q_arg, K_arg, V_arg): K_tensor = emit_buf_to_tensor(K_arg, restrict=True) V_tensor = emit_buf_to_tensor(V_arg, restrict=True) - # Collapse first 2 dimensions (Z, H) into a batch dimension - # From (Z, H, n_ctx, n_head) to (Z*H, n_ctx, n_head) - batch_dim = Z * H - collapsed_shape_3d = (batch_dim, n_ctx, n_head) + # Collapse first 3 dimensions (Z, H, n_ctx) into a single batch dimension + # From (Z, H, n_ctx, n_head) to (Z*H*n_ctx, n_head) + batch_dim = Z * H * n_ctx + collapsed_shape_2d = (batch_dim, n_head) - Q_3d = tensor.collapse_shape( - ir.RankedTensorType.get(collapsed_shape_3d, dtype), + Q_2d = tensor.collapse_shape( + ir.RankedTensorType.get(collapsed_shape_2d, dtype), Q_tensor, - reassociation=[[0, 1], [2], [3]], + reassociation=[[0, 1, 2], [3]], ) - K_3d = tensor.collapse_shape( - ir.RankedTensorType.get(collapsed_shape_3d, dtype), + K_2d = tensor.collapse_shape( + ir.RankedTensorType.get(collapsed_shape_2d, dtype), K_tensor, - reassociation=[[0, 1], [2], [3]], + reassociation=[[0, 1, 2], [3]], ) - V_3d = tensor.collapse_shape( - ir.RankedTensorType.get(collapsed_shape_3d, dtype), + V_2d = tensor.collapse_shape( + ir.RankedTensorType.get(collapsed_shape_2d, dtype), V_tensor, - reassociation=[[0, 1], [2], [3]], + reassociation=[[0, 1, 2], [3]], ) # Step 1: Transpose K to get K^T - # Permute from (batch_dim, n_ctx, n_head) to (batch_dim, n_head, n_ctx) - kt_shape_3d = (batch_dim, n_head, n_ctx) - kt_init = tensor.empty(kt_shape_3d, dtype) - K_transposed = linalg.transpose(K_3d, outs=[kt_init], permutation=[0, 2, 1]) - - # Step 2: Compute Q @ K^T using batch_matmul - # Q: (batch_dim, n_ctx, n_head) @ K^T: (batch_dim, n_head, n_ctx) - # Result: (batch_dim, n_ctx, n_ctx) - qkt_shape_3d = (batch_dim, n_ctx, n_ctx) - qkt_init = tensor.empty(qkt_shape_3d, dtype) + # Transpose from (batch_dim, n_head) to (n_head, batch_dim) + kt_shape_2d = (n_head, batch_dim) + kt_init = tensor.empty(kt_shape_2d, dtype) + K_transposed = linalg.transpose(K_2d, outs=[kt_init], permutation=[1, 0]) + + # Step 2: Compute Q @ K^T using matmul + # Q: (batch_dim, n_head) @ K^T: (n_head, batch_dim) + # Result: (batch_dim, batch_dim) + qkt_shape_2d = (batch_dim, batch_dim) + qkt_init = tensor.empty(qkt_shape_2d, dtype) # Initialize with zeros for matmul accumulation zero = arith.constant(dtype, 0.0) qkt_init_filled = linalg.fill(zero, outs=[qkt_init]) - # Batch matmul: Q @ K^T - qkt = linalg.batch_matmul(Q_3d, K_transposed, outs=[qkt_init_filled]) + # Matmul: Q @ K^T + qkt = linalg.matmul(Q_2d, K_transposed, outs=[qkt_init_filled]) - # Step 3: Scale by 1/sqrt(n_head) - scale_factor = 1.0 / math.sqrt(n_head) - scale_const = arith.constant(dtype, scale_factor) + # # Step 3: Scale by 1/sqrt(n_head) + # scale_factor = 1.0 / math.sqrt(n_head) + # scale_const = arith.constant(dtype, scale_factor) - # Create a tensor filled with the scale factor - scale_tensor_init = tensor.empty(qkt_shape_3d, dtype) - scale_tensor = linalg.fill(scale_const, outs=[scale_tensor_init]) + # # Create a tensor filled with the scale factor + # scale_tensor_init = tensor.empty(qkt_shape_2d, dtype) + # scale_tensor = linalg.fill(scale_const, outs=[scale_tensor_init]) - # Elementwise multiply qkt with scale tensor - scaled_qkt_init = tensor.empty(qkt_shape_3d, dtype) - scaled_qkt = linalg.mul(qkt, scale_tensor, outs=[scaled_qkt_init]) + # # Elementwise multiply qkt with scale tensor + # scaled_qkt_init = tensor.empty(qkt_shape_2d, dtype) + # scaled_qkt = linalg.mul(qkt, scale_tensor, outs=[scaled_qkt_init]) - # Step 4: Apply softmax along the last dimension (dim=2 in 3D) - softmax_init = tensor.empty(qkt_shape_3d, dtype) + # Step 4: Apply softmax along the last dimension (dim=1 in 2D) + softmax_init = tensor.empty(qkt_shape_2d, dtype) attention_weights = linalg.softmax( - result=[ir.RankedTensorType.get(qkt_shape_3d, dtype)], - input=scaled_qkt, + result=[ir.RankedTensorType.get(qkt_shape_2d, dtype)], + input=qkt, output=softmax_init, - dimension=2, + dimension=1, ) - # Step 5: Multiply attention weights by V using batch_matmul - # attention_weights: (batch_dim, n_ctx, n_ctx) @ V: (batch_dim, n_ctx, n_head) - # Result: (batch_dim, n_ctx, n_head) - output_3d_init = tensor.empty(collapsed_shape_3d, dtype) - output_3d_init_filled = linalg.fill(zero, outs=[output_3d_init]) + # Step 5: Multiply attention weights by V using matmul + # attention_weights: (batch_dim, batch_dim) @ V: (batch_dim, n_head) + # Result: (batch_dim, n_head) + output_2d_init = tensor.empty(collapsed_shape_2d, dtype) + output_2d_init_filled = linalg.fill(zero, outs=[output_2d_init]) - result_3d = linalg.batch_matmul( - attention_weights, V_3d, outs=[output_3d_init_filled] + result_2d = linalg.matmul( + attention_weights, V_2d, outs=[output_2d_init_filled] ) - # Expand back to 4D: (Z*H, n_ctx, n_head) -> (Z, H, n_ctx, n_head) + # Expand back to 4D: (Z*H*n_ctx, n_head) -> (Z, H, n_ctx, n_head) result = tensor.expand_shape( ir.RankedTensorType.get(shape, dtype), - result_3d, - reassociation=[[0, 1], [2], [3]], + result_2d, + reassociation=[[0, 1, 2], [3]], output_shape=[], static_output_shape=shape, ) diff --git a/lighthouse/schedule/xegpu/fused_attention_schedule.py b/lighthouse/schedule/xegpu/fused_attention_schedule.py index 6ea15081..b629cdfb 100644 --- a/lighthouse/schedule/xegpu/fused_attention_schedule.py +++ b/lighthouse/schedule/xegpu/fused_attention_schedule.py @@ -110,133 +110,146 @@ def bundle_xegpu_fused_attention_schedule( anytype = transform.AnyOpType.get() anyvalue = transform.AnyValueType.get() - # Match all matmul operations - there should be 2: - # 1. Q @ K^T - # 2. attention_weights @ V - matmul_ops = match_and_split(mod, ops={"linalg.matmul"}, nhandles=2) + # # Match all matmul operations - there should be 2: + # # 1. Q @ K^T + # # 2. attention_weights @ V + # matmul_ops = match_and_split(mod, ops={"linalg.batch_matmul"}, nhandles=2) + + # # Get the last matmul (attention_weights @ V) + # last_matmul = matmul_ops[1] + # func = transform.get_parent_op( + # anytype, + # last_matmul, + # op_name="func.func", + # deduplicate=True, + # ) + + # # Tile the last matmul in the batch dimension using tile_using_forall + # # Batch dimension is the first dimension (collapsed_dim = Z * H * n_ctx) + # # Extract workgroup tile size from parameters + # wg_tile_size = parameters["wg_tile_size"] + + # tiled_matmul, forall_loop = structured.structured_tile_using_forall( + # anytype, + # anytype, + # last_matmul, + # num_threads=[], + # tile_sizes=[], + # static_tile_sizes=(1, wg_tile_size, 0), + # ) + + # # Fuse the softmax producer into forall + # softmax_ops = match_and_split(func, ops={"linalg.softmax"}, nhandles=1) + # softmax_op = softmax_ops[0] + # fused_softmax_op, forall_loop = structured.structured_fuse_into_containing_op( + # anytype, + # anytype, + # producer_op=softmax_op, + # containing_op=forall_loop, + # ) + # transform.apply_cse(func) + # canonicalize(func) - # Get the last matmul (attention_weights @ V) - last_matmul = matmul_ops[1] - func = transform.get_parent_op( - anytype, - last_matmul, - op_name="func.func", - deduplicate=True, - ) + # # Fuse linalg.mul (scaling) into forall + # mul_ops = match_and_split(func, ops={"linalg.mul"}, nhandles=1) + # mul_op = mul_ops[0] + # _, forall_loop = structured.structured_fuse_into_containing_op( + # anytype, + # anytype, + # producer_op=mul_op, + # containing_op=forall_loop, + # ) + # transform.apply_cse(func) + # canonicalize(func) - # Tile the last matmul in the batch dimension using tile_using_forall - # Batch dimension is the first dimension (collapsed_dim = Z * H * n_ctx) - # Extract workgroup tile size from parameters - wg_tile_size = parameters["wg_tile_size"] + # # Fuse the first matmul (Q @ K^T) into forall + # matmul_ops = match_and_split( + # func, ops={"linalg.batch_matmul"}, nhandles=2 + # ) # Two matmuls are present. + # first_matmul = matmul_ops[0] + # _, forall_loop = structured.structured_fuse_into_containing_op( + # anytype, + # anytype, + # producer_op=first_matmul, + # containing_op=forall_loop, + # ) + # transform.apply_cse(func) + # canonicalize(func) - tiled_matmul, forall_loop = structured.structured_tile_using_forall( - anytype, - anytype, - last_matmul, - num_threads=[], - tile_sizes=[], - static_tile_sizes=(wg_tile_size, 0), - ) + # # Fuse linalg.transpose (K transpose) into forall + # transpose_ops = match_and_split(func, ops={"linalg.transpose"}, nhandles=1) + # transpose_op = transpose_ops[0] + # _, forall_loop = structured.structured_fuse_into_containing_op( + # anytype, + # anytype, + # producer_op=transpose_op, + # containing_op=forall_loop, + # ) + # transform.apply_cse(func) + # canonicalize(func) - # Fuse the softmax producer into forall - softmax_ops = match_and_split(func, ops={"linalg.softmax"}, nhandles=1) - softmax_op = softmax_ops[0] - fused_softmax_op, forall_loop = structured.structured_fuse_into_containing_op( - anytype, - anytype, - producer_op=softmax_op, - containing_op=forall_loop, - ) - transform.apply_cse(func) - canonicalize(func) + # # At this point all of the key operations are fused into the forall loop. + # # Remaining linalg.fill ops can be fused trivially. + # fill_ops = match_and_split(func, ops={"linalg.fill"}, nhandles=3) + # for fill_op in fill_ops: + # _, forall_loop = structured.structured_fuse_into_containing_op( + # anytype, + # anytype, + # producer_op=fill_op, + # containing_op=forall_loop, + # ) + # transform.apply_cse(func) + # canonicalize(func) + + # # tensor.empty() holding the result of transpose can be fused. + # transpose_op = match_and_split(func, ops={"linalg.transpose"}, nhandles=1)[0] + # transpose_init = transform.get_producer_of_operand( + # anytype, transpose_op, operand_number=1 + # ) + # _, forall_loop = structured.structured_fuse_into_containing_op( + # anytype, + # anytype, + # producer_op=transpose_init, + # containing_op=forall_loop, + # ) + # transform.apply_cse(func) + # canonicalize(func) - # Fuse linalg.mul (scaling) into forall - mul_ops = match_and_split(func, ops={"linalg.mul"}, nhandles=1) - mul_op = mul_ops[0] - _, forall_loop = structured.structured_fuse_into_containing_op( - anytype, - anytype, - producer_op=mul_op, - containing_op=forall_loop, - ) - transform.apply_cse(func) - canonicalize(func) - - # Fuse the first matmul (Q @ K^T) into forall - matmul_ops = match_and_split( - func, ops={"linalg.matmul"}, nhandles=2 - ) # Two matmuls are present. - first_matmul = matmul_ops[0] - _, forall_loop = structured.structured_fuse_into_containing_op( - anytype, - anytype, - producer_op=first_matmul, - containing_op=forall_loop, - ) - transform.apply_cse(func) - canonicalize(func) + # # tensor.empty() ops holding the result of the softmax can also be fused. + # softmax_op = match_and_split(func, ops={"linalg.softmax"}, nhandles=1)[0] + # softmax_init = transform.get_producer_of_operand( + # anytype, softmax_op, operand_number=1 + # ) + # _, forall_loop = structured.structured_fuse_into_containing_op( + # anytype, + # anytype, + # producer_op=softmax_init, + # containing_op=forall_loop, + # ) + # transform.apply_cse(func) + # canonicalize(func) - # Fuse linalg.transpose (K transpose) into forall - transpose_ops = match_and_split(func, ops={"linalg.transpose"}, nhandles=1) - transpose_op = transpose_ops[0] - _, forall_loop = structured.structured_fuse_into_containing_op( - anytype, - anytype, - producer_op=transpose_op, - containing_op=forall_loop, - ) - transform.apply_cse(func) - canonicalize(func) - - # At this point all of the key operations are fused into the forall loop. - # Remaining linalg.fill ops can be fused trivially. - fill_ops = match_and_split(func, ops={"linalg.fill"}, nhandles=3) - for fill_op in fill_ops: - _, forall_loop = structured.structured_fuse_into_containing_op( - anytype, - anytype, - producer_op=fill_op, - containing_op=forall_loop, - ) - transform.apply_cse(func) - canonicalize(func) - # tensor.empty() holding the result of transpose can be fused. - transpose_op = match_and_split(func, ops={"linalg.transpose"}, nhandles=1)[0] - transpose_init = transform.get_producer_of_operand( - anytype, transpose_op, operand_number=1 - ) - _, forall_loop = structured.structured_fuse_into_containing_op( - anytype, - anytype, - producer_op=transpose_init, - containing_op=forall_loop, - ) - transform.apply_cse(func) - canonicalize(func) + if stop_at_stage == "outer-tiled": + raise PipelineInterrupt() - # tensor.empty() ops holding the result of the softmax can also be fused. - softmax_op = match_and_split(func, ops={"linalg.softmax"}, nhandles=1)[0] - softmax_init = transform.get_producer_of_operand( - anytype, softmax_op, operand_number=1 - ) - _, forall_loop = structured.structured_fuse_into_containing_op( + # Match the last matmul (attention_weights @ V) + # There should be 2 matmuls: Q @ K^T and attention_weights @ V + matmul_ops = match_and_split(mod, ops={"linalg.matmul"}, nhandles=2) + last_matmul = matmul_ops[1] + + # Tile the last matmul in the K dimension only (reduction dimension) + # Matmul shape: (512, 512) @ (512, 64) + # Tile sizes: [M, N, K] = [0, 0, 64] - only tile the K dimension + _, _, _, sum_loop = structured.structured_tile_reduction_using_for( + [anytype], + anytype, anytype, anytype, - producer_op=softmax_init, - containing_op=forall_loop, + target=last_matmul, + tile_sizes=[0, 0, 32], ) - transform.apply_cse(func) - canonicalize(func) - - if stop_at_stage == "outer-tiled": - raise PipelineInterrupt() - # # vectorize (placeholder) - # # func = structured.VectorizeChildrenAndApplyPatternsOp( - # # func, - # # fold_type_extensions_into_contract=True, - # # ).result # transform.apply_cse(func) # canonicalize(func) From 1caf9ce26b085531a625ca8d8c2dce7b47cfeb91 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 23 Apr 2026 21:27:08 +0000 Subject: [PATCH 50/51] save work --- lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py | 2 -- lighthouse/schedule/xegpu/fused_attention_schedule.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py b/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py index 07bc3665..d4a80856 100644 --- a/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py +++ b/lighthouse/ingress/mlir_gen/gpu_fused_attention_payload.py @@ -1,7 +1,5 @@ """Generate MLIR payload for GPU fused attention operation.""" -import math - from mlir import ir from mlir.dialects import arith, bufferization, linalg, tensor diff --git a/lighthouse/schedule/xegpu/fused_attention_schedule.py b/lighthouse/schedule/xegpu/fused_attention_schedule.py index b629cdfb..6217d835 100644 --- a/lighthouse/schedule/xegpu/fused_attention_schedule.py +++ b/lighthouse/schedule/xegpu/fused_attention_schedule.py @@ -7,7 +7,6 @@ from mlir.dialects.transform import structured from lighthouse.pipeline.helper import ( - canonicalize, match, match_and_split, PipelineInterrupt, @@ -229,7 +228,6 @@ def bundle_xegpu_fused_attention_schedule( # transform.apply_cse(func) # canonicalize(func) - if stop_at_stage == "outer-tiled": raise PipelineInterrupt() From d1e3c3f831a4cb7fb363914fbfb25c3fa92e038a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 24 Apr 2026 22:03:58 +0000 Subject: [PATCH 51/51] add pdf slices --- .../softmax_lowering_flow.pdf | Bin 0 -> 211002 bytes .../softmax_lowering_flow.tex | 567 ++++++++++++++++++ 2 files changed, 567 insertions(+) create mode 100644 reduction_tiling_docs/softmax_lowering_flow.pdf create mode 100644 reduction_tiling_docs/softmax_lowering_flow.tex diff --git a/reduction_tiling_docs/softmax_lowering_flow.pdf b/reduction_tiling_docs/softmax_lowering_flow.pdf new file mode 100644 index 0000000000000000000000000000000000000000..ed96b0bbc56ed9e1bb11a065f78d7899a59dd377 GIT binary patch literal 211002 zcmb4}Quwyn#nY9raH>hk=RD1ZRo1vR1iF-#1ZSs88@x#Q-H@n3!7V!-F{|uOV1<2!cbTediVJC z8M{E>-;rhr!b?vUm^f~Ui-T%QFq_qa!pn&F-Uv%x z0R&ZX$&n{AohDm&XGw&nDrdA<$I!=RNk*j;EOtUFxq!aCms_0`X>%NSO;ZY{CeN?L z9#P4NvbVYqA*IK5*R64#>dRF*V`SktBYl&g&8~c>g1qkVAmOtE7znPy+hg~<$>l_le7bB%`stK->bKI;{?JB(X z5G1Mdd!20>ckMnKWj?-uq3Isz`^G*$-my0OFs#rx`cS92Y_2lqQL&FI8jumBxYMV2 zRm_o8#tkV}6YJUUC=85sz?A=zd93#ZikJRVHN{q05qv1Lz&G+$X}m@^>Uq!D{(%=d zSRG#I`X1gGL@0ABW=>4ZS^Ew<$O{_iio*{)Q4_DJ61CFy8q{EFa4%p3pl;Uh@XPU)`5Dvi|22cxD1d-6JMCq`laJL$!^ ze4BeJret?58n^}O9G(4b9GxC0Y9w{FUio9$|7J9v#0M^^k5|1B53VGi~< zO@Hr39Nn7Orx>PM&ACyN-gAJG1UbOS!0w<>FJ2R&=t*9Dk;|@O(9fs^Api8PU5Ia~XKVDBC z6AJY^N7Ocv4Jss~h@P6K7l@D6(2{^KucXh?*e?^1C^8;iF(1AB9U$x_@c|ja#Jxf- z>rFjQr&!7OZw)+T$)2oR{t}7U&=kgmff2rUsaCI4$>*vu<|a!^^(Yq!9wC>aHyA}n z@WXYe132l!KJqb+YKTNK{gP~WI~mR}*BW^SKF`X)B7fjSq_9Tu@NuNgL2vkQ?w@a3 zz^7fxgm1<7?WciOEB#|3@@n6Ka0)1$YZP~$p@O^kbg1z?fx-e)AoIz#*S%&DzZFf} zb1zV?fQAueiFS8n7-OBNDvDqux(S74KSN3mHVvz!*9s4LvgPT51937+d-X@?eQN*b zeY0Boge)ch1&k`84f2WmpL0TdUn#+F6n?P4hCT9r2-zSXEqhZ?X|}d|Sn2U{k6o-Hn}f33Iq21 zgv_qAY7Q*T9g)=ZK>Pm5=KcM#Ba=4jY1-lQ#*NX}*Wo2g))S#GW`AMSgrEgDD{$Fh zU;fp4EW~Gfi;Jf($~N=(_zb2;snJBU7FG%!%G2xMwV+|`;&nh{Gj4%}anohWiiw2? z<1+O?BYL!0v_DU&0({!~9~;iq{8zVc zK^>!5MDONqpG%f!y^@__94$1X0;ZXpP$wEqHJe$yjzw9Fjog}!$3I^do#_r#EoOZ0 zV>#N+wq%;7c(S#vnUR;vgU1~mt!jm0bj2o&rGNBUBW61`v9wNAT29Agx4F}>ELt3s z2DG2uyg81Y>=rOEv%n(js*iX&_$Hv!yBu;K9iSE68&3u6D_)b-y9O-Dxg`}hkbxtQ z)~B49k70B4A~wB`CHe^m=DN}wDuFY2tI~Y1sz|9j`>vb8=eeJ=q)yBku;(N9a_yTA zb72UTw;HR^>GNYex$HP#yl_dp`3--y;LOe$8HRb5U6C~`WX^w02GhO28o6+4Q8A%W zZ+&D7NYiDo{}wR?wAzBWq3miO0 zB-C=iB&mWkqv`v0CS)@vQH`ylUZrTKpl_OTY>OhyB9tW2=0P`?HWM{5JeTyJbv+0n z&C&58+tX__XHH?09xE(6q$Si7j3y-6mElG215>$ zUWC?rfcG?PGJX@Bpq&Qkdm)z|BRqA5a6@OzmYI2pPMXMKX!Op5x1t{B@ByZhc?iNO zKJqJj$n%tPHEDIySywMXvB>j=Q;}~`>OG^?tr3)sFvVs`4li7Nek~9(pbY?c3^|ru zLl5}o4?rBo5f(<3e((?ENAM7v3vo!DW}-dyT)4^_-BUqUidC^s0hKUuAk;D?Q<+Xh z*G?XvKrAt$qfCBy14xfwx!;lGaD2~ThEPdh~2?rbLL-VSJ(reoc=zI zi|Jh#&eaL-nKL-vv$;PB(%wQ@-%j8ACiH)K%G`a=`?Pl6$cEHq$vd3l!57>ZR5xp> zNf`KV@*1U25XO=UM@W$yn1pdFlU#P}Ih|+s?_7JjM#GLXZAGR9vPZH8Yj$KDbLM^&$>pgR zrLDP+yS3)}vEKNA&8lJOKgY;Q+@&6W+-apwm21Rlx6sn#cVv}N2S*DcJfqN@D}rsC z1ScJnZ>jG=@01f^S6`74?;%HRgM;MoX^i)%7$=`n9wgGcRHiuT>-46O7S8ud@>gQz z@UUdk+E$~G>C|^9UG`?-;pCp0c@o!E5TQTNHi#8{;Ali!f z(Jyx{dnrX)FyhfJ40SHfm)<|ej<**s--h2g;n?+qT)}L5Qmb2&%ayy_zgo29Z6{aM zoHSl|(0@91Mc)ZAzr}k#< z#c|Bv*%lOu>DCyW8Dw_zVs!-0 zyHF(I{XCy$f7kXagwo0j@A3f2G&RjLcYvsXs9LMx34u=r;t0?lTA=Y2Rp8*{9X#jb zNE{rHbqJczYytQV4?mcMzRSIbyyL`kwk#sbVlV%u>nR8x(mT%yBg*~M?+LDC&`m7^ zl`;JO27?#Ng0JGKtcBS!_PD$vd5~j>=eSL$pYN9*bI9CWkF@@*1<|%%aj_b{JhVBC z8i>G|q7WPIa!g^gef7{jtZ~=~jRl(9<7kRpOZ3`;2XxYklvZzU2=&Nt2ecd-N*17)o?#gBHmv5MN#B5K1zfMfDmyXSQ z*HW2QvsIF1<5kB=Rp#1`m136S^6b*t=?;U#*f~YZ(C-T458Tlws0tA4ir6i?w*!<3 zuxpAV8)G)hCtHpK=Icezs?z327~ae-YQK~p7$u`q3t8TmuLn^m9QrL7do%y0lyE{EsB0Lh>+px~u8P-X+9z3b zsY#*TzZ)lXB{PZhh%WO;KGX{3+JwLzg>(OX+}+ZqFZ~a3o00xM#BFvurvEx_tN)8a z-+w67q7r5o4w${k43VgtsKr(W9^G;rl|nLxXz;A}ddhBGTfA~@p+X#L$* zWjapM-er)@({X?`2k58i@(rf5R4{4Kn@f=gnMNXD<$A51KMdv+(^a90V79dBXI3=U z=&UqqxZ!bDo6pR_nDyvMoFf{QcN;EmkE6LXET=dF1W16|t`QE>=%&#Y!*KC+L&Xs0 z-O2dt=BMw0eR|CB)Fr~LEn2EX<{e(HFwt=iED?wDHYtX}b9DkUEscF-{b8w#xKYZ( zVAPkauzrJ(#hgD+!))d(dF8(9#%Pw}Nh(m9;I9t_w#LYUJO$i2=2G=YEBTciQVNin zAd=j`!Us+Txn3Ngf17ORD&PCM;Gj{l_C`aVAe+zq3RSj}kGRIx!2q*R*0ZSbotE74 zse(Dc!e+Y9J%2LmL7RVkJ za4waSoh*u3Uo&{T(@5Kz??&vWUBdBi$@gu8{ah)1b5Zf~UPHuo+NS0dsK`vUJr=#g zA_cpj=v^Qn7VYy-^aiu2te&_=c^P!9mRc!F6crG=HjksQ6gop$B_Ql;L4lTHijLo@ ztoA z@&MD-9Y|UgREKe>yz^nK)2p;<^~X&kZrUc-5kOw%Fm7O>GS5Dk(DP1`QB9@z^=QX@ z0HTXtoXu*}%wb?0Jre``4Wvn;rFx=TXcHbUyWZU*9MJKjk&6T7TU$zF`eI$NncOW@ z!+(U5Ws3A^mqtdXYvt&l_>E?W-?pD24$9LQpsm$7dEn*j%!mn{+!V}(0}w^-gHtO4 zrPO|4z>qys;&l6$T#*IZ_g2F_3O$V9e=+d* zRwDoc42V$%4(ySw%oa?!qirT|ppt^eVGzyH>Y**DAuL)Y@$8g?C2dOOUJa2&MWq$4 zkvCPI6>qe3x!#5zei-}8s|qWsreZVUQgSz}mS&HUo>T_E5>Ure*CINn^jZ|wT`a@R zX!pZIitemC7_{EX3+mzy$#hljk)}j)EM@H`W*HAyOa@>lM+$&P@|!wnE!URN?P#(Q z9!MzS%x}(2#Ln9!-yU=4@+$lfWzVrfFpV=R!C=v&f7B`P-;Dk=?a9V&DwvVK>q}|& z9G$9ewq-Hu%aqMF>fIEx%+$5Yc;83M907qIddXn(d_CdovleTffx|`&CWtdaeMI~E zq7Dotsc3=uhxp{K$4*;$PG@V!bD*&4=HL$E4nlYSN6h1|J#lt-WEK5$mZuSS8AoQM z&`t;wD5aJ#9gB+7zLk=NZ}Yr@4<_F|Vx8l?CM)?{*ZY}Lt6wwh=oAUsHHq3B4>7dC zWzP$qdL;qVPUv>AlE%tX;*7<2Wh+7|`$cwGr332Iit2qnXFzwzM+VV;OMJ+bd#PGiC?SVUHJ!-UDaitaMU^vR=2ItuPW#(MRR zJ#OJ}(pf39drVB95V#`0b^SNq%u_60%#ic(B(b7)sJ5{zSS44fIg^ULKN8cOL4M)H zERQRk>{TjSz7SO-F~!Y#5~aF|k&&q?;r_06BO<7^G*NPtJ^n>eOGoK2l`rrFOV6kHZp=qYDnnYNm=IEgPcH7`!J(svQMLSeP~3ltL+dGc6_e^b2^*ct z^EHmm8xROPdOmZGnwLaXT)xP0H#c9OxrRrE%fb|2FG(RQE+eKwMBl(AOYS2s?j_?O z#ei${x6DiUj#6QLE*UtfXi#PbpJ@zYJc5W%vL@_aWoQw0*)s@uXD$!09FBrxiEG?b zUvkv*FA|K1?ann!bLR6!3eLp)&)8br|1CKzwodVeA4Ue}?jo41DWsr(Jo?X(Q z;)qYt-HD0{UMs_E!UQ2j>$5)(`pV_TFYGG-*n4iA6f%%Od4ad_NRE7oU|tODEGd>%hb4W(!@2RaB2PzO2~GZ^MUGy|OCG9k?X2 zKZ)xW;gUV>=~hfx^EW3{=S@DccU_s!XtmQDS|OMN>g-_aSO)by{2rSSUCxcwH>@l9 zWW_axY&t@f2R;`LSl895ZYXfMZ2^)v(Sway|1;c9=*^(jPC<`fItwpo*8xu6 zXm3tUYW&WA16^LF!iEIkX;|C+G>g$Jwoul#;pyGP%C>A>cNDgr=Sz9^W8+5%^dgQuhk6V#@FKUw{L z{XPv_OcsL#-U0;C4}{u2iQ{v>XLvHWU=+=K1e2Jrhs`NC-*yT^jKqdHK|;c!Bt_w- zC#}x{JHUc^h<{~eg$=S+h_mh0G6I?h(3kHExzh%c74QTNLOz#lKo>>QB9p{9yUy1M zMb1({fUq&o{L2zOC0PjUsSXqtEo{InR?>+MH+JoSBtHa)F1nj`1ek%{!fu@1h)|9= z5h94@hKlVF=Mu__3#|#5p|1x%Dz=VRh_8=FUOulWR4gB&%?X4XRT~hq*iwc>fQbM- zejYR$+;9x^3b4SAKqL%^DZ(rO5pG@~gQ#F9JxuRCYLDME3LvPu9)OuM(Tbd;Q5;r& zc84egB~H2rB(YE&VO}sb$-vxU#BSY;L5K<{AkBE7uWxh;eUAYCriBiFPK~Q$z}|nN z4pUH@D$4IFn2i-QR!hK;AB_nyKpV&pgwX(fOoB8`7}X3&7=0_jbDf}iPJk$Y3QpW4 z4sHoPB;9IkAw=CMaq2f>7ItwcNFlO2W-(fTSen{t@?31uHWbe|?F;_Byb%Ifg1W@| zdWcbuLO)r;l#*#=(%gQQlxKQ%Od(gagXwWk@5OcF7DPjGWmLOxgM=1>Cw<~AR5?2v z-FEny&a{kaiKltaheKmqhSX?Fap|A+#y{hlG6=m7D;k~z!vYcU$k#>alD8`!l+Gy! zp^qtons)v?hzgZXInSK*;?ioVdEPHh<6#E!C^NCogV~ zjUK#QEN6^p@BBCMJ;lMPL$133cf6RoS(NYV%>ZAp%hZ}7?b)(t@(!h3sOdu+REle} zmLr-RH0o7!D)W*(4v~k+BNUNU{%=dV|C8wUz{0XUKAju+^>s4-;JwJ5J54)!zvJk+ z^SRW0rjym&|KNLqSREP6Za7+9MYoAsC1qk^oj|F&L=rUXcVChOoGCfaI zi!Y2dJn-`{I@cKdwPr77Bwux@XzcLdC0TXqxE4$}b!r`wAP|s&ySZ2q>id=rG&%Lj z@|=(mi0!4eR6P1d5W13Jf~8_!wh)ne*$kziLgvwSGn62F{PLI)NRjWRt)A$54GspN zeWf;0DT_^)*<}i~cr_8na>7CV@?Vo>a83Uu3#CLYk9f)_J}Zm%|t*ywmwM!0ri4P7AJ$7eH^Z&xPW#qN}> z@27XXF>}o)y_+$lAXaJHnwptb{ESMo7pW!u=iZEIS)M4G5s^isqN~VZHy-8;E?3?i z8l7w^k;wY_qK>l|KD4)WClG}*<*M1|lg7wIdI6BARM8SnoL!o~Bx|nVq7AMPqEnls z*gEV1sxk0MgdonOWuN@na1#gt4ncYR&=X7n1t7xV5r`tIIpw|sR|soi%xF!p2^ zU$r#aBVV#C+Bbi71~&JRn6zXdr#=3yh&!k-cBh>|kUQ5hqu4M)VhPajma}|Sg=^}} zQkko+J$0@vFQI^3OeP_pR6r&yA5nlTOdg?tEX;%T7e?M5SU&xY`!|J+ZW!b>l>EN; zZweoL3SfDRun9gTumT47oqx-NG0R!oS`qjtveCK3OBg?Gf`gdh;TaZ z5uu0E#gtgHbqTPsl?;n3i|OX4X4vMK<{Hzkl+dnoa@vZ~uCko_0*|@Kb8*@V(eARG z#{!SF$aC|bp6;iBtQ$U=5`Eb*h-)bJ+jjw(kDjZ{T$1Ea3l;-~llvc`?92$0@g*WcD234*r>{EN**u=j7FatGOEO}} zYBDxwr+F?u*Ql&BW7M!nVA!s^F}mKW5VR2UgftebSVVO*3PlYw5%}~VOqK6n*f{T z3@$!WL<7c42)$I(imGRO~Ja;{|urX}JF%ks%Z$biF97Nge069&$ zg{Z+V0!a=e`^&@=f?p%+P3{vAh@o>$g?AYmww*5}%Q60i;${@Bx- zc!{5S3-tQ5=BmEnf-%2{n!mY}6Zw@eKLb485&!rmoxWV3&4J)`NEm2fbo3I=Av%{c z^U8?;QSvPRVJMQfZ=5B4dpyo6eITI8b@q*hap_Ad_T_Yimjcym3K2;uLG?)}P}EMR zVmZ;IJA}PF0~bRD2m|B1!HAx^W3yo~!_Qw{jC$q{`c$U`naoyy9|nHQ?qih97Ny2d zED2s`#SR)UVmoxAoIv1^kcMy&Pf_`2H7IeJ6VS8w>NvwSfnbS2E6_YJU7%TZ+6j-l zR|0zunjNy%oqngcMz5yu6NSq_IOr2ww|F?0^n2LVo&I?nXtvV30ubF{!8gI&%jiYh zL@xIsjyFY3K8J}D61%@qV(O(nJrPo7V6B#qZq&WnO}COM;t{33AN^fhQY>xo`EJMT zD;GkTlDEhAueH=pF>+shfqF^IXKtyetxuAf*KcN98Cv{4lw=q*J;v#b#rr>ko(`^0 z?%isTHJbKa0O-qKhX*#^7wRRS`rREnTU04_p52{WKCr;DHMDm{AV26V%*a>2e$lEnpm=En%CTu`H5bv_PG zqi2}}u`>eOxb|Y8NWxF*;HN|g&d~Pi{Ovb@ zDu8lKVa4|J+=D>M^W2w*NWS^&{U)B}gXa`Io? z4pMN28X2&RVT@;ouX2P6psH8qEA7X{6~d;<>!ckQ@L&~v1QUgTpX5rm)cLM<8J1E$ zr)NbVYWz_Nzk7G7QbK@+C9+ZyqZmdkd8ncbuV^EQ=WrI77d(qj#A;6j1WuM}P$en7 zrNKO6oVX!?71QXhZuR=yg3k|j`8vUdYbm}MMVvooLo#B%^G%9xbq#Zo8-LNsv8#l+ z^F+DBZbFz~(9Z;?g$nP4KS7sS*)DdQ)y6pnKVQTk1x!jFdm?qY?*>!a(&D>k!Kvo8 z2C@wSt{Jec=l~suO(LYW=Ei}2wdcmb#f5F&+yI`)d&znv{azOP7DVAR?L1$70+u+~z{b_raMaVwged627b5q>>W zc7@h4!TaonO1d=;v$DGV6xw?XQT_b(H0QlJ(j}lPtEm}c9<+4r!8KtMmxrrP-LC@9Z z=A50RKDwkmw6-MyQZ?q_R9>As#w856?GeBqX%5=2jF=1 z$&@l9Wi=d)`6ZPza*YcWiiWvL@U8fczVj#4v~620`f+4qZw&5C9C2&C1cs$P$%>;$_F3 ztpqxP-fTGMsy>ZaL8+z@qmbjkR8U&<<7e2g+IrcD1z_|tY|D<(f()_AbFW53u2jzULrm3Tt7MW$|I$1t?<_1@?rGuMh-vZ+SXfI-N*IX#+5-8#eqMI{q+ za|qEM;1F3*nngMTiAMSjFM2L{%W%ZXCl#q$`})enu7FrdH5j);b+768Ts^>5q_>-_tBs~lchRXnp~ z*N!f4z97$t7f=+qsLsDED49fW)5vla$7yWeeGk@}wQL8vv^V~S=~S}tT(|22(uoo^ zhrS~4jZl2R005X-Q1}4d0QsHx0Lsw_eo@}Io0NUh&)eR9f42PxfcmMy{@Yhr3{3Pa z|Fy%f@dHpTi2ndo;Z|a@doQ|~Kkh>l-;8A93pEV1VYdw0cBEmgqr;cCX$S%Fm4v0_ z7Ryg@Hm5k`h;+t|rbtZ1i%RWw5o7cH_-1nd8Ocs9zp1JzA$YFiU<(B<^5E~~G=FLB zlcv4)k`RfG8U&Io_q^KzQ1FP^D4fVz7F7F3>b)n z=l7bzm(w;ehu8;-Pca_W+Z&0i&C%J$^#==dW0(d304B?el$@}i4$>1l9}Fdn3Wd$y z65kK)CnrkX1_78)TB4hIiF)Of$_vS0ngp@I8|!>z*kRu8`1k0xdcly2t4`TZ^jG1_ z>Vs)BCLd1CBK;yFj49bA!MOgO$*(oNg2t^U80d+eRlGwp|1!`{oU(7m&#=lh07%ao zP<0TD&Dxoy9#6#0NF`RmmOaP$h24)8-!|L=LkgM|RFCJM7b{vWq*<{P(F$tMkMUla zvPV~{J6X7Jvm27P%+=#v!@_YbCBDa|KD7|>xvOf5#!I2lqa=c?>Z-SwQ7#}hm%cx| zY<=6~)!O;m#+ECtFX8MCOV{!7Wz*`$mqJ~eQhEHxuG)9?&+nEqPuMm2DPcshPwFtrLrZkd;t&f8{S~hS-$pu{x86b9$kPxE0inr z?+4RRa9HVmVQ}0mzGpr%c`|G;zu0A=Mt<9Jr2>g;&`sHUU|>gFq=HosGY{C( z;(l~o>1d1)My|oad=C*66ARe}4;38>jjT$l&!Tk>{{V(MifA)FUr7IHGO@}X@W5N4 zJ_k{9Ix@9^HI6#%tXV2xiFpd)4Y1-^)vjFi*@@g)Q$Dp+i8_{R6z z7tBMPqR5$sRnX0-@@GgMc)RV0Go2=y{t5ljKwLx+hEx1U1DS*$4c7iggZ<`uq26)m zI2e(qt)Kbjy#LJagh!OuyhrAr`E>&#rlM($Lu$x-X|<++>6Up(KQP*77zYfASyV$H zEY33w$c-&DeK9wouApoC(}X!nA2TQ0mVJO8CPm7_yWD|S+=B9@3Y`c60=Og~&q2dc zf>k_&k2V+UI~f?fyxDopclUD7y;O_wHCAN%EL%+$=46Boc~w3>X3F90sNyX8RHf!%BmI*hWYYQd%VQMJH{uWG>4CW zu|A|*=T72%<7&|28;bq)Q*s-Z%bp+hIrx-2<9F;#)!KY>3SnZ)Msy^bR(fx(C*Gsb zheP$Tyrdui;AuKMJ{RaGa8N;|;11P?yl{sCf)?EP-)R+TuI?BG{bMumh4~>@^_1&N(D6Uo zv;Iqa=zc5XEEz0Q@P1k3q*dRsftz~e&c9U9#K+L0ul@1?KpsD8&tnx(761@}yqlK0 z9TXR!pgoqJ3b6ZHePWGqIg% z*OSP2#Mm52W%IEw=I4aVhEBoR3J(3*@ib_cqfdU z9sRcERI%R#Jl-rkF^{;)F6|aVr*HeDII+cmbCMMR`O6jm)A8V5s>#|znI|3`27Yp<+r(7B8 zODfDnmzK%Pf-d`99N9ssu&=1AlM9YU2rl?TweXlNJc4qPvSliKe37ELO>991B5b;q zElL|*nwTO(3X?5^$)yDt0c$+&4b84}kTJnpqqWCC&gpBSB&fvDWH$!QBHvNRlvxpf z>RANyys?85($xpZL^|~HHNynj#DVbq9>eC!V7_$Pg|8M38S&9 zZrPgUGt(BzH7!n!n6zrw+D$q`nhTH07KHv7TNa1Tnx9v9r{s#rHXEqtH+KwL`9zn(KGGSpXcZ7Bc~GlfuA;29 z*nb)?a43qzqy(ipcs1pW_AL|T-`O1-m^b~^h>L>X8m9+mhV|neTg($}1;1xr`Eh25 zD*%U2f{`nLG{{XO&kimf?OT>x*8st()H$_3O8b82La9P1He%*%Zu~F7IsHR$wGpku z>zdeSO7n3Vuj@Yq$N3Mz`I41ZTx_5~@d?I+t%k1sQbDLeWP1i>Md7v{wHL+Cfridr z@YnYVZEh{f;SUW^RnRaqA)8ZbPG+|7yHd!h#+F(Hl4#*wW z3>kEHJfRpgEBCq<)&t2F@LfPl+kizKozY3&7u8sC)wsZGTp+P6?zgtIgxOkU8pSjj z$v_F)fT{ZSRc<+G6UhST;hWSm!l?cPNe4C~aD3+kga|1mbK4}K`=*OBdAKU6yP7U2!y6VKA&bI z1OYFLVqn;j<;RO5FDE2b!Ug=j6^fTG0$ZD96d?*Crr8`49W7rqD$uUQ}! zDT`2wat-$6AkhO(!2O z&q{TBdER{Pk3Ehxd#8G*a!)FYub*l628W?y)q5-1?PO z9+YrzKkx%Ac(B6~izzB(Dd1k9 z^1RI~Qn7>_g;ZI9w!GSJaOav4CpF9MRz6LrM7@XElKzLqGws8DIZl(~R13id6ZIGr zYATf!p8)wT$F%o?HNL5)Q&Qfrpq=3o@c4Md5*Jjv-C&~is@`4ZCi*}RA7Fv3>XPV)dwJ*O!u>YiQtFuAve>O!LUqu#TNFfy&}D0kz!`~`2gi$6im$CU+ZK*9eA@{5`nSu zrN8D6kGUlD%YDi$BwJM_9-We6K0zQe;a2vttkiw3qysh#kLkSs^jq#GTU~CPoRZe8 z{?l)1>gNPBG*_pbEbXU7)ZDLXu?znxkkuj+hb~dg%wxn+K?preB@J4aZ=h!A2FgJ= z1JjJQ)KmPV3avyF#J^@vST=$Oz(LAE3<&@P2^(b=@}A(Iv-%IunL7m(@M^5-I@|t* zW5Pv&I8AA0_B-?N&(jYx;9B9qy)beaUAkOQu9}G&rxt0^g$+4>o~$u4{A3X2olpIN z&0DokdiAu3%Bi$11Ebncy|S<7YY7&YE5`?<)e8{Fi9QZ^nU+~Pd=txr?iwq$8*Hqy zJb>;31E=hz!5$VLbT5y7Z=+V^^>U?}@+z=5At&6!o?_?&_LdLZg?pQq{BUuXd>cbF zk0eB|g>rgbz&?Yi`e`KMQI^+oqSse%Amhf27?-?mE}kz(WqZ>*1HQ<39AGfP$5FqZ zSsL=i`22%d{D<{#7YsJ~z?9vw@Aql7MnbWpE5>pZ(_<~Yh{5(+cXHO`Ev~H4tVmU= zTXhL>l=p=fw56B``9CUr8e^~M+7KDGMLWXR5zG$rj!sA_ z=O7apaLg{PM*zXbHYxZq-vaU8xP@I-5pO4JDN1{l(Lmp2+CkO@j}-Z_gF0WhD0tl1z=1oW$zq)CCEl}4Ie^I-;+*h z*FH>T*`GT1{zGprP9zK^2_&>g&G7B~=5z|Sa_14u3}nZrBti*DWTe0r_s1_Yck^t@ z=zC`SgL2>bpzLyFzf_fgc;wzgHg;6637L_71oqoR)Utb|S|1p;=OuYvq6IIQ-U zP$6=WcIDm$2YDY>chiz!#cOvmS?>u!+9OqrHs zOygycCzPbZKS$jzikhN){hYVxb1XkB#7AD!vM{ip_}X6ohI&cN=N=m2BKrZPhRc@Z zbe%#7^9b(XXg7<0G-Ip60JO(&z?ZH{EBcFYoVjI#^iHTuh~;hn4ab8lk7y@Jtxn)Q z)JF2YbZAQvH7>LZYL##po3WraA;NC%P$vchh;b zpS^WXk&+Kqw>Ae{d05nEyY@u)4N~Z3w=zrE`S~oQ|)#N`9zq1>L|cwA)0s zo{{+Wm)eCc4OpA5n?7%_p#;M0Dc12lo49iT`Q-?&NTzIz*s#PJ-c@Fgi=f-D#y69D z&rot^#QzjxBmODGE>Biv=yrK7pJjd`mQ=LJX(`jVynMn4o@$m=|DLkMrVh~i=j8AjIk>k zFxnw`-qH9pfhcWX;dP;m50Wn6nS@rhfs5!E-Ay(q$gyzEw7P3j;=C>uxXvs=p-w@WgN=)IX#zoFr8^@ zF_hlG#`aALaT!}m^qsr0JB{iiiH53@wD3|1 zrdlOc`NN@x*I-AVEcMK+;t_I_b}%I<8j#B~OorT3A502hGd)|2E4765aPgcTWDmzJ0 z9ZFG|D3?zA1YQmhRUQ^*BDafn5m9A9#PJIIS17n>0Bs?*@4RDIdlot*aYyl~7H5)o z)A{ERQ`6VUF~|gS3L{mk8jry%7b@aGn1=^@Jz_`49m3=~KD3kmz>;b=*t@r4<+n;8 z?TPW*u0Ha`w^^C~=e1)>Hz&U6Vwmtj7-5ac=tgqAd zEfDnWxNg^|5Atn2TYCod>Rv%lLp3nMKG8nz=;DXNj~z$H;3l}y zXd%b7AAc`Ve;4y6%Q6R`ITQh~1<5{t@sv?^gNS*c$mFIf?vAs@uX4U_0 zrH+x#tYUy9#Wt_j=rg$lv&vY*6q9GO>hHDfTk!?0RjxTJeXy?#sOZl!p4vX4xX|L z=bEx%J752WOPYkUH&bb1_S^Re$t$RL`ciZEtxA~BMr?$VY%(@>S=UG2JpS3l&P&G> z%haDuY;pQ$6Ja)skAf~(OrCFFRGS87OE-wqCZMSgC{yNQAyBKN7Xs(uwWP9oHtcEq z#XqaB;9RD2gv-pRvOmo)F5zg9Q=lbdj@crQGSGF}6kO1UQQ0vq)(GCSjx~(Jy$VpW z`T1Ve2IUbG+lt9D-eC*f{{Gm~v);LlTkW**vRSo^;AH?8VS(=1${@DYajtGf48b*% zORcA&Xs4m6YPtHK_FfSSdm4V3s?bRyq(q=0&J++AMu;n~ICLZCFeKFZUruz|mX5KRn+!{a2K;sDXyQ_a1O@$Rl zf|yqsi}c^NnzvC=>m7U(6v#F7&aryX`PiwQ&iEv~RiRb$9qEXKR9-u}nwDR8IV^eK zY!!^ylrwN%m)PP@s<`^`&8n{&lgpck8z4Mf!1j^(u5wr}m-VgnF;avkT;{K22rqjU z!{Q>{L_l>&B)+gwILn?n#x<<4t0ZPgp>t%-xN_%cx*>@#Dg0*YW{@}pJ4W6@Q+%Xa zqlu$Eb&L60Il~mTaif{aHoaRIoI8bB_gX3-%fO}cE9XTe}R!l8jW&G zCfIpsCG;6j^TqxSX>(|2jjmZM1t^fFO6=NpG{D2 z{W>fN{v9w3dEu(&D|YHwH9NQ;G#u(^<8uMbPaQ@led{m8e`oS5lJP3WI1T<$7vBZS zHqLCzz|5ZxDN`4kyee)GhM8&WZWKI>c_>~TZ#38<7Ys>~By z|7hs>vQC8!G2M=8OJan|^bAyf|=FY>c;mCkv|tc0gJ2i!LJ7 zuWv3FsnY1jj+g=gCmq&A(3^1lbxb1ze=LECeHP?))evjkAo|SinESQMIcVwZ6Vel&QjVjt+>2v`MS)@8K52HA zv`JFhb;L3uMQUUaG3S~B!JC*glF_6`_e`Wh`nDPLOi993L(CKZqmSAnlp!Ot99Re8 z(+dNF=&O(3`1)_39%ya!g<-#m&x=Es=~92G-${{g7>Z-pOnH?pXRDq&R^4yXhbmZ+ z7nU8)>hv#Ak)x;A7dr!wW@Eptk18L3>& z4Xca$%aw+zbsE+o?1Z>$@7{uL)o_4UuD=j&Ihdjns}6O4nhn9xb2LXFS! zirKZ}2E!p3+Cn4L^;0LMPa5od;g{6+<0F4zYU;_P^9JZu^`zs!diCKSz4}3NSmxU8 zKn4wEUN)c8+j{RfujYgRCq>sJ@x&4rR;*Me5)ltr`hL10ZxAu~4MU?TG}6TvrWUxLh!As$5F_nd?!zc7-8SNq2$TlronHDLYx5#_Bn#}EBIb{2@W{uA zzTr!V$aehJ{SQBnTt2S_?DrjDpRaINz$tp5+RJ)v++!g=(aCzwZGRZE`i+v+Zyq~r0zeBX@4E+N)b?IIP zNEe)M9MI&ZF0IEXt&rk#XrCg^gP|?p!AatWoGu}+{3Yjfkqr^%E*@xBh*xvWrmG_d zAA<5j$*~E4j8E&%)e>| zbN(b-^{EY?lf8W67p7SC1<+y-J=2=ypUWeZC+FB;DlzEJIZrM*xL3qP{q+}8(7!T+ z2H~YNfRypo_=|Y-eYTVW9bXh*L~*{USGC`c;$y|$0VJ!@U&)$j4ICnSnZN%x6;OW5 z2>ZCDn|sTjn0{bV`&|5nYg8(kt2ah^W>E~%5o8~P+R%xGbN?k zBgQBBW10w%&vsP7fXxIUaw>Ze`U)qqWC|aLQ5{q=lo<;}Np z=jgB!6&J-}x4G4R?SU2e=y~$AZ=b>u4L4 zgCYL$vNhfu0rzg~A%CEZJprJa#paGG7bGnA6Vcw_Kxl?>VxrN>wT|1{t{5whd|4N+ zH=ek@WxwIP6ukhn&RE^!cZu-;cyj2YsvsTXx<2*Fp5a>-rFKq1cXi7s1p0YHy!=@+ zYB+y3h=H*Vb*du?kJtzBC4T*nUv4Vw?Sd;FUJ3_#KnJ{lS;l{8){r>hTyDhSq&|cD zEW;5dyqI>kC&)3gpPa0})^K|5L4_4&H9Y+)1NqffkL4m(L&(?azjvH7R$Mr;F4X>b zc@zO*Mz!?sDPSjc&HB>^Nq#+m_p@!fY2=(`3!pDukA z4qjEh!hy0!Y(T*Ih~+jYlp<@{?Z25#gsj!_uQ4+ zCIRb4;RkC!uD@nD56!7!HS`R8_6pOn8ZS1T*4UBeKkU!*c9rC4f9afmkFp%Z2ZT{8 zAdJvk0AaK`MU;nAxyib#tGc8H(n^6s;;?W)KW_eHg1tvz6cYI!NBu@a?XGes+l;&F zsgZny4}XZyc8K33pD9qD$C~E0fSNOlJ;%1R-E?pC{gSG#eTu^ksR@S<$3mg+%^LmG zE!w(3Dn}5IKsc?6?Yfu{In*cGtozapAJExe$s*r5nZ&+u#bT24)Oi~KN1;4nRg0l3 zPK+gLhKECDwPHu;z_eogvyc6JQZH1RD|9f5{I@lnbo9ZEby3~#7kN&asN$e9wiLj! z_|Tl$uNe(XV^p?xxPI!;H7nZw+o9LkT&)&DxxDUO1jB5Tf_C1J?89bSPGPo0IBUf#0zpS8hL%#S2rr0f6yHx5>*(|l#2SC zsLP>I3Q$pOEFErSV~W{V=S8nHq=xdiy}@dGU%e{7GG?!8eN(HduN|i*TlgC10-^Rk zCBoDKkuA@>ogfvh$^bf;u4!5=zkF$gLTd#(SQ*Y`=S-_#R%u?l7QUr8C}m?JeWeBdZaoo(5dL>kRWPDJr3 zbi&{y!I=D8#rJw%aG~M>Fnj5`Y)~y*VXKdLbqCB|gy9dhOW>Xp`4OOp2vB6wDN1e| z7U2rlbO6aW>(YRSA*eiFUg-`n)5Ej6t^@*s>@RwA(6C<`0MXx}X_(w7zyzhIu1qQ= zB$pAyZ2YCO3yAhYK(t#o0-`;-S&VOV1`zFzl^^#VuT~%J=0v!8l$yxdmqWA^f7bFk+Ps==@^*;D1)8XTOQXR~l4OS$-XbeSzY_ z&4>SYUCYGA^`CU@|E1)Ui;3eum3*@N&yB@0W$kdq;X2C!zqg72nDY6)pm ziHiM7YQR^gnT)|zRoq1@Lyw}2a_ALDG3-;#D%eb>%(DN^5~@r=KPrcASoJIsHv@)g zs3H`wwII>@M+Je5385u{Y_Ob`jC|)2Pf}$q*dR^?HT$7P$kW$+-Rmo)5wQ5NvJn4!Bs+%p4kf#AwmLPD&6FwTK$3L&RMP%dZ> z(b3u{F&1Wsq{uV^wLDkvJ!*&R1LEfyLqs_f>Ef zxsD|!CQ5$YZnvk`iSma~MJ_&IpKx%+YJ_-HHfVQ`$fy7+KE zx(k{O30H*I5rri+g-_wc6!81N`7=gNu8hC%6P(wRHQbAL0z~uJW%9 zPkz;*d$~17HIyHTgSVE=c%8`fueq;BAMI?ve39`Tc`2?x zXrg`+*u6jO?H&Zan%!^Soop}Hl>9my9`6AU&ASE==9MHo|&OP#H-@(!|BCLWQf)Fm39y|Bvn zN$x|M?HXd7ymnjoSvi|*{$S$k5q4&D1njLm?v>fgV}(wg}x>YJsw>>xe8tSe!}=PWzZcpRtLQi@>TlsXv?azXkm^dexg}3 zl8sVZIxzISDNm^U=m#8bMipKz?fdLKE3iX8UTR^UfMU^Nrkb%l!(;Gp7k?g{51hI# z`ee#lz6ZU6~B5^*KGizAP;mj=F_Y7fir(<*a znzqiv+cm8mAL16m*7NXIWbP`KgNF# zy0JOy^y)8QC_8k$1UOeQB@AD|Bno(d;#RIsA%OHDL0pM;&TVegx~jIm>xYAea_WbV zqoz{Q%j@K{@;G?x-u^xiO~rh1_ZJZ-YP-rXwIiA}0#5Lx6;O%=mso=$BaMp^Unofyt;=6Lx04x>y=E$f+X|MpWr}^L(-BQ#<%CC#IHT z%yoUr+#h0Dx*gE%(d|Frcsu9BmdQQp4pHV)&ymD+_dlZN zjQMLP=NMuTh?{hldFtZwh-2pYgN^LCwOtpnGKP_D?>fvI;3rpe2?XzD+)aOnwIbR| zbKF)HY;xb2P08~RKJ}ZmjGL+Is`mcU!Z;xxO@Pc zn5URv(0ITfX$8$u9kUa|j!O=6GTSU=UTtvJt(6pn#1Me>Sw*lFvmIBHTMFXT*mCF{4Q63tGn%XVw}Dw zCb*q87y)6)Y8FBFOTHM7B8Jz~7gfN=P163C_FKnl|C6%;bphFOj}fv5ebirDB0*Yv zTzmUdrMQ`E{W@WZXve>{L~ws?iT)y+Bdzp}MCRDwxXXEI(>8I&npO;K;C?m6+L`+p zXa4bD61*Pi(1t&gH6NJv8qd3cJ$g=8aBLF{Hee(BI!UDeV92tx%uh`_BHD}bhF04! zZgo!S+wN>a<? z^Yw6h{<57c?g9dizIb3m&Z~^mC?Kq5E9By#{A004$B(y1L2B{rWpe)9p{{*c`-dI_ zq1sK~o*`FH#N^<|9t|JHqhYfa-{$5pWJ9d{)0b~Jpd@iP)?udf%<1F|FiLr16K)o# zZdRv^FSa>9VB*E^Z0}9Ba6aYg9DAibJ>H_*yAxkiEz)w;B>kE9K7L&%&+4eB4BUjo zC}!M|0~-sHGsdIU&PW5Z%Na(IYrpajaKYLJ@9-S|h3o!E<4I&MzR5{a65^K6q7=T$nvNlxtmLRWIhze!?eF;Go&D}Pn0f>yRAi`i;!o_$Y zlmFr_tKOF=muvmvDFnrfG+V8#EkL`BX5Pu|T_tJ>zo zw5xH%P@O%DXiVU*KU7FmIY2*9vz5Q#o3@dEAo~=2id*vQr!|RJBFQXjLiQJ5-$nhV zlt`K^Aa>N7k!UosAlVm$ZI0oh9HVU=q-x#)U>=N(BkKxqz|y1kCN!8zpbIBMlY1WI zP!>>?{cH&#_Y2%Au%@j5eu?#wmzlLTvLrrge3XGMnphD=p++uXY3|G^x1s!1ha|T$ zh!P1g2eG1|#JBU~|0+ zr>iA&4f2J5+ zbM&x2vWC8?QScpR?+9zM89uIrGYf>0kLOE5I%p5yfPj^!DvhDB15VM})hIr+s(N@y zrSMyvY;&UHADK{`N=w1!ArOo-Y1ghns;x&rAG~cHS)UF8`oL}(&RN``vN^N z;MY3ws2uwnv<)_@dJe zij5yO$7M6C0^pPpiq%QzU>0o4b%-EiUEl0Fg>`>NL zdQR_q&hGt4U&~oi5u!;jOyTz7m{h*iEz>twF zBp7BB`3w`ozJ4xcXw-mD?qUhSVhL>N_zJzqBE8o&Kk9IP7@psjAMQEiA4$8;19w~X z-NDkU9_1la^rx|+IMpYB%W2Iq#K~6or^1#3onDuQgV{h!!BKEeFImkIq7Mx6xsxh%+2H8etRODEqg|t9bS1L`P^`k z_K7Tuk}v4rjW^tDNFc4rTloU->~LN9@7kN?-`AKh|IgP-0XAWHZO8yK`g?K#*wy?O zU)o(f5M$8i!zSCvIwY|K8-ax7geaHV5kGm=x>3_T`~+~+rQy0$FuH+|C?Hx;lf+K1gXO&6y- zv^2NjL!JPxKZj;2ppgvU$=QMuEUwxgK2}RFt~f}UqM6%jXoYGO-eOZg17NV}#rZ-hNIWRaB z@+Z2GUgG3*N2)=Jg?;GS z*v4SaFoH&}Cy|w(Q7S6ZEtFw^&y&`)tLE5#?CqWp%42Nm)XuqLw)_CX1l5(XOiM|h z`Jv;M`k^3D#5RWqf&^$@G@`U(1jfBxYu zUo#0_7bD~89M9L;&#xN>MO)}4WDJsqUzj?ywB_2xtX7zjLC{#(4tnp6a3?-8HtCpO zDV1OOCF%#pN9@f}$|+iMLVKdvMJ{GYbFsmge>1s^RPZlc?_MP3sMEfh(6!lb(@D*l z33QF{(fBacvivw5AcVXjfm*fflAQxIGi`+zlGo?owFdDS6G$QRz4LA($27idGf$YA zv!A}G&v!$suZ~}FBu_>*c+$1Q583TG=C%o^e}pq){5Idc44<9Y98|d5$d*zY-^BPZ z?#>nve9%~zX-dJ4hzVqwAc`pe5`Ap-1t**T*nV=i9T?SwFeXnzRN+HDDG>JAFm6(5 z{=((0SFzM@Jvi>^5`hthM(o70v@KUrA)Qtyx}M4H;yKLV+Byyrk1{1-2+cUUQK{D= zXgQ{Ix&ni#7+&MD<$MI;sbR6+fUFzO-JJXqm4p^RRpMpKDpI~xEi6Denc zL|;>CRYkL9`%cl?oqro|ne!6|u4L*`UIo2rXMqI#hLVO~>4g*bH|WD}MO<+ksw7A$ zPtf8Hg-<1(u0rN!t05qJ6IXp6o+idWUkP$aVL{=3{V0-pCuGWywP0H4q?RTB z#a0Zsxv#nZ+}uzjG+2bH1eObxj5GYA$R%2;S+Qi~!gLxyi^`dkD^qCbUPlMKK)@G` zCgn@f!uN1Ce}~WVJOcha>`SA0u>!bxv4EbZnS{yc*v zDHH27<4@8^=EdbX{5K`WjU8)8I#RI)D@{1IE3J}*#r70HtKIDXX^9z;63e(xos#e` zRKbJ;$@Y1%s$;9}6H}hzyvFX8z+4(=lagakcMna5+tlAfbWgfo(kC9aqp!ZT-|6bEZ zyDG>Z5nV{aG&)MHL(oH6^d&N)_EExq=Y}phQYz!|mb#C4Vh|`MrgzT6A8Un<3tX7x z?Vc6fsdI>4a9UR^g8t{?QcKD6~dp(xLl1Pc6s+Po0TafSHgtM&`z(XN4{($ z>=DE(JkxS<-!zJp7^EuNA5pC1bLASzv^KM{1xwIL%J5AC5s))6r=4bU;}=d^uN_(Q z3HK^^UUK7??9e=w3NBjX)Jx5oaX?zDJxFFLZ^Z)16E(|{BvuVd6#)oW`2OhT<~zM6 zUQh5-CN^oy7}6iev`vAMn}?<1;c6xHVi^agDwR^r&Bqk8=U`*yp&7gr`fPVn1RY{E zpo#er#Mt@CMA8AKFHG^}2?X$-lEQ1%X<9|A2dJd!ZkHOiRKrP@PK2Z)-?%YIC+Cc4 z)X4d@oD)MI{`ps%t$l>st{}|Bths(VP^^BhbxGxq=SG0Kc8fvNK^0O}+r{3az$P1R zwz!(kd~fp4*xw(^+d8ve?CPfcC{*CmR+)A0E-x4nSALmtOnKftog5C>OS3rNs$334 zZalfta8vW0O0)>Xy*GN$i(h*#eFP&63@%~e?`nqa_NhvWZ8u2dkS@4?!^S^aBc&Ru zULh`@nQqM4sL8P{#Gr|=l;eG7l42TqVG9!Iiuu1xBMu9GU-JyqfCL53~{UO3f! zug2)a!$c>uVG5Anzkyp}*(7RKgf`2bnh>HcOLAa1F26?P2x`bjg!@u!rdNw~a0{K+ z-2Aie;21JX_bIfj?u;)_*&{u^Kw<|yAGq7ci&J8=ZcIeVnz<*G=V3)_Qo%Ol*<}oB zsU|}=H}wOXn)TDK8Qbt}%4jR%@`lx&!k%_sH)2{GL`^P3qPYu$^K$>$@a}db8;_8K zVNb1?ik;y=fyGvLh%{PC_Q|!E*)JEOfz+>0=M%CX=9j|1QKT&Y&bEe)>;IwZS=(-t z{jY5e5)4{g9oxVg`O3ur$U2CJ^j-@q{M?>2{BK)BO7owhUtWKGYqCkRokVElTow7F zs0aYw+t2V}9CwH9e!K-;} zyHfzR7V&lKa{yD>qOSdhhPsjWLyvVjGx11c{;UL5aCj-&G0!Eq96OAE0bo$??F4I{ zMtXN=Ll4>oB@fP}9$g#ddvOLW&t<5rqb`QmihxjUID9%J4DlsAef& zosE;#>y;clQ12s00ARt7P+%?~mgHet2TF!z;ju$2IZYyAnpOzg44etDIe+zNTZan5 zivBG64dyW);mlwV)B3k~Y&32$kC#Xe%b9U4| zEL5PJQ+H+9KAXza^2I#Kw<=Otv)w$r4HXC1BsOpG3k#u^G_TdmD-dzZWNe@M>*@*< zRF$XhR+6XBOknoi=+{v9(*FHr4u;cyQ;=Qs@X8Rc!u`7vt;{GOp;$&}iM3dk1*$pJ zHZ5%0$Avh!Xgb+GtnA=??V$lQW^WK!Xj-(L;)3y#~cw*7U*ZyL$Ke`%KnBOCZ(;XQ2A3|E|2@{CPY;%5_tkJl@}m#n1V4Za z1PrYy68ygXx8~`epiu;nUSWxL2+~P2@Qm{2j2_I9ePMA-mBOv>F+xY;FxgB+x=OEx z)M-adaEU|QwySzz+xBXb$FJpjVp7N_Kjur^0(J)a4Iou zTn}z};2I+3zK4kC$iE4q6XDzZED2K~vr5)RZMNg0UY0wQy+oD0Wap1$9y$bH`4~7y z@UvJh{Pqlqyyj!rsUwRO3^$>5rh+b?Jo`9Rs^0w2JAr3Vi*h5}7jSvk< zDyUm6de$6^%MI61x(gnuEQ2oR>>3vCU1R{TW-$i1IM`yxT08stM|Ric<;jLMVYb`s z5q^)jcB2q}U=7H34zQ@EY&?({MGi!WGYy80+ls?);@L;LmotC*g3UGHx0M`@kIUuH z_Rb;V?`MH6%WKTjC)fEASvEWmT}%Ydm?}Wm0QZeOmhBt7IosnXo^~I=xv--WjWJ{i zr}BafbYo;37skSkjF9p6`BB`K^9G8;C8~%EUz8aN5)htookdy8Nv@@4wciWTTbV7w ztUWO}HYickVu{aBau_YaA*ju_O>C{;A_p>-von7Ls7{M==Y|({?I@1xZF#z%u~e^R|`@hi&(_dglE{!QH~>& z^d*WA_MhD&?j0k2Y!^N25s+82tZYzZ)Z{x()@sNT`C)&~R1vfX=nq6ZD>WQEtMkMU zLvYY!ql?!RSa&2ig&geJ!J*vibD>6l)VqTH*#WW7e2bF<^MJ6k*iv^g!}y|6W@buLkPyW-P3JLZhkUk3sCpb>0R1)iD^O4XAJ4 zJwt~!epd5$|0~2G#(NgY{Ti3wPDHdjSZiRw%uzbMa&Jg#+0Ma1ovDF;s)A{@p0E}> z_2Ag1-|9?@I|06jho>+QG{YH3yfW3aPaU0ZVIFpLwzOWG*?z<23-o&i6J`#KF?YG! z&|mmc{0ElYc#p&OX~JX;fbCUc6(|1yCZu=-R%&B^aNdl!01q@yr45Q4LCT(sGcj3% zn3x*FR#eQBjDFsy5o3-Xf^FE&Denwtc8z_j#9C>5zxw@n-JYGjOnCdBmasCDOT!|~ z-Izt1eVb?-lts0_E`jHpG<{qRc|?l$(^y^8=3pPL({WbMQ%cOgS$^&o7Znv3VbPM4 z(~*nzP&v)e0Do(p>_sUlECG*IfZ|(qo zrPxM{ZV|-CN8B5XHewLd=qswmz48p#dSwvHaPhCx3ZYjp%KW{F9tCh8bR+GzJWSsB zL7CpeAoAX$8$J^-T07?w9@VsE5`GZJ;>$nR@JjNVMos#%qhwWzDOE3kF~Q@g`!_Dq z7*p{ni8`VpZW}-^GrL!+8SGco$?h(}3{i(>^t|SA!79mYd;71T$GYerys$~ee+H4; zQu*~AWr*h<;PBq?e?N3!<$3pk_ z+*_<13*1xpOr#2n4kYuOCgE%;Np0IynMTWjB$7!20i2;|p9!NUPUKds5z^RAH>zAj z?-GQK)XyM3_kKO11p(ZDBRKzUxdONUwp9~5CrRiz3;R*L>YtQ z$#HAXo5;-Y!CtP)Y1miK(@)YZ9GB$$k9!YmM`J5@)m6P4-QAXNF73>`_x=%_;8O0T ze+8%9U%|OK&~f>A{-9P!_-NekENobGLU7qWS)k|IRNsw@a%EH~Im%E5fu`pyWeGt& zzvXiF*1)j2s`i(FqWuQ}?RNf$fIj_4_JL=JWgzatS>~mS+ar#)=O(=5jDvF`3KqVB z<g} zc*eMB=9>02c@WR^2V92+X#|n1Jm4cqQfP@mKtlRzBR@oE=sx#Xbl`e;8nQ}gmb&G^ zB})P3%6|bUS*@dr$*D2ztDMt^k?C6VA5jd z;-c;HL>GGGiEq`SV*a)0aB!>-@xC>8R7#QrrhzLcCYxQ;oQ3Xo0^2n(+5?jRQk-ta z0L5u>=$-N6LJvGz_It(&ZYXp5zXH%BhAsdAt;X2Lx@bTo8nZZ8(s-pV(*pp|ccXJ! z%;KB2z7Yu*osb*ND$W|j_=0*2B1y<5}ar^ z@G`lG=DU4qli4G1k=X@0x;K{)TmJ%}E?0HFBH+ZZGl~8ec-SjI?i=tgDnQAZd*G}S z0(YFrny`SC=kMq+Q_Bxc~sh3U#1u)eq+*NM4&K1>% z$O&ouCCdRJyCz-o#aCK7jwd`^OXnBF56L3NqU-*j`THqKqU5*U>aUBtgnW@OD@sMV?_>Yq`qc)~Qt1XjiNEh*K)qgngd87BGE z4GRm=Zr#WE0_{ z>FfpoHkVJUOj9J`7e|v*4Y^jvY{;r?!nDAn3IJM4#iINko&W;GftcL5-b}c;o*PS)QefnhxID5SNJxac^FO*|`dGBUY z{FAOixbCT`5W_zWT)y0F-I0$hfb(9Dw6)2~C{uoo+Mb`q*umWX&gT-gLKPMc;@xq2 ztz*mTG8pt9TqHmu_XCi!jugQ7UTp#(r&boT@~)7S3JSbS?=Zg|OR^)o_g1;E$9%`X zjc+Rvyd3F#IZn^5m!Fch97A(HjF;=l?fqdTAi%uaZyr-dY@F+JWg7!8&r{ju>!aVK zeD9{DhT#Qjzp!rhnpCyINHS+xMlM{{0d!908GyJt+04qb*8w_*z-nD5+f>Qi|8^pr zc)^cC$glW5Vh;KyAnbAY4Ds_`iNf=zvLo={#G*%+b3xCDBeMp2R=UTUd;5s>guSXz^^_((J547G93 z5&l209dApJTj9o_3P(`&QhnD;O%au(Vl0|~b|t#dXwqqF-zP%<0~{}|AAniGjFU4o zFYo4PJHF$WH@S5UhIOqA*Oht|yuV!}W0WRiAl+tQc`FyS+`K}sPK^@dN>Uh>gzCHD zg0TA_`#jJ=+n+y_?gTij9~S|5_10fx4+t%-t-r|r;U8Y@?E;jN2X86PY_;#!+P=+L zM|b&3+KLMxl#?d0NL3R6lqkIcK#84yQR3QPl*r~~#=vQY0mgGnw!L?rMiCQ*RE4Jr zAdc~X)9@^2WiwV#sY-DyA1epF4jXw=NaX68=Rd(_q+&xj8}_NdD~iOrMIkXf$+k5JHqQuVBzjt3VP7iOr9`_oX-*RK;FJ91s zXFPmIO{dA03(`#}G*<_3wV0+bQ>C^kiNox5K52!pX%u}>=ig6S4HxnGEpKuObwOqa zF9V>(Zb_%^;n$8lR&fas{a!qsPB(?>aw}rwS!_*gxXXORo}v4uTkw3}2<>|XmNnU& ztD~Q0X3hA}-SiL#H<#HC2&w$UJ-!DmADrywC+9fIlsA+2Ld{zK{26Z=?Lebe%m$#W z*t7q-6N!ZYsV5S&QPQab>(Bh+&g_OZhs|&^YrRhMed?7yg;5nKL z%<-)R!{SqqgePfb_ge^;RUE37Jr>D*J_Bk_vzk|IlJ8dC+BZ2gS?j&{#Q?cw;xB<0 ztFfA&w=GG2MepKcepIJ%T?an_j_%gXzHgm!uv!MkOQ3P3aw-aUuzPhA_B%!&n&K^RWf$|#|!QPpRw zOV#y@jj{DnRFKrBYt`R{aI)fRM2I#euK24kvH~-M>qyCD0TW4vOE*bNZD0+cPC+on zfmuTkU5nD-k!7jW%|3(bws4JO#r4%e=uC6Y_lndfgj$=Clm`bgM5qYrI#NU*vS7|l zxRxhi+=T&aWjcd2GeL!74plO#z?9So4r}}l-;s#eQDf{RiwwC?D$z(B{H0*MNUthI z%igId6yD3Fpac|1)(?fqrX&@40$bUqT9Opf*P}oK6}2G4r7;1VjxDbA&7=t$6gYr3 zyHS8kbz>vV5&7E65E?{Zd8~^3CtLs^eMhlw6y{dNuvR)ms{Xi9id@@St5B)U83!ae zg*Hb9sva3tnyVnpT_rOrRDvxq4dj<8sEEZvAQaBH)>s-s1&{~^V<&Z7dO2}%QJPl+ zjG{=L=(VbGlSv4J+(M*Dq@o_YZ&FpSP@`2@Z7)<6vlQ(L-3VLvU{N+nFdhm>5tTf` zAQbE_-l+y^JQkyXpl6z5p7G$I^@+;HINoY?KV+#n)pg&h290-~HLj`tHjYMNP{|kjEjDx5b&N}b0Nhpi5D43 zlRZUhZTytrri~Bw&-2cfIMkKqLWp0o+AO9ZB`LoY4<0Z`-(SE4>q|#F`B^pZ#l%G$ zkFwK#8J^T#h8G4!QZFqlDxeXoM2ZgF$6QaY^xAo)>~xG1U`HFJO+ojVpHf&;EY=Yb@IJ1GDt5TieUyeQ_sk= zzQ2#xm}hxK8%@W$l+NvLW^$yuwWfxaj&qDJuCA{xW;wI$A6R$g)L@Wh4qR>3WZ4*mjLf#}Mb7ToO6B2)IzR;PIdTb71M=!Y=N;bFxYee{}a` z)&9;p#9&r)h+jsxbT{~lA$td99nkfoTl>ePoj4m~&Z~p34~tj5x4aR}%+8}xM^3`z zLp|uOWpnH1QloNOl|L*tIy;t;lwIElSmF(9S7Y5Nd%LV{!}@M>yA7^(p!V>$m?LE7 z=*GGjb-Wr*U5CzHcb6bHkJbat#xR1FOVlxq%Dv_4>8~#t0a|^tjf<7YLaZlmim6mJ zjIfOYG;Ni&_=s}E`l}`Tok_<8;zYP~;}J4p3T)=lILS~M=6$~LIGVTdIPPujk1c2O z(c1`_mpmE1Sz5OwYMfJklSo@6L^|b)@sLx;{C1r(dd!UR*u&exws&)S{(l{E={j{D zy4wVq)qH19*^Jb7!QCu*BUSFMwdNzrCP|b_cOENJrpRVKNs=s;`ED{tjVM;SG9CQ5 zNbt7xmb|t%<0;JVMQ6xg&(*nAOGp;|!Rxg>z4g@a^pUk;+A?V4a`s(FM?8El;=jC3 zcs=QEL)1YKCG3uiI4HW9PNF64WIT=La1*!E?uQ0uaofK-VtlT;%sfebK7C1jKHlIb zb5GzOy7juQ)UVza-#(AOleEWN6Xvh>)LB=ag#&*4uIt=+>^^&wutz>D9g~X3z@^(R zf5l-uhYK5%xYc^)xo1c9^}Fof;0sQ+|5O$D9~(U~v;IF+{b@}EK2^j&#YaOOuUXRsW>s~GCKw!x*TdZG}wIdxvEEb@M*Bju& z$O;~%y*1YvfDN5r8$C9%c!v`H7iDi571y_I`{EWn5Zob1aEIU!+}+*X-95OwLvVN3 z0KpxCySqEQ#ed)P-ns4UciX-NAL>K3)oPA{IaiI|$FFO%Wx{aITNn5(p%fb9dYcv@ z5 z|C*hcR;HJox{18;NwQ(zgy(F2NmLo%tlLkgL|s9 zHI#Xq@u4o0_p(1!+=N(5C&letwy+1<7tS7p6rU%lIO?go$by3(4kA|NB;ZpT1}&Z-XH>;ciIb2ZBbOavsxMzoW_^vswjw|TPmb)*L)K?rw$d7Ur#?(|@H_JUU% z9$ON-5S}?D&@Ix|=miv>VSr*!hGAByf=glPOudo5omrWpne*MW9M01b0Xos3pFZ}x z9+&|L8PVqhOfsZUHh=a4e)UD~g;p*xn#JLB&CWYYY%xlq5~dULf7 z&8Du6u)kGeY@qI4NKxc;>)B0~-uYlZcWqH>Q$X;q#d>l~zB}ajFigIu?=15#deqDN zdf^mFK1Y~iuX={igtARZ_DDU?NmEF56w+*dD~-*)_45*;M#9JKNGj2ZKKS0+zHgcg1pUFVRS6BhjVQtx4ravSZ=?i zdELp?mgue`V-wA{H7e^Vt+)zmc0mhdR$ixJf->^mJ7ZI`4>N}`_ZD*w)(YrXqz@(~ zeyzuTH;k&-07N@4Ww`E1><)yP#mQgwIO1kDqH6|wm#6D3o4@m+Q&dP1jws^!Ih%8z zX@l?2c-0#L75%L?LfyZmtzzUqTOBaVK;!qqs^XEEwB_8`5waW ztXflu8ZFx##*)FdsH;Zx@f4Tlm}|5~r&&x>v?fo*oKH|zd1>iX2tUy~qetb8_?C5a{nQ#CKlfRY?ZI6{?}aKMg- z!!vd7i=87x7t?1EQSKuMwZZN+-=X}2M7kdTo3{JF4nw!(6O0`_PDoZCW)gD@TE#m& ziCRg`m~fBoqp70%l7G2)6nMnaPBX4K57=N5`qx3K_nl_d9{yd`=bBwG+EOsIz!VjR z`ZrKRg}m3)6b!Ox6xJ{#DFr%_&s-TkT!FmweBiX|L7EV-nrM8t{O*cT_9X)XggP)+-Q!8a|Vl}e1Yx@3VmZaGnn~H zyjq)o008MAA%xkpd;Tp3RsH-4|1J;`Nqy}CQN(-ga}OTbm-V;WuGt>567UCaUQt2b zc%e6S5iaOnu&CT0$b@|q5GRiCdQ(Gu?86@5jucwt(uSuDr9h&PdDgwbh`5{1cJ}BT zIxe7nPI!!@I-6(TpRu1;SlRsaq=ObL@fr}!p3pbJ9pvpodDBSnrkcHDKatt220UqU zgPZvIX?;Mmj)SYQJfV-}ZKz$Nho42YjWk1-<;*DWnVTu^cnpJ1c%&I~%+8JdeqY7$ zy>)pT)LfnyVFm5C0*2T#z5GL3R>sOZl_{di;`<%u+=><;Q?1q*{aUHGl-XN`Lj+{1 z6%z*3OaztX+pexb($3tdn`dzDs!b23VdN&-Fq4Eh;{<*B`~>EFwAZCcIedwpivm9M zXpacJ0!KB;-t?#;;0eO#MbYCDlB&n@V0a)+sZ=D z$JW8uiO>edjAE;~><|~YfLu7*)PlbHG+vD1s=tIys+JuTmF;?bWo(!mPsm}#*_3 zs7zp~I_N+ds^X-@aB!u7u*OuhyYE`uL#f7^hiMCO9F4AYj#QDao_ec0;-4{ALe{jxJAD5OHUbE9RncrTax zh`-lbz*`umxF8!&*Cj$i>LIs{5o>kX9X6e!W(){hbyyQs66@P|dab#KIp z67+2>$hdWb&~={8Itdi>ICLPJzWu(PES$cc-}N3MFDS!Ph;qDgruHOKeMWIHu$r@` zo{?9v9{3vi)bTe+6uMDA#2EejchYm=I#-1=ARxZ5mK$h>(ivbRnQ5?Pjl;~HXu&Yc ztmR7mTA{g@*wbh*Fprh)c*C&zOX=}H@9UTu{(Ix#{|)eF`_I9<>JmWyDRL*!7$Eb7 zpQT=p&O6QJ(;#dj?ZgsBw%lU&J!O{mZKQZw|IUOaW=6KymX}t~=austWq*700w=1t zLDwc#m-qa4k5aEv?=83}1e3CZ1+&7uJtcUXPhX|W#8+{hli^=RUIvLq_cn{U2ag#^ zpUQ0nM8!#l9N@uQ=~9)01xKVVEPdLZyPe}}ETNEvIH5+o4T`V)AS*S|P_mv6Ai<6{ zSbn1ii|H{`(*0#>^axJtsqXywRgEpq*D27f*_Zcb<~JsI8zgf=v3&=nfiC)HDZl+# z!S+r4(GVXhhwNxTaM)t!V7`k8i35D%j=D#)6->_d)Nv>ZvO~``mv1Vl5L6|q=P>G+ z(z$Pf>$kZKn8N-_KBvWdFc<$Ms>seUe_RDjnM<(Ju~1S(>DQ) zR{8gcPqCH}x~T^$IYMmVZ?C!KXMt;(@ZRC0$iE|ovrinL@E$bxheUILn>p(r$cK9K zUv4h?F{HKGG!F9b;P z1pO4Dw)gxM;BcA8*|nZLLPd~;mx+gL@Ue*QuSuVYW4TQA4EcR)(P=Q#_W!(k)euXB zrDExC4QIW|+m?FJ5w6KVw*-e9w!p-hSL-HF_Uf4@J-1&y6N65O>L03yUv4@fwaUSN z16XG;Asn4_{W^)aBNOHfgH4c1WnPQmq4rQk2~n+adx=>ul$o4rC`KM(&aCVKD#-3)o2;h-&v{euu2r}7)Q zJJUuUuVrS6tgiA~v_?C=2QPWVd18xdXJ_`^t$4XOJ6$>^e;s5LW0S>OB;%SdAQBZs zE+tf)>L|MFRFT#LK{rzzoQe}oa-F^`*oN(oC(E++y0^c$30^8_nAi@eK1hwuEST3X zvs0uG#rs2U+Jk?N`-A`a41MUV#7m;$N4}?voAa3?c}~9OJ^BG=Te-o7-*M%iA<1+C zI;fEf>V+>~1bRwDp8(h*Jja0QSm&H9lQLvwjZ~dmyXANK?WSPJ$Ii_S5`k5|;)HS9?EI&!toX#X~GbBYCTqOry!F%|zZacq99g zwO~$U602@=Q7vKNuFwpt71S;%Iy4h4`8t=ZbcPT@^%+8gi=jd7J$FOAeV*SlN4r z-oWPdox|*KQ*1;o*4Z)7eznTc+x^G={cPcLy8X1hVxyRURgn)*=W$DSfZ56|l`zw_ zOt8A~+uyo|QJbGKddEJ<$(SXSH6qwY4_vNsdB178srD5@|C$2{$D}lzAg1u<~NX1~<>mEpZ z!0eZmht4y}Xnvt@>o;Ef*nRm)Rwt#HV<((g=BZi+6Z5rYWIRDAI1r}X^ILYJY26j@ zA_O7KuID&xd0|tMiSfBfC5(5LG)n4Q3P~>iTvxtdl;ZsYI|r{xnm`JjjN`3yUMTVZ z@3oLQV6Dc@pG7RGsw34LmmG?uq5@IUyj=@?+%@3(uQ*kzjp$Gv?e*s~a&84-#B?N4 z-2HL=VnmsOC~j13eyB|<2u?lq-K{qzM6bE#0<;94NYrQPgb0?*LX_!d_*1SY19@GaLfO#X0GP2H?tPBphnznihd{qs?-2ATZ-k@&IZjp3}t&yC8Q?wXm;t zE>8aN@ok>lLPq;FNrmOozz}^KKD2R2N}TM&{%*oJ$1*kOn1gHU+Ap5!FkAHQ7ORx6 z56sa9UpUH{=$(%cBd$HPJnsK89eo6RgVqZ8rLmJ_EP$;3;u55m{8zxYBoOdz1F1V2 zA91e#52Um2A4q4jV(vX7W1e&_@6jhoE9mgYasI?hveeti`KRY{Aa>QvR>LGB*dxTj zk)CO(l5g7D@bytCwZEo<9mVsICLqtq@cAdB$Yj*$BR2|cu#4P_6_D_)hbwit+3(`C zCGe4L6o7WuWkSlEbeH~5Jmppts^EK+SV7IC3Xt%<{*mzQ0wjFH{wv{I+vX$To9`px z+Z*cXpM-CQkA!cYPQ#@CO8B*lB@cH2eezzuWa$$9 zV9FyEBDllNt+9N@Q}Iq>C&aQx)4n!tcgyTi6>0nK`a29Jorgyvomb`bLDe=L}?>qX%T{~d9j&Oy0B`cJk|Msk?TFYyjl6=a-*n$!k} z5DstJ^TfUPbmaQ0(?<)L{A+;G83iypjZS>JxZJM8g?2xcxH-6ef~qXGYzE0Ro7Y@p zJ}1XGK9;$j`CxQTNP`MXy*amKRdJ0-(qg!TKg(=hc2(+tU3+?8b7|1wpMk%-lpfd= zb$8v3nd3FX|2?_s+g)Dxy(~MBt`m5x4dFfXlAt$KzB2|K@K!1IDWS;mN+!et((~bM zk5aBzPBm}1#Ud$*L25LW=K@vH6I}{l_sWX01w$6iy*m&EdNW}%&+w&-2uJ(26+XRI zcXpCcNK=1)w}Va856jf8f)fajc1I52e^wQg5B;GTRPR;iAHjav2@dsEZi#X73ca+_ zUDrfpEb!-sJ6TEt#q|^94&IF4{W%VS@XYDueEKNpHePes5-4E@{CL6y-xDRsjVaj< zRcF35=#k+#YKf#KPH5=-qt*;1OFgrO#B>j4mQ*-aAyaefcCPa%74=N8;b|T&mAYmr z)$8wF&R~D4>d-6G$v5o0?4;^@&asAQQe1mt*r{myc6|yr2Tm3=^#}z@T$2Q?_Py6x zcFxCBN4V3^jSiOCH61;7deo$b`0K<4*)MDn1twOp{0Na?OCCQplR?!d%^`dO6r3^;+hBiwA3WzuU~Ex+0ONAEQh${@>tC6R*MbCXR^x~}J{O~@^XJv#0C_{yT$8}H$H6?)?Uek4Lg*umYd4P4|)hP3#rSeCGFeTbG{m5%x7w4<{2u2XunDk#Y2TpKuK8il_aI zH0|>))qO~W+$kvlqRcFyQ!9y|Js`)r`&d1TrZa_j{iKrxkY>H`LHu00C;O)LjQv`0 z%jpk%tlC3dO%f_a;6;;iY_>|>B%8AFv@Gc6WpxTX8Bk?3OlI9BK^&o|Jl3Grq`HCT&ljLfxyNXK>e+Zb)0mbT8TA_q14I~U?oHX zRcyG#ekE<=^7kLEX{5oP{w4-_114(Z|ATHiwrF5G4YOhaZ+i?&fceNRY9;ur%qdlW zEv|u=N0OhzQB=ETS!OHva(0ovntC8^UT2trdcKjUKS0^QgKt%wqo&;6;1A}UG!sxr zv!}R3PAQ;}xdb1D?2{H`(R{3##+x5H${f`7?-vr0f2YSg>mu|Dx)~hDnxdI*$~*$o z?rwo~i?(`Yd2wVIuIsSDOQ7HoFRxq{WE84k1Be8U!JVo>owE7qrkx_Pz<1=c`s}s= zKVH%JSHi6Gk87QF#l}xO1x0rc98~M&y*jfVJ`f3HvYI%s%^#KVgyBi!AYW|Q{F49 z1jKdfMdbjI?785EMn7x0Ue&XX;*G@kA^moEPeo+c_?1O@314@Z3R76mUUrwFcuA4~ zWW7-pz#G*Bc%$kTGKB&L6_XOo)$aWXBzT@w)GYi1?{m4;x`9ir6d6S3W9|TCRO`N< ziunY4Rmg+O-yVRBA~gn*E>#nR^yDK2;OXUE9Y21_eOrFC5vSp7H#~eo>e=NPI;%U)5TwoTXhnXDURE;eFeHzsmM%%1@6mLo{C5j{i-(n z70Fk#7(5r;!2%xYxexrOA~}&_Hg+TgHV^aBu%f%MVr8=9xIzVB3Fhu{K>qyxrIPDg zNP0HCO;4R7!?2Nd-Czl6;Z=M-yqrZWP$#KcI^|rc`Gc=%(MYiN*ps*!n@s>z6L;?<$?onL zR;LVd7VLVLNQyEW?{| zyQO&uix$eM{G+BTWcfIIXUYCGhsKo-hAnScN$!SNPCN@o!6x}PPS}EH^CvNOgqeIi z!%)+*J5w4kh4!N)A;1_Ke2I}AeG;sF%(e1{un;;fSlw6pXP&qkB_RZBkR?fvNzF*C zmLj)~y0iqkiOgrBMYO&6j_x^iQ8025g~I6Bg7f-$pbgWwEH3yLYM=TBdnotCv2(z>43{xocZa!wvbeG|vAb=UPSG^=+x!M%it)EnacO2WZN0|W~ z>n!>R7()pwpOykR@|?2auMG0U0<6t$&T6Wj-0CChQEYh;2YUVx*}uJPundH1xle^QRWOl*wPTJa6Q zsZ3SWW%~BFpQVj@JUL)luBxhl=A{h3J7w#3%-%7YR7Ep_e0$Lf_s!vg26aMs0jY1- zl_&frdgbDtI|r${Sx2}OqV_;bt9|*`TCm37^iR#9yR#5yy4?=`Ab8wVA}Y*?xwKMT z=v6wCsM1Gl-GrJEHvU@LcFqU}=lbIQ5rIra<(#(p;2`$l?6QKj%l^oZj|vdc7FX1P zAZw=Tgv>`Mni!PKVW1lx>3N-0Y!RiC#k73H31&frrw_gw3iQS^VBA_M*amgX=*DO! z*VyL>bHqvwie+g^BUhSpSNcvhGTM~@X3>jhA7^=)H!MQwGicZyx;n>u#2<%UN zOqx)r*?#?k3$D00G7W_lv-6^hHipm5lk7Oo?kW*|7^!Omn}uvYU5EE%Kc=`6mpsSU zVd5rqaj1mF&Wo(Jt#6bph4x~*`>%S2cHrfeuD;XxhLiYm&risy!8wQ`AQUEwv^WXR zkulx0^;F(}Ir(9K+vWNq3_Q||h{f+(*INkSd|uif-nl#dC9XZ6xYlAlR5oys(S8@a zl+mut@}1cYScEjY2)!%@_0FII49mEDYh2mkaBqbwyWiklN~QIm+`S#yvT^B4JqjLp z$PSvXvu{GD91S-1W|U%>VL0IoJ2^~+ip+gc-)PMO6&}OH!GC>&XfO((4Ye-uQks&h zui&wk#4;R2XdO5>+4c~$y1P^wL%hADy&Lj&g8;{ix}60=Y1-H|4WxXpeHuWchY(D5 zSY2klMQKwr*%}8CiUybwh%P=WXE>urlTH;xmF=|`bgl`((=C# zKBUFQugN_Vahybn21*l9ZHH5OIr8ayTEN_iCh!YZLwYCBp6WNPe^c9v*C+3)V&Hs! zU6W0J^;W7AeC-VPs;$EB;R`Wnc;@c3r2=naC3y9gE_w+WXnD6zB#v-bZf)90KFq1# zRZW=bIQ9jdnol|-BGp>y-P%E8bP}5B_@=Y_Rj7pciTR2q^p0C%wR4pQqa&lnp;UtH?L1#3?;7<-{=@V4z`5TAo39ZfS{8Tq+JXmZRMc^70h=qgWfkjdeaoWfVX%X>>( zaO2W^adoDn!YhyRE?3bW+?3x*AKbjyHbeg%pHJw=j;2bKO=3x^hm=kc zHg6(M*7Jpmr(3xc_BD^h8J$4r>m7d@GIva^H^iTVNj5mUEA$}I40M_tS;h?mvdZ$+ z&v0<6eBJ11@`MBcZdz(*G7xR3$ajF!&+rQxFR$9*qp5U4Vjm{Rv|G`wI` zt``0Nt%WR^N)BZiA|zACX-MCo5%C{1+;KY+R0qZ?HD#Ua_s1Hw;Vxk>AX97!23WkI^%SFi_PTgm_n8fP_}g zP8!FAs=Q81_5!AgK3}%s`xRf29Edsu2Puru5dU&cp{Wpud`h`f3d*j6-ftTe1Ao!c zM9R{#`C@z}`v)(uVoFCj%~F{5Hkv1#jDMh$Vv7GE;a^FObt^dR%h0!mun24sJ`LwQ zr*|RdyVG74>w>99Vp42BMRyoKYtMe7Lf zI&FzZvwB67l*{GWg4O#=-|3Ol47a+Cby9jV;>f&~)4Iz<>mc2-=TBoWUWoz8G1lUgaqmLW?E?CmN}^nQX%BnzF9Pgz&?6>Crx}v&>s8c&qG$Fm&_bs z>#B|(UzOBVuEYuppOo3wxMP=E6=eiTn-GhpqF`7L=ld0QU=ZP5X`%69L^tAhV4^aV z@0bD{^5;lyA(#RL%jW_{Lm?K9P7RPz2;LKicw6?~Ha8X)l%wlA*8Rs$@~!%Ur=trz zT%G5&94EeX(b)LjIKG5?Q`PT)mVy(l?!AVHt2)}B*oDIVUV?hG0 zjdlWK`P9CSxXa_^?Dg)j-?>H$^&VZyrBmDLnUq7u{pGh-+jW6U`YbOjWq1_r0zijXI@%V@xPwBI)xOi$DdQD-x>#~G2o1St#cD_ z7c}mu*sZ$5z=ZR* zN!z6ZtHx-W|3x8TJ0*+_TrW-*8<&mi+C}@OTiT5W^nEq6QE>xh z?7Hcc#DQ{V<6jKlBHuHoVfu7z;jKJmei0#q%%9+rG-RVkzHb4J%PkV`NDP7}hwIhb zbwj@tYUFrGR`u7|$MLjL4a!s3V{ClMK@7Ad3?c>}}xv`+dz&m?Ac*8g@U{g?TYgNgM&K65m!0FfT` z?Mx>(`l?~%;O#)F*@0p6rTI?~n3_W0R?mh?34~jkVs*nQ!5ru(9*pKvsF!yaLl^`9Ox{@-JikJ>k%iol8L2nB&i_ zQne1N`EYG(^IujoWb?to0ddLN2-=?WFto4*Lh3W4$-_mLA7;mg)!8!yb#VB5`?Zx*qtde~9 zBwSmye{s?SLWZpU)JNLbdXyTqo?4$Or48nDlZMBF>c4~xKg%F67CT|~D$rIjw_w$S z!XRQ;6)47jRDM@-5Cfi(heBf4imqRs9LHzWQ)nSnv)Tjx#R|pIPhh216#AX^vPPa_ zIyXTBPsb7Ul{jYf|57r%94DEUA^|0X`sui=7GuzPOX%X43Dghco0ktILvrs8P%_f? zDc%}I042i`P%%Fyhd>ui_wydew6wC zw?Sf1P(aDRpr`FvtDnmV@RgUKnSRB+(JMitO8otYg!uRKL!2Ac|B4$k8=*Y-CvL1r z`*+;z7|bPKg4?x5?_vLUC@JIpwV_^vA(m&=Vx?oq&eEkhHe2R#u1BZp}$E&h5>$-~p<&e@Lrvu4xT0kW>* z^NsDm(#UaLne0oIp!HJ!%iCVeva{3U{?pY%qCJDQ=)IS>iF&_( z-eY0@Eve=BTeeWo5n?hCKnJ06U_qNsJAFS2zJsvpcW+}spgt3FT2a(Xvh(K-0wyUmFm=%tkNMl;sOtvqTi1Mk$@Yu3v>0}j3efSH7(mvCsJ3I;mfeMt)b0P zSODYTPv55TBi4Z2yb9yb^C(s!ONt;3e0imgJBok!;h3qVYR?Hy3uaYTu zrm}iJmHsQsKd7Mak4_+!Fn^FUtwGDAXts*O=*3#4buUD z#jZUQo@IPF*YH<+#gxp}5te^wuQ+TzAwg z&Fy%NQJ?OYj{Ep}uU$I#Gz9vQr7ofbVTzxRB%{!@K)^B~f=!p;#032h3EwP^cj6cU z11iDnE~Gd-y~X2IsC*zdcL7T>TF#0dC?A3`g<44upjgCuA^~~{wvII{kX-9zux_9j z{c2czSwkSM%y|ctce?=^Qc3qzb&gBZ-;YU2+j$6fq0t`p>k`3Kt`?aK`Q+-*_-dw} zdo)@2;RwSTe!k#KQ8dR=cH#X*E=kl#Z#Z=%iX*5YMh*MT$d{L^#@pQ5)1sZF*Nhq` zhM{K)Q}H6^Yxy1t>kG27!!%=WcSl_4i0xkg?P`XMU;2-8hPuWmlDVDew4*Fh!2-d7 zf8{nMm>ESj-G%#J-VT1S0NG^2Se1jtclN-dI#y(B<>^%Nvv*SOasWT=m;}~2T`tyV zN@9pRO(Kt^6yE+9oR1ym;13PGkhDgQur<#a{d z#6;-oY$+|RF@vezbsnXEn>=lbzI@fFU`c;`Z-9JHp)_nk$WH~7Yzso6Vgu%KKrY|y=<^3vKl=wTZV9J{m|OP!|1ndTPxC^CkXW2<7O$}Lo3BqfZ^Qjx0~)Chru1#tG3ox)M}*}U3($(Tx9bz6SFQ0}c84#r`mbcI5veqod0U^6vXvb+P;I+^`( z`Tfi1<8^}`WQ+YA_0uBNt_|-etGmZJtitu!sVOYcYVWjln_R*clzG}b(ppj7C+!I<;Y;}tQ+I}*!;XdVwrC`WrJoG>{9uR`rus6LM`0r^r~+Hg<({5 zD*ec7>Sm}a^fM_@K_ps^08=Vx2^9;0^c!L%)y)hv^Xu)IsY_hpmjI~a5K-{cRR|{2uFt1m_N7M* zrDN`lv(0KqWZzhh0nt-oB>)gTg`=5`r%~R^q=qUXEj1fAFJq7xles|%cC;)i8$mP^wN;{pEvR>3=IGCMxKF#;Xl8T zS6j9QnnzyuDi;;sNY$^s877m7onn|lv6m^~*o6uI*b{fh;fhJedwZD#OPn1U11gW0 zA2xk5ZfposACsVcavf4c-M)zD;XeczSker$eDQqr_<;lgS6000^nA8&0iDZrXu(QB zvB6Tj)kaf`t#be9Jc#Ns8M!!ofv(TPoMSH+}Y<&)2`| z{h%LK|7(D1Qpz^Ko7=?Z?-UO1Axl|UPQLRe)X6A%r=d-xQvm-mA?zEBnws6zf>ch- zg)VBHQcd=%9{o9sG>?Ja$sAGg$?ASB#r-RRxE$4P&Qh;mBfK=^4T#P@mx{aWOn_*g z0x&-XC_Pw&5XwQCYeo&NmO_EWc*i@H{?E`bMv7P&QX<;@w0h0_4Df{ zzpUC4gR2q7)l8wM8GMb~)%{LZNuMsY)Y7-VIiNnkeBcIIeB#Dn>-8MCWd!^;rO+(r zVYuE>gdLtPU)rE9vpNCCEx~T}2X0Vj$>z(acVjq-@8aUG772e0a;pIN%gS%CCeZKX zvHsuy`~_0L{N@8UNdJKww7%~l$g(i1CC%hQ6)-Ondr~77|0j8-u|k(`idn?`zV3@y z+?*uxx5Asn!3c$P7sNO3#ZVSaQU!ZDQ>48J;vfz9F-@6o>l3Ke5v0Lcx5n)+Cw+gr z)4#j+io*c61rzCAqsPhKWo!;yG=JQiCOCX4itNPoN+?aWH~(a<6qAqS70BRcqDgNw zi1q5NtH*to9#9?UqyL(tRBNVmMwa$5JGH9-%ue-X?@pOLoa;}QrkgB#15(c%9I1AV zH6Zoa+c|oc4n2x%4AENqZUkF=MSmEDbG93!4Zn6f@5`A{n*Op`08U@Xe(Lyx0r%cI z{~Ee)CZy3D zL@ASA{R0M~oi9m&2g2=(Jb0}H5&h|!0%8)CahEDkn8_bVh!kc4@l=tSUDyqlKyRHr zp4M-SI0i2ngkH(=XGl*naZ)pn7+X?Ik;@$W*3LAGT~V5-USF=BIGC2SP#WoTWscX) zPg5-(U@FU6r%IDQYrtaZ*&`mFD)Na7t6Q;p|BC7s*QGT|P;5~TVVD!E?S!)`TqTn> zAzs+62`PdYLE0G~Tk*1d2B8&Gpd@TU5mAhB#3?Av&#?Vl`+giM1qFua51l`&40-mA zDS~EUpmRlv=PWxl6M;ttr4y{ukktaudE*C2dq-4Q?0x=_5g)6~4wl{3Yls+k!o$6qS*<{Z_h9v)e;I)9Ki{t1NF&k64$P z%NnwnXY?GbhV#BN$?h9)H>i$dJlvh47h}14?K#RaiusAvywSeoR_r!LXBpY?$;tUp8egE z_Lg=~g?+Q|^KH;>CBpFj=DC>R%)!_hn{9bqqeP-geQygppT)){cxiO`$H?8T70$b1 zCGc1GY*4XV@#Vl$$3HW+nTa!R78+ed(OF2o$-X+5Y1rI{tEwO%lGJYNJ!81-$4~kr zXs)$xt8G|obk}yjX?ZvFNDu6DQjL5~(*~>c;x$fjIdc-k8^n63<$E#;!oMLXjU5nP z5WVxsInNeyAwyA>s@^tuI<4Pc5YgDzK3T^F6?0cjzuE*;Wmfu!_n1K=5(TWicys0y zczD8n7ceMiAzL8;Ne!myD0ZLS57<_y%aK}Wp7xmVO}4d@LwV>|-!L)?>_zI=0#$jS zdRianJFS$rg~z_REaSh8B^>^W+%$2bYjsOj{Zy@_+FbQP>!@s|T5)^n6Y%Q%@T zdP#gW%I!=4vB|gdU(Sy3_$rreoBUi3Z#5!WIniV}8)dMMc`^h_Ij`7NX8jbc#}=n) z*D6&*h^Q-)7j5FZV{I_(1YcpgJI_TX8f#kRUqM)lkOXp;Sxh+&CTenrA|%5wwjXmC zbB-!Vp60t+6I&L7du|IJ~}#P)9vb7sc>+_9muY`rY}ahO}KQ$Xn7 zjjwh#h~hCzg%8MwgNfmzh1LEH%OjB!TdF!A)pq@CH7J(G%!i;VF%CR9z9*BBB?rb0 znm95uVq!&9@eBM^Bvz6~u2dXOA$s<=9M8(6e$qD>MuX)I`&QdUYB1G5OJybcz%W3E zElL{$6*v*P!5Flzm=>`!w`gJSoG0-W-d|KT|Br^hg!bond#jS5$5CT61h{zE4fI(| zr9a31TqGtFw!dOYUsMk}a74Smn46eyHPAeen3 z14TmBG@bIAW4=NH@Y> ztGJ%>H(}1;8?i=vy)QZ0NC6?ouA*lw4v(PqCf+lx4{2E@SNGrb`u4D-mgCJ>S!iv4 z`6K=Tyruc}Wj(yQica*6cWELCILKkc@yL?iV9A#na)3S@c6!RMr968y*yc&uGP&+% zp|$#{4Db&_@u>%*Cni;pIoahN3eE5PX0bJKRda~M@&*vA5h`2eGnDn~`EUxe)rhas zIXgS04KqC&AnP*jEx{oL^5!YWJqgW{x!4MIUQy)PO{jc(q{ER~O!(0pth`` z$2D%$vksuc%&Q9ZKNqXh%tPyQVQp)K%cRKoB#TU6@LvBY^XO={VEuZo2^WLOzjqq> z`n%Pf z)`s*-Whu3D#Jc1Wau+vG=rw|*i%4|5rtYpF-@yEu-pAJ$GBS%45C6+V*kYt&*vq0? zMG8|DSHcU*y-V0Xv}G;Au2Wye;g#){7#cP~@cLCn;HIs{vF#wsZ$Q z$E$iM?zGtjZLg`5Xg7xy48-Sx`-Gy=i<*1!(&Y|}!hLRVIKs`8pjOjzmu$!dpOr+Z zmL-2)0aBIS-8oNY`T@r z_t0or`7*P0IKh@gYLJ6hj8sx&8{{W}(xO~%cH{{;@q@8`<|tFp&kYGAn7fWD>A_jCzLXiei-i9?*vIbT+y$t>~R;?% z*(Yo85HYy=&-C^ojqEX&`@8!+V&Zyk$#LR(ZODvJK38Sq+S035vBnf1+l?cL>k~)@ z`#vs+;qHIww3v-yxBF%n4K?S&r_wdb<4fg5VubBHa>JdYZGEhQVy2|%)C?~DbXG*GGPxrrt8v!fH}!+6ZvLK;nQ3Dk9LJHn(*(-x$_V^ug9o9O z2;~+KYL@sjNU)-(B1lIedNrB!p#A*V;Zx%Ur_I(Wt;AkxSaFx~GxD&Ic}V<6z`Q@1 zAyBFWSTM40k#U3h0cy)m@J6hK{-=+rku`QfxsRz4Ex$;)AZKn8azk5x@(R++!J(X0 ze8~E1MkD@Bf-PWZL_(Pmfs%_iITgAcMg}wiORx39X4zrX9qt%cQgSswtn0Z##bex1 za2$>Zo|-_9Pm7%dSKapxYXa@dY&~Dxg`qE{oKO!5$bjvTD*$p@jN0+tuy;WxaPs5s_6U1NPnc@c|zbHg`-#y6C&NLZhu^ zV0_@m1|J>4>W#WbS|8BTK`!RMtwq^vMK*cCm|6I?yXYdd1--N-{q7Uj^X9>S)n(D8 zM(a6-n#3ti^~x>2Q`h~k4L*N~$blFZ)3kU-R97=^70_A!+2GxD{JE(U1&qYgxZ(oq zWs(z8d*Qwn8>)fT(-iSdIf^VLtAz^z)~mX&fEVv7twJFiknJW{`9<;qjuK z)r(Ul;LFQf8gsbQt0jc~>_vZQ`lfYT@qT{IhWmptBX-@U)nCaYGu4b!aQzSn} zrY&bov)uy3-;357L38HzIBPxMSTlle{~yZUIk>WR+ZT>Iwr$(CjgD=nW7|&09ox3e zPCB-2+sU{3t+Vg9@7JSsj=Zjpd6az9C<6VYdP{>_ZxeCR^aeOOUQMmIv9-vlL1r`&5_&9wxK*o5H8R zuSB|hus^Q%o|3;A{95Cu)(&7%pC>8t|qiS-oF05qTG&r{9uI8&Fvsw zoJ)csxwhu@7X#g#)DSjwcGOrrO%5lH3?&yC!XgInlHL zO-w=ju%ZsC!Q-gL3{7|^&VWDbyJ=VZDsV5pJ~a$|hZm|Vs_hCaR-NJvNhrjEK8tdMQyx8vumx1Rf0oaDP^x=|+q7`UL}z1uuQb7;GzJ1g?_68VEjW z*7XVj4av{LLEN|qDs-(HH*1c;7Q|7qii*P4)Rpt94g+yr3zLPGfHW7KiIUnwXovNDn}^tVlK9V{Ri z<)Hj%$1qg`2H0z3nenNpp!W6?8sJs!H<(fc0(M3!{IV~95FipxSxhhBTzo{JsggPV z3j3KX4t65xVlNg&4^()=Y7~jOSykv_3vk^*5U^*=M5e7RSp(|e9!}`_4%H^jB+Jfa z_Qa?m#~J7Ri$mnK5`DY!lWN!?)>z^NYUhnncVLp=6Gp2#{3isImmb2n?{G$L0;&I5 zO<39g-E`#tiY;UNH*6W_|9nT0K5mC2fhhL$9;rp1kyd?L?F$qLEI@1?j~QAdhAaXj zP7K=zZ>i|K`|e{07Zq(Y0zt$?Xox`AVII2Ca`pY;Oj5Q?IX4O0p)v`e7$WvNi8CKn znO6pzO8t#jDLGiV%v2~XLZm4&L?cjEc~C4#mWqWwvZ-n?Vy2Y3j=l6qR0oHDOxgl$ zBU5Ubdm=>CYCw!kDoCDWCX>AESBt+qOyYd-p_rr=RcknETAmKFm{LUnYJrOqvlv>1 z3gw}V3-u89F>54Uh*Wtgh9Z+H=g&TB{#c~si;eTDx$9<&bgMd?byY9uy|F&6Rlf4~dK9 zMoXi>$+nS}?#lhdcE%)5?p32uGCZcNL}h{@LOJc%rTk8thfSRvJxy+cMsE-qA{_%h z28ZFoD9^Utg%)3hgrzo|*vHUbmPSKQy^xO{EZ&qJ3uZ->-^R!$juC0lgqceluOUyVYgVi8<Y&I!}sW*|BRAialXSun! z((aG^da{(h?D_R8$v;*)w$ukc6qfFm5%7&*$;kYInAM;#F!VzyULdP>hxvP)nz!$-6O1z5#`g2J{(Jlz>sXc^N|chr`H1o!d0(7AcDGON)4CoVY$6(Olc z4u?2mf}s&!ztqJnIY)%{Hx`~lvmX9d=jRqCWDyLfE;rH73Tu43<`3yJ>5rDJRWxlx z_K(*J*dMb3yj&+!ujX?CX6lNb?e45y-ab#U#+TBYw5S?c9gbd*-g$d-XDB9x>&&yNlUChh#E(d=M zyV5@3*wC4(z}!%7PM!Sxs4ok)<$+tWUlh=%r=K!|BtGZmD*E((^cBM(Am4qtU$6oB zx?8a6Ztr6kaJHN2(d^XLo3snP8^1n2T3L9wpV{Sud+ArIY50);PDhA=LP=k8mod@n z1~W(I?2rp>9Yb`*VQ*s2b7v{h;gZ%xtEa}W#8s=Z(Qd*pG=5xkhPFggt*%qks^`#k z>bdYxezwrGIrNF{5P>I;o^e6SoO0i|m!NuCrz~2DnrG+s7~YZOn#pwpgM;z5{+rm9 zc_UqF6@0}3bcgTiQlo8_9Y;d}mg_PjrUR?*Fofi4Sn6`Onu4MF%_ScasaY|XxDS@`V||lxjIVgv}0_T;!69c{_5@p5X+VP zcdzanJ*d!difFxbh{aIWDN_ve<2`2|t`QjT1cpD1wxqv278{5KR47twQ6`hL6f zuB1Hf?+6ku2U3dI{lUEIlKbW`5~!yTTzx;X-C!dCPIRlqdIQHR+cSA!knb zElEz2<_%|Ziy-#|lJK83ij9-=Kc`yEY#je{tJZ(Rnw^c)jk_sBT*^2sfe0bA4t6U5 zDDz9MQ~pb?@3SnIOKzM1NtmwNYEG*xbh!ToHQN!z*?gbBK70f1c0jX715oDaI%mY0 z3RFpA427Q$Kl9sopRbLkGj!7!<<$Q=W96hz6vwTJYi>D*JUL9~9vCZ%AnLOMj0(u+ zHrXm3`cefpiIT^vO-1Io#!OJtK;H zZa9lo0L*0Q7OPjnOFh+M+)UPUi`J z9Rk2+9Lp>x|7iVD3k9S!$u|jsdS~3?-Z9D{$YEE5m`Kw_{+0(K5>_3xmAhWGA0M!v z*#MYh(W{ZV8|Zo8z{E(Q=1W~iT}X($#7hqu|mU;}TIJI#dG&x|pL&)3g#F|EoNHPv5w z&~G2{`%^_96E-_b&VPAH)qdlow;IRRbQtV5MszK<$C>GBD%xz?1_)tU}C8?E;Q3iSrbj%<&+EoD&#cqXfs@Y;zzvpBT8ALVNz9u-u zI*q5F-mIqH=f5ZUnJ^juP zM5j1uBQz}(xrlU_J3tELRgXXB&|0E|#u^#BUVH$&w7XXO(KsAkH94+Ig}K*o!no%= zjN#5apQ7mk!R8y5^!M7; z?+MjSH9YfXY(@?IhrYp!S-46A@rz-Lrcmy1vq#S8%%7+BgdDn64c*2Q@c*GtI zQNqF^jnTzI;5B!Y#D3v6vyxc6Ghmv0DuiMmeJR#!eX86ia`6NK%nOK*4u0Ovg2VOc z6?*T#;~{>ho6Mb$dec#lvbJd(eid6n{*7ja_9$NIeSS=hk!rwen@f}Q1d4C;t7C%77 zO~9U^-OcLe9kTsmW^oPIjyA-1X;IRf1@0L~#hJ0J)+Iv|7O_ouV8~j}IWuAXtMIfut^@{Yp^_<8h1X+wlyf^KHg`dwohsP}|92!R)`<#ZcGxksi zC9#K)NDK>Xd8c@lg>=o-=Uo~+dcq_EpR!^6vxIq@Q5(_dAi{WDp(I$kb&i(_Lz4EL z^=%x{VEu0+UC?kUoYb6^pIoK|GE7oa=#upMG=$^q> zvPBZ8s{!;YCc8R%USQC+{#)*3;T2SK`v#4EvE}KJa>N}$53GLZxS7CaqCo#}FA19ucUCon=D{XL?K%en1jX+yjz89FPPaoI!z<8W~~zRje&~pshxmt~ioh zEU&ia!^bjkuL@D|_RPU+vY#d(2pT=Tug3mtA1Z|dDuqKTg>XWQNf6RuC6XOKyIcb< z(9d!@9o=T|Af?{d#cjT4=TG}5 zNnpg2i-(*77w-Z4%v{4SmfuGn1(E3>woV!6Iygdmk+7_F98v3cW8TXUhNdC!tZEFc%z6@Y9r3^P(i zPRY+0tj9_c@n7Y6Fy}&<{On`)(_*Qxm~0hPY~#bSOaiz=P`Am=$Pw?u0&G~ZC*Ccm z3Q_&YKVL(0NgHAh_hG;m5Xnhmu=^D&rA2a&$?M!8{$@HG*nw#QZQ`P+NVq-}zpaVXgHVB=P&cB5>fMBw zQK}!n4qbL*96ct0b;OXrkZJJ05Q1=AO$RbAl#;*1>f*$kPkL%;Q4{@aP(}$DPE__u zvvN`fI1_k~)*D&H56|5oqi>+{Q}1V8vWK=c+#qz7oK4ugi!hvZ`wrN=D>EVa@u}Ma zg9loHB*u%|8c?bgAt^d1w(96TyeZbNhLjvrFf74B z6B{g$AEW{qCnS2mRd$yD84KF_^kTq&#v1=6>^JHXLZQP)l+~RSdy{g^00Xh&2taOW zPXd0*<9Wvv4L%UyO0|#j%JfA>;_s6QNt{@r4RQzWKN0;9yT4~z)*QX~cp_BZKAivy zd5$5>KM5^Az_|3&j>YmTq0N7}?_fBYmO%Yyf631D?=z79ny9ldGX2kI9Dn=ER-}Kp z434u0Hy>nz0*zu$g7$(LslL*n11-%?5U#c(dV8~P4|womk*m5F)sni7;MjzyVp4IO z1LmOvd7X_f>(hrLEbT5P_9pi4AasrK(uibY0{{x<%Hd6W8<$td-LxAIa&daKq7GYJ zPR2#C+gtW*^9i)|fJjh^+rPLBNdPVb?UK2EdPA012A}`*UoJyUjNiVDG1O8lcdK1y z9=@UmiC~TWKcY-NcgyX3n(2zrvF~=p&;(t z=Bh6n~H& zmb!i(E|U`G=E=W73>PhBG^aZuDq59^aq9>21IFw{DVQFj!S49_^|Ex)8bV|;kcQ7Llu{CVLNQre5b$?iEK`)r*jFuvVxOqwTbE>0@!)sl$G1+1~~; zWW<+wlb?P^Q{oPb*GMeb7DvDMcTUMo)Dpm6Ndl%b=OPh7aO@vtrlcLOG$D^!sjvEZ z)VIX!SiIQnF3CU7UT-OX=?u-^0?h%^Oh?K^9UAoEaAV7{T7i9k2*$mNUcSgVt3K~H z>MCa3F_Z#xRuc4hP^6b(B`@fw|`@U+fpWZN3~E+I)HSA*|DIu@Tz-ydR8hKdW2bczV3G> zm^{Q&bp5`7XlnMO4BpXxQDY1WDF|WfBr4U~RtsZxb`eK$-tdBgzA3VQdwY&xSGT{A zyB69Ge9r>(nzbL{ptsqVm_41`K#2Gy9YBk5eN;g3qa}}ok_;g%Xg)u?QT+Kd5kC4k z=Sr`!#Zs&HuKg41CKW{G%j#xU@RITWi-+(Gq!79~S7DbMn&494SZ-li34mXRT0;OJ z>qrMPx$jJzsC0(hPU+TH3Y@`M$hgodj8{?HioTrsn9yXNf%`k2Nj|m{BOwlrA4e}B zV-&PCb>73b(Bp4qKN-_n#_vLsuS*$fYCG{m0wjJhUR3nKy=|zB_(3OT7`eP(F_pgv z01&8Oe-Wtq00im^)?eu(&Z)4>qhV{vLt;bFBLe^2{H|c-=D0HYRkY^HTHlQ(Fvl0Y zmo_AtEAIB&M~LVVk9Oe~&TXR2-c`8V+DCNhzw$>OUIC^SJ-2atP?)>AoG;;1N}clc z35I2LgHYamh3B%{h!1HYt-lxAK4A+|Ugz?}%X-K(i5@EMQBJtS05DP4G6pOr2x8N3 zb@dm28%=9Iw5l6GqlpY?G(qp*oH&DN+}sa<&!#lC+`E;gb|)qP zK0!QyPk`hF@fZ6(JS;2IZ&MTmu=NDw{`~VY0LF^aS6zs1_Q7m*8V&6f|i$mx3J z+Yf_o9{&TsXK@-mL$45LJqA8L8t!wCP&p`z(y|WyVp%-0hzpzc)U&NDxZ$d& zFf`7I>*`$qotQ`QNXSA+Muz9MDM@MtfeoZ9^GbcQ3k}+qX%kStc*kHT;HOnXR`n7H)||Q=4`{_N-Pz7HDXF?swf~wRTeY^(07)shtgK zEj?&`71JrTIg;-Yn`CpMS;zhcBwGWB_vmlDt5eQ!_#wz2cXo0F! zM_cNv@UYh1PB>}GroxI^c}(hjhhI$>0smFGEW$HphM80{kqvV!dvFfAQAMD;WOs#? z9=riU=|&i$-uH^JBbGA%OMS5Dq~)9`p8O3!fG*v!wBKFR zqQ9v;I@RN?#m3k*CiV{SKCw(p!8m_b=CIEj4}i&9mr-fCxDXJ6wd3@zat}KoxAu_T zVjfG7^9`Vq%Hhs@FQxuPEC2N~6_cK(XDKNl!I z#Qp<@{n*=3XDh|o5lYHG9EmtFXvL)zJ1)@&y1g;=SV_v{IBBQRp-!!$Jrhv^)Svhr5#TKoz zi;vV!m;*)mN)=Ak+%)5Q_a|9U*Mv6@dN@l^j~THD!)0u>?Kj!rz{e7AQj!nFZW#j* zzsxiXa!h0advaXsAC<&qK$PN1r;Cg)iyy|X!72r+VwZNcz8M!zo2L4+c+fJ1LkwBY z=-!#1W%!u}$N3Toa^5^E`(NjKh1bS?aCy~^;sCn9%HLY_0jNbQe{0e5-&)j^^U*&} z9zG`Q@?Hx*Rx9Eh@i6Thzq-d4XNduDwcA^OOKCZoWOUQ$wkBl9l+1ahmDa{y5_>qz zwX9udosTFc!p$ppG&*KFEodrJ{MO$TU@~;072^9!S)m7bQ|+MyDT-S0En$>5!|#c4 z?&mDYX6NcX@_>${bSqd6$PJUdMC~o+zcUDZ)%bSM4tkPeo+~)!s~xlXiYbK z6?$Vv1Q!pWW|2xrl?v6gUrre^M_VBSo)yIofU$wR2q-J&|CE&yHUH??g`!#J=+n9! zST_DgK)55_;sTbPW`7xb&(5~*ag}vxpTuLMD5;uzmz3|dNMLQEifmI?w5A3jJq`Ov z%R4z`T(=@JC4{(dMMaT%_kYq#BUf{f-x!7nR(t=OodL^#GkIp>{C}7{Yij>kV@LI! zs@cvaH%=6ed;;;(s57;g;bZ-+jg4AePzd2IWovHz^)bnXO0|-bxa_zFr{RC?DCx`X zruSzBZ|^(~rCSN66ns9%zdvoZepq81j zuCrR8o?-en_Je2NW_@By>nxQTh93CkB4sb0VR_&H!B|=6Q>+{?9F=J~g(bWOk`+{v zHo2@&!36hCII$MYUg2Z^=+@k7#>}760cBKWEsICRT&m7we74?>By*T=Hr#|%rQoBe zRP~hBz`?!6lpT`Sn&*x>^rzW0uoNmGD|3?J+#xT84E9a%3BA7o3aun?CpaswZyKxj z4XnXcEsKI%C~^3ldrja8t|TX0HDltq>QN|tCC6BqyahKolulr_zt_D1JAqqvXjdFC ze26_O`5=rfws3^`p@K)hl>SdI`P&}L9!*;B2~>2qq+hQ*za*^?0u=3HX{MQRdqx(8MDetleT`#%zUu zDD4s}BWEBP$s{4Y{j$ftp$B;4YS)E|&w{%JJ{n9?D!2Ac%U)Q#DCPT`h|YZH$ha}% z3v9}6ar;YZW;gauV*g@ZVTfhR;L1iHGiy=c^4F2yp$N&V&b@+`xh<5xs}qfUFI6-tOPa#@`k(lXY~0e_PV2Qx2SrHb~_-r+#~IRi%Syqm^@Ir?h>n#ho%| zfbcJAKt5sjVjD{eU~CIL-Zv1~ zQ37fY6;?X(vUQj8fHI39=wRgs_P-ZY^%fGZjFy znd@Jkkc+`uUa{Q!$m4E>E?)gv$v*BR7BzfYlw(nCOj-I(Ee{E9%r(WzB1sEq24gb` zf+Y&M^=_c#yP?Y{ZAs9&BPEr9v%mAqvZ`Iy*e0waXi*i3JXCwXC=B0~Z^K{gsBHeYj-BxVf<|_>u8Hly zI-VaUf@9XFik>wu=_~>qc9X+IiyI>7{0!B!)e^WHkBB&il;l{&xQYSVe@BrZqG(y# z>C27Z80yt{hJgSle&xQ5W97#CH)!7b8HL*&63S!y%1QU1@ODS5Xp;+A-T8O(=G+P! zDsOzr6^m3h!ekK8nK_J#(Ye_pE{gr!L}HwpLgSa=%%zSxe;PU#eUWtDL7T#~ffQ>URwfTl26mFaH2OqyEx|UzVuSh6~8V4^xs4Y#pxZ5jn5c{;4xG^h$r}x z!*A5;-#2Sif>ZRVD%S8ZdFnkIR6>47f~YrxsK+POqqR~)(Cb0(V7jF)uf%peu;>_^ z2CU9W5YzK6K|+O2A9B`6(A9$K#|8Iq?YYD)6&w-rK6uUtMWJH8l^fmtX5bZ<_9l2* zhk;F^Bzg)_aw+Vr1ZZE;&qrQXl$g&psr%y&?=qmWy6x$GRgTQhv2+cK(BEVmVu=#k z*s!70a0hUtAtC#*kBcdldt&g^*x`#WOc16bcj6Ps>9Y4AY|OkHHusyW7O`NNsJOz# zuJ!L;-2k73(Z7UHH_kdpx83AT(0uB7BF_G-!4*Sds8DtP4)IvL;2e837R6wF9vhl? zV+9n$P>7n`mA)ZRX&*U;zEG=1Jn`(sTi)O6-mFtp_hQR;w3lB#qP}OjpaLV5L~^`s z#90Lr)DULbgc;Nu@cY2(*$_7bf#v#7y-o~8;_U&@mP91pfG`dg@RsOK;gsePMXsB^zi+H(D*p!NV7==F3fdzMdm|(AZ z6V`i7PT8PHlXCv8FnxHl-Y<7f&Ct9xt}#6Z?e8-8A@DQV+5`O82|kM(bjM*J~(KDaQE@P`Ccm(nzaeaA92Y@4&}oBr&eWy{+cSNp1z?Ar7O z-Jj_+@x^%Od!HjK<3?f3F1($f_+CkaPH-r3UPFg|tb8eCl>BLOiSe0*nLKqI;{?K| z#Az^FwPQtRK4)plecz<|ut(CRRoHZXq3ByoU*l@hvGN%@Y`hCGP-vk+<>Xwi!nk!} zG?n;W)J0S3Ilnv2fwxn>KPM z3@4)czn5zEEN_?~Y@BvELuenSEMlQ@z zhZ^<$$0IQW11l$*mLTPlD~hxtG+5hqV2B6AqvEdyQpxv>u%+#z&jeaY$x6&aQVaj) zWOAVSt(_xH!5uQ-Vs!^mG7%0T1&JTfkKM-y>8{Gi#$vJv!=on9=-g}r4NuphL)dp&kD#=U9%w<&V9fc_)x(kdlF@E# zKY388;P;@&3=m~vh5B*6H6d<(w2qnMAT(nG1)OaVO9`3*6=()?&!2u-ZpB~ByviHx zm!r!Ieh9>^)9d_41Rco-7*oR`k{tDMKq9d#PqTU9eb2o=IGR3#iDc&E*AC?t>Vve5 znDfjzyNexAW)@3RNxYJ-b?w5#g?m3&oHFOI`@G%ZgX>CkStR>HCtbSMxMrj6QH8hl zNv4T4D-r4}BC;un&`-t@|G}X4={G3WdW+RbxWGk!nS6q%g^9vHSGTjbz;jQ}g@2Ye zZ>XX?^{RKPSZ09yhK!0(MlOFW?m?|5|Davt`>w#0h`6YH!BP;%{Kh|(KDN1R<7B;- z4}b)+5*|8E8UIYHcUs{H(mS|zpnFfDty%NFBmZi}pZ!L#jk`LSc-o(GQH?z-LN#ya z2vRBL;Fb_V6;wSAZxqI7wWF!T-BE3eI5CdAJ1qK?576ep348^W&m~+v^2(}Zzx>j% zP2cc@sn|!YbtfS6BOjCNUN+^QfgoZ&-7RzxYH!kZ!IXm3;i17{v0Y>s%;JrF)6upt zCrMpfkDuYMY(7D9BBjBU`mXuLudz~JXE|LRgId20UOH4z$}lnXWSn^PM!EI*ngQdV z-&H5$=CP<%I8>qsmAM4d!QX|Oc$|vipFCg?M=da?k3{dZS;dy%!Ifl;yK0phTAiD7 z=D+!Z4H|L~BX7h)m36Q*Y0N#E-VUZFcENi#cBDj$XOszv?*vb$>_)NlbH^!4uS&bq#Y`yB)svY;@#bIn2^3T;hwPqbqIRR;XM8o#U3$S6fT2X` zeTZvPW?Rax<09EQf0mqLqs_SAANW zQx?gH=2GuP1c*JB5dyP<9-smKy@vmMU$e3QyZ^=i6><9SYKENubAH4Bre;VMHb4Z0 z34E(m62=Z`MX-v^U)qd_DODt>wtBzy6vL{VP)v#5tPIl!k`gvfw#5q{w$ESXhz_0P zRQ!v&5*Z_ajx^p-DHYjrASq-fdda0HG#!)JPX~SOI9NQ79O@CdmY{e1Gha zfpuglI3SzjO|UNPQn1=_I6TH|dJdgJSbezfg_>?KVCu6Il8AJ0YB#s^Q4+_e1j&VTsDj#OGH)_0M+3H< zc@8u+j-L$OP;O$6(SqnK7Ledsx!`#iEE81{Y48A5Ss7TkyF5r6(b(P)M(Jo+by#@Si9i?vCY)>_F6sn9rVA#u zJMz(kWiWoQ-+zEt3Ly1N#6?kG(76nYEJ6{`??Ri9SB7(cHH{W&@H}suxo0e(53a5_ zMo21y6VT$qZ~T_x$ozV6l^@s}GHSaFZE)+U;#d+`r`2;CNIW?8>~4qDXx?#r_H}3C z?K)<${iN^MDS25)r~f(S15`A18H&-cb+jbt+3n5T;pyWxBJ~!Syd5j{dg}oF9=11^G4!yp z0X;Cof9H5EGfFbz)voiwrR}r-hNWwQ1M(p{FxB%O37V^*!On^3Nh_z->2-H_3vX-b z-tOI?lhu`_J3(dJ(V1BV@WKvs_j=iQd$?86BpT)ADcbS=^p#7^kslaesqpyr z81>*?*F7YpTGcW1>H1>l!q&5qshd$waJ0nHKr#N*m`0b0?*kz;xzg{j0_?!1y?!~n zN%cpqw+MF*-I(R9#lYmK{ie~_$GT#T&W~rVg}vRh<#$!A#uvhC;P z#Q61Z_+H+HL3V3UZz<>IpdAr|Z&Sk?;oHakPE)ZFkB{4ou}}mlQsR==(Z{l^k7Y^U zjAmcIKQ$DDOI0HzlBd5ZyO!8ca|Ydx2HS>_DTC-N@x=!Y{FLCo3I=5nl6-VhL5lx{ zzYoQz8XWwTAw|~}tH%{)rtF#MGFb@ax_(QjIeYECd7BN%?q!RRNCwB#g*Qx(ikR>& z$Ze=hn`rbxjVeR8hc-Ui{Ovof)s#YeM{NVumPFvOJdO8H!bi3_jGu%PMS@OL(>ld8bU(e~nyI}b{!RlHL)+BsW zL1u~sovC{1>6VMYI~omA(a&cCb+u~u>i#*%{&8=f+9F)3xT)eRZXvWxApGWM)!OmT zm+f1ShU3P7{Z0h?_pkKUMJA2$gJEvI_#zbI|3F(B>A76}-*l`^!GBNl^kh_bT?g8C(&eNJT?`Nfk)4^^W1&qrhd>9Zoyy+44W@6FUdjf1pj72$@(o z|5r6)CS>DcVfq&v0)QR(*Cio4C*Z^XLmFO;Gx+zUja7E47-vc*f5`#Z4LstFD%b%a zkw3kR`_mi5$kZ`}k}b2d(1HGqg~85}GCs#=UXvbEUo{UtXPRdo**;rKx?cKAX67?& z+bXqU%Yv#%_NWBm1e8#Uv&u~1z`%mJhysX|lB1J$kRn_nU(G1>I3n?vj9@yq0*KO* zl$b8YDi8&B{#uZn+j(IuK%`(m5z>K!qyd8dgouiX1MVpWybwtw4lOP)wU!J3!zVl*kak1Hf7aNucuw#l&Feu#AF&28u~j z0?PK%Jh_V12_PVPdV22$7;e5`vwA5}7~9|?TtN4Emx_fH^XS*sCjRWp;IHPazQV5l zd7ytj7{Y-2;Sa#Yg@HLB%rU{B-Fl#I{MUrz{WDI1KwaegV`)%7F*NQNfEc!?4uJ&U zbN!0mP(EmoU_W^gMoB)l~^AP;_G8H zubqI~b~`ihFAu~A@@Cn|d08it{6#aCwS{;1ZY>_5Z)w-832s*rZ0``zuXGvQ^Qsa` zl=m~k=LzZAsa4)S7z6j?X$&5srRj|=v?{E^9|`yH5Li|HGpLOT>2u8tK@3>bA34w{ zKOd+R2j~>e0r6S2cX|Zvbq(s(v^%Yj;3Cope&)RTCL0MVE?XM}(c7x>{@lO&B zJi9O;$@hh;3Gt7Oh9d%G`%fiEjP0DZMkJ2;5D?C<31^vHNJPgwkdK^~PoDiRi93DG zPxaz2H;Hom$;qCNOh4dH0)b`l!}Ggu4kEe;^D(fEh#W@#UmWvbuc@w0xy_6Ssb7!f z1-%@OivDXnU)Cgsm*tESFcVtNVX zIXJ@K;OP81ayedK%%#vc!3^JIECPx1ff$jY9*H`+4l|M=f#BMe3hCzGIO2gp-tECo z$-(SdvITnK14!RpigyM5{X4><#;;)D!5_pw98o?Czy?7hI%igYV6V@Ad3|UvE0%Jg z3Uji>`b&mzsfDrSs8O#!>}q2rz4Uh*-DXB1@P*jiJWEBU_N&GtF~{+$Nxm!#u{RX< zdyooDct>p7lnzDNco#1@U%K5qmwjGKqwMS>%1s@;%Y~%diGx7!i@d1z$Z!bk8Bm-< z{Amt5=d#1!Eo}ElD$&!m+GN`p(iaemt~st!3%5elMUA^2!Wq(pz=*9sS~ZECsoNC3 zVcK<4IrrGY=i~kqj{kT`f7wyN7yhbp&jV?E2{hqC85 zF1M@ZvPO7N--o6eaDNP^bL}K?KOIA1e}v(tm)nSR6iP33V-JaIIj)@D9_}mcL!L8p zXF&~5RDE7|y_=f0olK?$g1P&x+}E{tQ?HAm#j|u;MxB)IoIuBBOx~^bj5^L0)JmeU zID+z5%HZwWSwc_-^Q4-wtTsa^Qo@g|MRB|R3f?HXARoH*X7~JS%7?oHCaLc=V(GmX zXlNh6We9JtXE(~f3oH5ZGdF%%v0jc*)xnNRukhG6#UfP%;J8AOZhQXtWxgd7Hab^* zDQ4oR%q*i|EIw4S^{^MQeM>NRaS~UE-1${JmO`~K0q)2&)m$ils;Jwu*p}}Jt1565 z#a!BF_$Qqh-Z+%O?`uDd-k(uo<;8s%26TLihrNo|h4|pn+<1bDU*gHKq3kNj$v{i` zkGl`=73>n-h^vpIeaMN%gw~8TDyqoLNF6t=$x;xYnjE_9~iYqh0gH#Z#89#L*u_qTflw;Nxq5 z;0d>q+V8v^VA^HU#%d?e)#xp!c;9wRLwO!UBOg`b&anraOK;_*Bl>@9g|M1k^epxA zJ}#2y2$^2t=U8$CQ^*%)DkpWD@jK9f$YEbU#U0)7T0;1S!zbzSQZ<%;?Wd{oUj~}{ z)~o!yi|6bLe9fAsmFamI==*j`FIWQ&!vEu`EVDW_n+;gWxe6FDi~f@RpL zQP-O>nd|m*GB_P)71&zvq_~qmUBS9!f z=IX~a-_Cbg@Yea%)S_5h z7mD0|kdC6=+1+Sp;AEHHKRnHi0d?}J9XznV)zKm@fWR4{O885>=kmHMxen7dkb#}W zT0$ti9OcD~)$he$FCH(tO7W~Bt2kCYrqRu=Bg#ki1IiAW+T)IETX&Y3Du zFT!RM*`%nx6@l3RM9#UXTuYkzAds7Xwx(ulSAr2;E%#V;o%AY6BICtPi+>CCGx#A} z7g?ghU(=kTm8`mDQ?2Q<&|suLl@MQH0;HXB?j2i;bJ|oA<4}C}Qxx*xxsVzAV&YKl zTr;r2!E&|8zJ)WJPIN7Kl%criI`dr(+sV{db4RkUO2ZZ~_-z`wx`TH$A)fb;Ck%{@ ze6x>33sOG4d<|X+QI!gGfro)@jO#9~W=5z8-g@NerG#T)-+rh>%sTlT-hy#$J8Bt} zXCvd`y|v94XH+b^@H#C;IjblU{_|ZX+{(kwrvPd**kPvB^rXY4AFuy;(P49p&rA0g zq*SuQygGTK+Aqos6B8}b7a5wGD<@5f6KUunO)A`2wKnFXBkX>q*05+EYUnKH7uBGI z4=wJ3?QA4`qUXD5rVK_h!r9qAcKOSR z&m!ON#lNwuXu3*ThMJP9jdpby@O%uKcj*dLlt3BV9hfyq0YOGA|3cYh!zB*Olw%LF zNXL=$zi&-d7D_(SWOGE=ZL-|T$M9@Xufw}EdnzrNsO1{4a$EAY%AS$m+3FBio|IY2 zKb|u-DE_rQnRzkwy#=hHh{0MpF}F)!KNJj3R6t1|o`Bq=fl&w#qs7Q2$|1eFJ_|$p zVOzBRR_M}XwzUB(!s-+>s29%)b0Dn5&1iE@rnpoojSf#Ql0{6H>26 z4%afHD~`mk=X8;7@bKq`$}Yx2OPkV}YLZ@cDQbC45@uJQ_*SlCNBhwh+1lltL}V;K zw=^}fdRNwJSz?FiVrhidq+ZdTDVm=n={ly&Ec-TdvY9z@h9wFN*(((CKX54|Y)eh4 zOz&HL#8~KRy`n~EpqbF@rhYAjbT-|21oLcNHCC;}Zx{J6JJfz>j}Ds~u?uh<>C?dq z!Ze;@ZzXC%Y8`3sS`|INp`J^5D(kp(j>k*=i1jvsOi7Ooy;6cUoW4DMg8&KYl<@j3 zK_b=p=hfIXLi<6T8ZYJKV5yDqkF?qphq&d3aso{Uy64-gz8WbxRwkp3Hpyh?X34## zPnd^N?3^EpG0;h~j#}Lc|59^S^_d@oIyL%jx$+Fdg%OF1+r4qPdP88?$T!VK!a z5uMtF1Ov}6aNn>FsTOR-wXvW6$#savEDWe>_0`MAF-|zr;@-3fvu@K4gM4r$eFy># zydM0y$DlXFK9@3yQmb=Ekc2^t3qCKm2Ez1pURn7mghQ0z0{&wzABobC0~bvPOY`qV z@&R3^Q)3u}{g1PIXL60>`e+>3Q-3(<)@o79(xm&vaD|!UACOv3RP_#mmDc$4&mXBT z!~4<$7rI=4_oa65ppfV4#$0|NYH12o=X)5#=MJDOnf5p85b)l6X8(VTol}!2z?x;t zwsFd~?K)-Kwr$(CZQHhO+qSDuPee~lMBj&b$cOv`8JW5Ew^u*v6ar;GZTw#u&o~=w z6MQUo30o|>Q#{1c#-!r+`Kq)1Q&HZByv80D1(QC~Qs9lUkC|bFri`6pqfAxhK9_e6 z!zps2OW=Ag2b`E8%#k00(ei+x%9$X}<{8$k3X=n1izB|%%9dcN_fGLVu%b7NWfkEC z`o{51;v>A30HD8_pbC8XDJ&_J@ei`xLrz+m^SY=^ap)frhJ( zfW?)i>R9PhZMbE`ftRH#C;8A8A1hKJ6fc6B@EgJCOl0gK`B)#sE5l6vGU?p5k|$^A zr(^iV&Riv0?GN>L7i`E2&c)fT`Q=Xku07)dm)Ds6s($%ad}z4**V&_ zmO8?p_TnV(ipVISh z*tAFQ=YC@55 zSNB9`y~nklgmUZrY44F0VjWj~G#{^jlQ}JgtR9UGnB7C}F)E_|1{#AdY+F?`taS}) z@4#`|AvNDOUw46xW2qtq)rm&74G4<00F|08-0cXYI`^yMVuvEhEOzb|l{<r}Y>bs$^AM!8g&tx%BD55N*nY)x}$NNxx-e~r0 z+PfsLZ4UjUB22oo!xmnKdww;hNq}h`h<&YGp)bJ-cbOHKL;K(F_fqz zp1qy^cntcNb)S?<0B&4}wGTX*h_>2bq^?$q;NHIf;yh-~Mp%A_aRISrlemlds zO#+m;H`ARvz-+cQJC(Jk;|BVf;__q_sKWf`%X*8|BizqMx|Z-EQ;CA_r8As|I_)U< z(`m?=OG(1_i;({34;TNMvWuxwG-d}{H(?)?-Td%gnb7M-Q+ zC{{mHLS-EY^+wvUulafuw>$xP?LFrlZOUZ9PlQ`ys)ZDZd(x`p;#WUxGP-IQ9 zLZMoPXjj2$vmjXg=ZbeLv}BN`ykS5+Q4OvijtcD&N>-3PypNNe(5S#N1#;=Y$hNqd zftA!&LSau{C0UO+x>K7|I1N_^0t68*a!I2-!5RLntA93lP1ltr&dF+X4J#lVj3WfD z5P<5aDLoj^Y?d%YrC8XSyOi!G z7Y(CW@0uHZ;2u9sopV`HkGb`Q+5H3zB$9Tl7OZdS23_Z%ucgA1v`#-E_3tqL6N zqcpL%o(WnSk_-E$O5KNvkf?SU54uxY+56(>6JVeL5yS+)RLnuCc!L(WH@v!y5y`5k z-T;eHk--*5kjJsmBo1(C>VM`9Ig21tKcgAW1e)C}t7yfni2Ju@mj?p^#}qY@^+Ay7 zfi?F;NcwV%d?oiVRa`q8%mtOkk+L|AbZ+DZ7WzVQI7k}>KuH>9BuOGNSls=%o<8@# z>`ZVc%{Agf5jTXsxCHUG?aI6IQtcT&rYKaU1S%s;c5%1hqa6*D>D|#$Oy8l7I3dbF zwY<9JWD8iEwbFQqaG-HMoxej@{ptL4z>1KQ%2Ud*p%=+Pzk1s6?04o{Gn(_UBPfB`qae(d%?3ZDWeLfZ18xkd zgNGD+M8|3E=>4^#Kze{TB%xf+=ko`8l_ZybtqC%(FN{Q)UPzhla`7ny%zD=JUP;;f zl#=7J23F7h#U%-+uFuUNrD54}KDR5?7(uApJPOusEematc%h`DuF|xvYH(4gS1GrS zp$HFE>0E>!(u!>uO-{}+X9jqv{?e@P`+nmCke<*l1)F}wV> zmcF$)90^E;W-RRrs1FJoyatE+YpslioJZ%0`-oJbiRZ)aOI@8cbC3facMmORl^pqG zX*Cl;NWB~N8D!0+VmYq{VF0ePIM zw-4``X2aRkf{2iR;dj-1sf`^py4Y}+(49Vb;BjZN5`icyQn$Z3pyyBrj@AY3Ej_b4 z#uhfWg-b40LQZsy-EQwpnmKjrY}=HOv7J~7$ypw~f!$B(oE7YgL~%ZLx62tw*I9IO(Hm#JtT#tFq}g6A+m`00V1P?L=%kH)7KoVTywIiJ{vD6A84hV1SWaKjTRPxF1j z#%60@5r1fbXHFD?e9v}0s1+F+9>V>03=S3G(%Y|o3%DEYy6orkqmDGgeOvWK@#UG7 z+F-UvN!{JIwFBcd2}B;R;!nu)!{_5ImzvtA_7^rwwnMAr7qr*370+^@{Ou6tHLHUs zZ)e5U)K-{J?^Aggsl`XX&l4SEt=b=4bO1zsg z@#C&>suSZc?7@Ym#hV)#LoO)?LK`X&-x_T)PlII!JD|?;HWXzJeF0XcC3OA+R%8F) zo&g4YMg}&v|Dk7$_#8}3O#id{f3X?^Ju4&o|1VZ+1!w+qfyP?(ia^YXNmCHB^Y;m@ z+cmJCoM|Eoi`e5oWY#%a}3P5aF7{aLT$v@Lo9n}yN2xwWro{Jutkq}iJ5kIyG zDBxiC&*U(&Js10yIts+o-Q6A#N4tH$mNkhQ2LLWY8xBA86_|@NxF)aBEMR~DjbFlmKihg@ zo^f0d2M`THNIw%eAaSV+AbxQqKZ<#o$y7s_HzUVj_Fu}xcd6J$xyN}cjPzt=^e8tW zuT*^MVN`2S>uz*=zB8+UP>+5eU(nY0a+aC8!D>WV{vC2*&_GqhpYV_r1D`aFpo4%w zTU%R#f`kBd&;VEF#^7J5f}@j|U#SVFsTk_PwXsZ7n0kTKAXm`!fJD7_UR(S(dH`Ff zfG^MQiubz^v7z94u=N4J)cq|80M-5`TL+ zevglTc`tr56SxQF@972-eKCIVfW{yX&Ys~xIl3U&9T1iD$DI9s=#=rkUOy@VstF)U zf0rry1;&Eo0yYRkGZI5{BQt+(IfkTh4K>200jWXmQjy_N29aIbpQd6=80Yx zu&&-$>4E*UUy#x-c z{=RF3=r|Cc*Er+CJp#UE@t7RHDn!=V@wEU;h*rSAG{^6vAFnSzR3a%SS7COagP08o zd{Bo1CEuDE3jyOoQ!& zT_Ao|kHfY`fA@NGxailAt%BLOU>lL38)~5~_-f+|UiP%`jxQ0Hc)I^`AB;x22z`$P zzvx5{WeX)UbWn0yi2Z5SYKjXD408fUXN5GUhs-cwUxridZp$F= z2b@Wi4c+2Vx|c_(b!;)}`apR-pOl47CovKcYx=7P|7|`S-DBHPW+=aye5!g-VJp)Z z5vL7yRZhkEVIE;14d*oBun8X2GopH9fz_)g&pKec!9i4ggheQ2P+Os1>rL;G6q{reGD!f=TQyiix{}iu)vtX6GAk*Obw#9^3bJ5F)9~WuvlM z<|z6~d%tV;++hFma(GRU0(2E0x&BV_mJ^QUWMAzl6F?Kp5BGoLk2Z(j7Xpl2;>5{n z`y-P)h4w35`JGFdZPDRhof#)=U+2?An;{H;)r|PIkGwONoJkAT7&e%e*EvBN zxiXNGpjy_X*h8C7-X<(gmc&e$gsrPPhqC-V4Dbs#ZThDd*j~|vbdSk8irvf#6bQf8 zmBD=q!0Boyd#}c&ay)>O?SPe3O^9Ku8jrgam)-#Hy)YRgUPgu#={sr=ncYS(AwE|g|#QGx#iHdTOR&B(y^r9Z_GpGwewtVMMCAf1F3rfa_9S-lzlpQ z@YnnVt@VOyRM6rGjqx1zd4+l1x(pq|Z!dOLlLf(z9@R;v_v0E(jxqFx_@Jx^&pDk$ z{nzBBq@uFmc|Bzjfz{q+c&zH>LgO!6uZJT9T{tsrxUg~KyLM%*k05xsvbu9zZ2#K! z(Q^3)b2hU4;Z+*{jz_0Df3X@wQ)Q@n6>4{dQOqbM+hY}%I(_;39ld&~w-~$0+Xc-x zCP)S2>P*kqrLRAD4#}!}rY|le(6n#3<<7;8n zqeK*Sl*^}t>Zv-IVB2BWTY(cVm4hfWifw$D1Z0}qg=8gT#bvyefF*_paT=AgQeLIo z?omh8tw=e>;bvmktU+ui7g!~Ha?&*m^Hne6C6f3s39%w7JxNT)zOqt#+~2gUFCQL| zf3K!Tf-qjmgqTyF;4_3UkQA&(T-%;TJHU@fubqjXmh!X}^;CAVR!qVEc|)~wR!T65 z2=>ZFr{}B)4*oD2`Brr>3v;~TPq{j#-Ak}Fglg?o(o2I@)uN2 zn6tU68)9#c;$cMN=M-><;xpFCwEI>LQ`wl(->Q=7#mZDFm6_3hLI0X`>bAw{Isr^d zZ^wJ9BVr}Lh%i+^Z=won@0G|%JSYCknjrX+{?M|QmhLd^{KHLd|t5exUx2hndkmtcu#ilq} zf;ZN(rWkz3`ZzlzFl3%yeOi*B8zJ2inEk4Q*f$A=$*5y=E^MQr+gthouMtl>JmYk~0iLq7G+xuf2M1mTWJ)=9i$1c!Eu zfkK1l%vlB8WJ{KxYLAJXcn6dmV7+85VL81Obh+ab79C|wO7)LGZ!K9Sg#8`yFc_36 zQgY|{vq7tMvy3wKnpWN!B{HhqK<>Zw3DPz@r(Z60mK0icyRhIQAZAN=8{ZmtYr_}MEtkrnrXQpccrP`g;-t>)`?vI|Hejd=Syo(+6%l(j7)gx8TYPd&s35e6I#G6g*(J5mK7$x|kX z)0Q~GYRR8h7`Nr0tf3NpZB~aeh*pT*Lkfj?6_LG*9$({}yW6{tyWBCHJXcts*Cv>az9G;zn{65ktpvjsoPTS!|&aOXCkr?{M5Zf$}wb9OtQ)5QY*aeGxHk?Pwx zHdHR}Y%2ZeK;3!E-m>?p4fZ7NDOBio2`gz^c457@O5ih ztf|ex476g!WHD+u5_3g{^FdOt%+3)4;M_wcYY5{u@5P^3#{nkcA^L zY1OFoN%Ou^N5TUQ3UIS}nwWYFuprn*nhjXWeFRDa zah>=Z=Y2Hu9wbJwg zC9tG$8);8<&=aQUoZhnS z9MAvlv53>N#B2uMST_NB%3|>L_EM;LAhCayw6sP|FoxYAodc?259I(sV`t(0zCizz zoiC85Z<12O8V5?iIs2+-LglYj{Fg=O(0Bln*<+hoxzbH3CE0ml zWR2lTtR^A88AJS(1GANgFBKis97D~iNRe3DyEEHV&d+XOOb4J{0}0CC6RXogU@k4T zR!3s$QN^#kaIaN^3(p>x!jz?hV&=?2&$!MSV(0AhWc2Usa+)&WQ!Gi60T#(tcRMgQ z$c`*#uG&L9x34IQ+BoHSN}~mLX1wx+KFpp}=~!N`PccT3%Lg&Zz^P-4A*pxJXyZmV zsCpHSCz?G+We-@6FPy=*E)IE6+8Xh zW{tT1KB#4oNm4Z(nix}rIo@>>h{+ScRddq{O+y(gV1%xW>gYGsN5L~%>uIJsTcFZ{ z3|Jr<2M4QbT#rn0y4rvabbyO!n{6gQj)fw9v4?7R zkyY2&)bj7nh|J`n8WbA?o~;U9OuK6-r`n@T^tc+Uq$*j>QuI-=gr3gb1COCt4s;=5ysT#$C!DA`fvM2uS(nk zL+jx-Pp>cmJhZ)_UWZVtfbOZpq9idNl{D0G^RAhq?A35DOwHzO`bM5}TB@%okPv)q zGc`z92vj-rotSJXC&toB>erbJM_3CrCc?v2G8SM5I1*Mz5K$iwUuD4=Fiw`MsBE!H zoC(yCTkY(b@-6p!T2E0wuzq@`+>!19{7ff_XL&=8Mm0gkxNgtJ-NWl{{oB#gT$ezm z!s2UXp=GG5(2AV&sIR){DqH&7E4hR6HR56&IRyEM;MW-+=-db^0)9ssen_nA#PqU$ zDTJ^YEfk}%y2h^xA$wQl5&p{iG|YbD_ZGaS~fr0@-6aM8>G5Xe+#q&8dnL*x#3R_13}IYsrh_40m{ z{7CPcHm?#~i}dgSwZ?({Zyxv!bp&-kW(t2iAuR-MUN)KjX~e8;xZXv;&6`WfGldEF zi>a+C&-AIF$P1T>FZ-7v49Im>>SVN3SbA8F;k_B;(fP#(bkuPzz;L`n!1P@fdWo zV3*IQ?adg&UfI+o$&e2%vh^Mk)-yeI@4e(eoHM}tK8Zm3YyWumi|qwc-8)UJ56lhl zk$gvIEwd!*A1V9NOHDdOrF6|V;!OyaY?xkBpNh=oJgC^Hp>N*>N9%uSo!k*^BH%-l zY{yF}P5 zx>gMQEVyecbHUxi?W1%&XO36aW<-d!+6%}`-gtm#JamN$6#j4$P+UgSnzDs)teO_@ z+DQ3$hl@*Ayte@YyqGc*5XjD|2_Ie}NPNH&!sW5P=JLak{%BC|@ zYIx!%DWP-w_q<0uAUg)}RxV)wSva{pU8lm@q#hYOX#88*I&_QtsxT@Q^4z@1OC~JS z(CjOfr(gLU$J-doi`MU~njDNKsYJUW%q<%uH#Pgd6-WE21{wMuv2cg_-7`M;G{IsppDYCJjpV=OkBd-JGh?xfwZ-NQ<`8f1)yf_*?;FqgV^pFw zGfFEPYaU^;kW6WiJ5S>yEib`-5^sLaec#R^PtjgSJj0VXE_si;ojHOZ@hRW*r_Ub^ zaT_IDhdA~_guZxfrHVIg=mb|3 zvsd`-bx&|h5_k8ag$eQg_~$(Qu<-U&@BS67ke|qm{vs0@+#b;?b)YBNx}wL|t3Y7F z)yJFsMbs&aWOvmt4}KzI{?My7TI4_x`3alyOuI6)TG^Gn>+2ufH=danmwP4lL`rl* znu2kZ%;QF~*)NY)74wuSre}XcY+z5yNd>X;ygRs2q8zL7j~@wr^mxpL-7C*|AXFE~ z-}%AR;dy}I4?0&}mJIW4FDv+=!&uW3PI=awlP+9^*;7@@Hw@|GGi#$z68pL~Et=4z zpCUtEhzW;!7;+kVP0(_GR*!K5BUa^;1rcK(T<#Hd`i4T4BWE$01ptvUuI8~9)1$0#~{ z8T2(=k2W&xoKhKo*Y`?ItWNv!-)9{Z^&)0*Jkg&nx*E6>*snrj0-c-dB0CCv#H9#_ zT)tam#J;<%hAuu2O;WKGdNIu`00av*Lldk=nQAHg(qr`-%=d2sZP(s6A5i9_4myoW z7TtFW4;n!r3qzm@3lRZ?%TgFPQ2G%a2WejtqJH zIbM=%IO}aSTGGv@R|s6@lhzK4hCRu8{P*u}Qt%`h?`tohU0@j#Nu<@3ZN1ry`1CR3 zSY}2VsW@P*$v9Y@C>pJDB7gcAjT?aulYV0wEt<${pL@pE%gum%HyV8xC&7!lTldN@ zDZ2i74{~^zdL=TJCX#`mdln3D2*wOm+~X<0-X0CuL#yi@I7=HTpspVg{y2UdJV?NR zTEx-*q4DFmB8|!0DmA#8P!q~rBrnA36(F2iO8jwwNa#kwI>mB!qf4~3oRmLw}lI4?S0e;b{m^poTem$tgSw!7Iy9q;*$kmuEDSfh)8w$F1mvkQAkdQnQcSSXHn<7jrfU!bIN!=BV*3$8GUO+{ z4FepAvc3!$GFp;Ev^}7OMBb?(sHkY-<|a??q?MX-JDa*Aq5YBMii{R0h!58}6J$Mi zOY}$Rl9zdKH<35SuC#B^d%f1tM7FUT_C_6}L1y-hTTCu3ve}4FzN3z2uvIj=O1u*P z2R98du$x`eCP0UHNaIHeg)eB&%#O2QUs72h3isk9rhG?OD(F_}n6a_I%f4Z~TGcn} z10xx%zQds&A>^QB1}v2o=E`|WqNDK7^3b&xy6CT$&*?NT?rm9{EC`_K_g;#+liABe zYiF6m4`n8Bh*SHW#>v<$%fj%01=g(2-EQ4%t#zRql!}7OYKnObEr`?QpB@vtbB1^( zKBuFWmySq(na7<%*=1=-wh`Nfrwa+4GXxkdTq~W_gs$NzhhE9yX&e^MHn1Jc*ACU( z&u<`UN8vF?qQQdw`qq2;!hY5I?tr|o^Wdc`mYK6d3Rlz2(U6^x4LTRE+eFt$;~qtZz$?xDlhZ>z0U*Q|~vlV#-vTx-5 z;C>f?6>%R5^o9>yiNz?KpU+|yXsxmvx};Gis{Jy#kt!0*_qm{VOE%sL6)G7po<`>+ z8;{#zh4<9q_v zI8Z9&uGyxdKF~@R83wW7m5p6RyjKJF^QaSaA0S)Tcp4#oXsa;I+*|3~(2AqyVs-8o zN6bZZz5RbnChYWDfp)qwD?yn~t2_Z4*9Ab7ZZc=FEH-dv2w`DZb}0E&kTu!5I)Kzy zKj?IHxg`P^mBAl(NuQ1JaLg)Ycv=niFHYDUc9r#vDjYLF&>R+JM#$B-&KhZ)7&2XR+LA)b{;B2_s44?4IVa9JsveCpjyeP=IIph>6 ztvJ-7?&nLZ)9TGDF;ezCcml&NhjBJ-pq1^%Mryau%*DNkJi4t`^qt4Kl&f1RJ3`1a zyo3xl?haXA$}j{{lN4nkv^8%<>T)?FM!^Xq=oXa>O0PzED`_9atI@1Y%~zfyCo~N} z!~d2iAWBi9&@*_1_?>Zi8zR5N-6fX-hG{tz2GWVUZ-tQhox9qm*IugYgq-pcip%aQ zAL*MgH?z*1GaBrP@lI_X?o!gcChMkpFILeP5_zEm+MhE(h@JR;zN@9RcCYu?-O|jn zz%QVcAJva#ch<%u z(HH`wLw=Rqd6$Y=uJN1+An3#(F118%0uv!( z4hjzE;;`Ie61(cX)dm?=mA@fES*2FINQl{Z1zyMQj>vF_JZN3Bp25BJjppqwP&eNFF}7&Gy@ttFBr{$NN@&O(6OSERtd(nS^o{i?W|lo} zM$c4HIk^}Zco6bG0Gdf|VxyLO<(JOmtJ@4%UMs%-T87nEbfld)7sx9!C<^8l9wB3H zq0_(J%9+e!iuYP(j!;I4Gsr=g{z<7F{1e_olT+CCr8uz_Vm^3O1vKz^KS5JX-M$~0 z80oN@Xb8NRf%o?I%>N$TU1uPeDPT2e8)SU5EnNk0wNO00Tt6j6ottZ9_sKTSHE%+q zaynfAc3A*gam`}fv?4h9Rf}D~3ud}}Dm=om8~Qn9WSvr7kC9v+s16m`(7 zWfh*Pz=Egg!tGlmZaQ_{VuOp|8InMo^MV<~*Ug#s8aKB16g+mISfo=3PKBy7e?Nc+ zKS}Nh+g6Op1|I#C4M(h9AtV}P1Wz1QlBrSDkm$O+uH(rP8w#wT(YKVrPNQled1G!P zs+PTGEQlDeR18Lvp~jvZWREzm8FVahn_bO{H@{5)tfC71?9lwMuJ8LP_QJ*5T4-iIEi9@gqAk)=N)}u>MIz> zF$7{l{d6=o$mz5GsUW2D4H;Xb+N89WCE@0Dcz$h%wWXKW5oZI8Q?`d$L#vSh<=S_% zns2OrV#>91aKTUfCb-0yLDrQT``OvFXIyR1X(&7EUc@+1;8viIqOZrxibui(C0hy4 zcw_31UfE;fBzN(E-^+G?<4R#Jo|@ha<;m*uz6Gt~MozpSK^V>ImI@Gt(o2)2}$)A_^#7V-(q95jMdYPb}}7f z&!hweHnL&!t1C#&v{dDVc4IbGl%%5#^^MJ)xRg2G6b9xaNb_C_q5){$+Qqd$+Phg! zDQ%+nWsjk=CKuQJJHpUs%p!S2nY9gctUnX&fKhGXY11B40J$}4VP^nS&+5pnpB@62 zdBosaH;E=(t1i(VHG1pSWzp`d_`0b4sx9hE-9U$REcP~dUDk`0wF~k<`$G!h5 zPdlQpa>}G=C$J};Tu6XoL}b7X(E`%g+dzS_ZI2uuTFZ-PNN4AIWM3)o- zh&!HYPQh>1OlSpuezUDiaxr}>sYICMs`cqpu;tQuMh77cInjevQKR=lEySzv)+Uc+ z0k{fe*(Cc+`pNf!yT0-|rjkc}s)=TYEcT6q>9|dO8|(@t_s#x_*$L7htI{smsq zZH(uR78{ua<{)kIJCJy4o{+T^-nH3Fm>t153r6maV9)3QZq?2uyBzL|vV$XnIEyXw znzavVH&agvR2;f*DIY0$eE4I~Xa8hYCFn{Ow>8dqG5`75djx40I-BVF4U^hJTF60i zZJluS5O@h|9X)5k`Wv98&ZvSnDK&JH+e&D`gn4zn>np&&F|cZJc5g4feIcIOb93*! ze}v_o=$`0Y^_yn(;8`BEnaQ^|XF>vAg;D)g=sTPRSG>h2x0&^Q0f!2gv9ji9$6^=a z?`2;)C}VWoXTKp1@&t0iinEY`DXhk+F_3Bk*VGG@WygcTUagl*u6*joFs|lnCLpxc z2ochN#e{k;RI`Pbx|p5dqwd~;x?7yLz6>jJXHv>3 zAi>NvkMfblJoSxZrNJTf)?HXigs-K!R^r&l@UG2F;pWMg9GOeNL{9T5 zV0}577Sg?^I0Oxjn!~1^3#}C>FEQTt071Y4xmU-=;TS(|r>6vu&Fr5}?X*H~qLIUu z-Gf5{7bTtWq9t@SFK(2!(>O!EWr`PcEKDQDRFawH2_~21tDHWX>TrZuIN>#I8Y3dfS+7kjF1FH zz$0;|(h2cgyGRTbp*UgKz*<4gpaGcUATUC*{vw7y z2n}B2)V+EC=m53lZ2{D^z47$uo&&g}V_?Ar;{$9UQ~^(c_7b5S{A*Sb+^4TP<&&@z z=g_Mshs2-T-qr><2ted7xHfKk2e`4T0_MjH@z3AZYw?SW2joORU(1~b8RGy5fwP;s z^#fdhyMz-M1aJ#w`UCY>?~WoCp@p6qpqHzUUmH3vGJwD*u-OOr4Zs%;8~_dAw{k1@ zDpJsHsMZJqCBjKS03pXNh^G(k1_t=7!VFT6r_KuiV&DY`!hQt*-*L_cv4v?62>xY( z10)x}3e10p{F4q3EIcT$76kxy2Q0TfBldnOAR6C!YbdIS08 z*e(N6#5HhfI|&z4*W8IQ_<{&!7Ub8-8&FR5-M^=O^!u_tSOp8_kHCfj5a|Q*c>gim$4BEpkD&)0 zY5>Se@5vtUwYz+8leTS(?fYxi2Hy84s1y$apyTb!(%9cLjRtA|>fQFOmI|zbyxd}E z_4u3Vp*H{uN(|l)y*CcuPgf2E1RyYI7~u5|I@fol9p3-<{E*LK8PP?oPq;qP=N|mo zb=JmLtnXQrV9U=JZBgK$92(f}JM$b~fDoVj2l~tRWUe-Runl~|+&wW_GsXzZO>R=_zMUZcl-rd$0 zPJrHB!r)KG7&LYW`ZC|AoCtq>3;G2z3L+@~oLLwd$Iruk3Pc}t)W4%}>JYsXkRK2P zDN+`2&-4XwjbQ!=9i4^{4&Z>~3aH)g_aQ6*kUxqE`# z^+ul_9wW)i3kEV7#;FR*inlDz)ZtVYD~WJNih~~Y2yd#Dc-VykUUPaB)5t{1ETt^U zH9(mfN7ETa3%=oB>dlizrE%E}>h`{|ylSm+eJCpXRvRn&b;GoW$)($WFI_wVs8R^A zBXW;d5=0q0y{g(f6>!w~wzqLGg8v2Pw;54&M3QyD?@=M^vM3^SjmycyIXN7oBA5@2g^n_#itJ3usoT>Gd($| zHVIqySchwz)pV}wbW-2ma{GW)RdACB{1W6})j8NrTrd`k zaAjP;%=o|COZF6$K2gPOeE9AthHKHl^3GJ(*T6v$B2v*g2g3#fv+IKxllu0Tw<|t+ zu1nUF`uRbKEY0X8Ia{aRKuIrULK%9p`#EFVs@D~eL+5%nQ|U`vfQU56^svI83|M2R z(jnl?FRAk2AoA@!3S*H~zh+&}8kDqa_x+$*hgMAu6m~MLNg@P!=J>Yu3yRyOFH_M> zy}eB=GYjJ)!kKMjpV`hW!yUoOQBd{0N2Jv&!UyXcjny+;+4X2++VVwEn+QPBEiLc!6?Xec~l1uPKDrysR zVRdavvubU|dL~|l&6k`W`*ar9neV(;q1D(S^cov@-$2{sBa>rnh4R54N`x(ii=H6Y za8bpsa-K)wba@hyC0&~&cN?!LF}+^fBz!facUdU_2QLY94PBSW9CWjZ*~Bb~zpGmF zPeFtgu48LwCa9I*G>1S8BwvLTyly&ioT{KPS_&(+^R&Ovk2iO7b)Oec@SDTI^MqTCNdUng0;PKYLPQOHgT{~xF7gX5UX6yDo+QL9~2ja1>waZMHXP455>2AKKsY>FHnb{@C`l`LUdVVlS8V^j4+&%L*;)vg++H%l!4>X0W za{S%g82glB8b<$sI*obhU7eX2``ArWt74dX--`UIxy@l^q!Yj$$bhrlA_ljK4`!*Z zqiu{X4okYcEH+4N)6lvs6>eEJ~c3J259b%q;M2l7e zD-oGvkY8>1H%=ZNQFv|9ByXZPmz6qYhC*N3^wnMf?cf5*JSKnf2*ct&`BIq*oLwiy zF8zio=~_I1=Rn_$vuT+vWQovLy&iX*rDa)7O>se~C6>@Wp_^YRp9AXRHMP;$=$UMG zHX|v0cF};!@g6fZzKQ(zD9(u&{xLeF^;&?I~fU&H+Geg^K7&7=vT;$6j_8YP6*F_DiQHO zXgOEovf^pX1NZsL%OHUVNwTt$R=(Ur$v%H-6WK=^b>oq5VTz}dVXQFhcrnPRf;vh`vOYjs*mN1!QBy~sGK z=Q%XMHNq*;UzQV{p}TZb53;GU&d)CpWMW@;N}@lQ#6zrS*8O1Qtzz0*CIc)VvX@Er zq>Q9xKFmOq%V5v|!;mDNAxm`C-Kza0b?7Z`nBHI(NJt#M4am=S*^~MNkHobOujO6^ zC16!2`|KG_;!Y8Cd<>OOLLOdaj?|wySMjRGcbvgz`KB~%2PsyHDfbh|(VYLhLr6L; zI;3P`axok@HWq!f#S5{d?}^!*vrX+dfz0`=u-S7EY@&~|r^ZV6H6+H)9a=-x@b7qA zF-CgR8mnjcs)r^bm)FBf4QJt3$2_b>t@96n6EG3%^Hy`Wl9%nL7dhL7$Q+Q}lf?0F z;dHBG5i*xOR#w)syEHh%4aUIN-LZGmu}R2`;j3kxySJgV+S(r!LyzP4HR7Tr3mIz? z|Harj1PP-AYqo9Mwr%&{wr$(CZQHhO+qP}{-POdKh*`|CqN27{6`A>+1M;;R{-(Nm zrBIHFn(UDS>r)?II(AYuclocX6TptfOXoeP)rqH1@F>qktIk zMB0N;^XoVT<(JxgL$7@t>Tw=x1Y^Ff|Cx=CE8R$fz>dQm61EUtbR&o_LM z*INJt7grBP#f^pRWfC~lIsgn9ni4UpSvEwJOP0CI(RXkYa*V%=?6OY%5ewB?B{;fh z((X_Y!8XJQyLW2xRGGTj6eqM=X3J?t0m9XPUfJ3pA@P~pzlLr4y`Hk1yb_tF64*W+ z$8Q7-4IwmM0`sq7U-N9%;!2IpV(I{7Lk8sX%2}93bvpxIu}!Q-UwLv3ral=|R6Zfb zE%M0`Q7&GGNSPN;Q}RypL}~B}FVFP?rO|ty9MoLId#e8GRUJ+FP|}mhf4m|SV=6sZ zP9&up-xDvQ;2Og&7V6!R#6Op?hQM$~mCbKfPT5nluffZPG=Clcjmv@(OGnh($Hq3j zN1z~BC<*}s?@6UlFu)d_f$b?`NiK!b${pcb9K|@G;gvjYOX6A9D)Owd7PBlCg?=qC(&I>^+6dDnn|xc{^IA_gil&ZxfFtrg)E_0d4FXSD>?q^CxVIhyIZHM zo(G>Fk$R=A)tNsLSLFD8v988zr3V;$7@9seAnn8|Iofn3FB(V}8`b&cufIB-)r1S| z<2aGuJ4i)IyQ^`-Z9Ef-L@iSfsQZ%mDR77jy|xr7MUt1=%u>V4@A5~qpBhsW@z5K< zGZTGMq#O9Y6}HA(Cvm6^M}xI$-2f5 zse30gclglI#cPFOWpLj6rCOEXuNt3hucol0USCRY!L?e-ZL{~my+_Pe1y2v6ESV`Y zk{3Z(L$}L*fhQdTwzQ{Dj;m+INuTRU3qsZm1m6hL+6~<7zS0u*)uqS9P7!6=LLI*BYLCa` z>-SUxAB@Ef8v9;wYfnhxjyu1mYOcYXg2O$T#K}{GDDok%^8I-$vV?8Few{ZQHtJe? z$Scth`Q??da6CNTxS@y7)r8eRFj@vWb|zk} z5H6^bT}{0R4gHdu5uZDJhW8-^=n1SiJ@wdR3Ogx;HI(LTK@~JfV~6^(v7`Yt@*8+* zl^E&N8~>ZPC))XuVPeDNUz4hrt}jAZ!|3|^u(3#No~u6Av3boQl?91&(Kc(1sk57tfmXu%Y?M>mYIvy5 z7_(&{#YXaVQH8HZYQm=pVt`@vzbFy-q~c6{jrl7ojkDEEDhk?}L(GAnz@MBghQ;m) zR$rR;3o`=Zdh%6KDh@{9%jFA<_G4D5My_OkY5eI9M2w%>EOUb*0qK+=E*RGJZ`YnK z-LJA?@k7MJXISp#e!CorV^;}nRb&dr@2Ozz`I};{JrUBzgf7EGHGKZ9O3uAeZPSTX z?W*`zweJLMts{O~ulB}YNH-Ct$w{JK%$uk`;BquQJh0(hd1vdoa4NpOGRwba#*=&6 zq~RA~Pz#6G9cq5B)r*ZKk4q6Fw>VGS`878?@c4!SNm8}Xx)0baXGoN-&)cfpfMlgp z@Ck@Vv*Z3K7f=Pf`$+pEq_wIBVAvCSBwfXrg5J%uD{aTO=5$=IbjA=wXO|COvMkd3 z4uPgybGbz;x3zkOcSsd`t=Ft`qMYp+g-Xsuw6d2D*)7hv|8lM-7rFk0^FUD?&@BB7 zwKHLj7hRu8M|Ky1OE4yGl+Z$SWw|vj8A&Q6Nq*5md|WTqEQ@b$nZ%I;eF@92A&fol zXM0#U=R4M(ek^0}F6e7iwx;GYBU3joOK`X}aj&JCGYmkDF>GW&uvtrQvV)1H?&O#(%W29aI}@i$^}a#TDSwVhbh0y?V)C5 z_97XwLU0SV7UPoJUR042B|279ib<+)sky!`<4ng^(%?WDn`t^+n#LYkUk555?wnyf z2Mcxri^yg2=2?ISmZ0DMqa?K`E01TysaNEu5pL zvHzh!K4S<&wjfmdF*19qUSbJoPYIjn1cCAFrIop3+L5g8EQLfL2pZHf;VV+1UvqM! zZj2K?ySIUpyciK9YB(4K=H3NOT>y<;d8_NynAiHQvarFle+fl9@G(&%me4F5?xtn1 z{wsqiDF~mtrMQV#-P3`kUOaF3ePA0%*4i_2BC}-;EnfsGT3=uZLEIW$@>gH_!;K5 zdXjZ3YjQh0hH;GTW~FGKb6dvKH6DB=jJS|7@GMcc?!zlRCF>l!+^~sZanatkdt*+)xslIFG|GOa^=^;0D<-Lffns7rO9j+(R zHp7R!LJI4koUV?L1e*x^-C6wan<|nZb>sUYwoGx*{R$O5NvNvJfekVKiNwy$*3#vw z2mk3j|A zss}vO1@13Q+x!%H<(-8O)ihS$QGP`c_sQKy$3ZZ^PV|^*hh)oQBAWCyajqVhFuWu% zbzPfdpbCsnGOaHgG;bCy)4u-cvh98JB(u`A96=MOwmC)gA!(9ZUqyyk?;lqGHxpYB zjhu#a!pH^)E|Cb4w`=~6*@(FTM&{+0TBH`|R6}(xdB>6Vd|37ONATDU3gzxMsyuDw zQ~)dG99nm(+I zb4^f2m>&f6xo5rCu!nca@5gh0a{ysk^PaA>Qp)Njcll@QABH2inmN+FQ9eH1g0SJ@ zbe9XKMpI|Ui}Cb*DuBy!XJ?CG+4f1rcLDN_o zcZf85?DxnMo|0RFTR->D)2?jAh&BV5E5T3nXcU@CWN3%`YVOV}jcNQV-D54V<+y!H zk(1R0QlBqenl#|tKH`S`gJ@m8c)9S?q-j!Hr~i17t5wgJ$Cr^@dyOBT-0IR*h1LmI zM@_VqVS1S(Yq!3l%q!I0xte!|$x?}Co-Sv_G&)^cPa@Ni;4PoGU zbiPP2w)ajeOCRHFmgyN+Ob1$rSrD~GjF1+tSJ<{5Oo2~fRh!J}**LgSY_i>|OBK6q zB;Gx46?SV=*uNZD&(gRs@3_9=)ZjpNx;$)Q+(w%798Pkxyb5P&DVE-X#Th-5zAbE7 zPc4bUbb`Q4g7BcMdnnpqUqYQN;@{@Bv)B~X#{`-uY%Q!@;)r<0`O^FmUijJ=G6NKm z4W9;6Jkx+FTOt~bbYJX>ow5})P0xXRncN_*8M}x)_p1|QeV}jT@x+Al6lRgo|3DvU zNx3dxoNo#?uJF9iCi7=K6Su1Y(u!2#IrZ?lq`Xhh{Mu6*0rereKb6Hn0(JM+aMvjZ zj*~cWK%?-}Kb+8A9KxHkX<~BPpp!(OaO#?6{3yZOmpC-)cMh@)uLzs0(=;8OV+AHb zMkEc4{V~hpgrtUiSly!UV7q}`UVG^CzwS5=N(qRMOgT@u-LFgbD&bqGxb$@{v5>w` z;~_M^D7)99Sb)ViPsR;FMADpx%w>BrlOruKjA^#qApcao_zrs-xARMKKomVe5q-iD z$y+dhUBkDaVC)2y9a-5^JGaoO|79Of?BA{GXrv!f(`z)<&$lRD(K(PH%1>H{!?RO} z&|Rvmu!#zri1j&B?gGT+O~tFE{CUqvYSgJZ&5**=%!aShOkf3}hF{_D8OXEYIMaq79?$UxVh}k~fm7+?;$m3(+S;0Kx@+leZt$(dTBPaV>p|Kb7;v$B* z$Vk6gE9-KPI_J5t^qmynI4DF|2(pl_SbxU#)GYr z%Y5cPqIsp36H7#@mi^8doOurM>Q|Nr7!-azGXtQSF{j&2dOL-%iQ@wRtN`~|_>py3381_;~1fk4{! zv1r=C+#qfd+LL_T*f;L}s%~j3^<}p%Zp*sL@(Yxgj1%e|8v!P{0(LYw)Hgr@6(=(@ zajpSeUFlR^U1d?QuxR7EfbaPoQLu3HtFXr*Y2MR9lBwsnafy=H+`=%ZxB~#8@ZbR8 z=m5a$2*C3QxVin4a&-3jLuG3N0W6&4;!y=MQ1kaez&Z=&#W^{@+5@evcL;g@J|Obj zi~;N+AnqCerf>-kAe=!pGJ^moPj6cV#2|`FI)DY1CXhg!-#+pXo(EfRZ)g8I_@5J@ z`ZU(ITDF$05M~;HKE8`q0Qelt`2|E1&^HZ6p3x2Dw~Iv;1&uM*;azX` zG&NN=cJ@>sA7LF#AJ!27NZBMseWzz91MpP8YA^H@&Q=z|&q>ZHKBxn9cwes~3)q-+ z8Spe7>Q5&xtAu)dV<%<>_V~U_v~|ybz$O(?OM;V=3uxfhmZI+|39JK%meA+T;Op&Z zGbpFK-}g^&m7qT5<&LSECwrrLfB+pnzx%n*hJYxUp9wpk+dnKPCnqll8o(J4poi9G z!x!27g9G@leN#b0> zYU+aVr|bBSij*{`8_<`g`@gcLES{^szdwL~oEQMYeV@VdwBX-=;0QI3`W9q?{XAoYZ(-ZuT*E<;}fPer?S=;=*HqidTf!SX_=lGY#=CEye z_Hp@NDbQf+xj)58f7+aC-#Sckc>4bBm6dJq5L_myY#tr}eza9)%qmv zVNd`{dI0^KrWt_jN$)Mn42_ zf0qkF?V7UwUz0@WN*oCjcM){}Sw-{!%%(ejomR z8m(3R1jWPr{fGn}-_5Fjw|4eC(c#DWzQqPXE1({u;Z9@yL;{v=e!zvitN#MG`v}kB ze{H_OAMJlabNv$c-)Y~6#-6seCx%|=*}?;6^bnBtUgaj@M>jt?wtxowGz5(O9-4%? zEdS7hn`aK_AucZTN(AXJw({h85Y!lqaeo(uBV9g+;2c~(@mWc{&iF0G8O+Anan}LO zf6B(TasE;}f90+^2;&Z5I;3U&a-*;nJDP&Rw)7qdNB&l@;KWP(ntA#V=b!3S?**oJ z2*7{hfP;_WNss9oJGMH3{xTuNM{jjy{Wt}<_8-uLS>~STAu9H+>e52p|3pS1A@82h zgP%v=+_`jw*n?=aUyO>-*HwWj|)%376i-*J*sH{-AoNlB>$mt@XF23oaa(+mV zn%r`m%XHK0f15AuT#kajpU+4Lop~mKy9wK(02KP8_@+*era$Xr6(i1NkE}2_k|nE} z6(}l6Ak(=w;peEHdq>NF{9hsB1n$Z$`0ya^oJM?fq0@51{p>6DER$p6_-H8WpsgW$ z0d4o46?RB0A4ZSa z#g!2!d1ZNV+Zw>U=zYunCxJMvL*-%=ia*7WrWpCp-!eVA+E$-~&;u=hcY3pxb2`oz zimo0|=Oplkmp|J|y|H;~y+9eY+QZFB>k#L>J0~I#-kvW(+%%uj2nr7f%E8^>l92U| z6GwyG&!$ORp%4d6{<9(yuno^rQZdx{19|*TeQ~Uw+w>~v9BVLn z=Qi;#s6+%TxIv3}`;pYD~7RcOck&i&!a)`=!EZG?p1KjZ3y7oR;~>u$$s z9!O*nB+1H1@jXIH#l|H5PR7>X?I2!5Q^i!Oa$PRrM8Pj)sb@(M=4g^!fK#2FWQEx5 zURkau6$89J4+_=|E$~-Vl#++33B3^Cdi*Hj51VHgogEQz*YV+>olss0tC(qndE41= zrPeJcI~uHTXQyNrmbF=4`T;WdU_td(j7>Aeu|b-Jsucdc^8Yk}Ijw4aj%xz;slK`{ zp4u!82(GSt$qlFCqT2k|s9`jWX;a0I957pLO@m?_= zms8jUBQRf-WPb`K%js zs3IH_mB&H-zy4L7(wKZ*zhqn{73Yfq!O^2iBWz^f`OFsYmLpinJ%jfJq=2i+`xcR> zRW*Sq-wzJoaxO>{r6ukf)yZ@ddDozvE4lGq){$a(T;k{Q)BZIpN}RgwF49a66rnIo|W{vnS}=>Qd49t2--s^+XH{X>08B!+s;c$)5u1cI$#(R$G^X=ZQ28lwl<>nra*WuVAZpQ2i+*2Hk&Pqbv=8(pbP1{4M%EVpW37l)mw|D zLU?yN6yxg03GK({53v46`N+lyGSe9N29z_QcJ;6BUQ(OOtHeA10Qd5PNo?BN>GD&> zz7EY!7C5HvHi&4H4iX3_lC1@4qSKNB`&cgK(8M-YTi&%*7#vX|bwiDU6jMKuWN5AO zKD(RJ%P^fPqsn3wQ63&12V7CF>f8vB*~X(O@Sa#&Rh?dRD$x)Kp3%$ph*X|fkJNFC z?t#fgre!+L&7CTtK1nY{k4JB}_7qY&pn^(Eg70MLzt+bi({a+lYfolEEaLBUr*mK2 z(L-?nYULxx(6RD2N^NFpc+H{G&rKkAl>AzJ8+jrQI^r|t?fdn8Pu4c=t(4uUTQff zM!{{}kE(Mc0(01I%jPgVg2DApr%^-}eFjQD ztELJMi_&J*N{;#o>VHnv7Dt(5g)4BC3(cA=Hk-i#k^%8Q+^84>9dy*~I(CbOd=k%& zqT8h~mpNsdYJCEwRqjdtf~AAvKjy%8_FQ17fnKdUCxXj4aIT8Z;<;|M-d&e9gEWp= zzNAL}>f8^18!>MMbG1^p6e>EXq4L=V{I7h>3=hr96IUH8InTevl+`q90LRCN-J zv;#og|Ktt`K0k}D(T17;p>~+OQ;~JtwIvnlK|o*>@K1>{#w$e!0$Go2-u zWbXKNTeWKl9uH*2JZip~oPM&&zw3voN&ax9^hN(g)adug9fA<@0%;@z`zrf(=!7=qJL??(Ui+HzX7Z zWw^tjiUZG(KfqQBzb#n^_El!gQX)upp8^5Ay5`N=V>3=#U4-CoUOrd5?kyPf7Z6sAsb5gB>IAQ#R&=KkmXu!Pp&h_rh|$xIXGrsd8V?B1^s zE8kxQud?$#uL=%JJLFoMMSI$ot=Kv-RZ2JHx9B{!g^;=U(;6YnEXh2y^q4GB3p78& zLNwhWh#1R97w_e-nLt|Asa$74_4&&jb+={VJsc{G{4#~<`=}>k)A-hSs)Wh$XlHI_ zn(G@#XfBnqFCT|8<4u4P6O*0dcu9MP2 z&Yd%IzXRp&nVZ4ghRIf(a0TLFqp_RBT{ea-vA~KQCIcjf#nwE)bp$(vVJkkV*erY9 zV>r1bS!Z#YlGQmZz>Isumdur#AdW>EM{|_CEMq?S1aq=+t<}fApPXws zn7nR$6d+Zns5`Go@IGO|?9`VnQEBF`CUDTlAjA75;sZZl9YBzrY;Ek#i>x7%_RWZ2 zM`rxseNKloHz2z0njiWWk?_3Tuy#j1W%x}f;74qs-I6B_>8iL!G8Gy=TEb3Ewj|7EJ|QX<0Wpyh(Mhz0&490hJjN7 zAkK?0P#T8jDSA-Ma^47q?PrrjSUg~Szdo=W#Eu#aarkd}Fj)BT(+k+W&%9L_17u*f zs+?*AS)AS(40lRC%FBgP6tMc@;FN+Pm4uR=YNT^Jq2y}qoInU$o|IWpEUd3pQYK@i|R)`WRR|QOj2c{E0Fd)-Mqct zAkpH>Vp-laSfeor_8MZ~8~I)$Cjv)NDzQJ1XN@`X9c_Alm0?LuhBIY30Y5y(bY2|= zcWRTV^TM>lYhZZoNM2vVIyL$?=$(#(a|HA^KT( z5yZ~#&GSzuq%)q&g%??tRTgDkgj)%ET}&a8XCjCdxY>U1_<8P_ERUPnkmGd>%(H27 z|MheSi#m?q)T=^@!Iqd({X5!P#8^C6Tty5zeqNkQG^rX|7Rja%&aV6tXY8TGT z^zcF@`MgR4v6TDu)xbk#?~VcZ_LFn?Ny5(bKt1NE*&&>gEi%*VWQ#`;HK-4wrg2NB zHq!OJ#e&}4WpTZ1nmsr;d0z*UBmHv$`EoToEb!4qY+;x!Qv&wcZ86N`A8CnM%t&Y) zNTU#kT8j1MqV$o2{Q^4<(RtWMg!7?qq~b8e$rxv3&N;C067_PPZv-5lG+FgoHtp5-EN3W#lTkxN=Hk3t9T7`mTu26|v9J@N-bZk{HPEg_Q=FIB!%`eV*xa zXo3_%6Sn?UXYL(x=}(l7i348qD;cjpDPKeD_!H4MA7%Zs<`HKV{TemP5DaV?CELe^ zIELSV?L(P^a>wS$p$#iG3omfO;ezdGJhKYC?pX;o$LF@xE@IdSX6bH>nLv6wO9WNx z(kg4BtWS=5nP$Sdq>P8&znn0lb*P9E&&8>xq(@zAZk(7fcO9)Y(evh~o#7GUr^W*+ zt2yhea|O*4u`Fwk(`5AxGB(j1{aKHF=8*Utz~#;B*k!huw)>dweW2Vk=0rzJM`4=$ zs=ePP_qfrV7cKqPz%a`&)XxUry#>zg(~z2<5aALS7$elDw6wvhTnZ8n*B1<9#DMJl7V{f_ITMo^LB zWyU0pJbD`K2L{B#k;KOi1YWZ=j<;1bSiDeOo9k}np34Q#ntIuw@p}Uz^x>HlVVuN! zWwSOuTfpFci?;EH5+Ey74i7eZx0Dux%vBS-QE z0AlYRhWSTi3P^fO+%fC9*8%I9^zud7tHacWHk6(=6|~vZ=>&?WV`j3temrwkgWd`; z6UE?uw;PUSZmAq@aq9-UZUNwS+6t(R#KQY{pR0Nmv(S#VCm6oYurT{jnT`otgZSyK z1Y`w6NsUSg4wwTisOL!r)7HlJ)n#$LE=Hx+@BR(iJ<1Z4v{8k$UWwUVep!FbU0wI% z!L~M!oXaJdfj1P?u_?_NZ@dx6cfiY-+W~X+1#8O&+{q4D3s>)G7x}f67n3A+ZrtI| zr;G2dL9LZm=oQyPm=;Yyo>E#!3oP_%UoKQECZINT*nCeT0!unXca7!D(|mMwb+pM& z>1niO&5@E_sSYH!ha6P@sFZYs8vGz5O8nB3Z_PH&-evoDC49Y=%RHLcqtw5J_`X7A zB?QG0MezBPn7>|l=wGc{I#%4vjv+!9Uf*nN?sv>Ueo49V*ku%Ce6<$YT+Cm{Kl6<& zd5W1NOjX46wBD`Cd&ZpytER&14r-`z` ziUQhm>Q6zQ+!i;x*6q#I>@v0>>vHw~`@QHzB%YZUiBAo)H=4QI-k-BA%CsYr*mZkU*dGqIVBx^&c9 z2@a+N%2b+MQ^7tmn_I<&){&Cpo`lq8&cW=nbzb3*UV1j00HfDJOMo4gT-WTOFJv$c z0cW=(>90ZyF9S8l>u1L#C{T_$BNqD+EE{O9R*Iz+WbTPy{~4oO3@DmjVECfzc9QiU zAu4TpZ6-&yKF!Pj>`>a4TdJwFQCkW+2|uK_l}$-8RJATucEG$$%=9Noi7%!6T5>oF z2Db)fautc~Toz3kEsViY!5nUebSK%At!6g<3nc;7d;!H0qFpk-Z7^_*S+iS(d|rSd z$1XZ$neypOFhSK(%#ausoegvdXPHA8T>Xl;XEW95NK$qVr?o%$>Ki=z%fzf zY=WPxbf(Bmmg+wLep8|ul=-Z#Z*F>~*Xa|$I#96MK5+|fW~VN?M~1_*OOla%*hQXR z>@gw8TOLML;$_fWHFu60c_TdEUq@U-rr4@+2veHUkR#IDUnXw#5vX&x;*n@Qr;|yB z(DGEMdgR1JG{7s+M0OR@J>Ru1+P2)Sx-RXG{(IS!6u3KE$;fA0qc_1i_~M>;7Ehg> zsR;mIZh#1?r=v9@hE}V{>57Swu%pW)UceT+Cx#9f))HDwb0g^sMZuXg*q($EHsRD% z9$o|l>51I^y%%TeB6?@bEHcn*XO>RzcVH>m=7`fd8EoCvW zD~0bQS2w4#8$t2UcmQ?zW(0SO&S;W!kk0i8zp!rcRjrH4_>e$ za>->l-X41>LunEsWsdPDy1A0UdizbKZ{ry$L$W%H3=uSv@5l07aOhlZIm8z-W8L^o zxVX#%S#0df*GZV=YXtA&1@(j!*)$G^mpOf@$qL4eU?QA{Rv~Am4gW7NYN7LLLobhz zNbdW0evhrJK`i@hIX)Ro)`A9wuwx(aiA$=<7-=0$6JDR(m2t7&U9;u|7wvltq4g%*gbLT=Mb!@%=~NM zi3d%D#lt4#)$|sPLvO5LR?ZVA4MPKk({Xk?4DE(RY@G49(IN_W+CnIW_GpygsMGwS z$MW0awwc_#cWPEtNrJDG$v1+{Qi<-sYC zdwRK-p>ktQPs}pe@2E~WZxPIU83xE$>=-y{=R$mza~vnV$hm*>ij@!z$1ryse zHB(x!YON^fg}oARo!fSTcW#M(ntwW3(nxf1n?gcZeBG!>r})orI|51C9!`)nR#_U? z0h^f69w5$;#=s&C2ngc62MiHNHOiUtH)zAfte&uuQ2(0`z*SQVz!$#0S3&Ej5d;>J zR$9MWk~K$1?M3-#qPEUMEtCNS6M?9`!;XB;DG3W7c}|Rgk?>rO3}R2#U&y@13Sm(r)fR zWClo*{lbNl_Zj;%^U>@Er4`e7%;O3rUG`O|N=F79tJat$kD=En#|d~M$atw$PpM`1 zJa-v-K-R0DvXbkUX6|ayMYOl-rqX-2RZA_KkBH*3L*2}g;&dLgwAXf8l0npRzQ(pL5UFii-Yp390gm9by8HyB~ z{vJvnq>u+j89LtY-6x(OZE#M`o)}EKzEOlKB z!)PCLsqM(5D@T%*@2Z?Vzr$=TH;LiDdHeLJVkZR)J*Fm?(j$mgzmwNdVTsM&j$JK- zzQ~Ip7GNysHTWPAnTLhkH}Th@g{BGb2f*ApO9&ZK^AFF=-@RTv{%nZ!LBnz>-ec?k ztg{Zg@9?4`+cHU(qDMpxwNSFDiX@gz5!Oq~5&v|C7BZe{J_v&idZK&|6tHCW4eZe_ zw454n5)=@lDnIix99kEdWdyAd57Q+es)5wTl=q?Ymw%-@zV#&PT>Kkw7NaC}TK@r^*?ivO#e6 zvhNy7J`_Sos4Ne=DW8#IUr-ec{A9;e(0AL5gHw*H)YxjJR^)tqG^X;HyKT6Q+3H_Z z5fCGEY6d zGcU!h+i!Qw{N3*UTx*dg>W+EXOHdui2?SC&zX$2rRNgG>pNAd55SY8v6-S-&oD{Cl= zflqu~%?wK$E1*(<952ovNjx;wj$jhC!2|&I*~ilKK`e7nkGZ-BC^;-=h6z~_Q!_;O zI~-HSCc1V?H&!!W(6y)|+RD^Lw#ud?cWLO2x`&fr#jZQm$(m7WGDu&2af@U)x~GXa z$_U?w76e2NY0$;M`W=Z>2uhOL5ZXscFZ;EaZ?MHJUlM@T@kSMANeJ6ZAFD1gACAbXTI9Ick zJa-07NOq_4%qYtgRD7RKz5RT-Ok6i~3|~boZL0ET{Jb~^Q1DAES+649#$>`h*5sns*Fr}7jx0Cd zLEdWuC72Z5ER2s6$0^@l!!I83rcBIyh*FS&mg0Qy5nr-vIr4q#s;?W#lw%LI+TF0( zyYhHb=tVtUE4k0*|GC>=Z;{N}WbUc7(fs1P4az#umRzPS%P4jU?x56HKKwRdC?4QfADte<5e&O2F4I`jI>l z4~Llkc?gi=v?7>KFb+7ftz8FkY5sM-v0Wv7SJl=MN{8HnJz>dI?U?ia`%!AdF3_Gz z3nBB>DQpHg+%;5n*K_;4I%6?<=DOE6dcN6T)7$&`Dza7VCt2g!D=xau8{Ny?13}=J zKBl5Z%Tk8H2$)^E*5zHgJY~`OFGH_2u&%dSkNL=+CHvq}8OW;}J{B2PDyXF|W_ZrM zReicHn>GP|kS^jZW|5m$ZZEd%_WF4*D`uHXIz<;6i_x%`YdQZ50UuZ4-+&B*ukK>X zo`X{k>Bar;R;i#+{DrhNz`(VhTZmc#=Xb2Q9a;+JK;r^@iQOetmO5Uih*xb{iPTXuq@y{%VMA7`wq0`2_r`6JMRz7S2^pqg8Vp~gRec=J;&QWFu&bLGf2$LcP+^o0kQ{B@hVcL}4) z6CkzYA}2PC;!yCu7>w`Hv64c<5VlmRDNdBt zl4(A1g5H)ZunCujtgmF-%fBhg*r4|;^bmEOZ)mUvNM%Ztvt8f`_&y-E?5_k(_O6b! zIq#e5L%LmgN}Ta~OwaG{UEY2NuO4u1O(%wlb-hb|?x{ zG2*_C-T)dZL*-s8U+}=@N$>Y~*Ld#9dS#h=6wlUpd2>t8mN(s)a!}72BDeT(HfRlt z$>%`nqW^_9-#gyw+Yh=A*Jf=Juxc^-SZHPVpvp&~M}+I$JOIP3o=BNI?~um0hOY0< zc>Y!^cM^x|Y78N)CQ|5Mc%>zezu7XizwkLWq1Qudm6GB|!8M3{nXIfo)2uW>pTcIC zVZxnO_ldXFFF@1%wu`TG)8NlU-oxoGfgX}I0HC_MTTiULV=FNhy%LowLXewHaLf$d z#eN@O5MApsZf;kiOC4O>KnR9t^Dz>O+0r|(`^Te#@#0`0NLJWoN@f_}VF`~(plY3^aWi=twzhIgNL@F05vQq6!}bCU%yIe;V{t|hyj%J{e# zGA}~b^!p*b>7t085;?y#_oLcXWt674c*`#h`h>~WEwBw-`E1Lx9mdT!?b^BCP2YK8 zL&+0?oNTUgY@hWW$n*fWt#U-nBNA=cHRWVUmV|$=-pc*Zk3>(idDFRtvWf=yARTs_ zkS$KO_>OjdNEr(^1plMEbxW{8RhW6wYzZnzBYTdse@c2uKX18)J|QflH5qOo)a#6X zuxL(A<#Q~v=-`S4?LJnc8f)Luxu$>yH($=T*G`O)x~^w#Q*z7AgKs;?FWp-G#U5jL5{#5K=0z0}O93nf`A~zKTUI ze?sR-1b~_-5S~O-nWe5GKj7mh{P8_#>s^aqpva_fCFb`E5Fnd_7(XSW$asWILl~n~ z?pF!WL69-XlTy^O&vaqD9tBOhHqY{|?QaxJ0eXC(W`Df1bcW-lwmH2&&m z#pctdujPWQ+bt@G%aewyLc?|n&mpS3fCYqyT0}@AXnH}~Em7Dqwh`Uh@)OUj$-D7us z;0XpZ&t6y!9r{R$@r|aTm&H7sxd(L(jesy>v#mxnaxH;Mf-W(HdeP4#Z`&TTRany) zXFH49~_d39d{uTd%yH1HSMS0X8a9!1uIyKeNF*2d8nLvKKGQN`x3n z&UK4lq6rQ?T(OsKFXQgF05w)572$VgG~u|=YmSS#eDtXpo)&XGk>9uCu5+n;_P?s4OwCbMAnyPe+RP<0&THq^q9rzY7iir4eEINXGU9Pb?3{fSx>4l~B zy0-R)VXJzhvu#n+8)}d7d2~iB7=(R&o@iqEB^)gSS;QdR+Kd@`TALQM!bk)zv%&so zZ|(^0Am2eMto#ewr$(C zx!bmF+qP}nz1y~Jn|I%roaDXa4BliVHCwZl|6lc0d80~70tc0fMecf{mMX9@p7fHc9@IQ647&x|yaGiwumJ?%c8>oGGj$}xWVgAgOXLiEMZ>hr zd!=wB6*KIF@2Vv&3qYQ2uAK43Y_~g=!M*veU_v$NZ8-5V`6B|~!EWt~_LzL4gar0Eztr{IFNY&ap8e5nDFJ^1-#k~@Uo&!$8cMF>F zu9K5W6tX8Bher>4f2>MkCeNwi(J&L;rmR3OX|7IbwdroMaQp_ci+IQG{U*b4Dd2O0 zAjdV>_8SbNWi49UQQX4BI9me~1>SQ%d7DR`KEw)GbFH#n)qkD_U`-PF1T3JMTaYex zpt1rG{|a+_XPs)LTO|lQ!bG{ntEM^<>M_0Ou$Z1=jRi$j_=v)Sh1zVfmXYp=1wgY+ zum~2F`p(;X_pbRYt1-@LTF4!sQ*}d$AbDZbw;?hNpxZ<&Cs#i5$({7o{-`nf@!*-* zbOz6eVNT3T^>&mS#FLb4eaY}BjuPlrdL4{C$lA*2Cgrn4VrdBw7SQewDisEE6U_;a znt1$l&L;KK$!%{?mHSFRFDIm~LoNs~Iq9KQ&2>s-q)~RmC(7&2ZZgdJ)H+81ir2Cx z)Wuw^;D62M}G; zR+^z84*3+o48fE%7j5e8WtYDy`>>~jMG}R36E_<{&c(JqlRs8}`i$bBdezhVjM#~M zUcu?*uH0mj!i88&?sJc1;J9hz-K>Wxp2pEYcQ|r77qAy_=^b-3h(5P0nz+bQ_-CtD#dA(ic84JIU5>mqj(J0uzXp2u0_OAD7)LcHx@;; zy%936h^P;z^bRkbrOt9gAD*YnauI81=~=7Em#KAu*%+KNVu17pW zN_)w|2^?E5IeIkU*C?>{e;wwvpTSm7fJ0g|`Myzt;_%Es+{@|0Ug^@!^Bz66&Wa`8 z?8!B&2IWyOQr`^){Xv@e@~~xDY(_F4x~VI;;r{L?WGI z%UiuQ>{3w=sh*qm$;4N@ehHa9Z;(6`K>iG3k)Hy(Nlo_GDUwy zZv-C@!8Q0Jqm%eu-3i(*Ca6Y}vOSYMH0>_QkCo1UVE}=Bk<-s}E)(}AT1wPy)@M^H z8_fdQbZ1cQmRqW~6W>`BBp>@1OSgt`xhCB(OCn-QTDN~+Z~tNP4OY?9ibv1l16Eux z(i^iv`8R=BuS{>iWe$FMh)pgEK>p8myVkyJ7fpzk7*Qi8x8K%9D00J|=k|S&;z(Ay z#N({)4VG8Srn!;+!!Sp9CQCqUwdD;l5tiqb*wOI9ffopuh;F18G9Yi&1wLwHZZ}fK zav-s0i?&I=wyua>Q@z80@Sl+ zbQ@<}3uR>epe>tq$mQi1fQtHDhM}!VXPNdt_BFKaLyG=gn_NTw?)2rRY=~jw>#&Z+ zQA;9SR&lvu*JRShMKwKYDEB9C} zVnibfo1&Za{Ot8F=JvyVy?@Jd! zmZ2S=5|6Aw;epY5QDS3zpx zF@lAoWrL`8J@-C`SEy&@o`wj%Z@nlc2027x2`-R8+8Larf1RFq~S)ThWeY zUe~DZ4dIr}6NflalaBU`^6Aw^W>iU^3m)YLt6yAYu2KdUvQ-3wwN_CG-MW|)RD zY+Pm@Edr}k>vlh5ouwngteX#+EJ?A9D-a~)v<#RE=G)nXVsht8Bp8|hLfbUg^})>+ zeJp1*|A8n^Up=3g_hvTct^M!=PxQHo|ExEiiv=c zk&T1%Kl1-!3z;}MnE!v-!WK{kolP{_NH6<9;X{Ni-u5oAFfgpLhS3~=fo(z%_?!4W z!ggu<)xBCkBt85N^3$E)s-3FZjh?Y5#?@@AMa2rM1}P1$j3DEjegri+HaJ56>nT<9-tlU0opk@Kz@zRj;KK5!93TrfK=81BIW+XV`JqcCpEQI)wIBZWS;WZ0f<49 z0hHwv@K64$WQP=wuc`q>fKdPxvKEM)i$)Uw_fJgW=(l~bOr$=yfybzF2AvYY7BVY#s0Y)W(Dv1 z0P^_*uDKpaTl14|aDFpY3*74B08T0KYXgj0@SUm|L;#=%2L}fThX*hY3h>ZaZ}3YT z(Avg)CQo{bZ%6^|r*B4X=8u{THom(81@bNY=mPBN4}hTV7T&f{;5#LD)- zzC7=tKG-il@7(;#z^5Jb*U*uQdk0slFZcmQqg zuSE%Mc1>;i_>xZA;lYzSk9X`mVC_u?6 ze7&Qi=;xu#yF1_&l3)0bkT-zq-`#&iA>A|fKccY2k5~|(Yl=TS54fN16Wal3?bw&t z7NBd;4rDt23tN!6{7Y;u-SU?J&|KzuHypaZ_8nW$$nsl^XYTwrCO_KQm)M+ddlzCt z@D`s0VjAu%CZFqZ7eXs)gdd`l;R;O-Zcg|k?kKLwuT2f}WWVV9 z9Dw9)_G>r}UiiEo{saf|@c$VBw7dc9LH6So(Ej4ad9pMJTWb&eSynJ#MvyQ2ca^Ta zA9@Iz{INcQW|`UDc<77oI%A!=yPe;_xu3T$$X_dT+AZCJZ~re3eIfK`{)Crp&Am}ynqFs=fE9`@@LoTdS z_N!`h8E=s2HrQ_D>v+OnLtdKPE6b>YZh*2BA#>W3DiG;~3(^NQR!XGXhWSD8*pu zF3TBI*&krtA^I8uPA1H`bzDaP4X-TQ1HwbL{Pig>$x=29CsYUZcf6c4_!UMZNx>MG z?moM|V5ub;hsb*oX3=CO($8J@HP%vzW;F4f385P|X&%~Br#kb;(bN^F9tXFIi`~@A z7F%J*uiSr*uoc^xTBSMy7#X_K%&O+EY`5zcYEW@~5EqL2o<@P5wfgas7&m7a7;j5jwE{DA;`JVdbLEA&Hp}@uc`cMrNQ1i&h`6x zum>37HuZnl$i?i6aPX0>8*C%j4)w~f24}T=Ru7^tDEBiCk~Lp5_e)KEV#RP~ZAVB$ zIc-Ejr-s2FN7JdrzDc$hQH)VV0?w4`RkmA9{Tv8B4VMRg=w7WV@iuXH`{t+n@1 zC3Rpi#+&gAM-y&p9MN^KE>E(@_0az_H8oYoYZjqkl!>qNXkE;dBC&65H}t`I#?v+O zepeTtrcMn_Qn-=~lFpZPV=C*EuTFnqVE$Rj1e#ryTXXy({I05o;xVyQDR89`^TZFu z<9f4xSk;bIKG`_$$vax!T1e2^h?FuOb&$=)O#0_Ag2adGiB0oDf3QeH&}K>^IUNi# z&>eaJvql`J=qah_!EeLd);mpEt6Xaq>-5o(le!&lhlmvHwd(4bi~J5Ll?WNa5&{jt*IA&MR=sQ8(_YBplCR<6~ zojV99`Jo=m$V}XF(bXLO$$Dc5uPfo0o!z?0ZJg5IJ>g*t_J%}zUI`UgQK-PK- zO%VeQ?8AlWbBfSzoIbF>M??RGRs|ZhNAmt`M*%@!wQ>8W@vpkjWjFT*2c?Vw^k$9R zgLho|pIn-Z2SEb{>llWxNyPA!dYj|Vp@OVm&S4qS$DUH^A@_Hs%qCr#tej&4+> z>F!XWZEEVy-qgW(_(bbExJaO^AX+^PGPTyF#;XX@mc)d}O@8_78`19qnZcGkL<()T+8@qL~nyGQ8{hS%I zDy6Shv*FVB&HK0(ZyXLC$H4mU_8@%YsC6_O(mY@{7CPg~H`}7U4AY==l4K^$13VGDrpIg5>`terJ6%SH_~yG zxu_nAG%WQ#?z`y5ajhNVw9_MUkZ;FL9$9x(C;*@gcFpl4zg;?C)^>+tvWLy1BlTfS zeUW74Zt;f{{HZ4O%_Gp(TtbnCGfJhfip{F>&?T3B10~>0aBTDWn$i*|yNhC!OAg{*3A2 zWI((kV4INq{Bf^Xpf9T2uH`=1(F9rM&t1O7ChC8m=yk&$ush6W{#>jqq-=(>`&?dU;Ywa2^1llZrGgy&^- z7p8=x>`~}GV7IDA9?kaiK=trLqg7E(R;R6M&zUTBlH=0-{Qx?F=;JnSTiV4l4|<}Z z)B8gP`Msf-#qyyP7OUMu+b7)2*wP;K(2DF|=i3A_c6O&28$#cc)Q?nsP`O~hTK3+=B_$1dA5UM8 z;g}g@_S@5QMKQc(miaX;?SiW=T11c0cl=nADtL0hk>?#9>sYS12UlqI60hg)9gt4T zE;$UzT#px-o`5{E1P4ns>|6=R5Y-Z=k4q+m@z4Zwbcy2;KTIv>PNcV|+ksm=jHU+b z>-;!y$uzyx$*s$AYOzG${zx~n$cwfp-k#h=+;z`)pN7OPR@a454ZIMqqzx_=wy4Z6 z;r)8d4S%@ehz+TNYIakRNN(i^O2E&Z-t3g{BZ(dIim9K6qrt06QR{E4*|6H?&bkY{ zaX}t6@IE3FA2KYLV%I(b{sLw#RsyVO zTUKcQjG#a(`zBv!C^-SDoqlfYY`h&|B==sLV|I&BLe-9~mIX^lJqSY68|WT&He-6J zC)Bv2k_q-$$mphN9@qYt;YdV&+Y{d!wvm;s=$G-#ikP{nSc5xLPna7tIFKcl^$-Ch84#7R z5zHU&3G`~@GR%Q_QK_aoZvJJiK-O)2l$)9lR8(TxSw5BZzG@t>Nq0nSS;Gy8=Ne>o zQ?Q|B8cOdr_3yE`Oe0qm?38cO)t%=`W+||mtG{(x3e`2ciR39dnV_vVPZYMuh;hzK zB612Rc5+>_>izo8Ia_jmT-$FW`r)4nYL_oZ_%vhcRzuX$eQP8jtZJtWyvA{K$;Yy3 z=pmQTaTMlB>XFSAaEA115!ox->b*8$=V_d~A)cL}= zt3ERFIL0GR4mbp|zufh7UlR7^RK`yr9pQwEtejy!KJ~MF|2?H3)HXhIG(kAMRfWJ! z+Cs@}rpy;WKVrUR8)7&_s-6mU!SynLQ%;$?2wC_yM!Ri4SI4QFc_dSY0(D(Ny;A1B=8S;UNm8?h?rEev{_$LD-*+P) zHR2dTH06mLKVsA40DBF0S42W#K?EIG`xAp~yVSY4$=!@h3-pgGbz8d1r(Gzeu2mB; z+e&!SHrUIRf7GMe+nOjH&t*5R^j3(}{;PSJ&R=y&aR&P_#V2;%GUyyHYu%w+{i6fX z?xY*)3ynlPH`>OV)I%Odu1W1_DElFRchb{o3#PQ)8ueV=?q*psc z8vlH0Dc78Z@{XeNYCs^N77hfhb~6#3HAbCO6ocH;lZx2TVBZ%Yh7cpSc74B1otGIp zv$%L*#Tl!KI{j`qNEiQ97>qpelIMu~|HlI>KE(?#VBmrqyP4?bPw zm#L;l8uJV16deawQ`2+TR+G}r=>j;j8_0*0S!Qq>NvPb?WVmMrw4#|LEQO)9!G!tw zKT+cG!Az?BeniM9KC3u%2}Z$`S_T|XXjZx!f+IRE_>>PT0Grw&e_}Ueu)!mQjRnEH$}FW=RV`BymYIm4)D=EclETnD7%Gu#JUhAK=MH$3}On zp}{=;9M$oHdU42l%7aQv-*)^TF)I3pd4H?^UevI6l-Z&FO$3+3Ka+$`-0z%?U}Sm} z9}ezan3H%(2yjR%LU$o!!w88vKvxZqJ)1PLZsU@qPUhNH2FzunnPA+sIK>Gu(_=R` zKefqG2m>!art-8`T7v%;h_!?D#a2%qGG4Y^hfP$X6gd|90KwkDG!Dg+%suxRyLTcO zaknkNLoM1J6Z(4`#Nw*N7uqcEzFf9unGwHw*d!k&JvL(PT^Lc=A)g+l2$oD(D7|%2 z26M{!#tkgPsE;>ou(CnRXbq;BW5wrV-4aRun%c%F9vOd#9i-p=&6fbT+QK``>dPAY zwZ{4*3a8V^&%L-xAB3ns`^x#rTqPFV(Y`OP>H56ju6?nV;PN5woDKMmc)_{&2ihP06l{q!Fw^q2j4=3g`VQ()UD7j$=6DyIdmP&%WHvN zD5q0sTz)s#R@C6O`!(lwZ{8--*v}okzyr2>P?f#Q+9(CS80wjzPmNl3mV4!`CgE(m z^C6g=Hsg{VI$hz?qBBV}{ws=)?y0a>+pQ>L)5`nt7hISp-OQI`x@D!nq|>2wC;5Ak zXwjFM6w!Kl@57Cp8DE4DXDkBm{l)Uo=aP+jo8%G_SL;igTXx+ng%GogoF>3Wa=$xK z!=)k<>o1HuOYC8EJy%^|0omAc{d_Twfuv;YvC%`72(q}-so`+kZhbz3Ev-I7t;3XC1eqfeaY zfQs^NA}O#eITD-g@P^k`=Ol72(^!-a>qCp7Vn49hXO-eR9k3g^TF3WYnZ>9#APfCBv zYP4xdUjzQ6nn+a*+sQ}Py#dQWpy<3Jt4)g;>z}5k-~cIp)v@9`|C^*=dpdw88lt0$ z4`$z;F;znJLlYW`aNNIW?mAX5c)0`i-7YdV&)B%_$XC?XBd>7yCxxK0$yfD|l$^N1 zo2-~lJFFNo4(rg~6I=HWZ1?(A*tRo$hX63|`e7Zuh4G0uOZ1iQRTOSo%IemN{dEt zFh`|w!NjF}>U%lPH}?67gU;S{>k_(DX#eJ9;9UpQu;)P>thU&^TI|fb&%=>Z_ZdB$ zO2|(y+%#29UW!?L^Mw7iE(&`o*$86zRw4<5IEW@0H4?zPOXbG z5+0j@d7Po=MF6GVr|(R&(8N5IQ^p}tub4((Gc^p}F*R=~<(|yVyNBi#r+C?`Z)8W1 zPIL%diPD|JZ_;{#t*_f9=Au4UI(*j!qiRTv*`)(X$ns=fU%4G+hx84N>p^qd8uTCg zT;n?)j!#xFDboM^o~8&J7EqF5wVP~9+?y0qyZ=nW!U^TI0gJzhhqMh3MQZmDzxw#v z&v5Yn^GOw@pP8Q#vcMAS1Z+`j=np@S-8IKrg}-C%-Q;V3KNBNw->x22F^Bqxt7_iS zz8sGzAc}_kgUVA{C1(T z7@erJ*>x`*EeNvG?tSS&WtoQU(VJoSIFwp5L5P5tt&igK-IXf$*m6~x*(qBr>d!b5 zc5cRtpmqAXUM;l-$z2>&Ne!c;ls}z_8!^u}^@2Bb#Nh6ManUBy6lvr2zFT3H&c?Cu zzn;e*FD)=GU?zP0dhbKupv!L6j^sbqsSxACCH7zUc!@8z=@F#?9Q-mFL)*M?=O&Ar zcQq{cpso*~+UYU0Z&k>+l1m9C>ive&j-&%47{nDB?41uuFL@cbDk!rhu>mf={^{SN zR)o6=ay=A^-Cu2wnvMsFEO!COqUcpDwhLVe=>4r%bPZH&ViJuXj%Y{Hw#7)5HgQss(WEbUhhB1s9oVp>KVo6DkD|i8Q?!XQ{58l{y z4=C@$KRanMU;={TR@3pOBDmo^xzOiE zXumOHa`KvTAtOH-+%W^&HWwQf$Dyfbr9h2N^}&tqr7{$zfj+!-KILtn2uDL7WJ-h<@>cUaa~W)_a1s|l>>?pmiZsZQZuLJqSc2Isbl zE6-yD>i3_lQzJ`c&IF23;-nefXvFh^Aw@YKXF)V$^lylTHW#9*oSa@#w=Qhjl-1zZ z)@ms>VS70@(HRQb^h~+9y5UBa>cp==wm>PLI)%Q()e2$j^!BOw=(7zfavJrvpjXi8 z2XV$Y`xpo;m8r3*z306Zee`jucG;3S3*+wARqXj}(jeuFGOas>Cxl;dOb_)Q%V#-Yms45!9cSRX^CX5VQRbj2zCE47wB_%>mHjzt=S&}J60DHWs zXRUBcIQUQV%F(I0FWEjJRb^bB31y_Qg-kvrFg&%jce7KcqegM-(0lsXpjHX+AmXGM z-lbbp=$U7-m|k1Bfrj?yO^UB@7dW@&!7nezkTQhk6AqoY*a zLWISKsoNQuhaDNL>_SeQBzV4;sBNVRtA#jwdkSt#1C*zZSGt(HhHT?`F7a}m&+cU< z4BL{3KsPUQwA0M(zyU`a3H78=qEatDeQ;nAlMEG^7$QXKJDS%`nXkFK43ksby4aju z=^UpQr)rlQyJ42+39(<-Z#l5BDNS8Xv@=IasaAda8J>2&#Z3WD@`=t)LiCFQ55#uE zdka1Z32os*V6$;G%f3Nx;`w$I!Xvw}A`w4Up9LR$vr|cyEt*)EkZodSf!m?UK|X^C z1(c9k%oIc{Xt%<@t62p+=E4ULgHCECg8Aw_$(ysQIwXUYIgzAA>UK9?9k9DEz;ej= zJtJ%Sb{va;Z*{0(6PUwSt?+bjPUhz?&D){Sfz_5WK3F5P)J#Yb0QbSX8LluFtSHGq zC-xVV;%5w>O)f-np_&;!<$JU0Pmh7e!?{I4htISbTg!T_rF@jYvrn7VboY%!H`e31TLReg47zqNwFj}8O!b_X*51Dc;`DNDS5 zE+vBi$DmCicfk-;uK+J{nu{`b%`k#F4(v4A-Y0H8z`W-xOF~s-FIa&;xFbwtTQ${C zQl*>=k@HIXZ3M^dp%_=*3eGDovnIx_YypcR>5wm_t8U?;3+9YuUR(9$baT_ zuMoeJVT$O=NUj8{?OYs>Me%x}4}cLlW1rZBFMPZ+u|`X1ixc)?)RgIxz9bp#{{8G8 zu0*p;vk?$ik@Aj`y}M4xxpK91W~|d>0o&g$)F#&s%oHhK6Srq8O^Hk{WuM=LN`2c9 z@T=x}C)&>2 z(+YYP)w^$G`DWLAF>GX=9u z9yT8zhMD*0T4s_W8q2hF0m-=rFh@4gpuv?X^>ki;qL~x?`M{1a6pFeZRQ8n`p|J@Y zN0lM@Wa&)JDTBBXLm7O)N#%YVzxEL!GTEiCvZ-ciwCpy$D9#IFo`6CDomsfgI85(g zQJI8Li+G=+kub7W?A$OD1BuX1?iDj(y!zKk;uv))UrXq7vVbvN(&5wPwGxvI={%Kq z>-l`H>>%HL7T; zvWb$`pAq@|;56>tm8_`*n;4ka-m_;pYDLwOw1CZ+O_WAUoLElvQ^HnrKvmh;j%2|3 zd~ZvCiNr)bViBik51x{9M<S!#hrh3w#6R z38ptK=yecBWHJ4@(JZyC!&wz0#E5B%1D9XiV~PFFyxna*h4FoN@_70QO30kz^CRrW zk%OIVcl330(L5d=&6)Vd?2b~(4GD4>22H8+tIIWhQLWm3O{Kg7PjcKq^sXhplY4?Ir5h}?d?qLypXNCPSdJ<>nFQ0tEta$>^ znle^eT)&2@p8mxy`$54%QVpVBNAflXZ&*iT;FSAkNqy3H4WwoRwM@r+MbFOL6Sz3bWz!_V6vZlG#DVy91;EEo!W&tN? zU<)?e0fv)>4OE?BGqC4@r|R_G=BffX14}tEpp!P(ZP&|IXbVnn`hD$I{*0rkf4u%w zom=6g`%!Ga2iFmNelS`m8#I4tTpqp@lg`Y@oBB56?zSYK`aZvm=&e}TB`ThXd;g2M zGoxSmm8Swv)8u$K)Kez^pj>FaH42oe8YYRK6LUraAdo^y>9zq?8@mR=CLQlK!%6q4 zhY{|*yD*3)(yqt>ycq{sFm(kZaQ68|4%b>op0frdTGi?tIpkh1OX3BKabiQZHPmjq zH@v@1Qcf3lSzk> z@yqPGS8#Xz^fW)3;Isp~p6Hz$Q%QqzhMvR}PlM1)irE+Ng=~&-```L)_OONp&vbLa={iNpnGgTFnjyQvCK(f}I!dY^Wy+UxH0>kg zBMxDP^oxtMYVMvPhR^^t@Le0FY!M{f&O^mP1S7Q>;?|+wklwY|1u?wXS&J)-lS}n} zK78s@HXhN;fY!vG#anY_Q#AAOO6|J2a|gE08n~q$y+O2Bs!^CBXS}`)IbulJF{EtS zuquUk#c%r*3D3oiuL=>upd;VwB!JG@^|vq?4o>8-AV9hfU?UBUY4j{ zso1eGoQQ#jp&v^G21bLqqtiRh!iV@d$O&GqFG-I`67Q&WY%_lSOr-V+jw^l-aS4Sa z6~wm5JR$=Fj2jsn#eLcUkya+g{*mk(A1wqE4YK%Ml`gwFak4z*5ORj zBUN2i?W?h7WQ@$$eCsIR4wDPPV`bHHVTXhjHXrbbQ78K8yfAa3AR4Rj42v3dq9|ew z51a}|<0~0tj#Bo5t3=JV%NQDx0mw;HllDIE-_83j+ z7*mia(&@m8wXEVs3wfWnGl6u~P}|LHn7~L$7eX>WhY`@SdfLVIBoIThGMQBjLp{8d z?P6=~gaNRB;;XlpO2xGawCo@AekSEbtjMYp^GvgXY{a$n+zfm)KAm$TA#gQV1}Vdy0w;6Z4WRdi`PxuH~1ajEbFWa&w>GNy@@W zi$0NJd#GoYX@DwUWAZ60CNeeFJy9zD-3Ov?!rEbH^aNM;ZwhkOS3=AD$d5_$(-=K~ zqAXgPB{;AG>2mkDgL zj|5xWgWVp^4Z3{4?z5Q#E8MhpolQ)mi5OZI8Gygv(}~FSp&tvMuE-WLD5r0`e5u5? zgai)N<|B?e^sZ#?nU%3MwdF+fw3_&Q1TDo^%u*g+jbQ-Ip@B5Sm({67jTA9ji5gz{ z<6EPu%Tz5q_3bYxe9>4gn`T1SP8YV`0XRV!l>w=laVrQW85&VyEEwY*6O;@xfjvAi zG{gyGv1_N=AC?5VB`87z<6u?@YWi9v_6d9&#FV=#z_jG9x}_w*r~}ps+Cq)I#2~hS z9@}j3Od31*eQXpZ2Fp#<41u2@pr11R8T9V;dWY=F?Ogh1<)UT2P0ROg?Aim+1LM0U zKiZZ+HC-9VqohVYm1or1wsI#0O548;4TtYv_)8KlGKhxI1;uWS9@MU zLsKZv9?qACI+Ja%^b=Fxk%Ak(VIRxAqD~ktC@g+#DFRqC4KKbNi5p>(Hqx`yc&w?? z83WD|vI`TDf3fhb%G-huhvHC3D1no5o*IH7=f3+K(gmyg*3a`E$oP2_NSE*liP8zr z8IKU>*EutmtAk{DjwfS_v*A)ivW?|~h&yZ8@i`_Bg=_Qp2&An&4NAR;AZ1&VjFT%< zi*odc?Mib#(uJ+TQ5c6N2KX?#$veoRHLNB1)Cl>p$FpHYWKz4d8osL3|8$X}$Q>^C zvp!I!qqmj3BPTLjC!OGPoPJ7cJt7E-It6Uqg?D1PeHViIO>g@_%z|&X&KPH=bUlxY%5^ z#*L7y32pIYWjK6-A;&GZP&Pf<#Qt}yGRslAoVeAbbFyZT{x0Te#O1r2l@mzE;K9;#sKst+f z<9qmiHv=U(Mm;e2tXsAhU6DL14M!SxVPo`S@noOA-WOZ*cBK{2Lu^5=+kIN&YPY3a zI;#>2r2~L}6w6%LpEiO7uA}|c`80e$5(ki)PVydOz%PgJ&-a2mCsTmzsJdtHLIF$P zO%s0Riq8Cj$~=0KL2G0xX1lg6-^WuUG`?~`sFn3`)%hi)jD0yqU# zY2}7KK3g~4Jk^{Yw1(L=5aZFGKRN4-&(T^p&+*k5;TBgU?2 zNO2;sIUxC1>V2tfkm{srFVXWavjT``pNMv$UU`DQp%i9x<@4SYF9&s=k0mKqNYoC) zf$t_uu%Aog#c`;4WuF&UOYj$ZK^UM_mDpa((su6H&vq5fcepJi9^ZjXtVHk}9$ww>(%3OaQN?_&M^Ela2JQ<*YZ$1;MgO)0D;(;PMLa!T)M-@6>89Ef-(d zeTZ&=&j;QxU&DrS{wvKI1_Sc1mvE>#dj$LEV37nzI2ju-u0&LYv@cmqBkltBIha2s!+a7mUpmlT z>mlRJeSH$O_maBD{c-bw5oB+qkgF%%#~sjK=IWZT_oG=VGr{XOIWwM zVOlh69TOLk$a(^BxdwbZVqUT?xAd` z1;3y<^6H>S#cXZOB&ho$$@eoe7`+e9t&Btj2Qdssnx>JH;X840VA$8_X$rI!8loq0 znQ^vMK22J=tI~^4;*0c$**3^8-6H<-SM}WUSe2Bj5KT;qnYcD6IYO@@ z*pN`>Ib)oUR^yD2MJNve0cN4#!!d6mscCw9u>V^5~{itpS1jtH6(e4MzOYNG&| zoQYmWKy;A1n%XR6t)rrf(!J~;z?jy(K>hrrk=Ts4jvh4$KttF)4d?KVIQ(UpH)jweUrC73E#qhHSRao;_?o*#^= zTG!DnD)`ArwPX$PFv?S!0PK9B9L)>zfZCQ>^zR~RgnIe53lB{Zk(DUy3Gn{70K3dc zB_{(A`J%w4|73h{T^vB8r6^LJ^eh#t#qQx}Pb+rU=kyvu7f{!UCpFA_uL&INQ7)Tm+&ja;oVFUiu%&WM3XcS_o%$K1DH|hLAyBGmh z&#NGrcZ4&Ud`?2?yjQsNW7%7xn4T^MRR-{C(XOZVvk+#>&9HM3bdFCX$hG%@!Al-R zO8*%&!B<|=+4JibX7(zVHDCe}g!l`q#v3o2kr8#XcP}!~Sd7zW+#6O<`YP_+){j*0 zT;%{6BC~Ifnxt&XtU)6_%K-*Ed8%=*Ox;~5#c7z zWtv+$cha0t4f=Xlfd#_W3?xT=M`1jK_*+s*ek~SJ2W{sx)Uw>sK)zKe@qjG=BFND; zxJ0MYfnj>+s0(5!eQS}i`NEQ|kH3+Bol@0;sF@s(#ACVjg`Sq7#m!*x-%n0{0ertz z68{@7!}(u$8AcXnj{g8MOayE!3~c{V|2Hnf&dkpFe`py3D0(pqYiAQj0(voP17{Nv z6C*og6DU4DC?{t}69XG4_l+1gaFryhMOxi+Q2{P`3A+Mf(PX`1VS!>WB!Xm+#JOU~ zZ~|RVae-oDCDC)yOlOc{iMmLCBGC=}XYbRm(=N9fFZC6vo%JTOoeZ-V9;=Y7p?(bw z$jIiP3SvYI14cweP65%?wKp;XDk?%EDk{0bX&a!i_Q9V($#z)-3KpD*_)mQhR1}!N z0mDiJnAEZ;5rB={+JAf%0U0eh88JB(RCrY6#7|hb$X~!p1bPmXIaI*r1PF}GU~SOz zo2Wo;E`$2I_gUh8vsw7?zY`M>-*Di`TYo6pc|icrgc#%)*srj!32Xfs7x3VQ-hQY- z^8%5<4(a~c^}W2J1h(dgB$A7W`VjVD$2b7R7+7GAL4$z$u)v-OZ0q@RBI50V$>i*x z@V79|{qIBb{{W!bhincZm|F*v13?J`uB8EwYH|7Za>Lw*RX>IifPA^I`KO1k?VkTc z{-i*}zTv_&w260i2qW~`2NUlBxP<_ITyPO#;720>2nzTi64siNu;Cqm26PEs)d%mU z2m`o~3Ihb%+||9zy->7sgV6?tcKui)K2gKIBrhWeO1b%#s;n%#ciWM znKjZu6tfTB(hmW~`g46*0$rTL0TJ)vF@pMglUx1DqJ$ zf_$Xx46ow8Y`$ZUy8f%-iggCZsh8~s+Jj@bf{(z*IsgS`ppe(_-~Dy{+9X0m2F|zl zg82jP6ih7f^EWoc0M5_)>Q2!fzh4NYikDH~Y4!A$oVvO@Un02bmj7w@5fl^a-2XNU z>gW8;F0YYk0sIa*H5Dj$q(lUeQPILdKuik;;>`^U7~-prN#r+TS%`xGaGihIFz%cm z)$5xSptmnI2>;EEj+aEOg~9(8KS4WEDq_Q^Z}_LL)rSB7oL3>_U7Uf9gtc}9>x{WIbn@e;ER13=Ecmy#XzCGY}3!Hg#qD;h=LGD_m@wM zLksr?I0&ns@kP#W=>Z)4cY{ft1IW~0WXMmK+P^-%{LJsYzRKh_u!n0-L-NOUpg!yI zR=tw&Dq&8anv9MVRzQIwoF3A3Ttgb_4teN?2<7Badl*0hBw+ZDFo3lepg%B@=z~>Z zBNG6`1N4VYgh3gQ@59al5!+}QYtlgtN>V{Wgz0Q;! ztij}O-1n!L|A6?(X#Ty;uDFX%d0BwCslo(T5a*!aOa8Qw>yg^t7m z#*+eT=$bzm60GTT9z0yXk*0KD;x%5P9`YP@w?|%G#L=*k#*R3t zl0J<>J)cYQDGMus{ekTB)(}|!7C4!p89+~C=VYByzdkx|fvh&g8z0_z8=iiTyJ({Z>J zlIGw&^m)@8ldm}shT2Wi_tl$c+8;^y zNN>%1VFd}T*W=u-!ELYWurVh`v}@hk>6H#PYI-PFoqPg01F!pt(HyUqBwv314`b&L zoC_0Z+t{{k+c>dpJ6~+uww)8(wr$(?iJiRv4sO*OyullGS6B6{tE+eKwVH=w6qnhl zD6J)u!U8IEanxV?UpfHZ9e$e0T)xrL6C$mC7z|+%Ps94QsW-Bp*%Nhe=1x&#ZOn1< zT2C8NAH1|lFPvXbS=g3rCDLtLM~@-P*j;)j+%VDFFmjoeAv51$1oZd|vz#Hwm6w8T zrZ}{mN7t`@u0^>}x#DONdLgJBgroRvTJ@f!<1WIk9q+ic8vY@M^8E63;Q6LvH1-!U8wP?KY05M{qwWc}pT@yB^%`_fthtwkM25(uV(@k!yGUWMx7+*ARFsPhjsM z$mAj>Wa%4Gdm>f}Xb#|4AvWSf<+{dlh9$knDRBA7yl2yEK-G4xnd5g))c-MUaN9oq z6U3fV>sa>RvpgSiMzh=VLha-|88-Ss8t_|)akvEQ-<*G2 zn4jp4JTk$E-GmnS(z@R;Y0`CQQrkVW^)zg%7;HY-Vj1;vG|S)ZEnL?{!3CE)-O9&J%3Q!9zE)1Pha?6_A|c{~eYNE_)(F{5~yvx_RNVVwkBl9?g&suG)ghd&es zpE~n(XCXR1EqqU3?f;hBXt4@A&9&ep1Yu7Q*_Kh+6$!f`IBCdHP;efEYd0tnOuJDe zNOG`ErP*(*b>fwMNPy)Q{V$n1o(GOE>TlV~UEDSgcCzK3CAz(n55tj}oG0?*Uv&1# zK7wXjX9=ylTPKLZDR&q4_B^2B7Ld8Z6yvP}tQ0R&ib}Id5S`Tz#E!n25a64fnL3qM zmqKV_!IC@*0wW+l3TvsKTLc#;wO((8zpN_hN<89Kk7*EnqUkO5kV)(`g1rp<0jULE z|A6!3ADd}@38q(QaWyWUjLlc?^SV!KmP=N0S~H=w@rv`%UiWPk*z%_>``35;;ydKe z%orRPQ1jWn)r3dayj|UCG06MbSl!|bFranvi?j@?SnP#LURG*t^D}fVRu`%zCWlV% z5R|)UUt}y$Canbs4(ij2HEr|$oGP5R+UQC7H)Y-x*X!LGi5uiwc-(W;I}tF`7ViQI z)pF2n8SkBy%PWv^94M7TKF)+zC&Q=fULF2pA3P09g9|Qi)HCDo$?_NuvLdvttDLK2 zXtd0h1y<=K_mXv?9uVXC$B)Ae(-Gvuqc(OXLC^E#n@gYfOa`U}%(ZYEO4EIuQr~=G%o{eMnZ<2qa|c~kx59JU+AG2F z_WVMfXQN$l=O^lXc#|3I^%0T~*LR+2LlguBT^Z|s?d0eVE}Ilu>vwZ>9mkUncIE}Z zI~ELHibBM=qwrA+6bN$hK7z3ehz*cpC$ADc&NVSqlO7)dPkOg-M_a2X(z4u1xB_a9 zUVx}#_P_sx%=+9b!X4y}=&AnGI<76Kbo)>iT#*Pht_AW3u?jg=9s}%{d`Pv^ljY^% zm9x_H2Q=LoUqKUQL*gG`1NE@u+`JUNqqwKF74deZ#+ z(Z14c8m}6999_Q#zOVFw;bw#>q_HJb=rq`5u1a*3>d;qAY6;hE<=~KzT5HoLmzmR# z>_vTI2V6aA)J=^fbZ+3n}PK?8y+&CT2`C*|>G=X>w)swxB>4YD??`xulo8 z0DNT$iAc<0uS=)qU<#sk%XAVU)2)~11CA00NI!f&@{&L9zfPA`dwTxBUX$i~|K!-{ zy-HF$2fcBpmYgl*l@%De#k4*CzVXG(-Lq@sU|Lx8vS8_{uF~qtXzxILm#X0gIB%`? zv5#a918eXW)s5ka62`Kkb13$_Tud)3^I8Vo6t{ctsLn+C`94ic-8Yqli|15MrP$_N z+m4?nRlwnoSx^4EfXFi*)*cQtjsy7(XGwd=ud^(Uvw32d%R2vz6*;?Kn!`l#aN@Iv zdT@+h3SAGuYr1<9=vophnO^5kD2Bb6a!+rIHR2%Z8`WeY!0YM4*Sbm4;5cK>=ut5X zFv&)3lG{uSNSLF1D+GK#JJp%@!01y-4hqNa^Base3Tk5GFk`?xw5;N@C;5Dj{9U|) zk+2I}$ehEJq^N#eAIQnQ_54;RE0$2rb~3#T_ZNxPV@iF>n>uUgz+%bMCxU62VdyjV z^SSavmu2dDg1~gae8Iw}?Bgk;G9jWB_n+drBiBETm&{XD1Gh^QMX?h=$o;s*_DM=x z=5rcJ+`q06D7!(*zJ-ITuya%zkRqFnu}+}XpV6-)Y$q`(bv2#v$i)=}Yzzd>^iTEkPC)IJ%=N>D*VSVkFhpiuMl=N48g1vZ3YEeHFZhTIkw>PsYcxk8 z*!dp1SYN7jg@?fVr$&_z4Qvlj3C6Z#zDZjP#eS^sYM2Z-MO1hS2Zf6|p#%XjhR>UsJoL!IYv_w@MjskT{G zv?PZHcHf5pUyWMsvB+!lb|^MIqpAn+7#%!pD=c!Cd*yHoFGVI18^9{vW@%^2_Su$$ zRQW*7f#~>4R(l2J;F?S$qQ{n8V1IopOMb!F^BA?&Q9(f0SJ{cINroEQ3~pJCZ!^ln z#Xwcl=8CnT4z``5rpk8}Tw)WIe-k3g*ah&}>I8012oB!xrQ$EekR?3|_~Q=G*y0B7 z`4I8*IFDTwBl2$w9#`k1wV{Le0E)Bn(WC@@6{5sjq@_VjaG~gj(j7KKxP2Xqg*S#4 zw$OattV^7>&wM?XOvk{S z#o&m<+h#elyBIU%SIh|6e_?!7<{mVC^n?IiUVgq97ku%1>ugny9zlgN*eY$BLv?AWv!|ldRLTeEuCprTn7W+HEV;{vCjx4>;cI_>I7HmF z)(4SQ#3YaVVkdQH+Z2^KH>qA`p+`;l_uM^Ti3p_mEo4wvjQc^?O^(3$Dt+c=EI6GD zcGDP2B(hiTgpp$PkP1Rx{RaMPDOR9oxHXa(tu#hi++v459@+g#`lh{2L-+69cUjDF zlj=SQm1!3YhfuZIN8UQM|dEHr0wAG zhoe^3^eqEaiqhF2O(lIx4&Wp;Q$`pxf5?+!DJLdUlUg&b2w|WBp2pc@cvSd%dUk)Q zi5;HU`8PGPGApF+e3x42ols2TUMdLh@jR~1)G`IN?IBSh=I($U^$lOd8knMfUab$s zb-fDDt%Ia&Nv~xE=jf#GsVdBG=Tr~WnxoRmuA%n?6M{0YHP6x?te1nbFHWKCG9BP_ zxem};KKNGvS*hpyjq%^7S}!|*gIeP=qDtaJs*v;H%EfUbg*DNUg_h1I;ZtBDZ~^

m<%Q0_npw>rFF4fR1r-KP~-(X;+_LMZ;PaWd#`0I_m~FU}5j^RvSpnK$k=a2o`M~j*i!~2-x)@GvC)z zP98dz7%)%jWL*(oF0gth97#5^j0rM#G$jiYGuU7)roF$(U}nXMntv0V@?eC;@W9K*A{rPn1)YhG+B!)eJ_^o z-&5qd&~?vMarjsi>Z68j;Bah8nU z-TY~7hDNB&z6Z5`$I%(3jPYMuHc0H6q@0X+zvb5CT<_+0N}#}G3b3ZcBvFO)66t0r zuJi*Z1QO+=QO+93&4YQ&?u3`@b>p7ueMu4aSwQ(otbf)ag9fglhS9@Oh>ic<0qxo+QCPc?4M|PfT1AB)qtIuH`;J zw5hmdvR^K5iAjw!{^3IIvb3pXkn(H1b|cQi+{Rr)PurEk6kW|q+xqXT|Qdz$sG{=GYvQ&lYVi%(YmelwSCu2I1JPvgzv?EEuOBc)r643amJYBhwp>Ao%P$ zA#wgfF>a(a5g{Bgxj3$5P2QD^Ke%bK+WVJM1`-~rX6zeO&-(thfJbMDWQ@4Npu`qN z7N6+EhI-;KdaiNp0}jOvVBj_+`=0jzsgBA6^@Gy<@Qub6YBRZKEDSe0_HQDM9=iXC%*3c=?7$u1k_i zyyyR69g==H0F>W+52BT+R=#EQbcM zVI*q@CFpw8tg0_M+8@3eWx?Ip{INb5V>m#uj

7(MS6&fVfut_LQcK}In5d5oJpl##Ufva zE$IuQC7QO;u2CHpF0vcB&7ds5?ii{wZaF>+BDPJbk!eZa{Scd0hSs{=xs(dQmUHik zfk6usE8owcpnb`$=u)*>E@e=rYIhDmW`@oj-D^arZc?5_TQY5_4+p>+rH{zN8RsiZbhw~)Q8}PmP-96D~Y6!FDIR<(h><4%+ou^2tPMGMMhw?v%E)+3y53B?845Uer^xsyh3vZ ze(VB|l)C1VH8x|rQg}mxGtB z2pnU`k|vowpiZ#CbAV^-{zMxe+|@L-OETd{lb!;@Zz6_l7`Hgjl};Dgpng}%v;&ay zV}6=h)W1VCme83L%EhWE44abHc7}tQInYQ1RV}2AD_ztSG$dI=Ui5tbu1@7%iB!U% zXYdjGuioCS(syMA44^*);s(;5Q^|3|x$-(e`B;AGZ&z&A#Kb&3)t%kJ+q!_!5B=Bv zV8yBS9mB&CvoDID>B*7sE5$MTYH@R9@*#?fVJidzaPXCC7>g9USen+7sEe&Vc+D~z zgEzT_Mp@Ra!|{3%@|Bg*$$(70ddlss*leaBMU8VuAw>RNp7R*0{W3Nu-)#tLJ-Cv- z(vEyOnWOP=4{)OCEiP*O%&_(~q8j@=`O!L??RaPO6y&dThS*95h)Z+N)}3cy1RW{J z59r25-Z$`;UI4-trFO#6rLCD72nmL4SqYZk16hYe346z?H79VUPqi|UO!4@A^2hwH zS2p`h&p`4V$Q@7cV&E=Aws=1zevudjq;9o_4O;wq{0i*koOX=3kw03NTAO4sFUYsm zS&f-OX|$GJ;`_Lt?4lkunVh#@9fDGJYfirr@ETLf+@ zUl4rIrkUS_MkGvY|Bon@oB96{rLr(FvHfp&`akGFPA<;>9sb{0D)WEnLAL*2^kAy1 zD&9ECfAZb|_BX?UAh=tM?c9AK2rN?v#Dx7@q-}CASd{IZogMae9S>9YJBrud*D+pa z1Xcqz*R{INDjX{(ay4;eR9w=85cuIlaI zKtJz<3idxQ2!^;&cZAILH$q`?a~#YfV`qwxi*e;g!`DacRPj$oC)Em+uB%liEK`sn5=(ji1x2C(da zI#EDj2wrkg)e&w0@$j+6--6WOnMW@XfJl96!Gd$Y!67^_3dtnU&>7Be#r(iJfz8!< z|1nVOhZ^}~ooybXHlFdr>gXm8n4`;|pArQ)8$=dA0Z-0T|GIS)<0Fv!x0zDIh{%gN zW!W5?%oSooUdn{#bwQ;$isL;F)&$ak;Ly;}>`+voM$iD=*qKa!awX>`@b43_9~Qy2 zg9{^T6Ido+2JmBO4sd>dVs9=io?xK37kMIuctQMMx$!?l{a`lNP)s11fyCo|?*1IX zvcIQi4E2y*ATFAD*+@`^@6YeA>ATMPnd^V5xBbF@T^FXcqfiUmq*d} z2gbp{m>Qk_BQVnO@c}W|VFaA{-xZre{YXp@T9*))ascJ^feYp@`-}4aEBq&Hqpy|3)!2 zL+EsTqdsmA$;Zt-64^iUp(p&puYi5?j}~&MQ~X{fyRrG?!5~LuuJlPjyCnd11<9uL z$Bmx)bI$C!JsXe$z{3JkZ1DQ~^00wO6GBh?ede6aZQj39xcJ;p4H)JhvA*4<9U~fI zC+a*+PWHjDHn(s)i3JUwSRH}*0_K7(L7cv;(t)RCk#NV($OFy}`G89~UdH?EQjFOUMTY`>Tr`QTYy8(D&aRehv+w*IyL>bCEY0#mIY zp!u#kzr-h6uY@|^N#>7`0y0hCxu04YHL1x%gg=1j9l|%dl^;Z()n~xJM_=xrz&V!O z|F+4T_1|xjO&5$i*DAgGZ^_aP<5wL5hg)Ntzu&^I7a(uzQ}Ao84-sh`-86=e)7E7A z44`zd=J6;OG9Kz^ok94+b$^Oad*o0nkYtkn);^#eW2JrBeXSAMA^$g|S1dZHd`Ir% zVY*|?oLnmv_Y<8obGQ~eQ)&5W3H`~mdW@ZUgw2ayHO)MTirPNV7c&J#ic0(nmKe2( zLCxw3kqsTcsez%+u~*Vz;*FDrtXIE@mKB;wHfn`=ehf)P5x&IM}6>0V~(Pa7RP?$>w5L+~ZvP0%l^obfwWmt$+cp&U@h)Fk}&Ac5J=D9KcW2u05FoUtJ zHVu}CZXJ@nS27G+IbG>xei+&N)r%w788==h&|Cd7bgtDnBV;mJfzrjbFlAak_Ro3j znkpiD%`WgqcR6=0iKZ8F;-y28;94KE6u%icLAWtuHaUPWEO8_83>4*Yx#G?i|d_ zljE%v>mj0jeTP)iO)rT;-iG&5gQWe2EOVmibD54fUj%72?C;Q0^c23DRvNLX3n^fp zYH)=rI2p7OAKPyyqYTqm1IpRJ*yit-6mC9_8Ii)o;p$5W0A>O=l^&G)O-Hj}*`cUQ z5xCFR*a`mAg7Y`ZPsBT5hNgB-(u#U&s_=Ry$}NR!R}^o}6vF`vTw3+_Y<~-PINtT5 zY;0sKAK5-h+NZd@CyA_dt7X;3=eES;yu5=YPMBk2*K>_z0n?w*y1-y|^r@4R?3O(d z-{2uLYz2SHNfQ%`UTzcjFYMkam^FfH87rJ&?V?{}uz;s5W&%oLV7+W9I`rq8NG)ZByL*VqUA_in54u_&&U!=uB;A`nzf@5LZb(xzgb8S%Vb zH+MqPDX?0aWrTYjSPMqa;i+%ki)YrkxAkn=z{x*18@R*78y>*bMdc5*GkmQ3)R7;5 zzPZ8yQxi>$!srle^_xHN7W-p63KaZ8B2r(geYy*{H!A7Vrwh?h84D+OT8Pk~Z%KR( z-c&dW;ksGd#1|i~gd;c(5qRt9Z0FWQ5C{yjq)f=nBLsf>+ZICMlb_s$3UxVp; zNCVhBU8vjayz0;X^Y zw1=4_U>y(c-XDM>?49y@kcG{}j3`YSRfsKvJ|z1G`PiMhv}K#2h%c?|0S)&%tRT9k zq78+y!fXwtTxlJ)krm+<{0daw+I`L{ zhet!az#vlw*z8!fbe?jrIRJs|W3;9v?Tq^SBmt~7v`+9-$v8s6G{OY$rjg83_YTfJ z!=1?tXLOB$N`Rwn2W8LXty`h3Ww2$-rq>T!`eL59C`w_X7+);74t%TsM7n-MV4=`# zUG7UPKNdG|UEf}Rw|C9Cfl+tOgTE$E85_S-u{?kTRF*cbE+!L%#% zLId6Xngz3O!Js^z8-*!m!0Iuf2NaiL?1dFTI*{QJBYf1F{vb3p+pEo98*}FZk2Snk z)o>*6F>s-@otHU9g9FV>qQ_b?F!;Q|19-i{=j^fmyRkO8vL#fRXpx(qo_`BHqtQsC zeLOUj7c`({cD`W+5xXKlN8&P?s|8~&&lWL}GqdO;2B$OBa->k!QuiX5uTsfNG!-SU zZ!hN!8D04}jG&};ypx3@Dte1c?bt#v1#J@%kgx;JkK{k&4vX~boWnK=wC^RO- zuhLx9zl_n_|8`B2IQ^>UX7RE}P7kAP{!6{^z!%~(LVi@t<(YXBY%pU0MIb|mnR!6r zke_qH@^fBbwOm^uHE@Zk;}8;kScSSdoE^2}OH*n#a=zttmA3uC<-Jygzz+-%29N9K zLduX&kAY>RM74vLt%5BZ9^`wbs4&ywiW1qRUVdRYz7=hKgqrMETiTOTCiiD6B4{+8 zI8uBOKlifIVZz>KwXYt?UQ2Jj5<^H9F z56@!n9lx@4+H#QGw9OMQK7}yO#9{DClRDphHB^DR*l(72<$2xjo9b}-;==OQOU|Op z(mIzb7Ab7su!S+bpt2PDe3Od2DFNA;d5+K99ZCMsargKVBY#ELrtx_<#+dv;pW&_n zZS0ay0)}#zKiGivrqM9G82<;(uDY0!iRjn(+RjKc5cz%NYH@iE>An1(M2Nlq`w)+M z?6iTtz(D0?#P_kr{FgjbDzuYyY`H;lwvuw9!Z_qsO@eS!eq^|r zR3~T79(I2KFd7EZbAtuE>+RL=YQl|!>|Yy-h6DFnAH8BgydZh7Xq04-mc}CuYoPaPdyv9Tl%7ExexsF;KAo+`s7m zkczmmHp#qiAFLyY#P-eoS*SKCn)p_NDU|i4He377&rX>|-v}7EFCVgoBc3o*eHZVx zBjL>nWhQ^Jv<#=&=x%5jS$~LrapU_^M{}%+^0{d{+#}JzI`^J}o}1s)SgkXvFX&T| zUH2dkTrQ?gnp-s-=VKTFW!j zk{b|+4UpvtJ%1zseZiPvBb*_CYNj?Imb{6^&L23uF_3;VxrdLvI&#HznO;NQM5GKW zENY-6k1AQQKFWEeZ5L0^t0CXT37*B~pPrcCo>tbqg|UxHK{xxt^66DSu9>saPZM@$3+9J^;GmW>+w!%W6tFY@&-WK1Z7zS2E zg;devD)UYxyd&+1c`W&q^UZ4ibLvc|8b+R;4$atx<2kc*4k}+emBvmOP3Q5MoLAN` z57z8Jwnxe)K7`fcQ`)1kmyhg(vi1{EXQK~@A(N_907+opm@Fzc#;>c{K+vkIyT#>A zlfA=9iMm>C;vJ}?PjdDLPZjww=3Nm*RnO~qB4hR;$e~Bls`hpy5x;bd2DCv4?@(p0 zwNYt9g1Rg0>niBdY!W#ms(d+#2kjzycZ9eY@XPlBw|z67wvrv-lnV+2#}3$%{-WzS%#_iA|r#=SgJb;0$)J;T~gH}bpQ^%P80M( z7g|%g8KupZNF-%t)yv|yexHj7@jf#eaqe?7z~%hd3- z=Ye{^S!^Ifz`y*m@qRPo{F{7(WFhqf=Y_%tnxfyWVNSY>Y<5L`$0%!s0$*52>saHJ z|HeLN;mNMgyB~7q&vi9Qjy^c6geh?lJ_j+fvvj@WGTNcrD}&*(Plg_f3l-E(+7fsX z6xazY8BIRwactI=cHBH9^Jr8gNVTya>H{ij^-M|%VOv!18BZUSo4Dei`__tT54;lG zZb&w4R*y2}R!jdq(f0joA7%M1P2pN~{|RE=j1Vt~eVaV;D|bbFb#(9|EHHOdb$Q(h z2^9nKZRt=*HEp~2Q&HmBbl+gk^Fk3F(miuX&ch6p&_w3MERhh=d-6OVT)0rVOyC-u zI)M9uvzcvp5ZEz}2oiIa3B!5YprE{-H3HtS>U`h_k^ia-|yK16z%cs&TSkrVd$(uZTY05Wi1=Lma{rr??sTc}AP;efV^$6QkqbEx z@fEde7c8MB`?|I@hfu!rO-yti1ai?#cyGaX`5mwwnvw|1^AWQ{mSQd`7adhL)9ej# z6UT-&u;dkkuq%XC|340a1AKU7#K(sA=IU>`Ej`x9h1`&yv44xND{Z?vv=^54w=;X$ z9GW z@Hu}Lm*q2chT;XE=-#$OyA@iw&EPs*`qiL36yEx;iNEXgym=T~(RyXe0xPX`!V1>^ zVT^a&#!e*@i6UQ^X{-)1k=4=4$ps;&#yMIyKSg z8x}C3*?GLg6c}{;TQzM?=f{U(aC=do!uUibJ{qW>S~(Rzc;n-lK6+NeJi0Qzv=L#* z>=gA4mvB)z@K#B<;qQ9!pDq`H5M-Qo4?zu)Lv{4U*96W#hj)CZxy#ZSnSgdynoyV+trkDNns$p`l?@O|=-8{80ZFtS*awvRX6D5+_yXVim1jF}Dh7Xm=SCqtEL-g(b1zoC(XdHoDLr~}6&F?g4mL%{vdPb3 zeA9|S44Gw@+a%YhS`B9xF+zi9nFOg`Hp;}X!I4=h=nYikXfHrieXn8?fIjC zqtUchme!lkU3*(R2rw{X^Rs9?}0_HrzL z;wfOB{bfE5%J}a#y5v~D)*Lz+r@9QZ(f@^x?ywY^a{-b#amZZx$x|^eVO6rI`0$`Y zc+c<)$4J9=1SiV@xd0j*)v>~vs2$p$bE5E-js0i|td>wT5{ zO)lH@>dKM(Ba{ueoo}zp`*9K=`TUnf42F&%IIScr&40c+pPpnYOm)2}(yRXMn4jhXwa4RP0!~u!6lLAXW8NtfS zF&@=^`epHZ?6tJb2H~B_J(SH878(JqS&sl26#C6irIzl01Rv#m*QQ^+hb9;9|2lQ$#C&JU_ ztQyAMnjul%mD-EvGtl8P5ShLHqJ%^%ji}41K=6wGCv_yNxmem7_F**975#pSYJ>xw zysIHs@^aUt^RN<(qu~h*JAEl#ujkXb0h%vB=Fc9{1)k+1-MOaP6{;u*XEy1HTJJXrM1dN9XpJefRb9+j_`{U9tKEJ9n@d52}%nB#(fAT#wW4 zHYr*B)l{k5RPZAGi{5~}pe$QT^lq&3NX8h$MO(|_qEb#|on%vb9lw&>zNO$QFn}Su z!L1yjCR^&up|soLE=8Qbzjch=n2(l?dUdXPX!aN;F0$GY8!zQnrMt-{oMG6zadU>c zs8<)PWlBBwhgz`UsuLGcM-G4qZWmH_<{Cm>h@z=cXNoC7O+lIsb?~nAE#`d_liDZ|tUumpujSwFqu9RvqFo38`6GCS?m;F;}GI=XR<8-KwBc zw!kHeP^a&AZvs*D>$pMgIxsTBKCHOF*2&~GJ6>Jm3|tS)?2Oj$_!c|T|3QfPXO4zU z@AcaMcY0w;)HIf3S&yDg8?L?Be1 zJ9^QuotW=uT8c(ffwN++`9|lzigOZd31bZ;EXykfbEA*`xynbI1j z#ao5UNSq4D@T0SU%bUyEB;L2_n@{DJlx1%B?jnwCv~=4u${$lj1#T2kG-0d7CPqR% zXAe2MITR`=JNu6NIZD@j=XUXu);(0*m+-AGFLpU~`YF;J_`0r3e>v;PnwraJu4Srb z2u$(wz*e&1sBSPu(z|mO5br1cY&%Y1`k{at`*S3V5%>xIl*F?|Bf60_ znM(iIz}FO$Sqb#rLi&5hypcP?%SIQ|Gs|KJV_auMl1}T$FE+(7|2uUF_x2zDvum(= z5b-Qi4B9K{=C&1KJNSLh{OGGck?lh_SWS7ol{RZ@F85S)P*h(it|5Zr1*5os84&1a4Ybw6E?EY!jo076`?SG|VsCD`*>0NaP4(Z;qtK z(49Ve91@p;b^n_CC$uI`4qh=xP5f;o-7jvyUxdSJ{NBy| z5704ttA^9qqf0lHUpZ2HiWlt_5&$qHD{A!coV2sOYp#f(!Ihpv<{xy2n~z?vcB=5IaXZ+Kk2X`lBds5XeXS9q#oVkYv^-@7IvulTnq)E5l(CGH*4} zvl3qxa!Z{38k*bgjfg{N$$(dGVM3PeBtmbgfsX4jW-wwV)m;^;@*8kxeHBiyeclnF z1c6^r?C}PJAF)>An&RKcDMi-0D1w_B}(?gHIq| z8xLrG9*BxaUCq-h$szpEgQ+VD%604RF^CfLz^lLL`7@T0#Gf-Iw$1)xAVdbr<$^^< zfU3kPgI1D0UcG&=3$*tnCob&Ux9KgiIr*M^Tu=^uFJ>fVF8$$-{WvDl5H=GSEyBcf z|CFNjbftrliZvD|?-$PA7z>UlOxvUkic^GDv9RKdb28QBHF=trZ(WSlK#c z4(wm;Lu5ue_eKSYm^*ldS{}X}gey$eI6-toUVdn=CflJXeL4O#l60nXH(}mf(IQ4L z(rrGy4u#%!UQ{Zeo?*EUJA~Li~X@r9|dOUP?NQOS}tVL^?dH0e80`kFY(xkm{N{egaimMvd{x zv~rwK5S(oMwtEK?0rO}Ns@Hcjt%8n!;|anjbFg{eMfb{)-;JlkSvsotZ2TAU<=DNU zY~}%H=MwJZ1b+x&*QF3^^|t9wdt4@vp-ZnONb&yA~nKTwsc~+z}V%0NnS|f zRjN}xuSsn9lTcOOvmko%Is4_4f*QJl_TjnJIq32rq>4V!^2PvwQS%e_ySutX2k0x* z+rxm6s=qBvTS3QWUEcF}CTX|D**Rk`aP7|UKaMi}&t24d;Ql>`{Ra*$=0aQ)uAe; zN%1rV{x@V9{o<-?P(CGs|BdolrcmHH>>(_Ju#!E>Vm4-L<2EcVVh6^h3$_(_k}m&N z8PTX!g~U1|v$!+sNZ3lV@{t%B%P6e zemOw-EwD=}E9{VkU$b9Mz+)nM9*eJuT%Pp$e@G)VD-Cc-Bic0(<*B{KQ6CzxpdI_Ea0gUO%GcIe=lXm#J6L)pX$ zIT+Lex$T2g)DQzLt^wi)_Rifbgw4Z~cp#0^O;5FPU<_aUilOZ}B9)w;&rn1ksI%gq zRerL=Z^B*=dHXA&?YMEUBzro`y)kKhZDlcF1&`87ytZs9Pg*^1*rFZklFEM>2sHGC zYk>KzGV$7Pi_BW`FPd*9o@mUxNQ}EXa?!*@fB! zkVJY=LJt{~l7zh^8Qt@?9Sy^Yn446HFBp24nnt=H5d7Dvn5(gwn~|Af=?zY*bEHg2 zkV%3mI2>qv6U+&c6r4@|@VG0XXuZBuMQM3u%(0B&rTIG7U4!I2ykonzVb*$nuf00x z>p{jdFRrz2iuhZ*_Bvd;;G;UJ)Wrb(<7Sw`?mv{zWT08@*||RL%eo=$N+muW;Cd}C zNDz915WnZ;TP!$)gK%}Gy56&B*(jb8(TnZEnmO9_Ual9kaw&@g4htS|X^}{A0{}l| zCSu)Dq#}FuqjWHuzB|J@ zZUh|TrJ@vi)kZ^`1!s;r^tM3vz^vH-E) z!4AN#vH=-5F_p)J#x;$ia%F1BRo=*4%F30jFg4afH7~J?+vgIRO01|(iNKc8ml$Y~ z5;$8@SZ>8IN^o9Uqz$REVRR!GC$lodA#c53?d9l3T-phq^a6-9EI1{zC1|t6TiU^j z6zoKmS4k9p)P@su9A<)Iy6)mF7oUDX=8_1q0=E0&5pcj@n_z>!2N_tf!a1d>l>&N- zs?}E~V8Co5@8^Wxc4QB?Ut-br&_%JP+#stzpQDYl-6=6HDEY1trD{N9C>$8$P4?)I zK@bI>jjc->#n34#wok@w=dE;{P#l3KXo{3EFsrJh7)x)!3L}9}*y^0mC#A`grmL~N zN(-yxUAF9xcYdwE=?`q26are3XdsOOdUdT6UbF{CDV!9L>xYLci;OEpu`h)zsE9^N zuqU5SPTB&vX&dm1p{;B2`S*z*a;1vyJ=ndrGhP&aKZQb*S2Vlay7PR$(=tigE%ScUS=kM-I%!|iJLn%V1R{y&*#bXi!Sns*lkXw ziv9?~CN$$Tbr(OdZ`dGa7j1*si}sG|=T`XS`|S=-Izh84rF7^vhzg_BFwUbg3nm^i z^uD8DF!Of9OYO`>f6n$6vbe#8BGA4RYUTfWADbbv8lT=RA1)!IXwB@WPpQDJ-_Fdi}%e-Qo&zLyJ2;iTY?y|#n{$nAKmi+Y?>mu{_x!k-{` zCf_)|Syn2Mw&`{f?%W|~W>|9k(05$PwNxS$*XbFVk4btj32}^8#7@OKeL&_|)WJa4 z^H%sa*smE^Y8F#ym~}ENC%&cMgbS%kSMvi*IsPLJhl<{k0g&}Ko=}Ul5&$J|-T9^6 zy%^cWVRmExt~b!8K1?l&^;O zkP^r%QvVMEoU4xd&ah=_az^3!{&%TWH$e)|A5#%D4g6qh^|ABv$tyv$q5W|DOf|z1 z6PfYlly4OVrY&n>E^EIu%OQF5hXa_y>I4F_P%?=)AFB{j#Cf{Jq^S=VWGxzcF=`^6 zlRdRFk2FC*{D;?sR5_iv0Y%jJl<{0u9=Dk?@5QK)cLI`aT2+j7pv>!U0%GDjB$xJt zk35e9J3@Psml;Ul=xf*cJyiY5FVo7!PHh9sc+J2g58-FWT@RW* zcx{4D0uR%#B(220a_9`xM$*4F?NHG#p%7jHYE*YNX>PgnxM_*1S!>RnR76uA;nm6c zo3}D#$7=b5cPX)fO%Fk>A!K4r@`wolK`eJuXhSXYI{^{0uJ8%tHE(F!>q4p$Ahb97Jf|VeSgXZEd-k7zG7l)Pa#4x9uMtnrFy>eGUOhZ|*VWX@W@Q=k zuxv`MeyzPk>&^TrLX)=eon`m|;icufKs9njwW0iE#O!~43F(+y2Co^m#5J6L?i-G< z0+-OPSeS8b3KhsVKBYyIGp5N!7#atNo}=T7(-2&t!|7*><}xf8!h!ABzwC$&Ps_Tb z#{YBk_tHozmF>Ce^K=De$%tlYX!tDN-YUGW6Rp{j^k$5r)ZCEgYy&} zB`kT&k#+shOK!k_J}v~0Uw+k5nkNpTGS+`Ijoj*}rm_Y(N&>mI|=PmMA{ zyM9k6b{=WK>`KP+Eul7o)aR3FY<>kpOvRPcuTmgWewsc)Xz2upCY~Fmx}OpP+$y(_ zG;I%%UO6}P=vdlfi?h2#J4;RaX8{e{!~)FA{%_(V#yo)4}X(Cg=K@4NFWNIGN^Cq^_vV*^^>q}ciFzx!P6`56VpkW zROZegf2MwYM=Est3YVM+AkFG0sm4EBhu*?{igN=e%_W-SuriTHV>n`<(fy%x%$6E8 zq@GACz-yi~e%NXYSiB2F()2sQxFlXsV}{aSC@>aur59Seff3%$KbHGD_Rw;eDy{Fa zm-m&MGQkfxo%B#?Y$l@RY#vDVgBi_>CyX(01i~x^oG2L{a?OB`v4*1^+dVO0r<;Cd zi!!^H8iKDVRA%%&O*R>asS1J%s*6vZJO`WXDS>KIq4IE!J#g(~mqrj6qa7%@9s&G& zc|da&WteAk?)sh9%V!}=if<#KI7yF~Qpfy}OKXVW_7SE`2}&M9xzJW$W!BBnq*v4@ zH&CQLk2dha!R6+XBo}*=UT7*spNDbl)R@9xi*g01T`TYCVm{9?<@giR+iJ>=M{>c6 zIJRup^M&V33sM0kSG#)an8p}O{yQxd@%#1gYY&p4A2)R?*GQ)%I8-f{L#5STweDWp zv4aJSm`;z%4}|x^Ntqw^g*6Ui-g>DBVY~69ZW1NeR?esQc%d=6C=lLEeAi48%FP&M z@QUsa3C()66 zlA8U?qyT-t8|6*t@;B5G-+Qd=&!mMk6TKSI;12M23ztA0 z;LEjxztER7fY=9IOE_P`4BaXS9>)N_DUiG0ZzB#Ih{6^He-6vwGt&I}C`M#d%TseO zyXx;;(5}@#_U6dhf8`AA*Ln{30?bvg9lC8W+lQ6%jT+V?HyN=dLc+`p1Lzp;t(lLs z4I3t6#|^)~cjlE5=sfuD6MLf{vBjkqtNz&zkwu_ShgaWH$~R$-tdF0~69b9@3do;i zPJuy<7kUBIJk$o{1im#epC7>IW+jJ#U}75hpCS)-3NHrWD-ri^$ojYcJd$0YC-Ag@IcR`?o?dNl?$kU{qbO09*H8X0 zpGkjc2|OMJ*8R`&mbEVzj<5OdZ)2+Rgq~uP$p6 zs|U#u;)}V)V_@(P!@xG7oo!k*91f`Sn=q^I8my0O7BUXtqF~2A2O-lL0>YWE*BN6V zpO_xWjk(1?1>v01({8~81jWM3Y8zyD1Yq^{R+TrvS)mxQLLNQ_Z)yv}?XDph6cj-m zg%bqAb{_v6+&S2rbB$qvfZ)4c$*zPp5=GfJdjbL&)WvV~91~L054Qh&0ZPy}G!!7{ z^$+NL{>yIwk$_;Rub`8{TW=o7pSg2Agl>paUjohffUtk3I!C<(T#cK(c^H2Xp5al1 zMm>H*e%D#bLNl>H;NEF%o??E2ef88C(%503(y+vZiiL3}g|X$Nk1@O6)WD7Fd`oh$ zpzq2+QWf+ljUhp`G`ipXyTkb(K6fa zGmWC=L3)wJTI>@tv}~giP}7PZwzzE08ARh-_9oXiT+bZic=l;*GiB~Nq|@OM)qGd- zmX{PWa)10X6c+3}CE-=97QgLOp5juNt;RWD&K;f#$55(S*kIF3m<77jAkT%=Y+qqC zxRtDw>pINFE^e|JP|_Qm*b5UNY{K`av<@b#cxJ0v9Qd(OBR;iSaPi4ibhS@ddvrg~ zfJOpl5{IEM@K#nomeR}oG>ctxa9(lD`Mhlh5RG_Z*sETpo*BoR?P5+sojZbBO6@2~K8i%`1k7@FNp~hbN z6~|g>j6%eo19IY*)eu?}6R%;X@m(GG7bo)3C8=8Uq-#pBuqkhNC02H9aw=89ns$t2 zCH2;6jT#2(UR=R+{!@d81`=2c-)R6nx7jm;@&k(Ao<%z1JAZ)eQZ9CSO@ps%XsF7U z!rRzaH&6f@!;rr@@@tp3fhy|SbJ0y}urSKtL6`~D-;Tcg*~NuXsv()#n;`Hoi;k9G zI^89ZzB57Jb_U~SU+thSj{hP(8ChoCPmu$MMP=D>nS_z>)>v_>Y?GaBpS9E_mcb;5 z3zwL+c}tZhKf5`ERr#Kd8Uk2w_L5!RO!r5 zj7bUQLUDWh0S21OK8f&5w5kw~t{@07BXdgnJ+HVf((D^2Lc>p$ys5CQOruo!cgW+W z8d&@^RJMxhzM7p2EYGb`YZzs0Fj%qQr(Y-fSJIOqw73cIJVN@Cm5O@{ z?KO9GWan@EY{(b}0FI7&w}=}3Xr-gl*@LBdTUHXN{BT&@1dv9kZHvAg8|dh5P2xG4 z+%^3z>c9ys)A_T=oxqG|TB=*z)&f*NUl0 zU}b<4WN$<7(+r&#dS>)6?XD@OM6n)4R?qi6@+9PQDMEMhm({31%VMI0%YzJ&NqU&J zJ96N{N5r;SqAW&^&?(s%p)?}@6?XC$9K3L>*=zclC_#bLuqofksBNPQ$6)z=$p`b~ zS1#QiOr1%kWSeByxUMg_-n-no7aJ9yUBa7#KaU3!v9LS)n>%~Cd0du{n#zvo1~ief z6m*@+>N!zn8i!cyTS9F>?{B*dS4(|365jIHxMwOEnB=G93W!4O zj7Lo!EAS>ZkOKY)=ua2EPSnF@4EujSUSjGwE3r10^Ay0p?m`GWcGv3F>*5dTsZ+B> zsLNnvK+;u$;KrI5VD1->c6i?x;mKCh&*o+!r zH-50~GG*jX+T!=Rk*t%GleZ7_4HGSm8G;p~!1Cl@k=59IQWM~;_Z0(>p?4+Cqngl0 z6|a7dTA|-qv=tT&YCi1Bf)Lo*8258-D5wz9YcD@GJZi)}9!+z6yzb2aWZ<)EQ`X|6 z$KQb0?{I&ylfmAF<<2n{LlrF0m685p>nX&m%>YcO#f!NW-By-Kst(g_XK-bn&SEx( z5U%llXshi*GlTIMm{u4(`NUfYmy%rMlx~RfQZrQ}H~S_q z6P0u-mi)D}6*uOioo!L<(ea*4-^Rf}Pe_yuj^i;1;A-}+)Ym&d@bSaW?kUs*P1gkv z$BBcdu!UQh6iIolp**cXR-~+&p0$sAa96&@mYnYYt@4tT-=0z+c?ML-1yy zrMVUy|8^IXH%zMvb2fU=RMI1AaCK$bx(7W@4StGu#=qH1I!CLi`jo>(Z$hV*_d6+L z8L7%IiimNjMoE@HW?#zEZj^;u$f16O*=D*n@0%l)AM4 zL9Nkd{Sb2DEPnDDMW&W93v-xlLwrpErT|$E5WEp;?R7BcP zM-N_bF~tJ21l{}jXP(=;t!TU9_r^W@a??8roxEeNOnQ~cYYHjAV#5nK1r=dC19MO> z6@@Y3N{e;KPp;kl4zk!%JqQ%LBtAih4Gi$#w(YK-v{Wvm7dHyag<3_hXtHk)yY>VF z?|t-e@EdXnV3Fz%mFcyt2EwwT2JD~_uUpdU$Q|!(-x1NwdNNZaetQ=cdO4RNN)4Yu z)WWCi7xUE|AwoTzAXcZLh@=~hX`1K|obGR**eS5?FiGZqPr%)7Hlgkd)Ue*Y zyFkoJKG*87a4Gi&4`{hO>)r2+iP?)h^H@_=qHR*P9o~<6A>|XSK4eQat`_o|L{_DQ z?>Z5-rerrBMo`43c4C`Q%urFK<40m-!8M~OWW+(@!dd=0Cm7J+Q#n*ix?SGYtMRS4)xYvxCKpus@Yk#Bn)$&b zsxTu@=6VY7ONl-i_Ec|*Zj+EILNL{xFON3&t4C5f0quYh?h>_(#vY9?*@wK3*^x%R zy(ZRUPt)OLNkvhhuRZ93LRuVFM$-6fr~sw3fexp}9eUronAFV937mWpHYWF-?Z0`C zt&IOnL5wS24x{UWf{n|JxPf`G#Y?5$Gzc<@ffG14Ud<=Yys30mH3oKBtpfP?F5u#v zi~#Dp4ZmHn?~r}NrM`}O>{yeE*OfBo14qAm~rv!m9qS6zClwZIANe+K{ zSx`DAa$6G7ls~!Hl3OYq;YpaGj*M!f1#brO&MBHQoe%mr7K95CO61)AQ%i?)S$D~R z@9n+f%D2LX-4%{b>eKlwC8+20S~+XhpnXjLYR)k#Mp)(^y|_1F?^t87c(FLHoDgLS z243y>xk13joX+5kIE)UBP{FvliD}1BBeI=RFVM z_Y8}xWBUd2RYsMcU{c$^C{%&Ba5xLafFQGiTXAZK{_zes?X`QmfeD-(XH zT1EhicRYqLmB06OB$q^>J3YdEWZ+3hhfcM)GSLnLiwc=m zHBFLc0S~0%WEIw`_Lcj1<84RjIhpvbI?={`>`|oHy=wCp(#~TB>b&~A(YG)czrnb+ zF|Q4f#_X%0EpLZ>ANZf~lN`v{`Csd=QI&u>1TLhOnn@7&!t#2oI~`KjSnTfg2%?@26IQ{0N zy{}+KDHBjtl84+fF9xQG?S@g5e1&a~e0DoLVOhz10)zi&3t97bhG=ccMoy0JKr*Xb z?A4;*KNi!jrc1Xc$;5^DtbbT%FznODhuc&BR77s&bW!vIW-78>qG<7kI2Oc*Zh%3^ zCi{wYV`!V(318ca@tsa3y@-C(uu`wFkDhxrw$`O5xnPyCpIqN8USJ1F|J}sVnG=tm z+Dmu^c4}OS>e#K$f)7x!>h;gkT|m+xE&v#Z2+LN@WRsb(5O_G+_bnUUQ2V8!5OXL4 zDhp;qEh}fiftApv@FnmSFo_PO>QvPF@w*L|^yJ!(kHho%`Wm8+#2S9hr=a%r1nD=` zu?xK-?dtK-t%(x2fx=rOnss1+twd}Q-)N9(AB!IUg@ji_ z-77`OILUBhdDms;jMSwn-TlyeYm=Aoa9eIe4bNr$0<$JG6nRrh%fu5!Ea_o*z1##V z8lEm{n8~gqmr;8Cy_Tr(0M#oCQSFK~v>A~5TiPgy-OON!7mHU&;;|D;zu^(!Y;&pT zPA7m3M951q{0hTt3#-eCTv%uc{BAfM8L`#u2+Od*oH`K}<1?*5Ftr9u#__H{LQd__3?z%_xw89o?VnDquV)JA6 z&C~e$D#$0lx)FZGTnM&Cnb^oY-B5`4O|#DARTheCNgR>~8;>`34h%+k5=fHXuIB^G z%Q=q{{dF`gUk)&(x1COGQGpH146lSPUmkInxgO!E2M=8AK9*w_a}?hk3!7dhIi_zk z9W9;U`@!>I|KX~b2)e2&5rwy&aEdh4LG80pk8Rqvr9?ADH_lPV{&vBx?!MO+r5*NF zJB+f-BCANxZhcy$g>UehdFIA-Qf!~oy0=BiM@b+S_Y2&|>ta*-%UP8=rzh25kkpTd z$VJ|ccWpkW^xM#ZCz8P~e5C>CJdPPrIdT7tXX?Fc%rImP(R9=ugTO)1q6!!FlO(j? zbaK$LEsJsN*(CaN-VN`uUz?t{ zIjy~Mf{?b`5t)s$Rt;X#^6f~2OcxbKsy~k9hT#ys0~om=9R1zV?egw|%m|~Tw!Pw} zEY={6gw7t&NUl(``i0QxsJ9(N8EuX$_lJXsUbYH>Nuz%+V6qqk%PO}9q=-Lm`ev?q zoysM0`YTzaXXZGBFQ1Lp<>9fWtbMQ#*?4FuZ{HiJ%{>gdb>?&_>ALBMCcacxwX8Z175)`tZb?=O*N(w9+^}DjpNh*LCxcWJt5IuM1Af`F zWJyBrWopRPNnx&U`MBzIGz*`~>hKaRe$f4}3p`PuUq5($1Dwe z3MZ!79tgErZ*hW5d1Ez&x^jccOV>iyH+{BXa;T>yqF(O@c0RsF}vcVCX3}*n^LO zpCV6mjg&;hk+IEa&-mp13avgo7Ugen-Olj75hN%5$P)Zy`im5vW)RakNkz7B2%O>k zA_)czhd$j05ow#5=x(@w!;Ut8o7*#kUll?od7XZ%N{&$Q`0wGnT?QoeOq9CgZ59|$ zhv_tSr!&qN4(kf*_>2iHct+%kXXI09x@+kyTa0Rn`2El`A-6qqbZ@$buQrO&n8Atv z`$W_yW1A^e^|VZxQza+ppE8^+t15~ot>dbNP-j`gILxt^>2n!*vV$<6rt|7sKsRU* z$S$Vy*KC^HeeDTUTtc1q5>r}XxBzpFzWK;wJk2dZRQxf%6PmiD+Uyd{CRlqnXz6tA zE50r!w8CHY6xM%L|yjq}0 zMuEi>TfDd{pf&j2%ot}YQ#0L`tsaE&NwpbzLw)R69kc{Y8_E!)wfR{k!bkw8$pF3w zikO17{_8#W-Bel~LX`CMyqz{9sG@)PEC$_XvSgl><+6#2ZgJ@yd562%bV~`IdvI}y z%lOr`);+UGYOK+t5Rn+>5pN1c?-;+(T{xLM29kjt5;u40fSJM*b7bmiUO*x_1BCYH zG2Z)N6oDbNiK#_EE2NP|jjG7Ke^I1dk7R=ylL4?~l3sScYw>kD0HIR7i_y=;5`t+Z z+Cb*paMPI|4Yd?$-C7A%ciS{Xc4L^6iPRH8zb+)xmZZ^ctxO_EhBZrv5@}0nmKP;< zn^~}zkqPyQ)DS>;UG;u#I)JqXRbQGHl(-eUJ*;CgU=;m6C%ZD3_kS8IYPUp z^vr=%P2u6?gwZ}zLf#-_#`O4v!Sl=aS*Kc~I+P--M-O1l$i>@ikJLyHL;0G{j$P)c zla3`Aj+0YtVUtMaLO_LMc*!!)Sx$A?qC*=(t#KN5IgrNW)vr>HI$IjxpU-LDpNOw|v!yDO zbsN<-<(B#F)CV%Jn3%t1$NQt4dPBu93Dv1P5fVNu!90c;v=l{;&vXr{)e7u(bQZA2 z|1z-XqM7-P0$Ft~sk%4`PS|V~b|sY7BtnbTl(4Lck9R&Nq8r90YMk2^iq9hLC}ejU z50HL_&{s;y80l=3$3PUnGXV!}nk|=bkT$K|T8Z92{Y5*Co|`{G0n)agMrUlOfWhhm zcHA+C1KzJGGUZj}7N;RIr~g!800Begxcp+mW839UV6}$;2D0fHevM4f9mPNiuLxXe zeI*q<5#jYwLwOh3m_P091S;aF3HrL2Gn&$O9_vMZjV>_Rk3#}a)#&f4afZ2 zdWe`zXNVLmqSu{WRmX}cX|M{T92&m@=aXq<@<@cw9Y z|I)x=D5y{G5tGH)z@b`U#h2QJ7YH_(S4I%4lNaE*yRJ7#uZxmr9y>n^6CNwn8oL}5 z;qpb$^PGuB#yroWU_)CMWJ<$Av(qaOaIv zjL)W>Y5#OEp2Ni5tf*-3rxFQnhFg+m!V_(Y$jrMm608b0man*>_mD-^GdR#|^zvYt zd>x&sD?jFUHL8_EutdBXA1is)Zdv`ajy91mMcoiK-tEP_#~hiX3j;au5K&k&$gsq^ zlMQq|3-ZyU5P82X1jgchYqKT&WiIz+i%Zm+T;}>`WmmGf21sN*T}wn5MHs95{wy&u{7D25ATR(#kO+&*-bV1Z5Ba?&H)anh+(x7*TKW~DDA-4sN|uKV zXa};)kMrvYWd=Y4>oZi;x2HsaK={Xs{DKwCDf%G|Ap-(8fdDw8Al=A~)r2`b3ld-9 zGN^g~_<*+MZ2_n&DrVifauM7Di4*4`kmP~(bM)!da0wC4_yIyi_!)Wqs03Wr0D~P; z{Xgyoz}_BRAphE=?Hzy~WCU<7v~zGi#}4Dp%0vMB0O+?d1AvfQ|IE+sms;PiVbD!* z&TemFKgw?){zi=wPXYr6R@)gIW{Cw5SB>L=zV1X1fNyVhA3z`Q&UfN_{6_`C|0^5D zUr2$j2B8QZ$O3+iFl*m0CZYsj1bYx%Ag)eNAgq!5f(ritQjkN~x)z{UJQRTH!2{sP za*+SCU3eZsEvnOxd*FRFCZKmh8@G`OsU*z7-k+H9CaCvBNhoiBuF~-p@yoGYP9h0? z;nsE%&Od1VTNC(XpUON)q|Lo=Iq65gMmy;D?P?!G00jg{(vJk74IAJ!gbU)wQvd7_ z>K7N}*Fo?4U-2`-0raY7#Sfg|YVTj8&*ug~p2JVjvk&;|$8~QPjSvH-58)4tK6I-% z0pxGF&vD4+w~5|1Kh{2MT42o?IG8}M&#!x9mmvmBu+z&s_|InqXfvwv6pNGFZ`D_B zUI5Sp_T3@4o_&(uKXAscf1tpx4Fls}@{>>p;$W5_&o^N?&V@qYq1L???&3C`_czY( zGFGU~-**Z1+;b>cVrq_^UWKmBJHx!jF|`!~k%8}yC; z00BLyaTb3F)tQ$;jcHCuCmis7_Yd-$-o|pkAOarUFRsd;vkIkQI7mh7#IMOUEalwC>Or>yU zEcG#F_nR8n&B2cZpAW$hf1Fux?78BParfI40r7XESpCEjcU?rXQD#fPrv1s*P|ohd zB{wS{QRtVe^2+t?8XsmaPZwLpo~u)&Nmdmg_2^MBP@*zn2ekC73yeFrKh0Fnq+j+_ zZsW(baR55_Gc~IHbDin+%FBe>c?L@>Ni=)!nbFSL3r0=Y&7!@jw|zLgMnrR%)PkJJ zeA}jn(A!jHfLDHJty##y_H|5}zVoa&^``Yc29(Kpj+PXW8|nyidl7#l6mM^_;>Y?< zsbUf1+{AvJoRk7iEX*z8zqBOJ2=c2N(LW|y%4-7LMWomL&&eB4U&BB_R3I3;#*T?k zmCgInN#JLaI3{v^!5B*{b1x591(e!J+@`}OTaf89;LsFeqt+~Ih8Gq_iF^~+=n323 z~DNs@@fO*QNpViPI(V7M@Kww33N5u)kDiV1w5^-43#uymnKMIX6h z1Dy5SEx4(Wx%WumV#OKh2u!;hFr4vB(kshrVsCF#n_3qj8u_LJb4NtYN~;EFlC^O3 z{X&mgy?wagYuilo62uA)<6nw5L$7VUpJKx;g;|S3?-}=!0i^RY_iGzMVy|IKmDmsX z78QAX^fyhuroMLF!jF%WEOBZ)m+An?@;)O-W~y|+z0s1E$-wdSe4V0s0&Y|V8uK?D;HRX7ArlWknAc}zKqD8y&;ZoG3LjNa>V{j!P4V(8l4#h1`k`hsRboFW z=R-PJJ?cJoF7v|nKEu$Q-k*BZ&D!Wm*4IaL^Fi@7eXwtr;USHua z__eAbga6`_^mc38YZ$rfTM+g>vMqsG+X7S?mCq`qs2W z<@YonElJGN#jS0|)i8*pMRYkhmm2>-v#?t3D4aG+TOpiZVV~Fp0&wq zwg{F!p`1?gr;6a2jc2E-f~>wLi8lrtX3AR}DJiUvV*y4`&G>rH=Vc12x!>U}F}Cii zHk9~~2hsG@G1ww`MPASg5y-6@K0B55K8iN-)p+BlFl?`#{BoPUO={PqFFS9Q#NzcW zE^7ZgImQHU3w@z#6p~YM$0pJhv=?2EBKpqQjbHomOB>e8!Q~Fmi5|fwIo_Z z+a>j4jxw$nr4}jh$+le>$Ru*!vk1EBRF66*q%h>)ocw2Gu4V?bwJ6nJ(~o3mq2=Z& zmX!yt;6HUQ3WiRA2ib-!q5d?^XIe(Bp`oc^EZMmjuC9jLESNVQW@S{zT|fMJMpSITRwFr`UJMi_$5EI^-w>LAF$$n>_dL;|^05OV z?lL~6hKds9URcVPZv;=a^FSQNY~Xl8M+$c!>(h?u_KSxY98bujS2!n`d`&{Xd@z%ZXSe52>6e|kvRemw@9U2?c>MOenhJKlO^;|kV{MkX)}^7nCi z6uM49Dn058M%wu;eEOvLA1E1=BKxb*|IX@R9#39xGCL2fb&227)?H3ANz8aeJg};_ z#Gl5hb6!c-^I5CG88vXO_vC`nM>mn_3TauEDTI*o$2^Vm`~C>uxR+gew{Stn>^v2h z)npv~Fn4w1Zr(y8w3ZKzjr64bY=qf3ryT}HMzCQJ@BbJA3LguUP}X9!(kQLU`Bbim zROWmHwNzDdH?403xSS-^jq|B*X7}k1MN~(xl$?fdoys?Fqe)ll1gN05(U75jRl5U3 zNiV$=p4ORG;pQXUmPp80VvLJAiR)JB;f>lq<@A zaoY+4!oGElW||X+gmfs??7Ff0k)-0@^^1g)Bc z@pvU1nG~n^2fVV^Yn}Tvz9wqjwOb^o(fFb*N;Ni+I_Y%@gp%)L)q~k@Zovb34nEqJ z`^Z}Vkm(0-IqGKA5D4nPy0f(3a|Fkg_!F@Ky&4<~$vB17Lhn>cB9OtYNjQt0;)^eI z6DB{PnB1JrB`aXEN#2$PzmxL;6lrr;-B=7)w6SmK9p3B@lHZNZQ|6`;LxKZos!i9n zNpgv&)4jBJZW(CK2lohjkJmPqx2rfI)M-{`yMn-NL!gx%m>v@Bv%c#-2{r0Bruj0E z5lOlxTCj!8z>+FLm*sm0arj$`)2?_5>uw#p328Wl6W)yF2#My>2i0&B0yX@u_A7Sw zjx8^zG3^ON?L}c|D;`YyBh2rJGx$Etm|^mNv3$q+z;^P`83@j1Xwh+QWFfQx#HJmC@*h$qZKttwpi=&{X`v;X6&`0cEfU z5YJ@B$tu~QBl`EFN(>)3HYIjW9&YgGR=npXlXuful9-V#-sL!mN-gS|hZlR;lzV*$LMjam-W z&E0>zX1-})nLQ6+1nJ~n%x-(s4UHBOfp;+?nG}84&kP=o)wxo}k%aL|M7GH-W9+`# z4YnOYqJXvbZXHG58A5Y`E3=LTA>VVng=ZQ}ZJ17!wAV`ArahB51v`I?4MK;4jOC`D z+Un`g6a(uL$|%Cp36Q`Zc|cxLL$9Ak`6dvEH@CYUc1kM~WJP4y^+sm1CnMK_zdk1RFfE-bR*B z++?x8{HpA)JrJ{_Z`=LfD^?qb3j@DVma~oI*CigRYoUcVgE#*^_Gs4X-(#2viOfC~ z+4crcYB0SXKbbG>nV4>oXc?;FpQ%qn1Q^r*Tq(%XrW$tJYMhrXF&1-@AXQ1<1{C{< z-!41Io7RjX&Ro7`?rS7l@19vZa}eI6I-;I78uolmG0{V8Z>6f3a7e%g54^57Thx0m zd?f5Xs>jBnGR;GSC>F6MFmUv)bbPlIiazHQEi5o+t?4XQkD->P5>TEq3x-`hMyoZk z+HA=~yac3*H6tsi%4D|D-LG@0bFS*1(U@p$1k{E)DNV%%HDMxZF(h^br=T&?$a(jy z0~R_QUU(|Mz687Gj%s>5i+*(o@p%HoGhw>2)$Tg`Rxey;Yo2$PZ28MZj<0U4lEz4b z8B4Ne)bml;b~C>&>OD^simsAaTy4|KBn&%|>qwws6uHw(tu7>kea0|$GhzCaYmRlD zG3ofL6hB^txjmy@)_1T1<~%u4<6FAyi&V)v1eA#j**CUYTJm~u-raVi$Ga|j;gqIC(O&`KDz(u z*sdU58|;_?JBw^LIUpR`4^jsl8BjU1kvza31v&pb6z-gDh!K)`t#C$ct=>J0_V2(- zjHP^M@cit>(;Qv7a27HArTDoddv>u4tvAqkFnyvx+-p4>h_im`rGYfdR$NrQD<9)4 z6PET(^yU)D8@b+4J@!%Ei9-X*Px+8)6SSj$aEn{0G zi^L!ZypM~JKTyeF1Y`NRVhlz6_%;|^)hQ||JBu7>(}f`jYZH%UWyxPK3%r(mo$@|J zr~Fy?OH$%*`IBouy3Eq1Xb{$6SGMqZE-_k;hzGdifa3(CuaYg62J-MQ9)F@L#1@Cl zwnf_IXCZ2XaiIT-TknrZ7JOV<-z9An<{4{y%m~2?mk8f6ZljSy zC&ZF4$w#C(x)b$BJ}U}K-~su^P1u;eMxya6*1R>nhP-Pnbt`LSF61<7T1&IBKEB_1 zFtC427AgJA8BeBqTsH_b)@@LwVZ-w$sUWwwnk7-x0Z?FbqvN}|^fUHrm7RaW6h#}N zBzTK1rjoEFe1njT6vt~Qyr>GoC&%1x;EGBf+!Cx)k7f>9pGJk+s(owgeuUJZ5l`LK zSz8r6mvTTh*@d2@A8%oV%dqEPyKwnbgucdbytN5o)H_oP$rf>$zWt51e&)L%J{AB- z_HLa0Axs~8TA*W%c!3r)=npz-HmT_T%qbb=&N%tDhSED0QoFyT*sV5#{u}Ef>I9*o z+FHGv;+s%b5BEuu;#6K!-MzE@SiIy~m}(+ww(19;Vs{jv^g;<>D(n{WUyPkoj3+?X zr<>C@r)_iEwr$(CZQGuAZY6=djXUN*#Rst`Jx=@mRK`a8P0-lB!qqs`Dhp9>tDk!$NVZ#MSKB=ZX;!b~ zE#|vNzRj{ve7VO1Gg^JfIrHO^mmxCV;2-ywSZ~q#6~d<@n9R_MWNz|u$C=fy;S$rj zup;$>y%@(_t)#l!FhVe^Cy;S>$r_eCQdyh;f21*)AGUU+WJj`hK;2p?EU3w^VLL4a z3*PrS7FtOu7l(7oOI(N$xVMTco0-Oi8D$pjKBS!`Uv~Kn_O*xcpo2`$V@o(Ev4diZ zP7%YTZB)j{@F~^$EfsF0b2^9lG(qKe3UsIMZ-XiFI`jA?i_HGSv7}%)+eC@r_Se8OM88o!xErpjlI@@sPhW zTB8X-v*NbSN^L>u%GDwRAd;hPahOFT2_vBpJyCvzk!4^w{ECG-a!nDlh-=W}d(asq zvgu0L7R)%H#(t+29>?f>tK`U20rm(GM!I@FV=ML|{Z`!HmzzYp-On*t3{8Jd_}@sZv4G7%BfmF@7L%hgqhNxs#|TPp~>@1Jpw2^xMQS~?bfZih(3Q>NH( z>0CBT&Z1>)wf5}Kj*SKuNvA%)P3eWin#yj1hog-hz6-0y@e9v;Zn%;wPY7CPUTsJK z`C^!eyD+q_R9C^1fIqOZ`7x_=gojUBI9Ehoni4dOSvzWK#Xx z^Cxx)W2Amjv?P(y_ux=))aCo&yE}5KK5>=?rLvHK_L}Vzi0%30`2Hj`yasMnk6!o5AF>RQEh2*r zYP*5g4Gi7n#ez9l>MU%S80pXZ;f9y2U-$rB>xVs~AH$D}x-T^RK|DC{uay>zOO*G| z8ZZu~EeFDyy>_SR~24j@&7rS3%+Rvs^8+cjB^L#B?8lGp~vxBu(*5bN8 z`Fm)KTvi`8{|Sl1F0u^Vu+tar#MZ>C_i@9R&=0q977`rWgv2`-9k}8e+}ZDTb!EVo zEi$Qmn@ZYMc<$jzgQ!uY%&~dk7&>p>J|C1p$g#r4lK+%lP_KKi(GPS;lE|F8uA9e~ zH{@9ASkk3yFAZ53Wi@$!m-N>zR8>=U# z$m;6?9sV*&z2=fs1J#KfX^E6zD_PE-jb=B^$!Kl*d6pUNElI!NCDELa;t zInn=)k^XX@j6&BbdgN*-3Zt@q>^>35URKWC$ud~zTV<_!ZW+4FY9K@)an=8hs14^c ziusroTDa_iZ;&?=9rjyBk;&7GT^#^&pI)1*^}yk*De)`2x$GSp)%Q8`=WNc zjr&s5FyjHFg_3%Cb?EmwmVOpSB3ZR>$}C~Oy^v_<=p21uTi%^>8Gs>h84@6M;;O{~ zWK=AFcxAyi|2+Xr?G1>SFw<_!fu)`6{a$FgFj5UWyO0fabsfGXTfVsIpnLY-XJz?WV|GrmB z94C%C-MKKbPJOTA;2wRGZ9X?DHFSKp0RBq;9_faFkgLZfCmoB_Xs6+1)uOTtvt3>h ztZ@=$;~2V;Qd5P0?M5<8PfWm(B!$J0oVurJXPZj2P;S8W9n>3&zx!OX zt+6ye&i9xpctGv+^0+a7&A|p!=13sDT@$IMLIch@EDxXdT0bn-7%QlrCtIDDjgCT3 z5sd>xvgU)DKuz4GO43sr)wENBAB?45D_OvKq)N<167TRx8l+m;-`on5=}^C({;rO9 ze`)GpQC+%yh;~ClU|b>J15P{j@Nre_EJzZ3U#>uA&G&KZ5Xc7W8Tmj0yQ--K2`Xv# zkWB(Jz^^ytdWEd#&8fFLoYq3QEkcbQ9y+kOrwRNR7)C$CDr&25+Kvo$h?%Vq(*ig= zxSbnwveUB>pZT(%Ng;L6HZ>#XxcG0t$}ZsJmIZc^}|u&_+MuyFH~ zTQLN<^E#5y;%t!O6fw#E(C;ztfRpT(&mYa5R?}*pkM`?}nv?cxo<>%tpjh74D%?r* zBCsH$wl|O0LE!t$_mWRU>=6T2koJK-hO$>bhQloZR!EW( z{UCm176WEsbjTGuH~%V_QC!%j01##y#8?6hW6{5GV1E%n0|g-S{DSqp z_QUzxA|LKOJv{DK+j>3pui+IhwSnjv`jP%Z1O{#L2)O#|(SkNvYxDax4+jmvGT4I+ z{9ryBJVVlByCDKW@%Qp0qQ*Mk`L73T0*_t-`KZA8w@^dAhgIE$^@6`zIRSlgfBrzf zq`lPg>z~@0VG|%8?^}XC0Sn;*J?le4{;e+R3%V7x2e$s}+7r%-$PRmEcVGb;y1GgP z)g#pkOfhy2VjYkBy@Zc*3hye?>#?gpT8ro57b0NWNVF)6cf1c0NVLn|bFKu`8RS3e z^RoAKVV{%FKwq}AL+!<|IQ^}L?TS~^6)Ins1tG4oM-PwO|HgI&yU7+kIuIr)eqW{AKJU0 zTG1hJz-LS27vU$LD>eqh;0gUN#dbW%Lsawr_(q^Fy9&nBnN!Qa%iez^)J8>Qm7r7r z{oG%fneE324>00q&!7!;3D>XyOza=)r~ClH_Q_(0$beb|8WiNmV-Ts_BfE{h{6*zJLsY%%WWM{7E87q>hn z$QcO#GbT`7So^^zr@DXu5dA#N@zY6Le9O+iZCC+^ee(Z-LF5O)Lj+qz%XqQ#Uu15p@^NE1c> ziNjLYkYcuZSaoGdG5p6lm5QhYO*RgBF{hEcZ-4&w}E)90!DwFUxUMdk`=51g!v^ruLPc;EQJ z3tk~F>koqsk*G`_@-RwgmYFAJ8J6sJJ}<QAG6yH9w#yyWW^682+vT^k<7Bfdg<4@oo)tJD_}8hi zGtlI4?owW6V_n0&l@i=yN~(I5S$<$@UI~S5)j>njD>%(O31PtizV0~G1^=n`$YqS0 zZC(+ucv-rHv+`UMW;QF&FrLGmw+9Dg&7XX^0Nv~6?klXTDDDu(oY}-_F!D@iGkka) zlI99%ohhW>4iwPD=Svs{-Kv*RZ%~Z^p5Sk%oH;z>$fkb`rIezfL<7M>ehEkY!s?{R z(0q{@8ZZemg}Rf-mZLJ)t5$~T@gn>^PnJQ3HTYufa{ibbc&z|p@bwJ31*pYLjcB

6{W1wu- z%)uHWJqV4&{DD^P{UDNzHe*`CA*us95^!j;>st#DudIzn!_8#kv5O+K{4BZj^jwl_ z6rER)8>nQxgy|BchEhQOx-KVv{RXX#Xt~wG*HE|n8fzTTfiYsU3_+Z7oupV8!8XA_ zvJn<3fqJ-!ly*jyW4Q)vsRmoeUu;tARi6K}e;w;Kqri{Fse5+ykbigUzqPFa{I;4~ zI5(j#$xXvEYbKv8(DgNw*qRQWrT!xE!YUI8t~eR}iIk<>d9G?Q{o~Js97yhG1WZAz zab+|BUwGG)7Z&2UE~j{CiHEnq_Dfr+$ig4zLLr)*B{jfvDUXIqhsYK3q}RmYJY}7j z=?=C6!4TUN8*roOJxBa*5R%T4I(qKomdbHSIPhGC+;>+KZTuXt7p2Yjb49P`3~CL= zN0lPOPks?C%$eqMlO@_qecFxlL<8}kIvOVH6>zI+jt(nO%*?C9~XQQl9C zuO~`jbE!kVK6{#T;M3{PQ2FgO0#wXcJ}sY9qQT>3nTzoxN%x1bB;U?*Jr}Qx%xom? z5Aap~z=ZlA-oGH$u;8ZeKe*h>O_w=o;Jgy8+L|9z!={U^>}ZmwKAYzCa|+R5WN2y_ z^66%4Aw~$!BfDzuUC|gA*tw&$w$Q9K{)Xj+2V2|K`D?41M7{XLx$vVTmL?2c?;Fj{WVdQ+v}R zQ3SAc!vI`+5-ZG`rvg^Enhx59$&aHigiz&{s}e5{s04}`8rtB*9SXRQXL6AFmS7so zbSLVcR8rGSjp6Sgs#DwE;ID%Ht#tw)B;L)ErJemNWnQ??8U2?cZRk#oFySW^pE`%fgb)tv8b_VyG?%=6#mn?p*3TUFJo+gRy=OxzdOUQj@ zJZKm?0B`YE7&0;V))@Sz)o?nsPiQ@kOua;`M~!u@1R0N2Urw-Fe_7;QGf_r|@1^DG z=jx*xZ8blpPbQV~#?2Z(>(Lr%S+HgHYe6d1)`dfYQfMt@^*!yp*41~RdDF?5i)+5W zQxA?H`Ta-<_pHA@`NoB(hfyzjI_{Uk{Y5|*Xt)`!L{J72rU`>) z>;Y{3D&fnR$I?j%L#gI~(GxYYU1>jj?uT~z0b6#QO8=w)D(GCd1K!G^rXA-qmE@I>8-`KA1H~uosSQhX zZia5>yQ|xw&kM3XqqkAt*L`q6-7;rY!{AA>6-^pNy!qh$-I5{K78M32H=643pA{$5 zUzCaIJM!xiMJ7RDk5Te01-HjjAi+sxsS~rbxwnGI+L9uV-lO=QdR@k|^>jH{C=1$X z%g1!5P6N6*^qDltu%dwX-=l-D!#^X1_qZlD7j#S+7il4chDSLaLekl>Oy!`Q;#aOW z^Z0pL!(+t;j26I4L6#}dhyim{R?Pbc>1V!?nq@PEcA61TBVZNBi6^#BpE_xnbV<@D zaUr%ohT)OtV=2PAmfs_rb3m{v+>PL}R2H)2wsyInLvscF;d45+bQm=35mcjG@Le^i z5)%RE-8?XJ@{tO?d(ZVSujKHW)b(Nm;sa}jumVTi^LFyLh_C;X${5wvoX-Z+X}1Ig z$KHqQfc}k%75mMvL{5A|uN2=2A(z%wRTKli?tw7`G%R(YPlYwSGWH^-s7EQqA2H!m zJY0M_&13U$=B#<@w-j@50QX0i=G+wMs>&Hz=!Bv&52G1=-?&_(s?J1+B)1h9Cjw2mrXU$p zk2jANdc3-7trEQ`ktI@MzpLV8r~$1s=GnEfNw8qpomcXM=!CBEZJFHDY^FEwf8Xi6 zENxl@?kwf1pv3!Yf!z80nKH=glEeUtaDGsOtj4Me5fZNUV>ki*DBro#yppqtS<{Mm zcInBgV2@Ns_zCy*J(6Xuzp&7K=tHzp7vVoiF%fRG)+ym_vFjXrU+#D6Z>xk-^9Z*-i$q?RR+&3*OgSp*u0(lFVbGhy zDsFlvmx#<&`%X?ElfDQdNI0L=`J%U%jeOLENo(ip0FN{%`0+w z()Qdb$iozi48U%mn*2T3OB4QebDmz+IXC7vJ(i=Uq+tHaU8s8&wHE_#RQfI~3}&XX z5^aGX9xR+$8%cWwjcqL^lT$rMvbP&e`W%`*rJTaWh;I0GJ`oZ8#C>*6zD8+{c2}ZC z&nly0Zu6#;*~+OB`nBFA81_UXQ`$&**KV)NK$$XATuDKbhpiydnQQbWZPvuMs9~m> zcnN!kjh7fT*rf@eQdZTQcd+&@Su)-Zt>LtEK|~{o^ReVV?&LM{^LzPDT6K*2>NWN5rxZaadGeTqEFWyO985Za|0@PM=;0_9n zadN6NQ@LSzgKlc^!#oET|4}IF-`gHr&{nl~VB9kZf;h3@&`3Gxzinbvm>pG7Qkqm( zx&Xy!{}@kjm~oG08Rq@k7dvNspou&pHNA7?<#nA&I0rw)V=N`%AL7)Z9=iN?wqG9_ z>zR&6(~0K3!@#-*v?V1cBXF$bj=~zJ&%c0ftQ#ObU9*iC4H(O!-lx-vw7n=JsK&wb z$e&HTr_OwM9`NqVrPG&DO2zW({OUq4;hpbR;%WAJQX&76v%~pS{7#3^)<@9;AzhTx z3lXzUN;lytQHpa67p}$Jq`t0PB_&9Rm|`iD1)pqO*Jk{KQ2wEHQ32si;o0ePosXE1 zg&c!#--qnXMnE;})GEK+ZY6a@32_Lke0h{SjEh7SV;B&wli743E+c?TcT-7q6)E*X zsF;S#_ypHX?l1Hz3vy3)Op0I8BvKTZ6Vt-XLbxlAu&`z8lL?OG2A9+ zh2gl%K|CR9EGh?6dzmAmCpo@xQD(MA9O-j?g7AgEj&TumJz{vVBcC(a@3yg55*y`l zz!N@pAR@Hn5fmk^ZEaqQ(BEEdaUj?P%zQri7;^UQ@% zQ3Vf{E4>q4ir`Ryf=6inPIWO!?sQQ|u>|d>Y(9a?WJOjUT6J?kD4CVU%_u~CoTu3Q zux=~&l4pfQnGkyq^_9KrtolWvPeW#shbKP=O_1I3+q@lBShz*`>ypB`BAVjr#f--4 z-Pt3t&0mRu+fbIX66rwQAInZsdVLfChJx%sw)L8DAI^a&Mfyewt=9Y!{6pG>e(j@O zkoY4kkz?*f?g(Y{xmtfI(3x?ID#9KKv+~|l$lF@~+W~}kJs1nz%4+MW%=m5Ei@rL} zBZf6z^3J>j+a@0_Cx@Zdx3b(^1c3SL$))eKoKbWlZb1)s(>MHXX@eTRUZU-fsYqt&ykC=&qs3O)sH`28q_@0s$11amvP7La|xs0J#6eZ#@Vy~^@>V+t<`<}jL zfPB4j76!gwbo!d!@WA=Q3{+;AiX1~W$_AltgV{vf-6umlm(EF}+>_o_c&wPKj3vr| zrlU=tLR^D-u=1zxUaCtM0Il^rf!u24U)ju_t=^X&7S$~4adj$Yo_p$C~W zqTf@+LzjGLPVQv_%Z7eiv|a&8Xk5MWjm2W@Vr$X54t|DQ5Z?wITaUqyZ}Ds`5hLuQ z6rayNu^F`@uz~)a>?!+De0g~GM|S9xjaoQh{i@#S7PZ(8=LyBgbeE-aWUP#jGH9`4 z*9}m@e*ayQmikbuNV$5_*!HE_jIw^wgaLp;JWqLHL2~ce;ub^-ft(aB3Px7f~54tnCB3_;AUROnIPz6ygKp1*_2{O>4%=2Nz z%=UaU_ex`XPXb0RD%iXm0iPX-34(vuK*^Pml4+a=IO?<^=$W|8iu--lcXXdg9|me{ zl5Ihkjx2I`gWGy7ZlUAFC61zwBq9Xc@S2{YGs1+oUdopT`qS&SHm^1Z{XDvedE$(liiU#WH9K>Seue<4$B;h%kfB4?) z0=MX}11nWZYuK`F)6>c%lHlNq<3%EtW-1!9=xlZI@a!g{1;$p`QVbQ{9ZT*2qQ=$- z8|wldm@OPGSuvfNK}v>@{bF0irLbnFGEokWEiGAyRrd#q{Tgj$mzD?CLO`V-^sTO?WKns?SS%D#`0&kv4r zgglTrS4kJS5Y;#*c-Y_w1u7rLJ`&#GUPSXSgXS+UeD6ODFJ5*^KL)lcKvkuXmU$L_@5FYDnBKstv z^uke@DtYAaaHS{cfqN+UULZY>7|~%QOtrkJ7(J7TBzw3-SYm}B^3l{H3kG`jEaY)+7RMRjSYw* zWxZHHjEoJaC)eg()pvLZT(X>ud?2Wy7E>h(Q zrnq%h5VaRC*<9DnR|}{(X~s0;brU+1-%`7gizW)Uj>yEYga2ze^?L|vZ{S%sVETDW z%kHfV$IM?sk4d`vz_;89@WEN;Q&iq^p^vhYD8!SBOjtS8-lcu5jJ%80CG%d%M(=4@ zJh?GOk4`(C?$+?~K_ON%XlhErSfEF}=l?eh$?@MXBoiCs|6oWKj{i}j|0jlI zWoQ2XAH{TmP)^!vp`9OuoEL~h_CyjqRq>>s7l8cB2!$vKB5~TS2Alw2fC`*IML;CO zDNg^-hz?Fk)U9SmbCUg$_cH8mXB>UF;WX9!tm!rL7Lv6!D=hvy*4KZQztCQgh@g%} zUS(%c5exEHR6wu30uWm(CGtu1D-3Ac6cQ#4a!lfv9+(O&NaTQ#r94hXEi`B9i*BYHi43c>QaK;ciQsA(?G&Y%K2 z^aaeLyFq#gW>F%XfUt<`Afq8b;6GUyZs8Zf@67%n`eU1)`wxEi$;`U4%TfFS9t52h z#{>x(GHWZ`g@i@y;g}WHLNMQrBV^TPxhHpJTIvtNGu_JlngG$J$t)Z)pFUT8`I% zpDGuK7&Tgu(HHy#tsv4~hR?LopVmdc($5{#@5=EXsl%VG_|VSAhZ}~So7W$t&<=u} z?jPja0+*3JxW9438$eGx$}ofY3+KeU;%;d_v6csa-H9m*FYgS|k&z$*hJJR%tA5SH z0J912>%;qz{DY7mCM;$o&<5edM1I{`w;KAlenYq4fgF1~`08DBeqo62AD>QoE~v0@ zKSxXoP!a^FK>?niz3v-miSdMnVspgRc^ zu2=OacJWgTt}$V^4SK-+cb@ewc3edXx)2$9AHlH5FpgiTs_x1ZGlrB-H-OGg=%?l) zQgalL*>lIt0Pvn^Oe|ts%GU!EH7~`#WxK;MGj5HLR@t#08*HcGbETF%BZ=j2wB^^3 z$FF#{ZJNb4g9|gR!iwe#Fv{5Qe{cQmLLdyiQ^`Z%$GF6S*U*jA>lpvy0n&a zRWr+6mCd#F3Nh9IqeaPygZGlvQ)EN=DzwhwqydhL91J**N*0DMIiW-1b`qYQezpjg z{_0f0Nv8a`AD9yJ!g9g79F(MqKQ33mC2ac%s-9MB{H2y1Xd#TXQs`(PLEf0fS?S$e zw37@pm3Zgdz5fn=7t{H$?|NJnhuWb+IY!}_am4$);>FU$iN!{_F;JKs%nmALtoNMi z6FC!U`lMjX_jp^ePF`G96cet(42WWgDH|<3Hn4aF%8{ndv300AHzVpkrfszY?`S+x z`AFnk;#QQ~UYZJV!9V`pIv>kG^nCn^wdFfc-NHH`T8AoUy-Odk&kWUIi((MI?yCjp1IgOKlpID!LOF^bk*h=r!i7>a_6#PUpfYe0aTW^b_VxEJE$++o)n8~ z7cp*ru8JS-Oq+8t9(8k)K<>+a+*1yfVlSH zr%$FaI^m0iuF33%BC@Exhi|e|DxtKDJyar`0t7vHh`*jI%n1l!xPzTOu+Yd=P8s-X z1;&^ebyBx6GlY^15-Kz)3FvwXrPoP2`#pYXRhxBI_^9N0-PEh#*EA@Lc~h?gb+cV` zi*SciVZoeC_A89Y;nUBOBUaSTb@{j1<>#IhxK4)QIzyTaHd21h5JNJBYbUuUGQat@ z4bn)M$R+U$YOmVZk&jST^;sGM;SjcR)cAhxuA?dQHdnbkTxuf~LN>vPN6WNBlq%UeN7R|7>{3+VV0o2C+~U zw9S;%Zb|9r5vF>B{ri_oc>P*lA0vkWA2OWXxTNX%&!lxb4a&Db^FTJO{8OWfJ~!r} zfBSVAtw5BkDwJpDx{%yHOc6Jv$L8Lj-1gJ75p^um=JkVLPfwx5!=W~~@K2=h3YqHa zaHSS=RyTwx^Ck?Z<*Eq?z&DIdO|B_qh{&8x$GVz+ztnG>pRE>rWTB ztgLM+*1QTe2_N-0D7Zg~BgVk(%*6|Gt5In>dURoDY4MWjaBVQi6vD(C^a7VbQ^`2* zq=*C;?;gFxTSg#-L`D*mDwT4K*=AclR=hgoC0AaT_X!}|u&h){?X}&qAH73vcC5(l z4jL>7IQ2)p&!eW1kyg;ifUeuN0<>gGqImw*%770hg(E__V(h$>} z{`Gm2jd}IEb~hoiPBM9r>6|R#FCjP6pT5DFg0FQoL?T+jH36C9OrXD08W0BZQUn4S zcuL1h-N@?GIrZypCeW3;AbAFkNTd1LuZDG%SCUsbGAj$nY-lV5aHKcNk8`j*Hv^;X z0ag6xof0Fs2sxIY(5cK&;W3IGuvUSpe7S0_WsTut&8uYctksKgMI+NY-)AE{RxHDQ ze4@b1csjdjHI4Iwelu%Ki7@@oHcTs@36w5F35PKhT$;UE(;IoYyo9{On(Bz+ukSH9 zXPPCYvB1ul)S)yTq+Yw3p{;midPC{jT$9XON3b^OMU*_YAoL}VOnDm^i>Djj2hZIG zavR#&wr|O2Lc&nWm(&XA)gsSXfa@79%Nbp1==&~5{hLg9A0%8hmdUQ`=LK|i5fJ_I zC7xn=U3Xbz0+Vev-tCNv#C;cW^gruTRkdVP9Q~dh>b;jb5!3F(Stz`tc)!J_@>_tC z3tjd$*QTrwNX|15L)a>DC=*0bX$n0g~lswx1U zhE1Z#_m(Q@vYJN)lSRx&pKR5?_nn|}hpiHEPWZ_2tMSTYQt`6`VGv41D`;Bt=Ivc% zo~DyeF3T7x+-0%zz}+A;L{Xxb|QXV7NC6S}Lw z9{Zr`qW{9)GjN&x3IjZj2Es+$x4!5Zlj{WLx0#RHTM+4pW{8F^uv6w>$E*CkYmzXE z?-osd*%UQPPB4Vb3A@Z$F`=w$u-3+4bZXf?k+fVCZ;a`Vq4en*Fq)_GYXUu)dPPj1 zT^?3bB+30@NsK^R(?pW)f)G_+!8NJc;NBw3C?$#Qx{Y(-N$ zvQ(f&s+!z#!41U=Q*cT$bF_uSOJKJltn}-3xGQf-ZP1x!qnvr_z^Tl*i@fOHTcWAn zE)h!qJ6B%g@foh?42+4<6BC=0YhGugi`rYa}m_FscIBG^+$m|Ysc2Ib;q&0Ep; z9>w*;g3KXny*_nf7?4k^sObl~GO!h`qdqEd^n?sfCsXIQP8YE^Vgkr=#g}LB#CgUK zQr4nlo~;H9f3`h=8l&@%nX86HFegpG7jsO-;{&mT(0TJ&5U{7u^u4ylakM{Sl&ZAM z1lKyTm)kEl6&Lpp`L|HYow`8JcPMd3hp9bB?t$!8>vFtIlzuI_*4g)6gZ`@&DKco% z8YRMW7;r|m@}VT&Shnp0RR2+)w~nr276J}0Q$1Hzb5BM41wRL4p7~C`F^^m{+z)O_ zg>Nfs9h87)@9)VgSmMWB{X-Z&_xg8<4TSAB@PHyk?2iwhZ^;?hJ`yMa(vtkNF{Say z;wzLHD3gGXyMd@?l-l4$)8hMBF$l=Y5*Lr!dcVB39+XALsyuXyZtXPs7+pB8QJPK9 zIrGx?8;15-oD}sGBh+MUIe{d8X}`O%QvSWFnyuH?k#kZnVi1);O||4D>7u6-!#qhX<~{kG!SF7P z2esjZ2Bfzv)ZrEn(^tN(2i`;5-yt=}4mS5*J53a^Dx-DpzN!t?p!vJ+8LUKnv0vaw&@v!n*y5XqBsFhbH|apdw?3WpW1kB(;| z-S!VF%uy1Jdi+n>z<|Y^noB=bHl4Jc%Ln%E>U&g{<}k|p6CgxUn&0C|-8mDlj_Fr5 zn5-L}6L#W$`U|a!uLKt|9w_Hhi)_Q#GY4WX^!47kmzOkaMhq$rxF5iXaH2-tF~ub5 zi)CWZE^tM1+?`P{ z^Rj9Xfa!co<~p>QwM`e>vY(rL{{jS~j;!|?D&GlXGS^GbWrx_#L|A6H+g7pA!#Ocu z2t|~l@|s0y!!oLUVEsh><6ZJ4yj>_u3FU@SM)G4a6rEMvxct1oXr3RGtEgV`|HPbu zJ!*zjyJc44u5VnXo_Mt*jRzO^S0YuXLoOhWN(hCre-YAN@QBJDjU6qVQd_ zAG+0A?kjNAre~|Zym4+)gwO5(I{6H@XbJajKvWd>XK903FvfoqD!)y z>B*t;f*jf9m+a}HZ0o$2xL&mv{KTV;rDTGJOe?=^dJ)DX+FgRz(k!lAWVO3KGs36@ z@S=xw9!hl{_H-boB=CgTzQ(Gg{JQplx6!mbQk@Q4Aqkl*J|>HFkYqj$$v?}UWT)K^ z({CDklp&P-71svIC?~lj-06G=BLd^?Dj<19C(Cnvi08eNU6)mi*OA6^!qh2w&|h& zL#ci->jyD!9sh_us?(mO)b>MCdffx4RaHovP1%~)+L#rTp!4tXfV%YX=kdj6dXDi) z;N*@$2{&Eveb$E-(52#A(UUkijOe znyz3WcGRr_?eBYn!uydI0D??l(%(y^v)(pQ<9Wfaq}^9FT1?HAZx>EXs+>SFFDb9r zcqmVM0I760W9%K7*(2dYB;*Xv9IwwbMG2Rcuna{BOuNjtitO2$hP&%zBSMm(E}M;eP%6;s>+yK-1*=I(ic)0vZ>JdCp;#t#hrr|HF&vE#n}$N+5cBhxjTDRwKJDX&)%%Z+ zMAq?sFVGMYJxRvClL}dnbLadu+;~|~+lUm%u)grr2;YMzEzSml$lqB5!Ro+mo@3t1 z;&L*Hn9cSr6<-1LPJsm|*JY1$){2*8XKdOR+XusO+KG0&c$zlD70hGmL?h&GA~ol5 zx&!yp(cP6?cDl(?k9dCgW&`7o(}}c!FOiGBafnk~2x>tSqG32dR%I^}aX}YbtkiKT zQL(4CUO2@U5M*oB*1HzL4#?*Su%=qn#c6kgG`lK8Igg&24+dlN2M6Q!Xp$ zyNBk~nl1Lw(3K@9Ro*QU^BS*={Hve_o6__sUPwf~^=^yI{^?_cs(BdxKoUUwpgk$K zK1j%w<44q7Rpz9yVVOTYp%mTz}B+=%rnRLOKe zJ0OGtYKSyP zYS3g&R96`~YO2527ctAuM)p^sQ&S+m#xOkrY~&NDqj*yapTRRJ%GtXz3$C^RruOnf zN@y#H7y#=Ei@{w1fXiG}#mfZq@5=2$qfzE} z$|x!Q27DRdYU@BtG}z6Idv~xHv3p0;Cx7%J5o0o{mm{ex%EPtgw0J}^BBcLG(>cd| z3&o8bvS;4QdPOP`?I!1RO}m0n(lrrE{_xiB(6l@mM7x4LULiCRg&_t9=}0c*)oX{h zxlQMvx%cs51L|z5OXB>%SN7xR^0#O%`V0B0p_GlavY^(k5EN|Q*soHC$w#vCbHTMq ztCsxjjgR^2G%C1jkzx(ckXR1}NfIZtgqvc}&6L968CA6RD6WZoOcKyVRl8}bo9SFu zm1@BR8#Rkr%lB%E3Ydm)s(T*+L7&4jqPRcmE{Zjl~2Nr)I+#&6ty<+ z^tQiOpSyV}(z{BXb7ieAAAvNc156p*qxP;&Pxqil!&~b_ih*)RQ zBX^-512K+!4(f@VmM-1#9$!^1CJXzU1Bifssc*NLPsK0Tpj+UTeR`d21sPy%!o7;Z zZl6mx_?3th4id8JIMLluUSBR2*oA~^YZx z&|6Dor!m>|TdQ{aObA<06}ZrcU&gQL+^_7?M#55YeLqb*ni(1MUq`A$8C}^fuc`9O z83=q`G~;GXiB-}`p*s^7M$?7tOyZpc-EP*m_Uark^hIaRbH$Cdz9ore<2MgFLQmp5 z5F+Np^!2lD5?hU_5EX2(_o;59ZnjdH9o`piJ`AZPwP=YKXu4;ug-XZjKlaupCyG+C zZ-1bMpo{j~k~e)`NUi*@%z;1tnDm=zL~`wR!-!E=(C%ULsP(;{B`w#e^Ad%qP;ria z)*19jw2K}HWtKq^Z@G^;X1%tsTw>IwpvYm=cd{h(mTr#DVhfNYnt$sGa!)zA-{cjYu zGxL0*+-B9b9djsL<7}j%=Xjdjj!0M@{{(<DF>W=2M?{|MUHa|1*ToXw2^vh)Bc8*88gESZRnt((2MshQ)upa1*>P#RGIn7Ft& zX#Y|N2w4H`&5aDK0kQ^;Wdk_(aX^fky9jcVqacxnHbp zT&z9*Lz$Rc8=L%2!`R7|LDkya&Iu?f`j5;z1p6;$3UmapF)}i8vM>XHb^xHOkr~5p z2P$s1z`vMGzv1^BygY1eYyl?kX#l;z9Q_|YR2Eg@qup=zhUt4lE?{f9qX8gC6|JO19zY&Q$ zSz7+Zr~F&s|A%j2Wp3&AkHq`jIyt`YfUM2?CRqPpQFY+oiz^E>Hg~f6zg$U2gZEt! zvNpB+Zz0Vc#LZoS#tP<+MrMD_#^12&?>)0Lw+1TMIGF#wEC4ztM#lfcdtWZ2FYnjG z;e90kVgcWG=YKtlSsU3H|6Vd?HgZ0uowx0H(mz+mtj`kP{XryPvT&5g|Mjhw9hi?cHS#@~x#?(pTmAp37& z$A5J>0St!!rI?sm-bs4{BcLVF#PJW9?H~B>!T7H-)8FtHpyU6@bFusv|33;`00yJ~ zQf%+OjchF6i}>G&S$`u|R{!SqySfa(^$`oA4}zdH=JK>PQH;tx^wzc6!~fBVAvUNT!tr++hGc`uNi zlZ_+L*zk{5FukLHRFUbuM*pg?{icAyA7v*2lCzLUvH;_k&XSI6?^ZG^B?4UMP2?_hj*i{e~|B9 z-TolotL^@8%U?U>GuoqANsFn7YK9(8o@5j+Zgc#TGR%% z-&YIay3p-T@Xe6zs;5xVc`Vs?I6WdlB~n#o_;1+X3MCHrAg}I-QQinH6TWyHx70(0 zw8ks8J$XLs#VSqjw!tn=VGK;>9}Cs>679QZ+)yuV|Jm$MD-67BQqF9X>QQ>bWb@?9J^A zYS!c%p;Rmltk+2-OM!>hCUexoocmh23bR8OHYv6i7C!h5>ew{7hwwp?6y}0Qabd&; z-$EuM6+OttSU;iXXeZGxZfzM!R7^%o`z0=kF;cI#ItD+DUU-;{U1fb+>RQ`l#04xG z6Bie8iCp>S!oJUYLm@DCq4qS!vd+;^a&8`bTT2i(n#VWN$G>VPwcSUwO}1fRx6xOd zgx5r0LhQWIWFhgc_c*&e^DWX#NNYTLEhbNFXI%GOl{*H?#7>88>i#$FazU7@$T*n4IDl{2Q5!YTda?&5;i;AWXHSt!Y7 zh}uMGVHEpI3%wi+F*S@fo+B%Nez;V+hWnmGP-?}vQgFIW`~-AdHe~;fcu)*nvGD@! z(FyYGNJ+RJ)`PjbTZD+8kV!~bX0~5UnnLBYD>BG1grA2;E~~_Q1Wc7urAz8`zmCW7 zD$gkRH+Q`uMVA-w-IQ)?kKy&kFX>j3c&ChM&2(!mx{bIFRYlZ>)C~HmlZ$G{&K!cI zlcP55w$AYobK$iGDR@12eUIJf?435anYWWxK57wy3 zX*n;VK10ClIyJ1>Fk|bLdI=wl)ECxxpg=#26rd0UAC&%;+Yg!8Kl`GDDyqa&b^2_e zpBAx!_we#`eb{6e?`c z>CjOX29PPcP#o3ODe=RayOMj5BWXaPx7#>W#lVZ3C#}CWUDQVxUY@m#ibiz5QbpsvwT)Tgw0ZLh9$?T zuHNZBDv#B@Q&LJV6)6yb_*}*I!bNf|^lOVoU{Ls&bbob_>PRrImb-{jonns`JOt!b zxL=|a9-GcSZcHU7IB^a`)7%#VDNva9%Qx`ieR5=dg#3UajEvIibv$}AGhN{E8&I$4_nbmw!Q zF*G76d{+)`e8e&6?aWeoKip~JW?ep6`WuuMyNoT#RjS&K7wK12TyLkza@9co)I*ap z{bA@rw4Dq;*6z^fH=^<*0i;5-3*69um|NYBNY}@FAjhRGUjOYNzXSp*f?~Hvf79$U zWDmTGO0;B|^{1~u%b$(8C>`T*mr-}U+4`t!xAQxGDZaN0YXFL30^86HL`pu0GEJtI zz1y;{*{Yq><;4sq@{b=>V9E5!Pw+efDY2ycx#>kt%*UmXW$q0A=Si zPS7-1GcTI}_Wios^yzTcm;oG&9?IsIH+5e}66#j#cchU{Ae!y{Xag+MX|0zHlCLz# z31!lmhY-$`gX|8zz5oCPKC@xe3me%J^)dq$1hx3{kL4oDr)8#2=PX%Jjps!nix?s` zh^v4vY$o|wg0--4Y*s=^h3?mOZTHS3Xw`r-lGQc&_lU1hKC z1381ed1CZboOwbSj`}po6E;UnM+wlC6$rio?r8dZM4qoc6$aFmIL zcxHC73$LLB)QsTScBj6rHNI8+`+;)kgQ_t7ECYl?(VHAdeZE}sj*Z$Zo{{Hl_Ke_H zEw%%TuTP7dP#cNrDR_3GGW{LxQ68(m^lQ=Lhm2hka6K0S5hSGE+!>lO4n9$Q*CGx4 zx&)HCC_9nQ8JWKs6+NJZJQf-Snh(v(vEflAW4n&hd#6T4_7QdK;LFG=OF-aBs)|ki zvxv19D}R&LoBdF=N1Hunx2o0~msn}m{H#RfmQyS%*E0X6z74kB_i+aQ{4MW8MAz-4 zXyqW!3==NGg>r!|jSG6GAz@b&2}7}GdKYz`r=V0D+AbK(0%4giU!)s6ptww+O--L^ zK`jqG%gEbteScsf$_^P&=gpGde#P%}CRuIdKjBb4F*@`a?`?!C1EFXFLkqEPmP49K zonVSc+b>jKA1Hf;Dqeb1TZJ!4k#mBi;d)y}V&y;ycauP(obGwu*gz=uL+I%WBd%~C z*+#UH@6k)}EH~16#d8Jj8DQJ)fbWPgDH57>1=O>;(PXv7%hSKZ` zIy=pn$NYMzQfMk%Zwc{nDUwU|Mf^yrN`M$u+g7&+Zu!HA?SxsxVt4jLS6uN$uqqg* z@8n#(d(Qs)cbQ;;I^RdIc+6*R0_kv`%A3a=^*6FHjfB7(2-3w6ZcM{rF58`Y@z2I- z!V?7ZQc9Gd?AmlH)W2AXG9stlG(q$!WE);6!Qm>PtA35hek@ZEA{j7=^;ID@!HXb< z5CGmKziJeb$0>d96B|5+(=BF}@jVLe17U}NzYs8^;v^qKXAm|(HVL>*iYLNsWejaN zcSLTIWhX^%@_O=9=iOkk*frngVBdcs{mxng1g);G1MHW`?9w_< z)fp)$oJ|85uN`liS5Dy8Ye~b6K-7yr-nw%JVUf}dxU6Gi~TwgoW;c5K@ z4{JTJ1!=?_x>4mys4)SFmd!*Yz`cpSh0>C=i;Uq&LeRU3AwbqLeV1q&7^d6_)OJtH z0*-?Oc>zW?%&3wdMNiZeePe`6=EQnx2vywtM@mECY`|VhSvoEZ^3yv-FxD!>648 z^P#&d?Xt(`xN#$0`}A#go64r(hL)S0q4jddPw^_NA}^B*fJAwoDZ6byUCswmG`Hp^ z;ZPeHdjww(K9jD-a#9L__SO7nqp?FSM!E>7oV!tDTUybRKbwU2 zxX)rwb0!dQ#2k)4Yg~qh3L3VfYv|xk(K*DuJ87UUR9~9P&x%C@`vO+T=Cr{lMWAx| zSP}mz6djbaSaYih!@Bw7)TGCA4V@oS`~4$yrsLZWFNNom1jjz~+#)I0x_)^p``%(=)HF4dWnbS>1;=Y@=^c5N9LuIz zB|XXKr@VWidu$9V7@F^^2gNq`$zjv~>1Gvu;E3UJyET05iyGdV)1K*1pM;m3@cC0@ zFA7%6#HoZ`!H23&J$cEE77pz6Nd}LY&k*izbjJ4MSgr^rxH%nTgpa5kQXA9r7x#e+ zGhKJ}%HK+MD6(*H-94~djtv_MW{kcl7(#mw_<=0KGUn9u^xhYayW{Ri&QuTuBB%s3 zhbxtm&d?SR_!plrWFH8FE_XzFjY zPy-irHEAO4u4QBMshhrex8ioWueS28yZNc^iA)IwCu`;f<7SEKwvKLCIeiPBV1M$G zKg1TJS21kRLYZsvPD9aS2~*3hKgG_5tjA?lIwX$9$H=dpYB)OILPSI;u?dz+j#BR? z??i9$RM(<>2t1nlBm*-nxEC_PVJzR9w>J`Kryd?`5POMTwOWCh$?bqi2IWjZO)Seh z21$_OcDS%@!6hfD^C4w`Ez)dm4T(-CKJ4MA41wV+dN*yAsk}7>C;IU7P-#Ua5hjk< zY%m26{%a5u1|Vc7IAg?z0{hB9YXD%mk5@cXe#`MdoHJqQtMWrIiC7Ug!NqpMA{{+` z&P;>pu8xP|rg-W@{pUhEB;HBNvn!v+ZZ?T7xIiE$vR;hA#Ai5MLOmGcWE#sd6=Dwp zbM|UIXL~jI6^z_qXwVoF$fW%&|5CGn4O50x*S4H$x#?d6Gpfu2m!=}8Xo`MiXe73< zUYe3n(%SwH{h_qqXYPbj&Xg3!2%!|dnDgGLtAOst=f@yy9_xoXPRNJGDGa1pjz!<9 zcu?yws=$pCkPQhh?`F$o!`BLYU`$B1#eVzsDsx_+piy6gu)O{tnqVLYxyx_hhic!> zx)47ni2Y#vy)?NyVc)rm4XqCBs5OHPh~6h^qD0eV#HU~m_U2Hwq7lo9|C%;CWn~ra zo3_FuR#7E$)Tf*A$+JQ^zNfk-qzOQBGt>#yS#*9sieF!}yINsttmL{_?M+kLr8h^?@ce|~Go&3{Of(X~VmHh|LUJmIxx~N6%|njF0}|1kV8_l< z63lm?%|Kwen^^f$$F~}wWy5uX(UBzt0?o3%KHtCEThO!yAwrTp!cGm2R%#4 zC1j&zP^^M2Pk=a6eb>{MXRcC`{WKCRK?96J$2jaj`r+Eai#Y<@Ls5vz1xM=05lUk- z3f-K=DMAUA1xzK~jxE;Elfe70;`KgNo+fc|W$S*4ur$9zyR-S$Ti2<1cOrM)@ za`vJFtiH28wwRz56#atJG(r-T^hSa<@oh|2-_1+ns4xR_M1aaDZuP_ZRZ`j;$Cfl4 z5vTA-N0cS#F#Re#n9U8rBu2xWQ>ZhnbW@r$QFFZR>>1I~e= zOBbxs`o)7|z!Jl|lvaT3N_#W!5aSWKj9MCrJaM@iEk?Y*Al?gYDKZxzq za2?Voltn7{!1#K8xQ)O+Y0o~*e&!{jD`X_J98R02!ZO& zv|#Nb;b9;k@));Y6;Hlu>KE_pOX&$$Z4#Ij&DvcYe&mEF;u)sDoE*gMr0(mKk-aM) zJljt<>`W+>mBvms@G!KIPM6%Xo>mRCX~uP#GsVATiR+$=^o&*dZig>`a$ZL9Sc0?% zs{A-_BRQ|8(+9PpYd)`HIustMt+%1opm$;sj8~cWn!uJ{RZJnr>4GF1N(MzE#_b#d z8#Y252;MwXdRF4I6>m`Jior9f2_L0jX9a&(Gcf>+ygWNAAvSc_G`HiY?ZeTs9>MP| zPG{Nci2zQGQ5Mvg4lLR{6{5ET_()Hb9-7{=8eUh+A3s!OQGjntaT{5bvK0E8FvH5l zHE^?th{W*7Bab)}^ZH;6)d>l3@paCI7{twB4SAqFKFW@^g*D!id&%rsJAEtk!75-~ z$x_oE+oYk-)eAB_DkgQjEsJ3~qHpvXo*$g)Hky8}^7vWC%bV_ea<|&NfidPje;va6 z2saLLku)@ww+6yBW)HcJ)!;{*Vd(GJm^zX|>UKC)I-jF<0?<`Y!s?m-G0UH~w!;XX ziE`sZsE{ul*Q8F3uO%`(kRB$Vd8F*;3vQA^s;D!)^=1F%Z^I&0W6F-)`g3#ORoz(NSf!_c!32hn=Y>abyO?bzVcUhg}xK7<+1%@Tg zs|VR{&s%p=PUvOYm8cN@mr24o+3SzgzQOxym!2g&1mBa;4t;Xatprn7EFSh|FibV0B|(g`?7|o25kc_VN(83}N6#@B{d1!(2+7E2>5} z8ONX1bao9ImRPe?(3$-tXo|1i#DD67Uysu+rS0nal(=Vm8$>Xr@1=`O5ADnlNU2sV zJ21ddESGg*gVyw%^v8b*>BP_;lqHem+$g+^qblYl>fTvJl+~fb)ni+5M`8JcNZHKq;uWsc$hGTT0CpY+0PbX z;}~K#cK%1+P3`AR&-uClF^D#aL77p?`1QxTgP4@tTKr*fomhO3EI2^=OE=#JN*tGL z?BEG(ftjaPv*E%(sa37|T4GRFq`!fU?x)_nC=948V>CJe4p_I-w4zEe(S6U^3 zb|}PU49h-99_^L8Xuj0-&axp^=5CvA`1oJSVUJ;n=NZ+nWQStclQ7yic~Q|dM82v~ zHDQC~m6w>t?xbP0@mz}!Kc1g@@8WI=SgR{%yQ#>%*-2(Z+N5_P1nJjvb4JPG_u~X1 z*7$ttS3s2$#n!BpIZ!OC&r1J=aSp{8K2KAG6Vco=e zWWEcwWs@m5%@#Q%=FJCF%=IJ7>czl-z&X)AzUo;r+=+hW^Y{vrvUWh92M@#+S0Q7) zB)bMphfQv(ufBk*^v+_f@UL45 zvi;tCuj?Mv)~nUJyKTu(C*Iqij(xd}9^!j4gO_STp87KVc^DogF7_vD@*^W>aFI66 zu%Y*{F8uczIdw-+0V+#Bs&m}|ix8!H_ft>8eMBGoIrl@(*zYJiPah80yOATtr05px)?V{%785LkS2A+e$qlJ7GoEx@8g}*1KiomE zhin$rS@qog$XAmiQTWkM^P08qq6arv`NRDS`(DlHRE(1lv1m5HUJW7j$ zkmL*o7385R8_+4t(JWQQaI1^vddYT{Nss5~Q5^vKkp`Ncm$|~XaRbVd?#to~w`g`~ zIY8_AO8l-bmz2@S4C;j3cR!uA+b3?WVH5%?NYYgE0%yhNxH(fD@@^ znRYE5$5nqZTd_jy9`FZIR+4Z(i|H!8hI|qjwl#_Z^b?Q=VrPC~ZPX+++zgWUZrQ5T zI(v!x3B`!Pe7(6VtBl?{3Gw#ExtSKVJyJbkYe7K%UAMX5yXgKeKCc7c znexo*W|pN5);BIyR&cN%?ISf}lV#31*{ zepMU02j>XDB4uT($GYy9SnQwN;n#IK3=W)Y_;b$Wt(4m1q}F8}X%;tgBN* z#@_24P?%-wOoyt{gzKN}FtfkTj3?lUTMgG$%hQH}KpvD#6(W|6EF?1YV~U9?^H(al zW-hO+cOVnDDOrHrB1uHdRG3?Q#4C=JLTF?g9s8vFEpNkW**Vd9lRb zk|wXS=jqhRkbsO97$(x?+f*&>*0es`;>_KlEbb?5Dsv#i4vOE>qD(w4pvK>P8nguZ zY*t8fMnD*w-a2~;yqFQ8-{e&%4_-?n37k0#=*BhT-N<@VvVW})q<@u!8jvU+alV;8VNb?FjosRC|3k&MRw3~i}i;}UP$ zyBPmiA5R-=A9Z?2!n)1vV{n&Y{YL(XUEik5jB;9dYK}={s#fka3OmujLytDscyx;O zp>;~}S9H_qCzT6+vTSRRj8w)r5h>RuhVeM0UV!wAgR)HMX>0q40 zSiRkf0jMtrdb;r+J(b&?XjH8x?>2-cni1q-ReNmiRmVON@V6(>w}-$KkE*I0R9^K{ zbc2IiwuXA5iBr6tbnHJD}KE{z=41BqCHS-c@OLrvnGKk7&!B zk$2qXwRI(&SAha{WNnkT{&m0x$DKRl9x*NK>n%y&8_x+ZiPHd+ctu~y&DPdA_3-mF zqg>Xf4j`winqt?JKQ#5o!ddzET#zaKP$@OLw5&-Hl4MWnKx4YL2xG?R*0=7{Aq7#i zEZAez4-0Y>d)Vf2c*4fhcl%=BNu4(ud?LqBCg@^aSa;Q%a83pHEeyHnpH4@MMElo% zq$+Z2=|yw36{C^j5e|y~Y!{7A8+`P~H&o83`w3reAJ|DSmP?=0BLl5w+}-$gQIn*t zbm(DxE<$VeiwcWP45kuG5(_nvZ47!*friCDh)Cq)*Bw`6kes3m!M3Mga_5?GabvD= zLbE0$LKJ=>c7=!uMGbnrd(EP!4};lU#}1lB-Fm_NP+%kq=FbJYlU>&u@g=s~ zS;jJ-1`IjXl#*8Q4cQ$+ECQNGyqd|htXPVxm++S7Ld{W}_5`jFB;BVM zzBbx8d`h?eEF<~iD~Rc}m+W=ULw2leCdTU&;5DnGtEt;B)^7tmVRnE#cx}cym=+-kg(<(QPD}^)cDvDQuS91??LQ)gkW7%<#D~GP1tHHpkzfcfWUtF9<3oxE3 z^0dF4f7C)?BUbnPMX13$qmX5aRq3=~?2CP2kA$+wi#ltr50mf(r+tC-z{4^!9P*Bm z!0Eu|b?;$n=QV*}P(8bT-2{Ygczu0QKP;ab!`0Qw-z0AtgavA?Hp=XBixh`&wzT;` zCxQ0t!{&X!+@hO&Y21S_FE^Ho<7Vy>siU%c4HqBbgt%HqAttX-4QATS>5FY3D2cqd zD1RP8(B?CDOiP>r?;B2B0a~5@lQ&zI>bO={?_;CDFx~ELy-VI(J_s2f@nP7*LG3b0 zM&xpP`i|H!k%N5r*Ca}NDxKog|9gfo_PyyiTv!5IHZaA^xQ%!YQW-kWD z4w;g7LNO>?6s`^=P7S4i?d()!Ueh~Qx&hmULn zY)h`SW#;KVU$E)v=xOzPa}T3Tcns@3;3|m|jyL6{rk_6~yEA+wJwSTqR2pkh6D+NU z;<6(qIC?g@pu4R7G5Ab1*%mBdwxK8AJSbsjSuoP2E7l6T;F?RpD+pm}1Yj4H_Ek8l zE5g$v*tL6Iwz`Vr?B|2vfnlW0giiTqug-)f`J|{@iV9h;#c{V>Q zhD<2Cu*6EN=;f#90-aEw5f;gdGiBylMM{*h_=?`rcS`Noh+1K140U+R=|JMuh_{VS zZ?cIet4KrGEpLz2mX;^-L9&M@s;7op;rTI~nK9h%SNJhiABRWIvX8h00rd~6IqG2- zmE6EjZZ@ztd?Pm7?(P1<-fL(la1`KOFhw9YXFBHx`A5j+b?<&v3)<~t`f-$w|& zI&&jCgdqVp;?PPBkhY6}E7i+OActj5Lb?Y-sdpeZwHGEqMoV#yK#qb89ui8I53Ge! zxc4DfXFZ{VTn)NqV&ZuX4w#Q1Ef=xchK+B=z=Mbb76wW zErYm$8H2p7rWqefUlVv5h9fBITa|{wawJAsFLYM8s!09x{Mot2OU<*G!pC`lc9ZN> z&`AR-Y!~eWvr9f17lQnH!S2w?SdlCoG0RHs!!pc)m^3n;=={h(F0CXeJ{li2D7a&% z{fn3COYU@^zcSXi{g`yM`rD~(<4m|$5#RAB`O$}RRaC3V8rb`jX4W(dxdy3TD;k5R zAd?bkVNI(8aGsI9y(DQ(HkCxb5yUM7ds;r+T;Mb(&L?JB$KkFHT$|KX@*!w`mHCty z1~P3`W1|6!&u@H!YK)l1>DE(eT4@pB!OzAW5nue!qA>eqTe(MO*o2jbp{GQKMIVEI zrUu}RHy$Nj7CyveE2BW?dW&%jd&iwH#)2swlEr+DJNMWTDm{1_E%X?5VZGtM!gp3L zdMAuoLc>uehI{2lB_eZ$jm?ZtwD!S-T1EXbIg*5j z#@Hy-xneIPGWnL%gJXZ%)mn5sl&#Pyb|>V9k|6oNFdKkCuqUoNkUn zDpWc!DM~Bp*%@?F{?3k}UtpEG+_r#nD=HA3N>k7_d{`Er4)VdqBN%FjLwM?V1Vh#! zF^HIIvhbLSZ!D@^Vx)*t*DNf(Qw?GF`l#b%2X~m_*BWHhP{Fs|eUt7=zVgD^0w$yf z7y}zvOh_TkW~zp~A9299U*uH}!osI?+2K8P1yr5F8mq^ShHPG|viYQyTdp<5IWg%t2;$JW63u5TP=ac7f4MBQlFmP=xpkT`AC8Q310+v8t7;dP@3(ClKB8oh zcf)*ythEZzQ9&hsfONo6rp=Cwz?!?BdE!OgA3q>$nqJ%<18v#+F;ktehUIbzz2FeH z4nMs~L8>&{F0#OW!o_Cnc07~8Yd?^^+vjC{M;qA!-h;PuP*31`5<>okdXiu+V)X~) zc01o^9AGQt8cYudGfRwr7a2)R7F)itVQ zTxyS|_m<#DI2+d$*mqk3$Pa#{#0g|tX-hCm)<)>Z4St8|rEN zNE5s|@@EE-S&DHVkVt#h3mFFTF(+j@wU8Ra-DP!$>g+}^H{n8WVU=u|Sm-({!5n+I zLWU5emD;fn=c=A6f<4jMNzr7Z+y=> zByE?~_sq{Xy?!E;W~F1+%Fp+4sb?h35ZxG&vvA@TUEMjJ))o+IT|bi+#ePGfM)bQN z!>>!hu(J#oHAHi(`T{Dbj)NQ$tvYI$1!f?tYC#`DetXnm>qAFg98dm&5WNo?1mP3S z#R@5*nl-KB_h1?+PZBWnfLCA%K*Epk)4*!mar^;R+o>1G97+J{My#b(*V-|jQplrl z3BQ4q1>}ts3PGldWJL?jmdl^a&Ss~+YTJD!ldhNxwQlO|$@P1V$MfqI0w-UNhKDnk zhD3{cv7+(d!_yXurq;QL@6y+_Au{$#zK%kLM+6yjU8)fOOgj;K2ySb+?2DKAiA=Nr zUxH-^KEF#Foq52C*hlcVu;zwx6g^Xm{_tm#TyzCJ4$E+B*?@j=SVE^5?cLX20Xew} z45?~coZ^&|I?PF}U`UQYeS@?ybw6urqsh|lx@(@<&cUl{J$tMhG@IDd z8m1NR8i_pcc~WH?&HvR(#I{Uc2$?bz@4oETKP>%r3E^0)^HecNg?`4#f3@1JlCvf^tKXO`>PQRi7#jMB{S-+8($DZCcj_4sf(IM$`^ zolJb8-J+8s`G$+Y5{60nx+%hA%RkRLl66w%hi??;r{8ioCc}*o*UtD|_EnC7v~FVh zq?mW1RD>^iWr4)+ROJ@ST4&&bMJU{aA5+pMaE;Z>)uM5vigGd(+_)9XA_Y~d(+hvH zr>uv9RVPbRu5V#`KZef$5O@J~q@0R{k*mM|)@+8Ae>*z?*5WL>rmG#CqwiGFdj&u$}2gHI^|+sqLBrBnj|W8%Yteq%W)E?hK?G$*FOpXon5D)E|=|ZcOyx@@|){d|mQ)8Iwe_W_4A*==bTCOxB>o zW3U`lypgYx`B0EVxWn5W5)FS!E2us9f>PjfDc?ig7B%nOhOre9l zHTW>x(``e7g6T$3JJK&zEqt~T?g^}5I~kn!x)OG9chZHi^i5vBkt$MhLqXdwm~UWN zc>bFE(c1VFwB9%Ztb4KQtDC&DCe`8{$T{f~@D;pgnIC6nUkxDP%{ z#orZ$);ddap9umMqF=(<_*+~se3>v0pu)>3BS5QJG)zh9oKi$_iz&1#Tn9rIJwYYl z=*h9lL$k~LD-NzKj~9Jsr$~_wEGL0c$>_5+d7t82LOgVsF)SLmpsB()av5q}TLUBr zJ;96?2wM`A*g-)f$7q+v7M^}oOb|vfd@GGhpD86eY(Y&wHlOH8pTXGpmKY^W@*r*d zvaeKns3VVuxCu!C)wf|gD4V_zbJOe)I;ps0kN9GO76|Rne8w|FJ6Zf z0qe+!8QpE8Y>S(olqjp@xr@X@)P?%Neb^SZ5|VBV{m1wJ>;Um^4`;gv!zl9uAY^X= z?cZD-f~%yMa==~KEH*)rF(Hy@;glxCi05w zL&=L79)S?|Ws{%IsV#i+!Wk7`z8Hn1g9H=O!Emys)SlhJMUTs7&I(vKxEkPq7)bF{x4id(~`dPZs$=d!PrFjry-q6h8cOBq}vRCT&r8N0OD zB5JZ((a_F{*vVKdH7P1h4`zn;SAW9>igazoX^cLcUOBKTNH?ejJVf@z0w!Ncwjj+w z&S|2ZkZ07h5-4bHOd?4$B5y3y3L$HN=buBY`a#$^*P)axw5L4%ftwa~fVLl>ZyWMz zcDWUyreMLv)l*LHY_W6Eiliwe{1H-`96TeCH_((V(G}wI)pz(-UbKggtqs%8R%8$P z^co|vRjbm?8%uIU9Ls54sL-s=g@>Iu8VGpoT4@`yZi&R%UzvahJ z&)|KE4fu9^M!~@jmb)puXqIigKIUD~hAeDEDWQS${j_eFy#|ll?;D=}1lBV)X^_f? zbK;NSb#wY@gcE5r9iLc4qq6iWIr`!}zg8mFSv4sP2XVOT={C`$V#n&G1XnVMm7b&B zrSC&&G<^P*WgBDKZzXb^y2L0S|%K&%&nu?cQYVd*j$dMr<)HQhGR(J!7^K7+4i zn94MSC{OE%h(5Z65uUq=*SA5?w94XReDFCfaeRgz={a-{mgAqhb`BSy8I+7;yStSa zOp}G(5_&M34fNOhmMlH_Qw{YZdNnzvBaJ(y9~zHH_Nx*2b<89_1vko}iQHPN?hTln z@;&?pd2UqBm+|S^)TRQ%mGwECtJK2BT@)%myZa?UUP0w`{|I?(X8sQlQMIJrm268J2v7`<_&hp+7;P z$dee)g5%O{Aq6|@iW(@><935-GYtA_1$TlNHuX$;d=F6qG^IUaA(9x}^4TO8&ia@S z;l6BXRp#ZAJUrNV3&2hGWP#Wqnxx<2921`CNXjiFhLswc3hiLv7TaVm$+gOwr!40+ zh!quP^(%J|@eC#=JL{)Y!oXYCrPev)x4v(Q0&5`s0&(oz{3II^MrZgDt|ZlO;$e=^ zApukjn33kX9nBM>wS782dDI+!M z>}K$T>|Sd1kPKUao(c!~R$00Fe%EV0rJtOKHzj=(jnAl&m4`kGFfn?Hr*AV2ox4p~ z0*NAYK72n;v9;#=@?BLDciBoBNV&b%zDGh*DwZVvy+vWu2MpTJ0g0#=)EUj|3?yA> zgpf&i2S4Usi|*A`3?WTsE(}%J2!Qh_YfzN@T3BcEbFqb(Cyw9_J#b029u5dt+V9@E|k;T+p={M1P$9K2PW zN5zvyw%c)(IU!20x-BF(f+elk;mTCJrBAqxpWf-ne!UH zW@eTw28)@QnbBghn3tZs=B%v*W0 z)x7WAW<|5#W(or(FH1$I==_v(P7BV?Pr&c1^BS;$yMR9jUpOto1IgPH1VH=)0&9J9XfCdQMMR({pl4+GOSl9UW;*tNmHjnZ0xJ_e!+)IZ z*x?`9jtSSYf6<9POy>AH9C(X{fWp$jj&_bt5)iJ5iiwH?Vl?{_tWjqm5fO+6yC0so zjk`>IRNi@5RMtIsp1(A|G~+v=X(4u`Gj}89hI{8d1%P=^15rn(Cn6#OfrUZ=`E>iA zoK5-Ka7sJOWAMFTtE_-9{# zb)PLDYPx0kZD-^u=u;pd!m@0{Ujw|FHr!Q$c^UcO;A^X^V4aaH!*5F@2^8zE7wV5)pL_<%Tu zQ$b9BFyRBiHoSv_ab_d3-s`H+%>sXS0e#Hiz{sX5f&|)7d@SOLM@2gJ7ygu;4G%cc z^SZZRI3{jm0q&po1uRnd^ZT&O=TLvuwc9np%LyAj8-MK8lZ!Yw6f4V{QdkRgQhq_U z*(LDY!beE^FGQ~mqiDVm!N5ZP`NRQ0h$cY2Ys1o)DnOtJq=VLb)>l$+JlfJwq~2pv zn27j4;db{04+B721b|HQVR{Encu_yNK0|*3F3V9i_5Rw622J|f@PJLmBZcTc6*>NQxU=Xr1@#hXgrolntb_7|5egpYaGL6((6*CemmYfv%PDg6!>&kK$fyoPaT<8UT`$mZ z0fHt$JHdCaWs0tH#QBc; zZ>o|wK*B^_cz`2W6i{Iyz^(pP^FvtM?`Z)*{J?A=9tI#bcKk0u-w{Q-b-u(P0z<9a zfWK1SUufsylz9K z#TG;uSYi#=w!Lv1Bht{K5xqic8kqr2j4rYZX}cKWEho3l;7T=Uzh1U1>w+;0U9f1s zeb@V3#M?Y2$;K$K#{FunQt{lQ1tRR}Cz7_VGQJ-5rneZ8e0F@QHck_p-^{LPVx>iK zK8GK=59wRPa?3K(#~kd55^*JhDV+L*E?H%w(VmNJJM5jG+CG?(#xd)FbI!TvE- zjXN@F(u7u=cw)p*jqTZe+CHb34`YshK-fgU4gJO6PB*?s!jn@h!+5!r?n2=q+zoWN zJ;1phDVj5FTfO;>sDO28j)4d#)^r=!gAb`Gv`&fh8YJ!;1p80l5Q_B==fW`_L+Z44 zo0#<&dww&X5EjSXzk*ohHb(YsAVLxMrl$SBe^vwS5rR*fbgZ4r%(>)45AqFd}!{1*h+j z+A#ydz5~e@lgKgDhZwDdc+`C_zM{J-+ubErlgJgZ1~daSb-Z$6Whmh?*blogc z#yjaf)=Rkilf-A**VjO^mS z4w9{T4GHcfP&S;C&;n4%v~6=`lLLgR+QOVSc_&D9K*Do0;KpkAR$m^QL{!izjI2|2 zHiJ^6=E;68RFYzFA&;C~gcLM?wi*?;wMZv@JHR1R%<`9%XpTt74w~wUj7yInb002c zF%0|C!>d2=avv-XNh&vFp|0*Lgs2S%K_jp$TVht=Gxi)4yX~vVGj;&d-yUy8gBUQ+ z%)5rUr+2*n;@Etoy!OtvbHO0afs@rpe;AmFv?F=Vn59f0P25QAq&8G0dEhF5d^1j| z!B$}zMLHVm`leR$32*-B8>3e8qF2l)ADY{IT^UjmWFCyY4jBxy`A9p%a#4ZJSNl+f zOoQG;U16$(YLo$4WX-M_D0E@SXWnWoVgzf!Z`NYkYr|1r`leB|%F(5;R0VI1sx}!E zF0JU*6wpbWSmdhvb55qYH&tqLeZYrnhAdcOAKeZ$8YwU8IT~U44D&u^DvvIaA7o?I z1F!5f`>t9>Sj;psl^ellykO^8w5GR5sTW56tnABm(MJie1&a{`!^&(8ibEUy;j810 z{Uf*G%TJHgmr*OH%LDXGJCf-r%H^L=dZXvx>dXsemj=>kUVw8Zj+$Iw_ZT>T$i?7u znMfHq89cId__16-M0-m`)}lK&^v)H=NZ!MI8&3cOXArzgUSOfc7En@N)YgSga{k6U zF;%KMf0N?4kAiD=^vxc?R_xfiAdd3Gd?l&&Z;bKFFu7LPI@S z1m@=Uxcsy`cDiyl8B@|Mc0xuA`fjg@#uAKBz1ZzNGkp)H+=XEJ6ImsH*sJ?Jyi2dI zsT@jhZ8Ip#*U~~%?B9%D`Q;apTZrLipdBgu#6cKwa(-PHPz{4@*h&iKS4jmrqkM`l z6Fa?Jqqr83hs)!1_zXTL$%98y%;#Nkl*!K|&){37;a~DaSZhQctvBN0@;!Uj{);x7 zDE&~FfSAuS_m!-H$qA{Z#EP6Dc<|)+7!vk)Ch1#g1L}=up!Ku+0-#ks)+AjGtR&-k ztYYTxG;mNuu55S}Opz^&@oxI=ORR44O;OLq?%dj|Dd*Lk{#nFW*yl=~m^o!tX3THl z>rg0U%9l6T!d2QP({m&IbuCCE6my@C;`~-!@RAp4_ZH$#lrH`2*7PMmy)Awlm|@N_ zNilYK=Y>kNgo~Zcgi?m%K=VI;UA5$FUU4;9XZ?~5E ziMU{7#%jS(+wCZp+c?_VSwVG$2h}SG0n@9?7JcSHezUkhC|B};G45sK*7t6vQZ&@_ z+m!ASE6$b@{j`FjXgxvzw!}r8_;C(dOj3f0zsRl|x_od*}gg`XG{^t=(m((Bv&ee1``~{9lLwZjay9iAy zKZ!+a(nwrJ+p+aU?GBn;*VNgAl+~Er8}d4;L;m7WXTvWwU^}wRmhpT8gWc<5@Y$E9zO-B9$9?8ovpiA2C#i`hKnmNW6EFK(zM$Qri%Q@a0^#)pN=GF=oznRq`JE zMGkxM@Q`15Rzb$H?JODcVzXEKm*0NlB(+o_98KW-WE<_d(EHh$=;r0Kj3%G$d}hE` zW5nSu5tX}Zin*<07+ZOo8v)(&@)-?K_Tpx7vC#XneLtwmuhi)5}zKARD_&?-8p&Ak)@W~klAb$&n zI#1)eiayDX^dK{!oK(F(6NkW+>+JbpQQxR4cM|<&pfE=h|K5s~*yYPF%ls4BVI<+I zo?>J4dJxl|Rk;FLFiHod*w)g>CxVa>PCI1`gf*vY1n3G4I)jzY3*B?+O17e!E;fS)Kul|)}t zr7^YxZJMzwqn!uvO1EWq1y%*wj1X#^IEU{jh}2B?RsLkdE=$f+YI8v<;(@g=yA!b; zC;iz-qxzOYd~$TESL)Wj^`;0f_l6W}?WQNYrl0Na@Tnhf9osk(CyNRFEQ9N8RC#VL zAJa@5v0DE=wH?Q?K8MFw(ENSCjAh8tRAmzlOZf`uD9r7u&-wUPTTZgAw3{`mhNcfL z1W`*t$W?rFESRCYdVJp4h>Bswp|BhZX}vD>GH3BY`YCrskj9MC=P{{U2X)WSl3|pT z?WScC-npEaP;y|pN5E~S(So7tneU07@&X|vrN!D=dLti4t5*V$&SbsEi`Ru$vT0jC zK$psqv^6sNJAo7(RlLoOUl%DUo@Sa;SqXI!qP684>95aWkH_*%HN81t`#J%Fcf5+^ zGy2%D;qg{@RtYs%5ID7TN!`g0#ld;LCnXK~%sTMgw`_S^I}PalsZ|zzUG{)b&v2-$ z@#6`y@as?gGpm5;=rK~(5*DpqODMpS6r<15z6 zLS8VHEHH#G`r{R;si^fcO*g9r%x~aD0Z9(aFQb7;D4v|q56IsK^!T-?fC`(Im_DiE z#EW^ce&3l#qH2~EMiZ~%>!u!(cIztskdD$=td&d__w(P~j%edKylXaCuhF3vt9Wf@ z{Fs*&fcbPj;eh?+*rep4av!SDS(1sAVaix$PPZOngh^PP@UG&~BV66L$&Ehm=9e9$ z@SH6_4@J^uEfC>~A0qYFyf_cC6+`E1He0JeV0Zpz-;RR1?5rvLy+WEc)+@i58qY>` zVoEFQRH80pR9BXxD%0a=uq!t@q&D<=7iN=_Gg<7;a#YcLuHqNdX5T3<<(e{7byS_h)_&2`;2oI>F(VJA91yYZuDH zVBu2jV;9<~tgLBPvp@N|m$tx`j8I^y$`$T2DX&1)B#qPWQ(3pa!#qwf8fs1?OR=hu zP7v{OEm8i}hG1j0=w;1NmoMf5LdD%MZBaS6LNfGMjW8Z^vSP+m#!o5~&_u4 ziFMvdiJc@3HY5%Y+4T$j=mwT=N|m{6I8D;ZoxRPU2{96@NVPBd?V`EF2#SpFy233B z*Y1~F8Og;)k3~e?8I^9Cp3?Tv^wL%&doJ#dNoeA#_)`pd?|x<{VY4BWE~uSm>cB6G z1g8fJe@=l5Yh6);O8An2FN;a-&0_;gwN(?r3y~wvN%L*VX?<22ki{P5Et*<**-fsk z5E=}E<9Iec{Lqp|;(hVPXVXZlLi?6J5`xvwU}Xuyd;# zhi>(99*XmdSi4+<48KNNPd1izt{cgC8d$V1n1EOo8Tsz(@y5@N1dFUlXZT$z#_mfe zh_|v_L?a^V$V!s0Q7gL;lsW$Jv)q}k?)5#ao#KR1>dA}(ap*AvWUyJ{kz;?{-J>U} ze!uT@Zkq@F0Z0@r-<*QQ<_9*8mzHjGAz*y)Y?scRGo3-QQ=#yF$QWc~DfED| zV<-FlvmoscrZ4HT245Z)C%OA7$5#yU98^x7Z{{qPkGD6@H0$StdJSag7LmUvCkm@3 zM+#kwj^4X`wsaFDl7UJK)Svd{`Ch*v7ZnQQpG3@UWFiyags+VLaP2`JCzL`LxGo)k z;G7?3Axo&K{4U6fz`C!YH}=}ln5ROLL1a1UrZ_e3T!^N7KSY#cU8h#5lQv;|L0kh= z{WZ_Uo{?dcc>Qn+#dEJIj8+J7C8VjIVt+}MlF!gVpx`XBl}SDhDUHhd!^e-O!^y>) zKVgt=e#>A8uq*l4MUwaYmyN;Z_7~0*?W2s7#r7Mdedp8odK+e?Pnq{W0y0!XA>w^s zk0Axk7OZt(P>jY7`>Z!tuI#NO9($T|Ga*bqb7ijI;toNPWVOe8NUOib&bdQI?QRkt z_xZ`s#()?TTCFqC@h&fCV_k*!9g8p@6FjZ|SXNK*zw<^MOEI;sy$K(8*n;^E4)4+s z7C1q?d}32}G$zV_u3vT)0wwppD^ahn_nfFRp7M%!pyG51v$9<{tLyz!< zrt0c*ia>>@s{YfbO0iISyKe7kmqqt>*Dom$MOokZW;XFa4=AUfuA81hyseGi?;Z5G zOynXTlEy#xl6hmZ|As|J=D0z5@y*Kc2NP?G#PkM305Y(Mx&(`RzD)PeE#U0q-}BHNaf*Zuqnb#`s!DH0Te_u8sC$g5XTbEtz4R#gdDPr-lpCC0Mr4tA3Z~(gmG+E$HB*R$I-sct zeNeC1nS<1_`_~;84a*Zv-iyh^1QL>!=S07XRQ0FmMLXT-myJ6pcNY zP(P9#e=UhV$ITkFu21VRTJ>X}6+51+~rP>bu{rLi!pE8bdiQtHFWb4_PXR_ZcE>32SlijY46~&?D&^?PyCG+!N){? zSVrb#`;q!2`$ly=mPGilDTrY!O=bz)`GAA+JJPf3MH868fn4fmUo8k0yqoy5tr&}( z(ZCxrHBy4u4a5owqmvsuumZK_Hr&qnlqQV z$dR~OK8ArB{M~S|-d%4rd;9EkUdO({l7aGq`jy_6YGHselKGWI1w23>NNEeX=c{Wi zLD0H3ZhiCmH9^FxM&d%!UHLDC-lrUQjzF!Uc&y0fE<>?=M!M?2blQ}4DC8|0yq)!*(5I7 zUeuL~nbt*8z=*Dj+sd85tvO;h`n>vgpiuhOv%^l$bETP|7Dv%DRc6m|_|PD+KSv;H z^vzl;~a==Q?@r#)ZNen1Nc+-OCEjMlt&N)k+5$+D; z;9F8sCW3q=5#p-c37Ia9w|nI!vvQTA*sg^*19{HUWmIOte8L+L>fPuMB6oy(d^lil zO+}o(!J=A=sAY~c4rYuq2r8s~N)=mf?d8cf&8N^rHKS zsWf+?pDUxQ(33t6FrSq4xzaq9GsyjhkCNsWWhkrJRsvrEj3R}gcV2V9ib#eWuWM(d zU@elo594An^R(GUoqD95Y>s!U`)LI2w7mb!(c%h%bSv_}PBzXyY8%H6c>ajr+=KR7 z@T<*4eLu$G$sT3H9%*f31`*gz+HMtUfW-n?3ztDWbomTAZSJ79rAMu6#RfeXh&-)3 z`K*S!mFWvVHKB~ZK}6|gn%x5KwZPL*K^V|HbWCH-SP2Vy-f9po{IicWuQt4Do#s!+ z;iNbk0i484>1Cy+e0wZb8>4eR|lT`77KB z4DH%1sw4w}LQjg@3C(2iGX#0)#NaH$@f`d)nEvrm%}YzF1Fx2XD1%0iiNQ9RuxW@0 zv1SOz{7qxD=swhuGIgT#8!nbKc!K7ER!te1boAhYl>A-&^x%wFJr>w5Hz9%X@HV1m zB1m^Zf?OftiKaefxLiWH%%4bSjO_8~Y*V^g?(%{!fvztSs;?79xMy+0JE-aVN*S7r z46m+lSyZ_TBwfC1wEH!QoR>hGX_Vs3gz|nKfls%V$8Uv43*SPQkjENZ8>@L)#o|q$ z^IBjwBMd7ZCuHyyQxD?<*DT4xm19RIj-jWN>kepN>YY*arjXwm7U`Cq2nPZv`ym-|S?QHFn?+L6xtp}|ts5Ja^E(po*#UtF+^o$Wjt zPP+*T7m`2og+Xhp64O$bIwA}zi>O;m4#@?Y=)x&L;pUN4d~NMpa%50cjuJFq4fn>; zulJ?ngWh8l?Q*I#SvxY)sJCXlJwd1Hkh5yvsCkr>;|kEzU8|;BVmW!lH>e@nVs~F= zdL3ph3stc!Q%mVxLlIl`)H-}5y9$PH!@fdWw4eNlkEAGE7Hb-!zST?+|c7^aSzEU#lktZ93w!O z&HgHWpdujCvZbX|`@xZH3$M!~p5|vx=9s6!LJH>A)F-}(f3_w7k9TpK0dqHh3bo0? zugzUuS;)mq-2R~12RA>??Xe;~Zeq2-DXQKh!9HVlZ1K4pQ$giGp#;B-eU;ql;;8UZ z!S0iQ%)q+1u=pEEH|-Lx(>~N#v}f}Ab3H*^!cyKFsm_P zL3#YxUuE1=G(}#SZM9dR9@*cM?MkJuI`IZnc;<4_!f);vC(KSdaz)+|f|s}^{?6CR z{BOQi0w#8*fA~-d{`Vp0E#suD^XL!*FJ7T92<2U~lcU^~ph9~2hxJxULX_J`?umt< zAuyv^K0LYcMCX*?`J+tMF*`uxhAJQ0vPz%9JYPqbAb&Yp6F;VH4D_c-exrVYc6fl! z=8IUm4`@qMKhH6k8IwMBSmIW@k{+?B)K%n0E8mhow*Iu22!i+2|Gbq> zza)Om#!g0U>A|P(MPr4(ZsuTFLIKTv(QqXt?dEY8+*;U&{4PMsOTD((V}nVB4?EmV ztsf=bQiSv4q#6W`EB&g1eHBYLhBIhaVjg;VpR5b#4!kw~CcHo>#wB~st0`yO0WV)) zuRz}npKIKtLVvXHNBJuU7Hx#q!k$xx8ayt#{ciBK#%}U9<*tA7mx*Alv-Df=Yu|15 z>sYpL+MlhX?YA*pq)yM55k&@lRSB#^kavC01+s=V@tKD?YS-ydaqW;C|mK*1z&g8oi?SpHpnSlAf< zbMc9nvEHUb?7MhESr;t0=c~?$<_m~EeT`xJu909~ry?rJibO1j1pjt_l0qn{bO+>` z(psJMrm%m$aAIX#a=&T3m4{+&WlHODWPEhtfxfzCWa+}*V(jizVK;Z{e)S zc0OZSZ+U#xmDV8T(yVA@_x8Q!JkM$ozq91p ziFD+5_#M10g*;6Rw5F`{a*{IfP>L(pCtx>w96fv7Z28v3V^`CJN^Pbdue2m{zz+zJaY>6RxfaQOS)E~BH#1{Zt7+t5N&Pt{ly$IR0f}ID484>lmFhkUfajQ5&uD9;N7W_3<3e$|WU@f{=Q$)q#6*%+FB~i3esjoSe2kVi%v@ zl=+E%Ve?e5AlgjHRw*|+Zfa~IU~EEUY=SmA-a9%@JUWhHY(ix;$T#CMltr%)$Ect{ zr$CuftTZ}nZfq`SZ2raA9DQ`Qe{_~)be1NiSc_i4BDuJ3bhdDG)_8Pwe{>dYlqq49 zNoka6Zj=cwMFMNhd21n12i^UhuEHv>p!K#bi-T>(Cb1Kx82bmdo7M_H`n!#`G$U4~ zcIYPWEor#R-^o7f-^f1uKXP>t{3ri96{UX5e%pbHGeW{LXdgbY4>aM@r@xB6m@HJ# z`^z0Ra+xwWqEzO_XzLY>{lkeP=af=I1M=PAQX2PmPj^!-Cb@Drqy$4Q*hRJl3>2bHTd}QoW8HUXT(b>!0;^T9Ph|B zAj4m~apT|ua^mds-J=@D-wxa#G+SgPJMX4*>aHoPE$D{ZGqs89&B(3hSe3q)1fAv3 zAjK*hBsPS90S>k+nFzskTNMhpb^69(W&i>|4|R2W@) zzoxMNRyYV9LJwv>Hgcf-F-~;Q7Oc4;7i^Lk&@oGT%iWc$$<0;sh*!(|F{}zqzU^Ri zY#wD2Ue=fU1T=`N(E*Fzbr3i>nh^aG`-~{NBea10RPZhEeYD<#K>Z8LNY&?de~As& zfjZF*RuSBz7c~Eub}N1KA|4WW^Wnwlx?RU~pUPHDr00Yy`c>tWhlK6b9q{eCfe;x- z;<+qaFO@@V1=_FlZkmr6&H^}u-bJ$#WTv9UNs5z+`GY2J`{KpS9jm)S*%*=>lN#^l;nbSIpkqFF1kLK}tDQ+&+>t=FS9 z3$bPQ*@z-te5Vg9PnltZ4FC`_PI_cn{{aN5qZJq9u+#d%)m)vzlaqQE{p}cgt3N8> zGt0BCYfN`xUZcmF;+hb`3cnTsap|#LWBBS1mtO}yeQ6V(bOVmH)_!}jQX8j+ybf8? zJA!l`M-(%2V~{N!3C2Y%2Qx>y1b$b#b@~ZQE0OabH}|HCw;E9@<1Yt8bVvQZ4hfb< z&olK*YxPfbUH=3Eoz&firU0eWOH8UNMb-|b;uMAI@T5LwUMF=uxMW}|w7mutOxPC+ zRU9eG8{0rGOgG4#aAel4j1GekV`)m%7?5$7-Qr|9V z2UWHncgY>o)7g-c-5Wg$vR48<$&auYH~s0{gJuf4&~4jieFh|N*c%F_8VNLw=FV%y{3}lK64$VOvFXHXF8nTuZ8#v&uhWjAg~ zXkGG)yIT-nl9pr9L9xCvt)wMbk5d%J-%v>lJ}KsfChDH9%K2sqVZYJp-m*%w<|r-W zH0XSW+#RZrR?Dcl`APSIb@+62-<<;&R*?>`Ou3A~ZDrEcYaI_R&C}#k+~tD{lVnXb zx}>ID(rY=faD@e*MAfcce6B|zgKOOlXd+4|r?bI)!!hs`lli0z6_jId!z z3>oN(Ig)J>>cMe+Ne31A1qvDYsf8ygV@>@qUZ6SCkSOx1i|uX)ygvoM2;<=6QbgKT z15}0$fv8sv%DIX(1xR9BAj<}?F%y^*fx!{yfZuqM;t&f3UX55$rPE&hgl!yq0w?I4 zAP9Y{H3aG%{D}f5i>n|q4p{^w4xynE#bITV_N$fwL^@!xBHES0M&KCJ7kL`f>@bAp zb7#u8-jL1^yK5LYIsW!Z#{J%)K;k@($4^&`w_>72EfRfVq~i*ovar7bbjzWAo-q$A z#dVWh^+bfYyHz~kS3D{>_wn}H6=N1@!p6QBDnQEs6pp(4SX`pcO}-#Nx7@f)#{iU# zN2bH}P5uFojO*4-e!@iDJdX^=oy#;p4~+wgwr=9>vWgTRxeBHa$TPGqM&GX2XmPEP z*QZWORWe(~d4kWY9T9Ld8=L7D9v){itA;&R1$o{rb#LXtynywwiwFJ=H2>uD90DyyVuUO^$wvMoWxp=y)+hjO{&bfrJ)-vOQE}7Ll?BICQ zZ^av08+S5u)yRt2?Odf^J$(9I#tojA1+u45hpnwjdm}1Pj1?`v{5$0gK5M}d%UIf{ z#PrI=8Zu=cvl0aRPvwi}!shzL?va(ULZ!Oof?-DCAPtLXbeQH8qn3nJRFOY*ZL7iq z`aYJhyl3^ELAZ)I2Uo5kBPti zX0X#{C%xXnL2IoBOoqJZLY!nTJr72aXKN?h%?`@WEB^Z+d45!?O`=LdLS$W0MC7sT*yR(JP&JV zO>VYGMjNN+{!RY%30LtphfJ!@fmqmt%k;H3jp+FuHGEW0gST^@%O7M572xSP&% zdXQCE_pd4NSH37WMY)X+czq7f4Be!4U9Xt`JK!+Z?J1Dl}hHc3F+5FqqB)| zo-Y=5L@uZQ3pa1hOaHH3GxzUyjVIs%^ym za;e0Y=>iz|W3MUk1MQXIS*PQ7dOy=XLB5pZ!6)8v`z%qsC>mKUSyWwfzKG_#5q3CFevTX9RSJe=L0A^A@hT5B?Z7+5a+Z67OL>Eu#XZIvQNQwt1w(##vE`sx2uO@>)vF zSf>3zrefErHj*m?nNiHzM^dj|iJI+|f5Oqp303Zr4@4Wm0Jo~9)@7JgOkNRCRmG75 z7&MqLVaR*XXpkddablsun%O}D2z8C;`i>FMu}?pLzP6mvEW0#zGDP;vUt_4s<3c-& zs)9~H9dv#^>+~_`u9z?~X+YyBh^`<_2{v%2lYVpdvE3ga?|<@ojZs*g??fK=gR(^f z!8;Wr>5OfhBu$5`FzFoklLTNS8uJ%)Gem$O+NU|x%k+7U$eVt~fgH}kEg?2*B4025 zcIW|8nc|(@-k8qao-<1DHI`cmv4vz{QvWT^{7mTdXxG3^e+OYK*IWrO8{mI2LqFj?1zur2kfx8$;iWy6eA)2sAlIMucHTIdkrq~eU_O_9b<8509~`Uz;!fvAT|LoT>$ao46;Zw9=VPH z2+@AzGTXb!uLHmcplASSv(F;c0w|i*19DUznc`a)agzm3`wsU9iGdaWhTG@!goBD& znOrxUHNha#3aimODV{pgGCTZ-+bh{>=kJq*i=d1pN&aha$X#;@O_UYR8lIecoyHvnKSZn(DEt@FzKQ8lFiRg+a;a8y4Q^cPk0@!tcQ+W)_x z;r$=b`~@^gx6h})K|`DCX7w-72>lZ@@lhYR7;PwBI#bN0vwsH~rhiv8>~#O%K(oSu z*m|N;6$EtD#mq}APXh~@ZwShM3o3PaSrOU0`zhE+upZ&*A*$9Q%`!$Xe9oqclM^TK zN0ukgxO<0JB;$=xsLoX*uZ&hY^G1Ep_IbtHj{_|nnGJ(e11~LZPgLF_j|U8y?RFPa z{Qj+5_OHC*Yb#Ar3uRGp0`9o|>ApPM@8s{CCr$%DzM4!NY@IQMGluLIZQUlu@@S%e zLhFUinv?b`uNQWv(|DBQ=PiBVG_EmG#OVT4f@Jea*LyVf5I};(O0SZe1tu*0q zN1`No|D%0Pi2lQ-Ut@gcg%Db5Bhu+n8(-*E*iSK^emn9Opicy`FEAur&q-BF_ zwiwd(?pC)A=TNr~Z%bv5H_T2g7kGk+B;T-?e4@SpXKmYe_DP-)_eslXpVqC^%N%MW zH!c_=4y=m>rcY^vHk=AIC=pc06nx^!17q!B6dyLZaUHdKd>6$StU5#88ddZ08cN03 zRZOqc3kbca?59H2X~9aT&PoE6Ibnlf9r3_DUXj3^{k57T`Mt?n;wtr;a@eF?d~}

E_H)<;!=c+oH@bn|Q=EHCN9f9P9H-71Z z(C{OOWqxnkA#ZNOUUNyEjuW^*Z9{SX5=g(E`K#yMTkBTDdh710 zZy`$Rinl`b81L^u{Wre#|6@V@;CkKwq0snGU zDyx$p|4B8C);wT37r%LCFU+$=3kft0e(G5fA zzR!+)zhip)im45`XQJS>t{$fQ7xBJXxvXV&|&rZ2K3kjzM+2-Pgplv65hekA+mH*1e~_RE;9drF{Hy^tlpM ziVS@tsGro{NPI%x)WE7h@ZJyDKzMwc)L1~r#fc5hKo&sQAUHr$aI$Kp0l<84RG{!c zb6@mJgaY~DdYHMK<0<-3T_sYs_0-DoCfVfXV zyfr{=Hg|Dg{Au9B;4SNB8G!M;AJhb*hPZFC)j+Vo(wO0Xq92!;@kML`;LP030YG+T zjM@x@J@a#+8OP~K@%*vt@S4}kkv>#8>anPhFZg9_>UN?&kDZXM*khnR&l3by!CqT5 ze+@CT$d{v^Mk_=l<8xRB!*HeFGIim<+M(cw2;ufoBtu;J+Gn>Uo`RE=p$`1aV;)#6 zO%W7>C|RP2@7PSf<{pUOxCSJ|u!uwtekwz0#5|^iyT{s3SxwVdPL_I1Rjr^7nyV(f&N1j{2BX7Qzq65Q3%r^YPt$RQ{^2 zv}@_wh5hnZq=8NC>OxL?+bqA-h{sTG;!8bBC3#1#`+P|+)%iB1jZJ<6D_v!aDfkE5 zyttnq<&$)$M5KpM(#BjF=pN)f#`xf8d<*iTJ&c3xd7K%?RU25IJL}A=*yBq$rKJyT zlMlDJw~LWeFCC)~x3sqlt^1D3+7G|ccLa(T-ow)m-I@=Q2UT0xxWMT z-|cu{{b$gD|H$jxWd+zU({_XUd&5I?KMQayMW`WmwB2I8Ilw4HwrLg(;pc;Yd$|i2 zG;TQtzQyQBaoHi=KOZ=;Dm=c|HM#m?V(kRj@L*)Jf6yLwetuowoL-KdRbFn%odFMU zc3GEZ;4ZZ-?Q1NABV@KZ&K=3Ggxqn9twi;I27j)4`SIeWoG~DGxwJK%-rqmQ7QK;n zwJWxf$%t>L;|Ck(x#Vf8{?Oa|V`YG43ed~7{j2D^NN5nCm=jHo|7u^S)f+s+%mlJ6;fYaBK zrL0~=XKLc3l~;r1!>J$p;H|b+a=QVlxum{cu%YsM#}9lKb^CU`lbY3U^K%~=UkP_? z(toBDU4L!WPxzi&hk`eP0B552h1j!{?h(h4s|z=Vn}3cNQ8P& zgnCE>H!jMRpXb=$eU|*`C*!po?T|%PbL{Ir`}N6`?a37E$&~oX zRR6k<%F1&KKhO*1b*n1MY9z1uwrzv5@t2KOXRc{_+gV-3837EwX7CI|U}X}+GfbtF z|KBbj{>?TAdOC*xILwBnvXu1-8)DmuiYWr{w4X)nc90?lgy;|iK;lh(`?PUG#Nc{s z2v>q$U#`a%_l;xxB8sNmTQMUu9Uc}Y22$|vCv25HQ1>dGSF?(E#+N%-f6q-<9{w>m z#o4xLb9+^OPrrXN;7WKn6MUevlF^C1=~?W^2y(uAUM25cI7`-&Eo$`&{TO!|7?TcN zx^a1NT8?A{J}5dnqGtDS?U3xh&a2MP_S$w>Js+?xc%|?c4V+{9^@6+ndv=;&=lS)a z1~=%ayT~1)MWDoP@xEvT_3fdB2?30^9bd-wfv){DtS>kHL!33L=qQ8DlYF1IkPtA} ztZV%crCxL8!iTs1%p@*Hm9&SbBi}#!@i0b{cq8N1`UFsA`o)Lp43(a|y*qj{17?iv z{rZHxHJW7$BSeusALry ziUk6(AoPpW{Z;)}_{202YoQuu(`(31_(nfhi`UZ;1X(Ns9m9j#Y1Z?L@`*;f=u_L2 zS|Lj)bVRCzuC%^vq#fBYkxLh{b+Kb9e&&LXVgai&Tohp>X#uY%MX3FZ}bP6>ihg$rNMg(FgkHZ-ZzCr#ZRZ`rBhWK+jhiz6xY*cH+$!2rKTK}$}K zinbBB;kX<4iE_WQdbEpU6ZHTrD|BMKtKukJl3?^@CKr8d6~x?ZAB|3Mj4@})eBZcB zXCXbvrYG5*>!8=_F!F4(OeMTatrF9(c8i-enE5BAKO|&N%hVL;SW8zF??u%&=^lhK z{-{8=_(3yrlY&iyrx>N57EJ?@r>kGiirQ|2ajRAh!qO8>N{7a4=Wg>U_r;Qpf0{L{ zedXQXXX_vyeWJ9nPkQw?Q;uXBzDj{k57=q;aR_=br^|ZgeK`fU%~qwYRd!CO>sC>t zY#mqMlA4FUs8N38un4fOQFhL%>;9oe`FdQ4N^F#`S@TY2=$Roz{F90(XP5}d0WC(i zCKxPDh`14OW1I*nm5Qj#A-npcXBBZiFZ;ARTojLng(0lZ}_)y*s6?UF2{Yvpqwh`9gSz-T1e=`2n(v`Kd zHgq&FvLhg~F*H^*Ql+M+VWOj9U?zvA<#%*2wYDQ8r&Tm__-;g|$;iqA__Au!(a|aV zCf2`+@o(brn^^uPdcTSDZ=y-Z1o+a?N&Y5(s{Z)>%jh?;`%TP#6RY3EDkyA84}P6>wUL3f~J*K6jG%T zcKrTbU(d?wx11=Nn%NToKL3yxDgtFAJA1(62pDPTndq6>8CaMZsTpWk{v}9&21cp` zl4edu1OQGLX}=lio9S7R6Ij^jIhfiT=@B^nvpow9BMm_8czAw`?C;o9EG&Pu-V|MI zj0k9D^vx9>UVL=ve<1h>C!biT$5jU;p1WaA4{4O-inozs(`^8XajD8BIr-CL?_!x{?CJaSn>yBt{S_jt+i~l6R8uDE;=8+r z@;#|` z37GL>JJUgFV$^1tVZH6LT!OFSp@E2RpeKQ5u+3O?^GW0MVjc)kAfA9K=wuK-*S9#8hu`r9^~2c=qUO^1<2gt0v~``-@1Kmc+o5Pv!muaY_e! zNb#+fhE5HU1kpF6f*ZnkG3<}!_g#<4;W0*{r zuR8A$5%~QFwEdtY5@)D6$;#sT`lgB3I!gFOxbV&r#0IVmP3VwIVD<>l75Z!PMMl_T zP1gp>8z$$%@O^qf89H6Td^38{pzAWU14r8%m!E(q8-~Xo=gP`!JpRE#A2<^(tT4Jv zmeuA3;f#4m$g8{h7gu>udI=BOm((!SWzRy+Wb06`|EXG+1k^?GcMd1L9LAaZtSclV{<|(Z^inV*!b(=1?dcj9Z z_r6ekTjG`<&Q9GnV71-lUAACL{qExYg6HC*<|xpNP52vLM98R^y}sOlh0h1&JCL z1oZ|~8h5|aR)_9&4ZA%+Za$^5dK2ZYuEGOD`$AP^!hR5&wFBpJ`$(Jh_p)$?^q1T=iwaweWjuG=OrDh3 zt31E$jO$~#?`ko?iZaDKN>+|5$du&tePZu&K&vujMHt=p=w4lgcxrS9FEYx@YN#@= ziPyy(>^crf5;9UFe|1mTBC&0!+A&XVWclQV{W_$4!e8@^b8>bWXq{ctvRYRYdAY4s zZmTZtt+BEp3@5?hw|B9KR`qw)BFp2usu0gOrJL$5L}S6PM8!ZOVkaXiN}vsWhlFxI z#=)u^Ofc@)3p+7B`U=`9c-WKHT`Y7r1u8DepnIDyQ>N#T*{K@OLvebAQA3v0w zn0Q+LWpw{pOvQ>IO_6?Ft{6114#QhM{@i>X;N9#H?OIS1%x+`*BX#8@Zp3$8FPY); z46i}P&O+G=zkr>1&9noRb;aC8XVXenCLNJ)a|H>y&%<^}9(@F9+V{j*^M>nWYVt5O z>LCZXlT<@`+)YBNlj2p#@FStfVBW}=+X=S_5=9b6H?8|CeWF^Ll zXi8VU){t#~IFU^)I7nnoiJlc8G?VtKnJxvN!^=^s#vJb&^RE|?FR-J)cNfm=p9Ggwp>A;Kbnf_WTZk1NC=5+B4pY_d{z^Xj}ay*%w(e^Iu1QBn= zGGs6y$iAW@uJQCI6o~$u3jEy1J5&Xi_}rMj)T$I{;|>-n-f@SSG`}8aJyiiK3tzqd zFy9Dk|{mf@OrHW1!5i}gzT+Kgu6n-u|YH>Pz zpm1BQb-r6dPd=On%BkJ^SYAO{aJSvD+RU3i%wJE z%_vb=u$lZJBTeUp5dXr*K#c4;7~+4v`onNHc70oi93pG2H+0l_(m6Az_zm*n$QqY) z{FP%W%h}>Mb`m(C>etMK^Jtou^}9I!xxLZ(;;j=_so{O4EA`fV=*QTF$xqr%ZdIuG zWDXCy(Fte(1njlH3zc+faC39FaO5lZ$&#PY%qcO4`$?1?bjKZ21qN5i+HMHYYubXS zV4^d?;D=Gk%)GVmyF+L)d-z#xY-p(O7SEFmTSYWjzek>5$+4e6RE7JKC28i%cXBmK zav}{FQl}26fxWQIjQru@D)0CN%?Uy0npNq;pTL%9bs+Z%+1U$(5el{|7jSgG|Bc-c zZ@kPUR#omrL{A{#YLiT4el2h;v%I!P9+IWifgL1kVvnLO9c^eGpS2z%>8}$X8SkGt z-+Uxl(`rwqeIukPwRL8}XF7 zD!qJ!Qe?&xC!BA9R#6lWns@y|((&H$zampoECC;qN}ffID@Xa*p-<4egKj|h2XkP? z%tg8~Y_vz%g{kpS&Tn@7gGYRhXvgUG+GfjKrSC~ITgE1v$Kh}-0Kwo7zB6xgJiN$V z8UreHmjOw<}K=ZTZ-Gxb=1E%VB+_pJC~a5#hxuqGXr8;%AU6k%NC~@AeN$K zL&(E_*!p>QH^3SADYXL&X{)eVXvk9v^iz6?Ieuv*-OkTFu9{yOMp&K%=3SeNTR)CV zWh&<9o>sL|zW$bh;ch)eS2!(9Hwb0i#k&l(uVABXYb_Ty@HmsoJUTtnnrs zx_cs{szWgwWvuabDn55(9N0TjP}|WD4EaNl1lG%qqxR5gDG#g~z1q*na<&pMb1+iVW?| z(aaD+_;#gzV=w>yLa(VvCz4$gfh{=vOk=V^55WYW>6~tz?1wi!v8d=%}oz$OjJf0P~QFdOqm0_opzr`^AC}DkB)$ zO7P-}Y$jCJ6k_{5S*caRroOk?98H{W6>f0zShBr694S8-=X<+w3{iDF) z6Ri0a=7EpkV8U-|sCKh))Kl)9)-g|~DhK6AKT>yCisPYcZU5OF{H$R^SZiAoNVJdv z_tGHA;`2kmigQ90fMPp-`^8o1wEfK&S3>Ozo(EHS*OSk?QdrA7E5xR-mbv(h8t%*I z^^NM20@4$wVAt9gBQpU}$RNg;lG_C!J?k!A<(w^qzVb|paS|{6cF9Png@0yC@+@QR z9UqS?d6zk~1{$=2uu^iR$kJC~{bhdo%0UkL`pUrdRy%_mO?~ztkq8gr62Eq_C;Cl- zat;VxD3{?aRbWX;;X^N$W?F^^;4KsjJ=6stcDE5z5+_KOge3V=AX#cyl@_TGi5p;7 zQ8W@@z$fhE6PNUobe<|)=3fIkd+R1$QNmg00;Wb+#cpm4gfD4dNotYt$)P%X)#%|k z(QR|&?Dyeq{qz#0JQmD!@)So?wCD%oV)MkZy{5iUWv!1 zibqeuk%7N_li?P7IKBROO?CMF5>9;pGURbXK9TNVypcNM{wk*j02w;ekbiZ6I?%qk z_wY6)+0E^SBBsYyR%uAjdg`&^3?3Rs57vIJ(^V;M=$m1LYBUDcK%=Bv=E#$pZ!)cB zrBU}WS}@_36}3y&ip{Zrbo=unougEMlXT!UGkS)?zgxZdYfxyQ3%4c{)o`?edvO|z zaY{=_JtY&>v-N@hE|Q5p;WP$vN^k-%D|MZm+%GGC>tl3(dssVNTECY=ep{-X#&SxM zU@;kSFi1rVBngsMmXwr~0LcJBO5&0rkP=7*tibv2O)j6K1$VS@f6fWK47~qKtM>2F z=&ezWuuP4cUv^3=A-Hv4feiP9{MxN=Y4QpF*kAq}RE$0@eckE!c?~6Me7j0eCpf$z zNQs_^kV;`5N<%|L#m#NXMoK9Y(ERbVcq{LubT#1I`%}Cw%b9B&(V3)Hr-7#J#5`Z|z1=@;$*eR+T~!l{uuX{pV!mi$zB*u9Ofao}B43n6zFLU1 z$Ru0bm+h$qn9$5s5cX?H-~td z33%opJ`kT&ix<7k*zh}XXu7WVyt&RZ8!;C29Iy7A zWY4Tf&z#(yS?)V?LX?Y=2UyQ8VeKk9*1mw;QFII|I@}l?4M9ixqGO4@!tQxR33x@= zc*QVzg+K9%vU{nR%Z-Y!A^KgeILT2FeBk$vX}?&p(eI3y3=Xu?W0n_bbZ zDAWfTR^FA|XR&$&|3}&%iY~b?>TD1aAdgF(wnr4_4=~@D%?R^SLU7EjZW#8#XdogQFFf@~B&j1N?ggqB~@jJ4CdCO=$J({|i)$VlU1(3)P!e>DfGA zbGs;Z#%T+1`N;(_NBkRv-@m3eotRJRGPY+&P=5C$$zOlN9dU^!q~brXzsrAu^7}}7 zfO7+)BHH31G!RM-lVlXHbotAug2QCpAz-!T8# zBmhbmWi{Fxf9J*s%CRdG=R}W@{Y|q!+=R64v3AyJ2_iu2bieL_16;AC%Jnq*dX$XS+qm|b5km>xWy^7=yxac`L8RYnUMMoZF;YWi-mpe1Ln4gPA{(key?x8v3mL^nt#Y~T50 zRq89e+U@YB(bOVkfdb z74v%d!mc#Q+xcfHqE3j)G@!8@w%ihW;aCdt-p;#)sFR{PKSY&}EVV2*YkO}y9> zxKN!-pK?s7D}b!^}ekJdUIQ literal 0 HcmV?d00001 diff --git a/reduction_tiling_docs/softmax_lowering_flow.tex b/reduction_tiling_docs/softmax_lowering_flow.tex new file mode 100644 index 00000000..bc378c29 --- /dev/null +++ b/reduction_tiling_docs/softmax_lowering_flow.tex @@ -0,0 +1,567 @@ +\documentclass[aspectratio=169]{beamer} +\usepackage[utf8]{inputenc} +\usepackage{listings} +\usepackage{xcolor} +\usepackage{amsmath} +\usepackage{booktabs} + +\usetheme{default} +\usecolortheme{default} + +% MLIR syntax highlighting - simplified for slides +\lstdefinelanguage{MLIR}{ + morekeywords={func, memref, tensor, linalg, scf, forall, for, arith, math, gpu, xegpu, vector}, + sensitive=true, + morecomment=[l]{//}, + morestring=[b]", +} + +\lstset{ + language=MLIR, + basicstyle=\ttfamily\small, + keywordstyle=\color{blue}\bfseries, + commentstyle=\color{gray}\itshape, + stringstyle=\color{red}, + numbers=none, + backgroundcolor=\color{white}, + showspaces=false, + showstringspaces=false, + showtabs=false, + frame=single, + tabsize=2, + breaklines=true, + breakatwhitespace=false, + escapeinside={\%*}{*)} +} + +\title{Softmax Lowering Flow: IR Transformation Stages} +\author{} +\date{} + +\begin{document} + +\begin{frame} +\titlepage +\end{frame} + +\begin{frame}{Problem Setup} +\textbf{Input Shape}: $1024 \times 512 \times f32$ (1024 rows, 512 columns) + +\textbf{Softmax Dimension}: dim=1 (along the 512-element rows) + +\vspace{0.5cm} + +\textbf{Softmax Formula}: +$$\text{softmax}(x_i) = \frac{e^{x_i - \max(x)}}{\sum_j e^{x_j - \max(x)}}$$ +\end{frame} + +\begin{frame}[fragile]{Stage 1: Initial IR} +Single high-level \texttt{linalg.softmax} operation on the full tensor. + +\begin{lstlisting} +// Pseudo code: +output = softmax(input[1024x512], dim=1) +\end{lstlisting} + +\vspace{0.3cm} + +\textbf{Key Points}: +\begin{itemize} +\item Single operation on entire tensor +\item No parallelism or tiling yet +\end{itemize} +\end{frame} + +\begin{frame}[fragile]{Stage 2: After Tiling Parallel Dimension} +Parallel dimension (rows) tiled into 16 chunks of 64 rows each. + +\begin{lstlisting} +// Pseudo code: +parallel for tile_id in [0..16): + row_offset = tile_id * 64 + slice = input[row_offset:row_offset+64, 0:512] + output_slice = softmax(slice, dim=1) + output[row_offset:row_offset+64, :] = output_slice +\end{lstlisting} + +\vspace{0.3cm} + +\textbf{Key Points}: +\begin{itemize} +\item 16 parallel tiles: $1024 / 64 = 16$ +\item Each tile: $64 \times 512$ +\item Introduces \texttt{scf.forall} for parallel execution +\end{itemize} +\end{frame} + +\begin{frame}[fragile]{Stage 3: After Decomposing Softmax} +Softmax decomposed into 4 operations: +\textbf{max} $\rightarrow$ \textbf{center+exp} $\rightarrow$ \textbf{sum} $\rightarrow$ \textbf{division} + +\begin{lstlisting} +// Pseudo code: +parallel for tile_id in [0..16): + slice = input[tile_id*64:(tile_id+1)*64, :] + + // Step 1: Max reduction (64,512) -> (64,) + max_vals = reduce_max(slice, dim=1) + + // Step 2: Center and exp (64,512) -> (64,512) + exp_vals = exp(slice - max_vals) + + // Step 3: Sum reduction (64,512) -> (64,) + sum_vals = reduce_sum(exp_vals, dim=1) + + // Step 4: Division (64,512) -> (64,512) + output_slice = exp_vals / sum_vals +\end{lstlisting} +\end{frame} + +\begin{frame}[fragile]{Stage 4: After Tiling Division} +Division operation tiled along dimension 1 into chunks of 16 columns. + +\begin{lstlisting} +// Pseudo code: +parallel for tile_id in [0..16): + slice = input[tile_id*64:(tile_id+1)*64, :] + + max_vals = reduce_max(slice, dim=1) // (64,512)->(64,) + exp_vals = exp(slice - max_vals) // (64,512)->(64,512) + sum_vals = reduce_sum(exp_vals, dim=1) // (64,512)->(64,) + + // Division tiled: 32 iterations (512/16 = 32) + for col_offset in [0:512:16]: + exp_tile = exp_vals[:, col_offset:col_offset+16] + output[:, col_offset:col_offset+16] = exp_tile / sum_vals +\end{lstlisting} + +\vspace{0.2cm} +\textbf{Key}: Division loop operates on $64 \times 16$ tiles +\end{frame} + +\begin{frame}[fragile]{Stage 5: Fusing Center+Exp into Division Loop} +Recompute center-and-exp on-the-fly in division loop. + +\begin{lstlisting} +// Pseudo code: +parallel for tile_id in [0..16): + slice = input[tile_id*64:(tile_id+1)*64, :] + + max_vals = reduce_max(slice, dim=1) // (64,) + + // Still materialized for sum reduction + exp_vals = exp(slice - max_vals) // (64,512) + sum_vals = reduce_sum(exp_vals, dim=1) // (64,) + + // Division loop: recompute exp on-the-fly + for col_offset in [0:512:16]: + input_tile = slice[:, col_offset:col_offset+16] + exp_tile = exp(input_tile - max_vals) // Recomputed + output[:, col_offset:col_offset+16] = exp_tile / sum_vals +\end{lstlisting} + +\vspace{0.2cm} +\textbf{Benefit}: Reduces memory footprint (partial recomputation) +\end{frame} + +\begin{frame}[fragile]{Stage 6: After Tiling Sum Reduction} +Sum reduction tiled into 16-column chunks with partial sums. + +\begin{lstlisting} +// Pseudo code: +parallel for tile_id in [0..16): + slice = input[tile_id*64:(tile_id+1)*64, :] + + max_vals = reduce_max(slice, dim=1) // (64,) + exp_vals = exp(slice - max_vals) // (64,512) + + // Tiled sum reduction: accumulate into buffer (64,16) + sum_buffer = zeros(64, 16) + for col_offset in [0:512:16]: + exp_tile = exp_vals[:, col_offset:col_offset+16] + sum_buffer += exp_tile // Accumulate + + sum_vals = reduce_sum(sum_buffer, dim=1) // (64,16)->(64,) + + // Division loop (same as before) + for col_offset in [0:512:16]: ... +\end{lstlisting} + +\vspace{0.2cm} +\textbf{Key}: Sum uses partial accumulation buffer +\end{frame} + +\begin{frame}[fragile]{Stage 7: Fusing into Sum Reduction Loop} +Fuse center-and-exp into sum reduction loop as well. + +\begin{lstlisting} +// Pseudo code: +parallel for tile_id in [0..16): + slice = input[tile_id*64:(tile_id+1)*64, :] + + max_vals = reduce_max(slice, dim=1) // (64,) + + // Sum reduction with fused center+exp + sum_buffer = zeros(64, 16) + for col_offset in [0:512:16]: + input_tile = slice[:, col_offset:col_offset+16] + exp_tile = exp(input_tile - max_vals) // Fused + sum_buffer += exp_tile + + sum_vals = reduce_sum(sum_buffer, dim=1) // (64,) + + // Division loop with fused center+exp+div + for col_offset in [0:512:16]: + input_tile = slice[:, col_offset:col_offset+16] + exp_tile = exp(input_tile - max_vals) // Recomputed + output[:, col_offset:col_offset+16] = exp_tile / sum_vals +\end{lstlisting} +\end{frame} + +\section{Stage 8: After Tiling Max Reduction} + +Max reduction also tiled into 16-column chunks with partial max followed by final reduction. + +\begin{lstlisting} +func.func @payload(%arg0: memref<1024x512xf32>, %arg1: memref<1024x512xf32>) { + %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x512xf32>) { + %slice = tensor.extract_slice %0[%3, 0] [64, 512] [1, 1] + + // Tiled max reduction: accumulate into 64x16 buffer + %8 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %7) -> (tensor<64x16xf32>) { + %slice_7 = tensor.extract_slice %slice[0, %arg4] [64, 16] [1, 1] + + // Max accumulation + %14 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_7 : tensor<64x16xf32>) outs(%slice_8 : tensor<64x16xf32>) { + ^bb0(%in: f32, %out: f32): + %15 = arith.maxnumf %in, %out : f32 + linalg.yield %15 : f32 + } -> tensor<64x16xf32> + + %inserted = tensor.insert_slice %14 into %arg5[0, 0] [64, 16] [1, 1] + scf.yield %inserted : tensor<64x16xf32> + } + + // Final max reduction: (64,16) -> (64,) + %reduced = linalg.reduce ins(%8 : tensor<64x16xf32>) outs(%5 : tensor<64xf32>) dimensions = [1] { + (%in: f32, %init: f32) { + %14 = arith.maxnumf %in, %init : f32 + linalg.yield %14 : f32 + } + } + + // Sum reduction loop with fused center+exp + %12 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %11) -> (tensor<64x16xf32>) { + %slice_7 = tensor.extract_slice %slice[0, %arg4] [64, 16] [1, 1] + + // Fused center+exp using reduced max + %14 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_7, %reduced : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_8 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_9: f32, %out: f32): + %16 = arith.subf %in, %in_9 : f32 + %17 = math.exp %16 : f32 + linalg.yield %17 : f32 + } -> tensor<64x16xf32> + + // Sum accumulation + %15 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%14 : tensor<64x16xf32>) outs(%arg5 : tensor<64x16xf32>) { + ^bb0(%in: f32, %out: f32): + %16 = arith.addf %in, %out : f32 + linalg.yield %16 : f32 + } -> tensor<64x16xf32> + + scf.yield %15 : tensor<64x16xf32> + } + + // Final sum reduction: (64,16) -> (64,) + %reduced_6 = linalg.reduce ins(%12 : tensor<64x16xf32>) outs(%9 : tensor<64xf32>) dimensions = [1] { + (%in: f32, %init: f32) { + %14 = arith.addf %in, %init : f32 + linalg.yield %14 : f32 + } + } + + // Division loop with fused center+exp+div + %13 = scf.for %arg4 = %c0 to %c512 step %c16 iter_args(%arg5 = %slice_1) -> (tensor<64x512xf32>) { + %slice_7 = tensor.extract_slice %slice[0, %arg4] [64, 16] [1, 1] + + // Fused center+exp + %14 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%slice_7, %reduced : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_8 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_9: f32, %out: f32): + %16 = arith.subf %in, %in_9 : f32 + %17 = math.exp %16 : f32 + linalg.yield %17 : f32 + } -> tensor<64x16xf32> + + // Division + %15 = linalg.generic {iterator_types = ["parallel", "parallel"]} + ins(%14, %reduced_6 : tensor<64x16xf32>, tensor<64xf32>) outs(%slice_8 : tensor<64x16xf32>) { + ^bb0(%in: f32, %in_9: f32, %out: f32): + %16 = arith.divf %in, %in_9 : f32 + linalg.yield %16 : f32 + } -> tensor<64x16xf32> + + %inserted = tensor.insert_slice %15 into %arg5[0, %arg4] [64, 16] [1, 1] + scf.yield %inserted : tensor<64x512xf32> + } + + scf.forall.in_parallel { + tensor.parallel_insert_slice %13 into %arg3[%3, 0] [64, 512] [1, 1] + } + } +} +\end{lstlisting} + +\begin{frame}[fragile]{Stage 8: After Tiling Max Reduction (Part 1)} +Max reduction tiled with partial buffers, creating 3 loops total. + +\begin{lstlisting} +// Pseudo code (continued from Stage 7): +parallel for tile_id in [0..16): + slice = input[tile_id*64:(tile_id+1)*64, :] + + // Loop 1: Tiled max reduction + max_buffer = fill(-inf, 64, 16) + for col_offset in [0:512:16]: + input_tile = slice[:, col_offset:col_offset+16] + max_buffer = max(max_buffer, input_tile) + max_vals = reduce_max(max_buffer, dim=1) // (64,) + + // Loop 2: Sum reduction with fused center+exp + sum_buffer = zeros(64, 16) + for col_offset in [0:512:16]: + input_tile = slice[:, col_offset:col_offset+16] + exp_tile = exp(input_tile - max_vals) + sum_buffer += exp_tile + sum_vals = reduce_sum(sum_buffer, dim=1) // (64,) +\end{lstlisting} +\end{frame} + +\begin{frame}[fragile]{Stage 8: After Tiling Max Reduction (Part 2)} +\begin{lstlisting} + // Loop 3: Division with fused center+exp+div + for col_offset in [0:512:16]: + input_tile = slice[:, col_offset:col_offset+16] + exp_tile = exp(input_tile - max_vals) + output[:, col_offset:col_offset+16] = exp_tile / sum_vals +\end{lstlisting} + +\vspace{0.3cm} + +\textbf{Key Points}: +\begin{itemize} +\item All 3 operations (max, sum, div) are now tiled +\item 3 separate loops over columns (32 iterations each) +\item Each loop processes $64 \times 16$ tiles +\end{itemize} +\end{frame} + +\begin{frame}[fragile]{Stage 9: Final Vectorized XeGPU Version} +After vectorization, bufferization, and XeGPU lowering. + +\begin{lstlisting} +// Pseudo code: +gpu.kernel: + block_id = get_block_id() + slice = input[block_id*64:(block_id+1)*64, :] + + // Allocate SLM buffer (64x16) for partial reductions + slm_buffer = alloc_shared_memory(64, 16) + + // Loop 1: Max reduction (32 iterations) + slm_buffer = fill(-inf) + for col_offset in [0:512:16]: + tile = load_vector(slice[:, col_offset:col_offset+16]) + slm_buffer = max(slm_buffer, tile) // Update in SLM + max_vals = reduce_across_cols(slm_buffer) + + // Loop 2: Sum reduction (32 iterations) + slm_buffer = zeros() + for col_offset in [0:512:16]: + tile = load_vector(slice[:, col_offset:col_offset+16]) + exp_tile = exp(tile - max_vals) + slm_buffer += exp_tile // Accumulate in SLM + sum_vals = reduce_across_cols(slm_buffer) +\end{lstlisting} +\end{frame} + +\begin{frame}[fragile]{Stage 9: Final Vectorized XeGPU Version (cont'd)} +\begin{lstlisting} + // Loop 3: Division (32 iterations) + for col_offset in [0:512:16]: + tile = load_vector(slice[:, col_offset:col_offset+16]) + exp_tile = exp(tile - max_vals) + result = exp_tile / sum_vals + store_vector(output[:, col_offset:col_offset+16], result) +\end{lstlisting} + +\vspace{0.3cm} + +\textbf{Key Features}: +\begin{itemize} +\item Uses vector operations ($64 \times 16$ SIMD) +\item Shared Local Memory (SLM) for partial reductions +\item XeGPU dialect for Intel GPU operations +\end{itemize} +\end{frame} + +\begin{frame}{Summary of Transformations} +\begin{table} +\centering +\small +\begin{tabular}{@{}rll@{}} +\toprule +\textbf{Stage} & \textbf{Key Transformation} & \textbf{Loop Structure} \\ +\midrule +1 & Initial high-level softmax & No loops \\ +2 & Tile parallel dimension & \texttt{forall(16)} \\ +3 & Decompose softmax & \texttt{forall(16)} + 4 ops \\ +4 & Tile division & \texttt{forall(16)} $\rightarrow$ \texttt{for(32)} \\ +5 & Fuse into division loop & Recompute center+exp \\ +6 & Tile sum reduction & Add sum loop \\ +7 & Fuse into sum loop & Recompute center+exp \\ +8 & Tile max reduction & 3 loops total \\ +9 & Vectorize + XeGPU & GPU with SLM \\ +\bottomrule +\end{tabular} +\end{table} + +\vspace{0.3cm} + +\textbf{Final pattern per GPU block}: 3 loops of 32 iterations each +\begin{enumerate} +\item Max reduction $\rightarrow$ SLM $\rightarrow$ final reduction +\item Sum reduction (fused center+exp) $\rightarrow$ SLM $\rightarrow$ final reduction +\item Division (fused center+exp+div) $\rightarrow$ global memory +\end{enumerate} +\end{frame} + +\begin{frame}{Optimization: Fusing Max and Sum Loops} +After Stage 8, we can fuse max and sum loops into one. + +\textbf{Result}: 3 loops $\rightarrow$ 2 loops + +\vspace{0.5cm} + +\textbf{Key Insight}: \emph{Online Softmax Algorithm} + +Incrementally update both global max and sum as we process each tile: + +\begin{enumerate} +\item Compute \textbf{local max} for the tile +\item Update \textbf{global max} = $\max(\text{old\_max}, \text{local\_max})$ +\item Compute \textbf{local sum} = $\sum \exp(x - \text{local\_max})$ +\item \textbf{Rescale} global sum by $\exp(\text{old\_max} - \text{new\_max})$ +\item \textbf{Add} rescaled local sum to global sum +\end{enumerate} + +\vspace{0.3cm} + +Maintains numerical stability while reducing memory bandwidth! +\end{frame} + +\begin{frame}[fragile]{Before Fusion: 3 Separate Loops} +\begin{lstlisting} +// Pseudo code: +parallel for tile_id in [0..16): + slice = input[tile_id*64:(tile_id+1)*64, :] + + // Loop 1: Max reduction + max_buffer = fill(-inf, 64, 16) + for col_offset in [0:512:16]: + tile = slice[:, col_offset:col_offset+16] + max_buffer = max(max_buffer, tile) + max_vals = reduce_max(max_buffer, dim=1) + + // Loop 2: Sum reduction + sum_buffer = zeros(64, 16) + for col_offset in [0:512:16]: + tile = slice[:, col_offset:col_offset+16] + exp_tile = exp(tile - max_vals) // Uses final max + sum_buffer += exp_tile + sum_vals = reduce_sum(sum_buffer, dim=1) + + // Loop 3: Division + for col_offset in [0:512:16]: ... +\end{lstlisting} +\end{frame} + +\begin{frame}[fragile]{After Fusion: 2 Loops (Online Softmax)} +\begin{lstlisting} +// Pseudo code: +parallel for tile_id in [0..16): + slice = input[tile_id*64:(tile_id+1)*64, :] + + // Loop 1: Fused max+sum (online softmax algorithm) + global_max = fill(-inf, 64) + global_sum = zeros(64) + + for col_offset in [0:512:16]: + tile = slice[:, col_offset:col_offset+16] + + // Update max incrementally + local_max = reduce_max(tile, dim=1) + new_max = max(global_max, local_max) + + // Compute local sum centered on local_max + local_sum = sum(exp(tile - local_max), dim=1) + + // Rescale and accumulate sum + correction = exp(global_max - new_max) + local_correction = exp(local_max - new_max) + global_sum = global_sum * correction + local_sum * local_correction + + global_max = new_max +\end{lstlisting} +\end{frame} + +\begin{frame}[fragile]{After Fusion: 2 Loops (cont'd)} +\begin{lstlisting} + // Loop 2: Division (same as before) + for col_offset in [0:512:16]: + tile = slice[:, col_offset:col_offset+16] + exp_tile = exp(tile - global_max) + output[:, col_offset:col_offset+16] = exp_tile / global_sum +\end{lstlisting} + +\vspace{0.5cm} + +\textbf{Benefits}: +\begin{itemize} +\item Reduced loop count: 3 $\rightarrow$ 2 loops +\item Better memory locality: Input read twice instead of three times +\item Same numerical stability (still uses max-centering) +\end{itemize} + +\vspace{0.3cm} + +\textbf{Trade-off}: More computation per iteration (exponentials for rescaling) +\end{frame} + +\begin{frame}{Summary} +\textbf{Softmax Lowering Journey}: 9 transformation stages + +\begin{enumerate} +\item Start: High-level \texttt{linalg.softmax} operation +\item Decompose into: max $\rightarrow$ center+exp $\rightarrow$ sum $\rightarrow$ div +\item Tile parallel dimension (16 workgroups) +\item Tile reduction dimension (32 iterations per loop) +\item Fuse operations to reduce memory footprint +\item Lower to GPU with vectorization and SLM +\item \textbf{Optional}: Fuse max+sum loops (online softmax) +\end{enumerate} + +\vspace{0.5cm} + +\textbf{Key Techniques}: +\begin{itemize} +\item Tiling for parallelism and memory hierarchy +\item Fusion for memory efficiency (recomputation) +\item Online softmax for bandwidth optimization +\end{itemize} +\end{frame} + +\end{document}