From eecee073a311b1ad4aad4bd24b97fdfc2caf3d03 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Fri, 13 Mar 2026 23:32:23 +0000
Subject: [PATCH 01/38] save work

---
 examples/xegpu/softmax.py | 336 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 336 insertions(+)
 create mode 100644 examples/xegpu/softmax.py

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
new file mode 100644
index 00000000..bb90f812
--- /dev/null
+++ b/examples/xegpu/softmax.py
@@ -0,0 +1,336 @@
+# RUN: %PYTHON %s --dump-kernel=xegpu-wg | FileCheck %s
+# CHECK: module attributes {gpu.container_module} {
+
+"""
+XeGPU softmax benchmark.
+"""
+
+import argparse
+import ctypes
+from typing import Optional
+from functools import cached_property
+
+import numpy as np
+from mlir import ir
+from mlir.execution_engine import ExecutionEngine
+from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func
+
+from lighthouse.workload import benchmark
+from lighthouse.utils.memref import to_ctype as memref_to_ctype
+from lighthouse.utils.numpy import numpy_to_ctype
+from lighthouse.utils.mlir import func_cif
+from lighthouse.ingress.mlir_gen import get_mlir_elem_type
+from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor
+
+from xegpu_workload import XeGPUWorkload
+
+
+def softmax_complexity(M: int, N: int, nbytes: int):
+    """
+    Complexity of softmax operation.
+    
+    For each row:
+    - O(N) to find max
+    - O(N) to compute exp(x - max) and sum
+    - O(N) to normalize
+    Total: 3*N operations per row, but with transcendental (exp) operations
+    """
+    # Approximation: 5 FLOPs per element (max, sub, exp, sum, div)
+    # exp is expensive but we count it as ~1 FLOP for simplicity
+    flop_count = M * N * 5
+    memory_reads = M * N * nbytes  # read input
+    memory_writes = M * N * nbytes  # write output
+    return flop_count, memory_reads, memory_writes
+
+
+class XeGPUSoftmax(XeGPUWorkload):
+    """
+    Softmax workload on XeGPU.
+
+    Computes softmax along the last dimension (rows):
+    output[i, j] = exp(input[i, j] - max_i) / sum_i(exp(input[i, j] - max_i))
+
+    where max_i and sum_i are computed over row i.
+    """
+
+    def __init__(
+        self,
+        M: int,
+        N: int,
+        dtype: str = "f32",
+    ):
+        super().__init__()
+        self.M = M
+        self.N = N
+        self.shape = (M, N)
+        assert dtype == "f32", "Only f32 type is supported for softmax"
+        self.dtype_str = dtype
+        type_str_to_numpy = {
+            "f16": np.float16,
+            "f32": np.float32,
+        }
+        self.dtype = type_str_to_numpy[dtype]
+
+    @cached_property
+    def _initial_host_arrays(self) -> tuple[np.ndarray]:
+        """Generate initial values on host with numpy."""
+        np.random.seed(42)
+        # Use values in range [-0.5, 0.5] to avoid numerical issues
+        input_arr = np.random.uniform(-0.5, 0.5, self.shape).astype(self.dtype)
+        return (input_arr,)
+
+    @cached_property
+    def _reference_solution(self) -> np.ndarray:
+        """Compute reference solution on host with numpy."""
+        (input_arr,) = self._initial_host_arrays
+        # Use float32 for computation
+        x = input_arr.astype(np.float32)
+        # Compute softmax along axis 1 (each row independently)
+        # Numerically stable version: subtract max before exp
+        max_vals = np.max(x, axis=1, keepdims=True)
+        exp_vals = np.exp(x - max_vals)
+        sum_vals = np.sum(exp_vals, axis=1, keepdims=True)
+        output = exp_vals / sum_vals
+        return output.astype(self.dtype)
+
+    def _get_input_arrays(
+        self, execution_engine: ExecutionEngine
+    ) -> list[ctypes.Structure]:
+        # Allocate device memory for input and output
+        input_gpu = self._allocate_array(
+            "input", self.shape, self.dtype_str, execution_engine
+        )
+        output_gpu = self._allocate_array(
+            "output", self.shape, self.dtype_str, execution_engine
+        )
+
+        # Copy input to device
+        (input_host,) = self._initial_host_arrays
+        copy_fn = f"gpu_copy_2d_{self.dtype_str}"
+        execution_engine.invoke(
+            copy_fn, numpy_to_ctype(input_host), memref_to_ctype(input_gpu)
+        )
+
+        # Return memrefs: [output, input]
+        return [output_gpu, input_gpu]
+
+    def check_correctness(
+        self, execution_engine: ExecutionEngine, verbose: int = 0
+    ) -> bool:
+        # Copy result from device to host
+        output_gpu = self.gpu_memrefs[("output", self.dtype_str)]
+        output_host = np.zeros(self.shape, dtype=self.dtype)
+        execution_engine.invoke(
+            f"gpu_copy_2d_{self.dtype_str}",
+            memref_to_ctype(output_gpu),
+            numpy_to_ctype(output_host),
+        )
+
+        output_ref = self._reference_solution
+        output_computed = output_host.astype(np.float32)
+        
+        if verbose > 1:
+            print("Reference solution (first 5 rows):")
+            print(output_ref[:5])
+            print("Computed solution (first 5 rows):")
+            print(output_computed[:5])
+
+        # Check row sums are close to 1.0
+        row_sums = np.sum(output_computed, axis=1)
+        sums_ok = np.allclose(row_sums, 1.0, rtol=1e-5, atol=1e-6)
+        
+        # Check values match reference
+        values_ok = np.allclose(output_computed, output_ref, rtol=1e-4, atol=1e-6)
+        
+        success = sums_ok and values_ok
+
+        if verbose:
+            if success:
+                print("PASSED")
+            else:
+                print("FAILED!")
+                if not sums_ok:
+                    print(f"  Row sums check failed. Min: {row_sums.min():.6f}, Max: {row_sums.max():.6f}")
+                if not values_ok:
+                    max_diff = np.abs(output_computed - output_ref).max()
+                    print(f"  Values mismatch. Max abs diff: {max_diff:.6e}")
+        return success
+
+    def get_complexity(self) -> tuple[int, int, int]:
+        nbytes = np.dtype(self.dtype).itemsize
+        return softmax_complexity(self.M, self.N, nbytes)
+
+    def payload_module(self) -> ir.Module:
+        """Generate MLIR module for softmax payload."""
+        mod = ir.Module.create()
+        dtype = get_mlir_elem_type(self.dtype_str)
+        memref_t = ir.MemRefType.get(self.shape, dtype)
+        
+        with ir.InsertionPoint(mod.body):
+            # Function signature: payload(output, input)
+            @func_cif(memref_t, memref_t, name=self.payload_function_name)
+            def payload(output, input_arg):
+                # Convert memrefs to tensors
+                output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True)
+                input_tensor = emit_buf_to_tensor(input_arg, restrict=True)
+                
+                # Create intermediate buffer for softmax (used internally by linalg.softmax)
+                # This stores the sum of exp values
+                M, N = self.shape
+                softmax_buf_type = ir.MemRefType.get((M,N), dtype)
+                softmax_buf = gpu.alloc(softmax_buf_type, None, [], [], [])
+                softmax_buf_tensor = emit_buf_to_tensor(softmax_buf, restrict=True, writable=True)
+                
+                # Compute softmax along dimension 1 (rows)
+                # linalg.softmax performs: exp(x - max(x)) / sum(exp(x - max(x)))
+                result = linalg.softmax(
+                    (input_tensor.type,), input_tensor, softmax_buf_tensor, dimension=1
+                )
+                
+                # Materialize result back to output memref
+                bufferization.materialize_in_destination(
+                    None, result, output, restrict=True, writable=True
+                )
+                
+                # Cleanup
+                gpu.dealloc(None, [], softmax_buf)
+
+            # Emit utility functions for GPU memory management
+            emit_gpu_util_funcs(dtype, rank=2)
+
+        return mod
+
+    def schedule_module(
+        self, stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None
+    ) -> ir.Module:
+        """
+        Generate transform schedule for softmax.
+        
+        For now, returns an empty schedule. In the future, this would contain
+        tiling, vectorization, and XeGPU-specific lowering transformations.
+        """
+        # TODO: Implement proper transform schedule
+        # For now, create a minimal schedule that just applies bufferization
+        mod = ir.Module.create()
+        with ir.InsertionPoint(mod.body):
+            from mlir.dialects import transform
+            
+            # Create a simple transform sequence
+            @func_cif(name="__transform_main")
+            def transform_main():
+                # Empty transform - just identity
+                # In a full implementation, this would tile, vectorize,
+                # and lower to XeGPU operations
+                pass
+        
+        return mod
+
+    def shared_libs(self) -> list[str]:
+        return ["libmlir_levelzero_runtime.so"]
+
+
+def parse_cli():
+    parser = argparse.ArgumentParser(
+        description="Softmax using MLIR XeGPU",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--sizes",
+        type=int,
+        nargs=2,
+        default=[1024, 512],
+        help="M,N matrix sizes (MxN)",
+    )
+    parser.add_argument(
+        "--wg-tile",
+        type=int,
+        nargs=2,
+        default=[64, 32],
+        help="Workgroup tile size M,N.",
+    )
+    parser.add_argument(
+        "--nruns",
+        type=int,
+        default=1000,
+        help="Number of runs to average the execution time.",
+    )
+    parser.add_argument(
+        "--nwarmup",
+        type=int,
+        default=20,
+        help="Number of warm-up iterations before benchmarking.",
+    )
+    parser.add_argument(
+        "--check-result",
+        action="store_true",
+        help="Check the result of the softmax computation.",
+    )
+    parser.add_argument(
+        "--dump-kernel",
+        type=str,
+        choices=[
+            "initial",
+            "tiled",
+            "vectorized",
+            "bufferized",
+            "xegpu-initial",
+            "xegpu-wg",
+            "final",
+        ],
+        help="Dump kernel IR at different stages of lowering and exit without "
+        "executing the kernel.",
+    )
+    parser.add_argument(
+        "--dump-schedule",
+        action="store_true",
+        help="Dump transform schedule.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_cli()
+
+    params = {
+        "wg_m": args.wg_tile[0],
+        "wg_n": args.wg_tile[1],
+    }
+
+    M, N = args.sizes
+    dtype = "f32"
+
+    with ir.Context(), ir.Location.unknown():
+        wload = XeGPUSoftmax(M=M, N=N, dtype=dtype)
+
+        if args.dump_kernel or args.dump_schedule:
+            wload.lower_payload(
+                dump_payload=args.dump_kernel,
+                dump_schedule=args.dump_schedule,
+                schedule_parameters=params,
+            )
+        else:
+            times = benchmark(
+                wload,
+                nruns=args.nruns,
+                nwarmup=args.nwarmup,
+                schedule_parameters=params,
+                check_correctness=args.check_result,
+                verbose=1,
+            )
+            times *= 1e6  # convert to microseconds
+            elapsed = np.mean(times)
+            flop_count = wload.get_complexity()[0]
+            gflops = flop_count / (elapsed * 1e-6) / 1e9
+
+            def list2str(a):
+                return ",".join(map(str, a))
+
+            parts = [
+                f"sizes={list2str(args.sizes)}",
+                f"dt={dtype}",
+                f"wg-tile={list2str(args.wg_tile)}",
+                f"time(us): {elapsed:.2f}",
+                f"GFLOPS: {gflops:.2f}",
+            ]
+            print(" ".join(parts))

From f991027da04fd09f164f10b771ea470b9b577519 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Mon, 16 Mar 2026 22:46:40 +0000
Subject: [PATCH 02/38] save work

---
 examples/xegpu/softmax.py | 93 ++++++++++++++++++++++++++++++++-------
 1 file changed, 76 insertions(+), 17 deletions(-)

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
index bb90f812..ab61f1e5 100644
--- a/examples/xegpu/softmax.py
+++ b/examples/xegpu/softmax.py
@@ -13,9 +13,9 @@
 import numpy as np
 from mlir import ir
 from mlir.execution_engine import ExecutionEngine
-from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func
+from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func, math
 
-from lighthouse.workload import benchmark
+from lighthouse.workload import benchmark, get_bench_wrapper_schedule
 from lighthouse.utils.memref import to_ctype as memref_to_ctype
 from lighthouse.utils.numpy import numpy_to_ctype
 from lighthouse.utils.mlir import func_cif
@@ -174,35 +174,94 @@ def payload(output, input_arg):
                 output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True)
                 input_tensor = emit_buf_to_tensor(input_arg, restrict=True)
                 
-                # Create intermediate buffer for softmax (used internally by linalg.softmax)
-                # This stores the sum of exp values
                 M, N = self.shape
-                softmax_buf_type = ir.MemRefType.get((M,N), dtype)
-                softmax_buf = gpu.alloc(softmax_buf_type, None, [], [], [])
-                softmax_buf_tensor = emit_buf_to_tensor(softmax_buf, restrict=True, writable=True)
                 
-                # Compute softmax along dimension 1 (rows)
-                # linalg.softmax performs: exp(x - max(x)) / sum(exp(x - max(x)))
-                result = linalg.softmax(
-                    (input_tensor.type,), input_tensor, softmax_buf_tensor, dimension=1
+                # Define affine maps for indexing
+                # #map = affine_map<(d0, d1) -> (d0, d1)>  (identity 2D)
+                # #map1 = affine_map<(d0, d1) -> (d0)>     (broadcast/reduce along d1)
+                d0 = ir.AffineDimExpr.get(0)
+                d1 = ir.AffineDimExpr.get(1)
+                map_2d = ir.AffineMap.get(2, 0, [d0, d1])
+                map_1d = ir.AffineMap.get(2, 0, [d0])
+                
+                # Step 1: Find max - linalg.generic reduction
+                neg_inf = arith.constant(dtype, float('-inf'))
+                max_init = tensor.empty((M,), dtype)
+                max_filled = linalg.fill(neg_inf, outs=[max_init])
+                
+                @linalg.generic(
+                    [input_tensor],  # inputs
+                    [max_filled],  # outputs
+                    [map_2d, map_1d],  # indexing_maps
+                    [linalg.IteratorType.parallel, linalg.IteratorType.reduction],  # iterator_types
+                )
+                def row_max(in_val, acc):
+                    return arith.maximumf(in_val, acc)
+                
+                # Step 2: Subtract max (broadcast) - linalg.generic elementwise
+                output_init = tensor.empty((M, N), dtype)
+                
+                @linalg.generic(
+                    [input_tensor, row_max],  # inputs
+                    [output_init],  # outputs
+                    [map_2d, map_1d, map_2d],  # indexing_maps
+                    [linalg.IteratorType.parallel, linalg.IteratorType.parallel],  # iterator_types
+                )
+                def shifted(in_val, max_val, out):
+                    return arith.subf(in_val, max_val)
+                
+                # Step 3: Compute exp - linalg.generic elementwise
+                @linalg.generic(
+                    [shifted],  # inputs
+                    [output_init],  # outputs
+                    [map_2d, map_2d],  # indexing_maps
+                    [linalg.IteratorType.parallel, linalg.IteratorType.parallel],  # iterator_types
+                )
+                def exp_vals(in_val, out):
+                    return math.exp(in_val)
+                
+                # Step 4: Sum exp values - linalg.generic reduction
+                # Create collapsed tensor for sum init
+                # sum_init_2d = tensor.empty((M, 1), dtype)
+                sum_init = tensor.empty((M,), dtype)
+                # sum_init = tensor.CollapseShapeOp(sum_init_2d, [[0, 1]])
+
+                
+                zero = arith.constant(dtype, 0.0)
+                sum_filled = linalg.fill(zero, outs=[sum_init])
+                
+                @linalg.generic(
+                    [exp_vals],  # inputs
+                    [sum_filled],  # outputs
+                    [map_2d, map_1d],  # indexing_maps
+                    [linalg.IteratorType.parallel, linalg.IteratorType.reduction],  # iterator_types
                 )
+                def row_sum(in_val, acc):
+                    return arith.addf(in_val, acc)
+                
+                # Step 5: Divide by sum (broadcast) - linalg.generic elementwise
+                @linalg.generic(
+                    [exp_vals, row_sum],  # inputs
+                    [output_init],  # outputs
+                    [map_2d, map_1d, map_2d],  # indexing_maps
+                    [linalg.IteratorType.parallel, linalg.IteratorType.parallel],  # iterator_types
+                )
+                def result(exp_val, sum_val, out):
+                    return arith.divf(exp_val, sum_val)
                 
                 # Materialize result back to output memref
                 bufferization.materialize_in_destination(
                     None, result, output, restrict=True, writable=True
                 )
-                
-                # Cleanup
-                gpu.dealloc(None, [], softmax_buf)
 
             # Emit utility functions for GPU memory management
             emit_gpu_util_funcs(dtype, rank=2)
 
         return mod
 
-    def schedule_module(
+    def schedule_modules(
         self, stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None
-    ) -> ir.Module:
+    ) -> list[ir.Module]:
         """
         Generate transform schedule for softmax.
         
@@ -223,7 +282,7 @@ def transform_main():
                 # and lower to XeGPU operations
                 pass
         
-        return mod
+        return [get_bench_wrapper_schedule(self), mod]
 
     def shared_libs(self) -> list[str]:
         return ["libmlir_levelzero_runtime.so"]

From 22415bb54f34727337b66392f4585f09110d2b6a Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Mon, 16 Mar 2026 22:48:34 +0000
Subject: [PATCH 03/38] save work

---
 examples/xegpu/softmax.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
index ab61f1e5..dff8bd99 100644
--- a/examples/xegpu/softmax.py
+++ b/examples/xegpu/softmax.py
@@ -224,7 +224,7 @@ def exp_vals(in_val, out):
                 # Create collapsed tensor for sum init
                 # sum_init_2d = tensor.empty((M, 1), dtype)
                 sum_init = tensor.empty((M,), dtype)
-                # sum_init = tensor.CollapseShapeOp(sum_init_2d, [[0, 1]])
+                # tensor.CollapseShapeOp(sum_init, sum_init_2d, [[0, 1]])
 
                 
                 zero = arith.constant(dtype, 0.0)

From cb9ead174134465610ef797dbb33fbee0196e1cd Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Tue, 17 Mar 2026 22:05:47 +0000
Subject: [PATCH 04/38] save work

---
 examples/xegpu/softmax.py | 64 +++++++++++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 9 deletions(-)

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
index dff8bd99..97abd49a 100644
--- a/examples/xegpu/softmax.py
+++ b/examples/xegpu/softmax.py
@@ -269,20 +269,66 @@ def schedule_modules(
         tiling, vectorization, and XeGPU-specific lowering transformations.
         """
         # TODO: Implement proper transform schedule
-        # For now, create a minimal schedule that just applies bufferization
+        # For now, create a minimal schedule that prints the last linalg operation
         mod = ir.Module.create()
+        mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get()
+        
         with ir.InsertionPoint(mod.body):
             from mlir.dialects import transform
+            from mlir.dialects.transform import structured
+            
+            # Create a transform sequence with proper signature
+            named_sequence = transform.named_sequence(
+                "__transform_main",
+                [transform.AnyOpType.get()],  # input: module
+                [],  # no outputs
+                arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}]
+            )
             
-            # Create a simple transform sequence
-            @func_cif(name="__transform_main")
-            def transform_main():
-                # Empty transform - just identity
-                # In a full implementation, this would tile, vectorize,
-                # and lower to XeGPU operations
-                pass
+            with ir.InsertionPoint(named_sequence.body):
+                # Get the input module (bodyTarget)
+                payload_mod = named_sequence.bodyTarget
+                
+                # Match all linalg.generic operations
+                # We have 5 generic ops in softmax: max, sub, exp, sum, div
+                generic_ops = structured.structured_match(
+                    transform.AnyOpType.get(),
+                    payload_mod,
+                    ops=["linalg.generic"]
+                )
+                
+                # Split the handle into individual operation handles
+                # For softmax, we have 5 operations
+                anytype = transform.AnyOpType.get()
+                split_ops = transform.split_handle(
+                    (anytype, anytype, anytype, anytype, anytype),  # 5 result types
+                    generic_ops
+                )
+                
+                # The last operation (index 4) is the division
+                last_op = split_ops[-1]
+
+                # Print the last operation before tiling
+                # transform.print_(target=last_op, name="last_linalg_generic_before_tiling")
+
+                # Tile the last operation using tile_using_forall
+                # Tile sizes: [64, 64] for the two parallel dimensions (M, N)
+                tiled_op, for_op = structured.structured_tile_using_forall(
+                    anytype, anytype,
+                    last_op,
+                    num_threads=[],
+                    tile_sizes=[],
+                    static_tile_sizes=[64, 64],
+                )
+
+                # Print the tiled operation
+                # transform.print_(target=tiled_op, name="tiled_linalg_generic")
+                # transform.print_(target=for_op, name="forall_op")
+
+                # Required: yield to end the transform sequence
+                transform.yield_()
         
-        return [get_bench_wrapper_schedule(self), mod]
+        return [mod]
 
     def shared_libs(self) -> list[str]:
         return ["libmlir_levelzero_runtime.so"]

From ac39be33c3b90e6eb77c16237611fd33d4490695 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Wed, 18 Mar 2026 22:41:32 +0000
Subject: [PATCH 05/38] save work

---
 examples/xegpu/softmax.py | 86 +++++++++++++++++++++++++++++++++++----
 1 file changed, 78 insertions(+), 8 deletions(-)

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
index 97abd49a..6d6f87eb 100644
--- a/examples/xegpu/softmax.py
+++ b/examples/xegpu/softmax.py
@@ -14,6 +14,8 @@
 from mlir import ir
 from mlir.execution_engine import ExecutionEngine
 from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func, math
+from mlir.dialects import transform
+from mlir.dialects.transform import structured, loop
 
 from lighthouse.workload import benchmark, get_bench_wrapper_schedule
 from lighthouse.utils.memref import to_ctype as memref_to_ctype
@@ -21,9 +23,25 @@
 from lighthouse.utils.mlir import func_cif
 from lighthouse.ingress.mlir_gen import get_mlir_elem_type
 from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor
+from lighthouse.pipeline.helper import (
+    apply_registered_pass,
+    canonicalize,
+    match,
+)
+from mlir.dialects.transform import bufferization as transform_bufferization
+from mlir.dialects.bufferization import LayoutMapOption
 
 from xegpu_workload import XeGPUWorkload
 
+def match_and_split(*args, nhandles=1, **kwargs):
+    """Helper function that splits matched handles."""
+    matched = match(*args, **kwargs)
+    anytype = transform.AnyOpType.get()
+    matched_ops = transform.split_handle((anytype,) * nhandles, matched)
+    if nhandles == 1:
+        matched_ops = [matched_ops]
+    return matched_ops
+
 
 def softmax_complexity(M: int, N: int, nbytes: int):
     """
@@ -274,8 +292,7 @@ def schedule_modules(
         mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get()
         
         with ir.InsertionPoint(mod.body):
-            from mlir.dialects import transform
-            from mlir.dialects.transform import structured
+
             
             # Create a transform sequence with proper signature
             named_sequence = transform.named_sequence(
@@ -305,8 +322,11 @@ def schedule_modules(
                     generic_ops
                 )
                 
-                # The last operation (index 4) is the division
-                last_op = split_ops[-1]
+                # Reverse split_ops to have operations in reverse order
+                split_ops = list(reversed(split_ops))
+                
+                # The first operation (after reversal) is the division - this is the consumer
+                last_op = split_ops[0]
 
                 # Print the last operation before tiling
                 # transform.print_(target=last_op, name="last_linalg_generic_before_tiling")
@@ -318,12 +338,62 @@ def schedule_modules(
                     last_op,
                     num_threads=[],
                     tile_sizes=[],
-                    static_tile_sizes=[64, 64],
+                    static_tile_sizes=(64,),
                 )
 
-                # Print the tiled operation
-                # transform.print_(target=tiled_op, name="tiled_linalg_generic")
-                # transform.print_(target=for_op, name="forall_op")
+                # Fuse the producer operations into the forall loop
+                # Iterate through remaining operations (already in reverse order)
+                current_forall = for_op
+                for producer_op in split_ops[1:]:
+                    fused_op, current_forall = structured.structured_fuse_into_containing_op(
+                        anytype, anytype,
+                        producer_op,
+                        current_forall
+                    )
+                    
+                func = transform.get_parent_op(
+                    anytype,
+                    current_forall,
+                    op_name="func.func",
+                    deduplicate=True,
+                )
+                transform.apply_cse(func)
+                canonicalize(func)
+                func = apply_registered_pass(func, "eliminate-empty-tensors")
+                func = structured.VectorizeChildrenAndApplyPatternsOp(
+                    func,
+                    fold_type_extensions_into_contract=True,
+                ).result
+                identity_layout = LayoutMapOption.IdentityLayoutMap
+                payload_mod = transform.get_parent_op(
+                    anytype,
+                    func,
+                    op_name="builtin.module",
+                    deduplicate=True,
+                )
+                payload_mod = transform_bufferization.OneShotBufferizeOp(
+                    payload_mod,
+                    allow_return_allocs_from_loops=True,
+                    bufferize_function_boundaries=True,
+                    function_boundary_type_conversion=identity_layout,
+                ).result
+                payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops")
+                transform.apply_cse(payload_mod)
+                canonicalize(payload_mod)
+                
+                # convert forall to parallel
+                wg_loops = match_and_split(payload_mod, ops={"scf.forall"})
+                for wg_loop in wg_loops:
+                    wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop)
+                func = transform.get_parent_op(anytype, wg_loop)
+
+                # convert to scf.parallel to gpu.launch
+                func = apply_registered_pass(func, "gpu-map-parallel-loops")
+                func = apply_registered_pass(func, "convert-parallel-loops-to-gpu")
+                func = apply_registered_pass(func, "lower-affine")
+                transform.apply_cse(func)
+                canonicalize(func)
+                
 
                 # Required: yield to end the transform sequence
                 transform.yield_()

From 51d494e23aa34e3166210e080aa23663e98924c2 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Fri, 20 Mar 2026 17:10:46 +0000
Subject: [PATCH 06/38] save work

---
 examples/xegpu/softmax.py | 67 +++++++++++++++++++++++++++++++++------
 1 file changed, 57 insertions(+), 10 deletions(-)

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
index 6d6f87eb..be29cdd0 100644
--- a/examples/xegpu/softmax.py
+++ b/examples/xegpu/softmax.py
@@ -15,7 +15,7 @@
 from mlir.execution_engine import ExecutionEngine
 from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func, math
 from mlir.dialects import transform
-from mlir.dialects.transform import structured, loop
+from mlir.dialects.transform import structured, loop, xegpu
 
 from lighthouse.workload import benchmark, get_bench_wrapper_schedule
 from lighthouse.utils.memref import to_ctype as memref_to_ctype
@@ -338,7 +338,7 @@ def schedule_modules(
                     last_op,
                     num_threads=[],
                     tile_sizes=[],
-                    static_tile_sizes=(64,),
+                    static_tile_sizes=(parameters["wg_rows"],),
                 )
 
                 # Fuse the producer operations into the forall loop
@@ -393,6 +393,39 @@ def schedule_modules(
                 func = apply_registered_pass(func, "lower-affine")
                 transform.apply_cse(func)
                 canonicalize(func)
+                # set the number of threads for the gpu.launch operation
+                launch_op = match_and_split(func, ops={"gpu.launch"})
+                num_threads = parameters["sg_rows"] * parameters["subgroup_size"]
+                xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1])
+                
+                # outline gpu func
+                func = apply_registered_pass(func, "lower-affine")
+                canonicalize(func)
+                func = apply_registered_pass(func, "gpu-launch-sink-index-computations")
+                payload_mod = transform.get_parent_op(
+                    anytype,
+                    func,
+                    op_name="builtin.module",
+                    deduplicate=True,
+                )
+                payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining")
+                # transform.PrintOp(target=payload_mod, name="before_gpu_outlining")
+                # transform.apply_cse(payload_mod)
+
+                # set xevm target
+                # payload_mod = apply_registered_pass(
+                #     payload_mod,
+                #     "xevm-attach-target",
+                #     options={"O": "3", "chip": "bmg"},
+                # )
+
+                # # convert vector to xegpu
+                # gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"})
+                # for gpu_mod in gpu_mod_ops:
+                #     gpu_func = match(gpu_mod, ops={"gpu.func"})
+                #     gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu")
+                #     transform.apply_cse(gpu_func)
+                
                 
 
                 # Required: yield to end the transform sequence
@@ -413,15 +446,26 @@ def parse_cli():
         "--sizes",
         type=int,
         nargs=2,
-        default=[1024, 512],
+        default=[1024, 64],
         help="M,N matrix sizes (MxN)",
     )
     parser.add_argument(
-        "--wg-tile",
+        "--wg-rows",
         type=int,
-        nargs=2,
-        default=[64, 32],
-        help="Workgroup tile size M,N.",
+        default=64,
+        help="Number of rows per workgroup.",
+    )
+    parser.add_argument(
+        "--sg-rows",
+        type=int,
+        default=8,
+        help="Number of rows per subgroup.",
+    )
+    parser.add_argument(
+        "--subgroup-size",
+        type=int,
+        default=16,
+        help="Subgroup size.",
     )
     parser.add_argument(
         "--nruns",
@@ -468,8 +512,9 @@ def parse_cli():
     args = parse_cli()
 
     params = {
-        "wg_m": args.wg_tile[0],
-        "wg_n": args.wg_tile[1],
+        "wg_rows": args.wg_rows,
+        "sg_rows": args.sg_rows,
+        "subgroup_size": args.subgroup_size,
     }
 
     M, N = args.sizes
@@ -504,7 +549,9 @@ def list2str(a):
             parts = [
                 f"sizes={list2str(args.sizes)}",
                 f"dt={dtype}",
-                f"wg-tile={list2str(args.wg_tile)}",
+                f"wg-rows={args.wg_rows}",
+                f"sg-rows={args.sg_rows}",
+                f"subgroup-size={args.subgroup_size}",
                 f"time(us): {elapsed:.2f}",
                 f"GFLOPS: {gflops:.2f}",
             ]

From 7ac8852412f06100c2cf188c37c2f18254e38c83 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Fri, 20 Mar 2026 21:37:04 +0000
Subject: [PATCH 07/38] save work

---
 examples/xegpu/softmax.py | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
index be29cdd0..56b43c21 100644
--- a/examples/xegpu/softmax.py
+++ b/examples/xegpu/softmax.py
@@ -408,23 +408,34 @@ def schedule_modules(
                     op_name="builtin.module",
                     deduplicate=True,
                 )
-                payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining")
+                # payload = match(payload_mod, ops={"func.func"})
                 # transform.PrintOp(target=payload_mod, name="before_gpu_outlining")
-                # transform.apply_cse(payload_mod)
+                payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining")
+                transform.apply_cse(payload_mod)
 
                 # set xevm target
-                # payload_mod = apply_registered_pass(
-                #     payload_mod,
-                #     "xevm-attach-target",
-                #     options={"O": "3", "chip": "bmg"},
-                # )
-
-                # # convert vector to xegpu
-                # gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"})
+                payload_mod = apply_registered_pass(
+                    payload_mod,
+                    "xevm-attach-target",
+                    options={"O": "3", "chip": "bmg"},
+                )
+
+                # convert vector to xegpu
+                gpu_mod = match_and_split(payload_mod, ops={"gpu.module"})
                 # for gpu_mod in gpu_mod_ops:
-                #     gpu_func = match(gpu_mod, ops={"gpu.func"})
-                #     gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu")
-                #     transform.apply_cse(gpu_func)
+                gpu_func = match(gpu_mod[0], ops={"gpu.func"})
+                gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu")
+                transform.apply_cse(gpu_func)
+                
+                # Set layout attributes for xegpu.store_nd operations
+                store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1)
+                # for store_op in store_ops:
+                xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64])
+                
+                payload_mod = apply_registered_pass(
+                    payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"}
+                )
+                
                 
                 
From d65bf9fe47d2c95c5fdc986177e604ec4c0b6f82 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Fri, 20 Mar 2026 22:58:52 +0000
Subject: [PATCH 08/38] save work

---
 examples/xegpu/softmax.py | 88 ++++++++++++++++++++++++++++++---------
 1 file changed, 68 insertions(+), 20 deletions(-)

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
index 56b43c21..2debc8c8 100644
--- a/examples/xegpu/softmax.py
+++ b/examples/xegpu/softmax.py
@@ -311,14 +311,14 @@ def schedule_modules(
                 generic_ops = structured.structured_match(
                     transform.AnyOpType.get(),
                     payload_mod,
-                    ops=["linalg.generic"]
+                    ops=["linalg.generic", "linalg.fill"]
                 )
                 
                 # Split the handle into individual operation handles
                 # For softmax, we have 5 operations
                 anytype = transform.AnyOpType.get()
                 split_ops = transform.split_handle(
-                    (anytype, anytype, anytype, anytype, anytype),  # 5 result types
+                    (anytype, anytype, anytype, anytype, anytype, anytype, anytype),  # 7 result types
                     generic_ops
                 )
                 
@@ -350,20 +350,23 @@ def schedule_modules(
                         producer_op,
                         current_forall
                     )
-                    
-                func = transform.get_parent_op(
-                    anytype,
-                    current_forall,
-                    op_name="func.func",
-                    deduplicate=True,
-                )
-                transform.apply_cse(func)
-                canonicalize(func)
-                func = apply_registered_pass(func, "eliminate-empty-tensors")
+                transform.annotate(current_forall, "gpu_loop")
+                
+                transform.apply_cse(payload_mod)
+                # canonicalize(payload_mod)
+                
+                # Vectorize and bufferize sequence
+                func = match(payload_mod, ops={"func.func"})
                 func = structured.VectorizeChildrenAndApplyPatternsOp(
                     func,
                     fold_type_extensions_into_contract=True,
                 ).result
+                loops = match_and_split(payload_mod, ops={"scf.forall"})
+                loop.loop_hoist_loop_invariant_subsets(loops[0])
+                transform.apply_cse(payload_mod)
+                canonicalize(payload_mod)
+                # transform.PrintOp(target=payload_mod, name="vectorize")
+                
                 identity_layout = LayoutMapOption.IdentityLayoutMap
                 payload_mod = transform.get_parent_op(
                     anytype,
@@ -377,28 +380,73 @@ def schedule_modules(
                     bufferize_function_boundaries=True,
                     function_boundary_type_conversion=identity_layout,
                 ).result
+                # payload_mod = transform_bufferization.OneShotBufferizeOp(
+                #     payload_mod,
+                #     allow_return_allocs_from_loops=False,
+                #     bufferize_function_boundaries=True,
+                #     function_boundary_type_conversion=identity_layout,
+                # ).result
                 payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops")
+                payload_mod = apply_registered_pass(payload_mod, "drop-equivalent-buffer-results")
+                payload_mod = apply_registered_pass(
+                    payload_mod,
+                    "buffer-results-to-out-params",
+                    options={
+                        "add-result-attr": "true",
+                        "hoist-dynamic-allocs": "true",
+                        "hoist-static-allocs": "true",
+                        "modify-public-functions": "true"
+                    }
+                )
                 transform.apply_cse(payload_mod)
                 canonicalize(payload_mod)
+                # # # transform.PrintOp(target=payload_mod, name="bufferize")
+                
+                # # func = match(payload_mod, ops={"func.func"})
+                gpu_loop = match(payload_mod, op_attrs={"gpu_loop": ir.UnitAttr.get()})
+                # # gpu_loop = transform.split_handle(anytype, gpu_loop)
+                gpu_loop = loop.loop_forall_to_parallel([anytype], gpu_loop)
+                
+                # # func = apply_registered_pass(payload_mod, "eliminate-empty-tensors")
+                # # func = structured.VectorizeChildrenAndApplyPatternsOp(
+                # #     func,
+                # #     fold_type_extensions_into_contract=True,
+                # # ).result
+                # # identity_layout = LayoutMapOption.IdentityLayoutMap
+                payload_mod = transform.get_parent_op(
+                    anytype,
+                    gpu_loop,
+                    op_name="func.func",
+                    deduplicate=True,
+                )
+                # payload_mod = transform_bufferization.OneShotBufferizeOp(
+                #     payload_mod,
+                #     allow_return_allocs_from_loops=True,
+                #     bufferize_function_boundaries=True,
+                #     function_boundary_type_conversion=identity_layout,
+                # ).result
+                # payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops")
+                # transform.apply_cse(payload_mod)
+                # canonicalize(payload_mod)
                 
-                # convert forall to parallel
-                wg_loops = match_and_split(payload_mod, ops={"scf.forall"})
-                for wg_loop in wg_loops:
-                    wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop)
-                func = transform.get_parent_op(anytype, wg_loop)
+                # # convert forall to parallel
+                # wg_loops = match_and_split(payload_mod, ops={"scf.forall"})
+                # for wg_loop in wg_loops:
+                #     wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop)
+                # func = transform.get_parent_op(anytype, wg_loop)
 
                 # convert to scf.parallel to gpu.launch
-                func = apply_registered_pass(func, "gpu-map-parallel-loops")
+                func = apply_registered_pass(payload_mod, "gpu-map-parallel-loops")
                 func = apply_registered_pass(func, "convert-parallel-loops-to-gpu")
                 func = apply_registered_pass(func, "lower-affine")
                 transform.apply_cse(func)
                 canonicalize(func)
-                # set the number of threads for the gpu.launch operation
+                # # set the number of threads for the gpu.launch operation
                 launch_op = match_and_split(func, ops={"gpu.launch"})
                 num_threads = parameters["sg_rows"] * parameters["subgroup_size"]
                 xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1])
                 
-                # outline gpu func
+                # # outline gpu func
                 func = apply_registered_pass(func, "lower-affine")
                 canonicalize(func)
                 func = apply_registered_pass(func, "gpu-launch-sink-index-computations")

From 0bf3eb32a7fa9fbd66b0a7a238fb302607c2896a Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Tue, 24 Mar 2026 20:32:41 +0000
Subject: [PATCH 09/38] save working version

---
 examples/xegpu/softmax.py | 138 ++++++++++++--------------------------
 1 file changed, 42 insertions(+), 96 deletions(-)

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
index 2debc8c8..e95286b5 100644
--- a/examples/xegpu/softmax.py
+++ b/examples/xegpu/softmax.py
@@ -17,6 +17,7 @@
 from mlir.dialects import transform
 from mlir.dialects.transform import structured, loop, xegpu
 
+from lighthouse import dialects as lh_dialects
 from lighthouse.workload import benchmark, get_bench_wrapper_schedule
 from lighthouse.utils.memref import to_ctype as memref_to_ctype
 from lighthouse.utils.numpy import numpy_to_ctype
@@ -54,7 +55,6 @@ def softmax_complexity(M: int, N: int, nbytes: int):
     Total: 3*N operations per row, but with transcendental (exp) operations
     """
     # Approximation: 5 FLOPs per element (max, sub, exp, sum, div)
-    # exp is expensive but we count it as ~1 FLOP for simplicity
     flop_count = M * N * 5
     memory_reads = M * N * nbytes  # read input
     memory_writes = M * N * nbytes  # write output
@@ -303,8 +303,15 @@ def schedule_modules(
             )
             
             with ir.InsertionPoint(named_sequence.body):
-                # Get the input module (bodyTarget)
-                payload_mod = named_sequence.bodyTarget
+                # match the payload module
+                anytype = transform.AnyOpType.get()
+                func = match(named_sequence.bodyTarget, ops={"func.func"})
+                payload_mod = transform.get_parent_op(
+                    anytype,
+                    func,
+                    op_name="builtin.module",
+                    deduplicate=True,
+                )
                 
                 # Match all linalg.generic operations
                 # We have 5 generic ops in softmax: max, sub, exp, sum, div
@@ -315,7 +322,7 @@ def schedule_modules(
                 )
                 
                 # Split the handle into individual operation handles
-                # For softmax, we have 5 operations
+                # For softmax, we have 7 operations
                 anytype = transform.AnyOpType.get()
                 split_ops = transform.split_handle(
                     (anytype, anytype, anytype, anytype, anytype, anytype, anytype),  # 7 result types
@@ -350,117 +357,59 @@ def schedule_modules(
                         producer_op,
                         current_forall
                     )
-                transform.annotate(current_forall, "gpu_loop")
-                
-                transform.apply_cse(payload_mod)
-                # canonicalize(payload_mod)
+                    
+                func = transform.get_parent_op(
+                    anytype,
+                    current_forall,
+                    op_name="func.func",
+                    deduplicate=True,
+                )
+                transform.apply_cse(func)
+                canonicalize(func)
                 
-                # Vectorize and bufferize sequence
-                func = match(payload_mod, ops={"func.func"})
                 func = structured.VectorizeChildrenAndApplyPatternsOp(
                     func,
                     fold_type_extensions_into_contract=True,
                 ).result
-                loops = match_and_split(payload_mod, ops={"scf.forall"})
-                loop.loop_hoist_loop_invariant_subsets(loops[0])
-                transform.apply_cse(payload_mod)
-                canonicalize(payload_mod)
-                # transform.PrintOp(target=payload_mod, name="vectorize")
-                
+                transform.apply_cse(func)
+                canonicalize(func)
+                payload_mod = apply_registered_pass(payload_mod, "eliminate-empty-tensors")
                 identity_layout = LayoutMapOption.IdentityLayoutMap
-                payload_mod = transform.get_parent_op(
-                    anytype,
-                    func,
-                    op_name="builtin.module",
-                    deduplicate=True,
-                )
                 payload_mod = transform_bufferization.OneShotBufferizeOp(
                     payload_mod,
                     allow_return_allocs_from_loops=True,
                     bufferize_function_boundaries=True,
                     function_boundary_type_conversion=identity_layout,
                 ).result
-                # payload_mod = transform_bufferization.OneShotBufferizeOp(
-                #     payload_mod,
-                #     allow_return_allocs_from_loops=False,
-                #     bufferize_function_boundaries=True,
-                #     function_boundary_type_conversion=identity_layout,
-                # ).result
+                # fold memref.subviews into vector.transfer_read/write ops
                 payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops")
-                payload_mod = apply_registered_pass(payload_mod, "drop-equivalent-buffer-results")
-                payload_mod = apply_registered_pass(
-                    payload_mod,
-                    "buffer-results-to-out-params",
-                    options={
-                        "add-result-attr": "true",
-                        "hoist-dynamic-allocs": "true",
-                        "hoist-static-allocs": "true",
-                        "modify-public-functions": "true"
-                    }
-                )
                 transform.apply_cse(payload_mod)
                 canonicalize(payload_mod)
-                # # # transform.PrintOp(target=payload_mod, name="bufferize")
-                
-                # # func = match(payload_mod, ops={"func.func"})
-                gpu_loop = match(payload_mod, op_attrs={"gpu_loop": ir.UnitAttr.get()})
-                # # gpu_loop = transform.split_handle(anytype, gpu_loop)
-                gpu_loop = loop.loop_forall_to_parallel([anytype], gpu_loop)
                 
-                # # func = apply_registered_pass(payload_mod, "eliminate-empty-tensors")
-                # # func = structured.VectorizeChildrenAndApplyPatternsOp(
-                # #     func,
-                # #     fold_type_extensions_into_contract=True,
-                # # ).result
-                # # identity_layout = LayoutMapOption.IdentityLayoutMap
-                payload_mod = transform.get_parent_op(
-                    anytype,
-                    gpu_loop,
-                    op_name="func.func",
-                    deduplicate=True,
-                )
-                # payload_mod = transform_bufferization.OneShotBufferizeOp(
-                #     payload_mod,
-                #     allow_return_allocs_from_loops=True,
-                #     bufferize_function_boundaries=True,
-                #     function_boundary_type_conversion=identity_layout,
-                # ).result
-                # payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops")
-                # transform.apply_cse(payload_mod)
-                # canonicalize(payload_mod)
-                
-                # # convert forall to parallel
-                # wg_loops = match_and_split(payload_mod, ops={"scf.forall"})
-                # for wg_loop in wg_loops:
-                #     wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop)
-                # func = transform.get_parent_op(anytype, wg_loop)
-
+                # convert forall to parallel
+                wg_loops = match_and_split(payload_mod, ops={"scf.forall"})
+                for wg_loop in wg_loops:
+                    wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop)
+                func = transform.get_parent_op(anytype, wg_loop)
                 # convert to scf.parallel to gpu.launch
-                func = apply_registered_pass(payload_mod, "gpu-map-parallel-loops")
+                func = apply_registered_pass(func, "gpu-map-parallel-loops")
                 func = apply_registered_pass(func, "convert-parallel-loops-to-gpu")
                 func = apply_registered_pass(func, "lower-affine")
                 transform.apply_cse(func)
                 canonicalize(func)
-                # # set the number of threads for the gpu.launch operation
+                
+                # set the number of threads for the gpu.launch operation
                 launch_op = match_and_split(func, ops={"gpu.launch"})
                 num_threads = parameters["sg_rows"] * parameters["subgroup_size"]
                 xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1])
                 
-                # # outline gpu func
+                # outline gpu func
                 func = apply_registered_pass(func, "lower-affine")
                 canonicalize(func)
                 func = apply_registered_pass(func, "gpu-launch-sink-index-computations")
-                payload_mod = transform.get_parent_op(
-                    anytype,
-                    func,
-                    op_name="builtin.module",
-                    deduplicate=True,
-                )
-                # payload = match(payload_mod, ops={"func.func"})
-                # transform.PrintOp(target=payload_mod, name="before_gpu_outlining")
                 payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining")
                 transform.apply_cse(payload_mod)
-
+                
                 # set xevm target
                 payload_mod = apply_registered_pass(
                     payload_mod,
@@ -469,12 +418,12 @@ def schedule_modules(
                 )
 
                 # convert vector to xegpu
-                gpu_mod = match_and_split(payload_mod, ops={"gpu.module"})
-                # for gpu_mod in gpu_mod_ops:
-                gpu_func = match(gpu_mod[0], ops={"gpu.func"})
-                gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu")
-                transform.apply_cse(gpu_func)
-                
+                gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"})
+                for gpu_mod in gpu_mod_ops:
+                    gpu_func = match(gpu_mod, ops={"gpu.func"})
+                    gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu")
+                    transform.apply_cse(gpu_func)
+                    
                 # Set layout attributes for xegpu.store_nd operations
                 store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1)
                 # for store_op in store_ops:
@@ -483,14 +432,10 @@ def schedule_modules(
                 payload_mod = apply_registered_pass(
                     payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"}
                 )
-                
-                
-                
-
                 # Required: yield to end the transform sequence
                 transform.yield_()
         
-        return [mod]
+        return [get_bench_wrapper_schedule(self), mod]
 
     def shared_libs(self) -> list[str]:
         return ["libmlir_levelzero_runtime.so"]
@@ -580,6 +525,7 @@ def parse_cli():
     dtype = "f32"
 
     with ir.Context(), ir.Location.unknown():
+        lh_dialects.register_and_load()
         wload = XeGPUSoftmax(M=M, N=N, dtype=dtype)
 
         if args.dump_kernel or args.dump_schedule:

From fabd656c1267437174b7c7b9ed472c0b76556ee9 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Tue, 24 Mar 2026 21:48:44 +0000
Subject: [PATCH 10/38] save working version

---
 examples/xegpu/softmax.py                     | 288 +-----------------
 .../ingress/mlir_gen/gpu_softmax_payload.py   | 121 ++++++++
 lighthouse/schedule/xegpu/softmax_schedule.py | 195 ++++++++++++
 3 files changed, 332 insertions(+), 272 deletions(-)
 create mode 100644 lighthouse/ingress/mlir_gen/gpu_softmax_payload.py
 create mode 100644 lighthouse/schedule/xegpu/softmax_schedule.py

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
index e95286b5..474eb1a3 100644
--- a/examples/xegpu/softmax.py
+++ b/examples/xegpu/softmax.py
@@ -13,36 +13,17 @@
 import numpy as np
 from mlir import ir
 from mlir.execution_engine import ExecutionEngine
-from mlir.dialects import linalg, gpu, bufferization, arith, tensor, func, math
-from mlir.dialects import transform
-from mlir.dialects.transform import structured, loop, xegpu
 
 from lighthouse import dialects as lh_dialects
 from lighthouse.workload import benchmark, get_bench_wrapper_schedule
 from lighthouse.utils.memref import to_ctype as memref_to_ctype
 from lighthouse.utils.numpy import numpy_to_ctype
-from lighthouse.utils.mlir import func_cif
 from lighthouse.ingress.mlir_gen import get_mlir_elem_type
-from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor
-from lighthouse.pipeline.helper import (
-    apply_registered_pass,
-    canonicalize,
-    match,
-)
-from mlir.dialects.transform import bufferization as transform_bufferization
-from mlir.dialects.bufferization import LayoutMapOption
+from lighthouse.ingress.mlir_gen.gpu_softmax_payload import generate_gpu_softmax_payload
+from lighthouse.schedule.xegpu.softmax_schedule import get_softmax_schedule_module
 
 from xegpu_workload import XeGPUWorkload
 
-def match_and_split(*args, nhandles=1, **kwargs):
-    """Helper function that splits matched handles."""
-    matched = match(*args, **kwargs)
-    anytype = transform.AnyOpType.get()
-    matched_ops = transform.split_handle((anytype,) * nhandles, matched)
-    if nhandles == 1:
-        matched_ops = [matched_ops]
-    return matched_ops
-
 
 def softmax_complexity(M: int, N: int, nbytes: int):
     """
@@ -180,262 +161,25 @@ def get_complexity(self) -> tuple[int, int, int]:
 
     def payload_module(self) -> ir.Module:
         """Generate MLIR module for softmax payload."""
-        mod = ir.Module.create()
         dtype = get_mlir_elem_type(self.dtype_str)
-        memref_t = ir.MemRefType.get(self.shape, dtype)
-        
-        with ir.InsertionPoint(mod.body):
-            # Function signature: payload(output, input)
-            @func_cif(memref_t, memref_t, name=self.payload_function_name)
-            def payload(output, input_arg):
-                # Convert memrefs to tensors
-                output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True)
-                input_tensor = emit_buf_to_tensor(input_arg, restrict=True)
-                
-                M, N = self.shape
-                
-                # Define affine maps for indexing
-                # #map = affine_map<(d0, d1) -> (d0, d1)>  (identity 2D)
-                # #map1 = affine_map<(d0, d1) -> (d0)>     (broadcast/reduce along d1)
-                d0 = ir.AffineDimExpr.get(0)
-                d1 = ir.AffineDimExpr.get(1)
-                map_2d = ir.AffineMap.get(2, 0, [d0, d1])
-                map_1d = ir.AffineMap.get(2, 0, [d0])
-                
-                # Step 1: Find max - linalg.generic reduction
-                neg_inf = arith.constant(dtype, float('-inf'))
-                max_init = tensor.empty((M,), dtype)
-                max_filled = linalg.fill(neg_inf, outs=[max_init])
-                
-                @linalg.generic(
-                    [input_tensor],  # inputs
-                    [max_filled],  # outputs
-                    [map_2d, map_1d],  # indexing_maps
-                    [linalg.IteratorType.parallel, linalg.IteratorType.reduction],  # iterator_types
-                )
-                def row_max(in_val, acc):
-                    return arith.maximumf(in_val, acc)
-                
-                # Step 2: Subtract max (broadcast) - linalg.generic elementwise
-                output_init = tensor.empty((M, N), dtype)
-                
-                @linalg.generic(
-                    [input_tensor, row_max],  # inputs
-                    [output_init],  # outputs
-                    [map_2d, map_1d, map_2d],  # indexing_maps
-                    [linalg.IteratorType.parallel, linalg.IteratorType.parallel],  # iterator_types
-                )
-                def shifted(in_val, max_val, out):
-                    return arith.subf(in_val, max_val)
-                
-                # Step 3: Compute exp - linalg.generic elementwise
-                @linalg.generic(
-                    [shifted],  # inputs
-                    [output_init],  # outputs
-                    [map_2d, map_2d],  # indexing_maps
-                    [linalg.IteratorType.parallel, linalg.IteratorType.parallel],  # iterator_types
-                )
-                def exp_vals(in_val, out):
-                    return math.exp(in_val)
-                
-                # Step 4: Sum exp values - linalg.generic reduction
-                # Create collapsed tensor for sum init
-                # sum_init_2d = tensor.empty((M, 1), dtype)
-                sum_init = tensor.empty((M,), dtype)
-                # tensor.CollapseShapeOp(sum_init, sum_init_2d, [[0, 1]])
-
-                
-                zero = arith.constant(dtype, 0.0)
-                sum_filled = linalg.fill(zero, outs=[sum_init])
-                
-                @linalg.generic(
-                    [exp_vals],  # inputs
-                    [sum_filled],  # outputs
-                    [map_2d, map_1d],  # indexing_maps
-                    [linalg.IteratorType.parallel, linalg.IteratorType.reduction],  # iterator_types
-                )
-                def row_sum(in_val, acc):
-                    return arith.addf(in_val, acc)
-                
-                # Step 5: Divide by sum (broadcast) - linalg.generic elementwise
-                @linalg.generic(
-                    [exp_vals, row_sum],  # inputs
-                    [output_init],  # outputs
-                    [map_2d, map_1d, map_2d],  # indexing_maps
-                    [linalg.IteratorType.parallel, linalg.IteratorType.parallel],  # iterator_types
-                )
-                def result(exp_val, sum_val, out):
-                    return arith.divf(exp_val, sum_val)
-                
-                # Materialize result back to output memref
-                bufferization.materialize_in_destination(
-                    None, result, output, restrict=True, writable=True
-                )
-
-            # Emit utility functions for GPU memory management
-            emit_gpu_util_funcs(dtype, rank=2)
-
-        return mod
+        return generate_gpu_softmax_payload(
+            func_name=self.payload_function_name,
+            M=self.M,
+            N=self.N,
+            dtype=dtype,
+        )
 
     def schedule_modules(
         self, stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None
     ) -> list[ir.Module]:
-        """
-        Generate transform schedule for softmax.
-        
-        For now, returns an empty schedule. In the future, this would contain
-        tiling, vectorization, and XeGPU-specific lowering transformations.
-        """
-        # TODO: Implement proper transform schedule
-        # For now, create a minimal schedule that prints the last linalg operation
-        mod = ir.Module.create()
-        mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get()
-        
-        with ir.InsertionPoint(mod.body):
-
-            
-            # Create a transform sequence with proper signature
-            named_sequence = transform.named_sequence(
-                "__transform_main",
-                [transform.AnyOpType.get()],  # input: module
-                [],  # no outputs
-                arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}]
-            )
-            
-            with ir.InsertionPoint(named_sequence.body):
-                # match the payload module
-                anytype = transform.AnyOpType.get()
-                func = match(named_sequence.bodyTarget, ops={"func.func"})
-                payload_mod = transform.get_parent_op(
-                    anytype,
-                    func,
-                    op_name="builtin.module",
-                    deduplicate=True,
-                )
-                
-                # Match all linalg.generic operations
-                # We have 5 generic ops in softmax: max, sub, exp, sum, div
-                generic_ops = structured.structured_match(
-                    transform.AnyOpType.get(),
-                    payload_mod,
-                    ops=["linalg.generic", "linalg.fill"]
-                )
-                
-                # Split the handle into individual operation handles
-                # For softmax, we have 7 operations
-                anytype = transform.AnyOpType.get()
-                split_ops = transform.split_handle(
-                    (anytype, anytype, anytype, anytype, anytype, anytype, anytype),  # 7 result types
-                    generic_ops
-                )
-                
-                # Reverse split_ops to have operations in reverse order
-                split_ops = list(reversed(split_ops))
-                
-                # The first operation (after reversal) is the division - this is the consumer
-                last_op = split_ops[0]
-
-                # Print the last operation before tiling
-                # transform.print_(target=last_op, name="last_linalg_generic_before_tiling")
-
-                # Tile the last operation using tile_using_forall
-                # Tile sizes: [64, 64] for the two parallel dimensions (M, N)
-                tiled_op, for_op = structured.structured_tile_using_forall(
-                    anytype, anytype,
-                    last_op,
-                    num_threads=[],
-                    tile_sizes=[],
-                    static_tile_sizes=(parameters["wg_rows"],),
-                )
-
-                # Fuse the producer operations into the forall loop
-                # Iterate through remaining operations (already in reverse order)
-                current_forall = for_op
-                for producer_op in split_ops[1:]:
-                    fused_op, current_forall = structured.structured_fuse_into_containing_op(
-                        anytype, anytype,
-                        producer_op,
-                        current_forall
-                    )
-                    
-                func = transform.get_parent_op(
-                    anytype,
-                    current_forall,
-                    op_name="func.func",
-                    deduplicate=True,
-                )
-                transform.apply_cse(func)
-                canonicalize(func)
-                
-                func = structured.VectorizeChildrenAndApplyPatternsOp(
-                    func,
-                    fold_type_extensions_into_contract=True,
-                ).result
-                transform.apply_cse(func)
-                canonicalize(func)
-                payload_mod = apply_registered_pass(payload_mod, "eliminate-empty-tensors")
-                identity_layout = LayoutMapOption.IdentityLayoutMap
-                payload_mod = transform_bufferization.OneShotBufferizeOp(
-                    payload_mod,
-                    allow_return_allocs_from_loops=True,
-                    bufferize_function_boundaries=True,
-                    function_boundary_type_conversion=identity_layout,
-                ).result
-                # fold memref.subviews into vector.transfer_read/write ops
-                payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops")
-                transform.apply_cse(payload_mod)
-                canonicalize(payload_mod)
-                
-                # convert forall to parallel
-                wg_loops = match_and_split(payload_mod, ops={"scf.forall"})
-                for wg_loop in wg_loops:
-                    wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop)
-                func = transform.get_parent_op(anytype, wg_loop)
-                # convert to scf.parallel to gpu.launch
-                func = apply_registered_pass(func, "gpu-map-parallel-loops")
-                func = apply_registered_pass(func, "convert-parallel-loops-to-gpu")
-                func = apply_registered_pass(func, "lower-affine")
-                transform.apply_cse(func)
-                canonicalize(func)
-                
-                # set the number of threads for the gpu.launch operation
-                launch_op = match_and_split(func, ops={"gpu.launch"})
-                num_threads = parameters["sg_rows"] * parameters["subgroup_size"]
-                xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1])
-                
-                # outline gpu func
-                func = apply_registered_pass(func, "lower-affine")
-                canonicalize(func)
-                func = apply_registered_pass(func, "gpu-launch-sink-index-computations")
-                payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining")
-                transform.apply_cse(payload_mod)
-                
-                # set xevm target
-                payload_mod = apply_registered_pass(
-                    payload_mod,
-                    "xevm-attach-target",
-                    options={"O": "3", "chip": "bmg"},
-                )
-
-                # convert vector to xegpu
-                gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"})
-                for gpu_mod in gpu_mod_ops:
-                    gpu_func = match(gpu_mod, ops={"gpu.func"})
-                    gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu")
-                    transform.apply_cse(gpu_func)
-                    
-                # Set layout attributes for xegpu.store_nd operations
-                store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1)
-                # for store_op in store_ops:
-                xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64])
-                
-                payload_mod = apply_registered_pass(
-                    payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"}
-                )
-                # Required: yield to end the transform sequence
-                transform.yield_()
-        
-        return [get_bench_wrapper_schedule(self), mod]
+        """Generate transform schedule for softmax."""
+        return [
+            get_bench_wrapper_schedule(self),
+            get_softmax_schedule_module(
+                stop_at_stage=stop_at_stage,
+                parameters=parameters,
+            ),
+        ]
 
     def shared_libs(self) -> list[str]:
         return ["libmlir_levelzero_runtime.so"]
diff --git a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py
new file mode 100644
index 00000000..26fd9042
--- /dev/null
+++ b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py
@@ -0,0 +1,121 @@
+"""Generate MLIR payload for GPU softmax operation."""
+
+from mlir import ir
+from mlir.dialects import linalg, bufferization, arith, tensor, func, math
+
+from lighthouse.utils.mlir import func_cif
+from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor
+
+
+def generate_gpu_softmax_payload(
+    func_name: str,
+    M: int,
+    N: int,
+    dtype: ir.Type,
+) -> ir.Module:
+    """
+    Generate MLIR module for softmax payload.
+    
+    Computes softmax along the last dimension (rows):
+    output[i, j] = exp(input[i, j] - max_i) / sum_i(exp(input[i, j] - max_i))
+    
+    where max_i and sum_i are computed over row i.
+    
+    Args:
+        func_name: Name of the payload function
+        M: Number of rows
+        N: Number of columns  
+        dtype: MLIR element type (e.g., F32Type)
+        
+    Returns:
+        MLIR module containing the softmax payload function
+    """
+    mod = ir.Module.create()
+    shape = (M, N)
+    memref_t = ir.MemRefType.get(shape, dtype)
+    
+    with ir.InsertionPoint(mod.body):
+        # Function signature: payload(output, input)
+        @func_cif(memref_t, memref_t, name=func_name)
+        def payload(output, input_arg):
+            # Convert memrefs to tensors
+            output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True)
+            input_tensor = emit_buf_to_tensor(input_arg, restrict=True)
+            
+            # Define affine maps for indexing
+            # #map = affine_map<(d0, d1) -> (d0, d1)>  (identity 2D)
+            # #map1 = affine_map<(d0, d1) -> (d0)>     (broadcast/reduce along d1)
+            d0 = ir.AffineDimExpr.get(0)
+            d1 = ir.AffineDimExpr.get(1)
+            map_2d = ir.AffineMap.get(2, 0, [d0, d1])
+            map_1d = ir.AffineMap.get(2, 0, [d0])
+            
+            # Step 1: Find max - linalg.generic reduction
+            neg_inf = arith.constant(dtype, float('-inf'))
+            max_init = tensor.empty((M,), dtype)
+            max_filled = linalg.fill(neg_inf, outs=[max_init])
+            
+            @linalg.generic(
+                [input_tensor],  # inputs
+                [max_filled],  # outputs
+                [map_2d, map_1d],  # indexing_maps
+                [linalg.IteratorType.parallel, linalg.IteratorType.reduction],  # iterator_types
+            )
+            def row_max(in_val, acc):
+                return arith.maximumf(in_val, acc)
+            
+            # Step 2: Subtract max (broadcast) - linalg.generic elementwise
+            output_init = tensor.empty((M, N), dtype)
+            
+            @linalg.generic(
+                [input_tensor, row_max],  # inputs
+                [output_init],  # outputs
+                [map_2d, map_1d, map_2d],  # indexing_maps
+                [linalg.IteratorType.parallel, linalg.IteratorType.parallel],  # iterator_types
+            )
+            def shifted(in_val, max_val, out):
+                return arith.subf(in_val, max_val)
+            
+            # Step 3: Compute exp - linalg.generic elementwise
+            @linalg.generic(
+                [shifted],  # inputs
+                [output_init],  # outputs
+                [map_2d, map_2d],  # indexing_maps
+                [linalg.IteratorType.parallel, linalg.IteratorType.parallel],  # iterator_types
+            )
+            def exp_vals(in_val, out):
+                return math.exp(in_val)
+            
+            # Step 4: Sum exp values - linalg.generic reduction
+            sum_init = tensor.empty((M,), dtype)
+            zero = arith.constant(dtype, 0.0)
+            sum_filled = linalg.fill(zero, outs=[sum_init])
+            
+            @linalg.generic(
+                [exp_vals],  # inputs
+                [sum_filled],  # outputs
+                [map_2d, map_1d],  # indexing_maps
+                [linalg.IteratorType.parallel, linalg.IteratorType.reduction],  # iterator_types
+            )
+            def row_sum(in_val, acc):
+                return arith.addf(in_val, acc)
+            
+            # Step 5: Divide by sum (broadcast) - linalg.generic elementwise
+            @linalg.generic(
+                [exp_vals, row_sum],  # inputs
+                [output_init],  # outputs
+                [map_2d, map_1d, map_2d],  # indexing_maps
+                [linalg.IteratorType.parallel, linalg.IteratorType.parallel],  # iterator_types
+            )
+            def result(exp_val, sum_val, out):
+                return arith.divf(exp_val, sum_val)
+            
+            # Materialize result back to output memref
+            bufferization.materialize_in_destination(
+                None, result, output, restrict=True, writable=True
+            )
+
+        # Emit utility functions for GPU memory management
+        emit_gpu_util_funcs(dtype, rank=2)
+
+    return mod
diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py
new file mode 100644
index 00000000..e01f9af6
--- /dev/null
+++ b/lighthouse/schedule/xegpu/softmax_schedule.py
@@ -0,0 +1,195 @@
+"""Generate MLIR transform schedule for XeGPU softmax operation."""
+
+from typing import Optional
+
+from mlir import ir
+from mlir.dialects import transform
+from mlir.dialects.transform import structured, loop, xegpu
+from mlir.dialects.transform import bufferization as transform_bufferization
+from mlir.dialects.bufferization import LayoutMapOption
+
+from lighthouse.pipeline.helper import (
+    apply_registered_pass,
+    canonicalize,
+    match,
+)
+
+
+def match_and_split(*args, nhandles=1, **kwargs):
+    """Helper function that splits matched handles."""
+    matched = match(*args, **kwargs)
+    anytype = transform.AnyOpType.get()
+    matched_ops = transform.split_handle((anytype,) * nhandles, matched)
+    if nhandles == 1:
+        matched_ops = [matched_ops]
+    return matched_ops
+
+
+def get_softmax_schedule_module(
+    stop_at_stage: Optional[str] = None,
+    parameters: Optional[dict] = None,
+) -> ir.Module:
+    """
+    Generate transform schedule for softmax operation.
+    
+    The schedule performs the following transformations:
+    1. Tile the consumer operation (division) using forall
+    2. Fuse producer operations into the forall loop
+    3. Vectorize operations
+    4. Bufferize tensors
+    5. Convert to GPU dialect
+    6. Lower to XeGPU operations
+    
+    Args:
+        stop_at_stage: Optional stage name to stop early (for debugging)
+        parameters: Dictionary with scheduling parameters:
+            - wg_rows: Number of rows per workgroup
+            - sg_rows: Number of rows per subgroup  
+            - subgroup_size: Size of subgroup
+            
+    Returns:
+        MLIR module containing the transform schedule
+    """
+    assert parameters is not None, "Schedule parameters must be provided"
+    
+    mod = ir.Module.create()
+    mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get()
+    
+    with ir.InsertionPoint(mod.body):
+        # Create a transform sequence with proper signature
+        named_sequence = transform.named_sequence(
+            "__transform_main",
+            [transform.AnyOpType.get()],  # input: module
+            [],  # no outputs
+            arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}]
+        )
+        
+        with ir.InsertionPoint(named_sequence.body):
+            # match the payload module
+            anytype = transform.AnyOpType.get()
+            func = match(named_sequence.bodyTarget, ops={"func.func"})
+            payload_mod = transform.get_parent_op(
+                anytype,
+                func,
+                op_name="builtin.module",
+                deduplicate=True,
+            )
+            
+            # Match all linalg.generic and linalg.fill operations
+            # We have 7 operations in softmax: 
+            # fill(max_init), max, sub, exp, fill(sum_init), sum, div
+            generic_ops = structured.structured_match(
+                transform.AnyOpType.get(),
+                payload_mod,
+                ops=["linalg.generic", "linalg.fill"]
+            )
+            
+            # Split the handle into individual operation handles
+            anytype = transform.AnyOpType.get()
+            split_ops = transform.split_handle(
+                (anytype, anytype, anytype, anytype, anytype, anytype, anytype),  # 7 result types
+                generic_ops
+            )
+            
+            # Reverse split_ops to have operations in reverse order
+            split_ops = list(reversed(split_ops))
+            
+            # The first operation (after reversal) is the division - this is the consumer
+            last_op = split_ops[0]
+
+            # Tile the last operation using tile_using_forall
+            tiled_op, for_op = structured.structured_tile_using_forall(
+                anytype, anytype,
+                last_op,
+                num_threads=[],
+                tile_sizes=[],
+                static_tile_sizes=(parameters["wg_rows"],),
+            )
+
+            # Fuse the producer operations into the forall loop
+            # Iterate through remaining operations (already in reverse order)
+            current_forall = for_op
+            for producer_op in split_ops[1:]:
+                fused_op, current_forall = structured.structured_fuse_into_containing_op(
+                    anytype, anytype,
+                    producer_op,
+                    current_forall
+                )
+                
+            func = transform.get_parent_op(
+                anytype,
+                current_forall,
+                op_name="func.func",
+                deduplicate=True,
+            )
+            transform.apply_cse(func)
+            canonicalize(func)
+            
+            func = structured.VectorizeChildrenAndApplyPatternsOp(
+                func,
+                fold_type_extensions_into_contract=True,
+            ).result
+            transform.apply_cse(func)
+            canonicalize(func)
+            payload_mod = apply_registered_pass(payload_mod, "eliminate-empty-tensors")
+            identity_layout = LayoutMapOption.IdentityLayoutMap
+            payload_mod = transform_bufferization.OneShotBufferizeOp(
+                payload_mod,
+                allow_return_allocs_from_loops=True,
+                bufferize_function_boundaries=True,
+                function_boundary_type_conversion=identity_layout,
+            ).result
+            # fold memref.subviews into vector.transfer_read/write ops
+            payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops")
+            transform.apply_cse(payload_mod)
+            canonicalize(payload_mod)
+            
+            # convert forall to parallel
+            wg_loops = match_and_split(payload_mod, ops={"scf.forall"})
+            for wg_loop in wg_loops:
+                wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop)
+            func = transform.get_parent_op(anytype, wg_loop)
+            # convert scf.parallel to gpu.launch
+            func = apply_registered_pass(func, "gpu-map-parallel-loops")
+            func = apply_registered_pass(func, "convert-parallel-loops-to-gpu")
+            func = apply_registered_pass(func, "lower-affine")
+            transform.apply_cse(func)
+            canonicalize(func)
+            
+            # set the number of threads for the gpu.launch operation
+            launch_op = match_and_split(func, ops={"gpu.launch"})
+            num_threads = parameters["sg_rows"] * parameters["subgroup_size"]
+            xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1])
+            
+            # outline gpu func
+            func = apply_registered_pass(func, "lower-affine")
+            canonicalize(func)
+            func = apply_registered_pass(func, "gpu-launch-sink-index-computations")
+            payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining")
+            transform.apply_cse(payload_mod)
+            
+            # set xevm target
+            payload_mod = apply_registered_pass(
+                payload_mod,
+                "xevm-attach-target",
+                options={"O": "3", "chip": "bmg"},
+            )
+
+            # convert vector to xegpu
+            gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"})
+            for gpu_mod in gpu_mod_ops:
+                gpu_func = match(gpu_mod, ops={"gpu.func"})
+                gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu")
+                transform.apply_cse(gpu_func)
+                
+            # Set layout attributes for xegpu.store_nd operations
+            store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1)
+            xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64])
+            
+            payload_mod = apply_registered_pass(
+                payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"}
+            )
+            # Required: yield to end the transform sequence
+            transform.yield_()
+    
+    return mod

From 1e63d7de4e88047d2251643e5649a477fd0a7c1b Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Tue, 24 Mar 2026 22:15:39 +0000
Subject: [PATCH 11/38] save working version

---
 lighthouse/pipeline/helper.py                 |  16 +
 lighthouse/schedule/xegpu/mlp_schedule.py     |  18 +-
 lighthouse/schedule/xegpu/softmax_schedule.py | 307 +++++++++++-------
 3 files changed, 203 insertions(+), 138 deletions(-)

diff --git a/lighthouse/pipeline/helper.py b/lighthouse/pipeline/helper.py
index 213b45df..9c4820d5 100644
--- a/lighthouse/pipeline/helper.py
+++ b/lighthouse/pipeline/helper.py
@@ -35,3 +35,19 @@ def cleanup_func(target):
     func = structured.MatchOp.match_op_names(target, ["func.func"]).result
     transform.apply_cse(func)
     canonicalize(func)
+
+
+class PipelineInterrupt(Exception):
+    """Exception to signal early termination of the transform schedule."""
+
+    pass
+
+
+def match_and_split(*args, nhandles=1, **kwargs):
+    """Helper function that splits matched handles."""
+    matched = match(*args, **kwargs)
+    anytype = transform.AnyOpType.get()
+    matched_ops = transform.split_handle((anytype,) * nhandles, matched)
+    if nhandles == 1:
+        matched_ops = [matched_ops]
+    return matched_ops
diff --git a/lighthouse/schedule/xegpu/mlp_schedule.py b/lighthouse/schedule/xegpu/mlp_schedule.py
index e9fa7909..94cbdf01 100644
--- a/lighthouse/schedule/xegpu/mlp_schedule.py
+++ b/lighthouse/schedule/xegpu/mlp_schedule.py
@@ -11,6 +11,8 @@
     apply_registered_pass,
     canonicalize,
     match,
+    match_and_split,
+    PipelineInterrupt,
 )
 
 from lighthouse.dialects import smt_ext, transform_smt_ext as td_smt_ext
@@ -33,22 +35,6 @@
 MIN_NB_THREADS = 16
 
 
-class PipelineInterrupt(Exception):
-    """Exception to signal early termination of the transform schedule."""
-
-    pass
-
-
-def match_and_split(*args, nhandles=1, **kwargs):
-    """Helper function that splits matched handles."""
-    matched = match(*args, **kwargs)
-    anytype = transform.AnyOpType.get()
-    matched_ops = transform.split_handle((anytype,) * nhandles, matched)
-    if nhandles == 1:
-        matched_ops = [matched_ops]
-    return matched_ops
-
-
 @KnobValue.ast_rewrite(in_exprs=True)
 def params_with_constraints_imposed(
     params: dict[str, int | None], knob_name_prefix=""
diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py
index e01f9af6..98acd4e6 100644
--- a/lighthouse/schedule/xegpu/softmax_schedule.py
+++ b/lighthouse/schedule/xegpu/softmax_schedule.py
@@ -12,19 +12,11 @@
     apply_registered_pass,
     canonicalize,
     match,
+    match_and_split,
+    PipelineInterrupt,
 )
 
 
-def match_and_split(*args, nhandles=1, **kwargs):
-    """Helper function that splits matched handles."""
-    matched = match(*args, **kwargs)
-    anytype = transform.AnyOpType.get()
-    matched_ops = transform.split_handle((anytype,) * nhandles, matched)
-    if nhandles == 1:
-        matched_ops = [matched_ops]
-    return matched_ops
-
-
 def get_softmax_schedule_module(
     stop_at_stage: Optional[str] = None,
     parameters: Optional[dict] = None,
@@ -75,121 +67,192 @@ def get_softmax_schedule_module(
                 deduplicate=True,
             )
             
-            # Match all linalg.generic and linalg.fill operations
-            # We have 7 operations in softmax: 
-            # fill(max_init), max, sub, exp, fill(sum_init), sum, div
-            generic_ops = structured.structured_match(
-                transform.AnyOpType.get(),
+            xegpu_softmax_transform_schedule(
                 payload_mod,
-                ops=["linalg.generic", "linalg.fill"]
-            )
-            
-            # Split the handle into individual operation handles
-            anytype = transform.AnyOpType.get()
-            split_ops = transform.split_handle(
-                (anytype, anytype, anytype, anytype, anytype, anytype, anytype),  # 7 result types
-                generic_ops
-            )
-            
-            # Reverse split_ops to have operations in reverse order
-            split_ops = list(reversed(split_ops))
-            
-            # The first operation (after reversal) is the division - this is the consumer
-            last_op = split_ops[0]
-
-            # Tile the last operation using tile_using_forall
-            tiled_op, for_op = structured.structured_tile_using_forall(
-                anytype, anytype,
-                last_op,
-                num_threads=[],
-                tile_sizes=[],
-                static_tile_sizes=(parameters["wg_rows"],),
+                parameters=parameters,
+                stop_at_stage=stop_at_stage or "",
             )
+    
+    return mod
 
-            # Fuse the producer operations into the forall loop
-            # Iterate through remaining operations (already in reverse order)
-            current_forall = for_op
-            for producer_op in split_ops[1:]:
-                fused_op, current_forall = structured.structured_fuse_into_containing_op(
-                    anytype, anytype,
-                    producer_op,
-                    current_forall
-                )
-                
-            func = transform.get_parent_op(
-                anytype,
-                current_forall,
-                op_name="func.func",
-                deduplicate=True,
-            )
-            transform.apply_cse(func)
-            canonicalize(func)
-            
-            func = structured.VectorizeChildrenAndApplyPatternsOp(
-                func,
-                fold_type_extensions_into_contract=True,
-            ).result
-            transform.apply_cse(func)
-            canonicalize(func)
-            payload_mod = apply_registered_pass(payload_mod, "eliminate-empty-tensors")
-            identity_layout = LayoutMapOption.IdentityLayoutMap
-            payload_mod = transform_bufferization.OneShotBufferizeOp(
-                payload_mod,
-                allow_return_allocs_from_loops=True,
-                bufferize_function_boundaries=True,
-                function_boundary_type_conversion=identity_layout,
-            ).result
-            # fold memref.subviews into vector.transfer_read/write ops
-            payload_mod = apply_registered_pass(payload_mod, "fold-memref-alias-ops")
-            transform.apply_cse(payload_mod)
-            canonicalize(payload_mod)
-            
-            # convert forall to parallel
-            wg_loops = match_and_split(payload_mod, ops={"scf.forall"})
-            for wg_loop in wg_loops:
-                wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop)
-            func = transform.get_parent_op(anytype, wg_loop)
-            # convert scf.parallel to gpu.launch
-            func = apply_registered_pass(func, "gpu-map-parallel-loops")
-            func = apply_registered_pass(func, "convert-parallel-loops-to-gpu")
-            func = apply_registered_pass(func, "lower-affine")
-            transform.apply_cse(func)
-            canonicalize(func)
-            
-            # set the number of threads for the gpu.launch operation
-            launch_op = match_and_split(func, ops={"gpu.launch"})
-            num_threads = parameters["sg_rows"] * parameters["subgroup_size"]
-            xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1])
-            
-            # outline gpu func
-            func = apply_registered_pass(func, "lower-affine")
-            canonicalize(func)
-            func = apply_registered_pass(func, "gpu-launch-sink-index-computations")
-            payload_mod = apply_registered_pass(payload_mod, "gpu-kernel-outlining")
-            transform.apply_cse(payload_mod)
-            
-            # set xevm target
-            payload_mod = apply_registered_pass(
-                payload_mod,
-                "xevm-attach-target",
-                options={"O": "3", "chip": "bmg"},
-            )
 
-            # convert vector to xegpu
-            gpu_mod_ops = match_and_split(payload_mod, ops={"gpu.module"})
-            for gpu_mod in gpu_mod_ops:
-                gpu_func = match(gpu_mod, ops={"gpu.func"})
-                gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu")
-                transform.apply_cse(gpu_func)
-                
-            # Set layout attributes for xegpu.store_nd operations
-            store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1)
-            xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64])
-            
-            payload_mod = apply_registered_pass(
-                payload_mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"}
-            )
-            # Required: yield to end the transform sequence
-            transform.yield_()
+def xegpu_softmax_transform_schedule(
+    mod: ir.Value[transform.AnyOpType],
+    parameters: dict,
+    stop_at_stage: str = "",
+):
+    """Transform schedule for softmax payload."""
+    try:
+        mod = bundle_xegpu_softmax_schedule(
+            mod,
+            parameters=parameters,
+            stop_at_stage=stop_at_stage,
+        )
+
+        mod = bundle_xegpu_to_binary(
+            mod,
+            stop_at_stage=stop_at_stage,
+        )
+    except PipelineInterrupt:
+        pass
+    finally:
+        transform.yield_()
+
+
+def bundle_xegpu_softmax_schedule(
+    mod: ir.Value[transform.AnyOpType],
+    parameters: dict,
+    stop_at_stage: str = "",
+) -> ir.Value[transform.AnyOpType]:
+    """Schedule for lowering softmax payload to xegpu wg level."""
     
+    if stop_at_stage == "initial":
+        raise PipelineInterrupt()
+    
+    anytype = transform.AnyOpType.get()
+    
+    # Match all linalg.generic and linalg.fill operations
+    # We have 7 operations in softmax: 
+    # fill(max_init), max, sub, exp, fill(sum_init), sum, div
+    generic_ops = structured.structured_match(
+        transform.AnyOpType.get(),
+        mod,
+        ops=["linalg.generic", "linalg.fill"]
+    )
+    
+    # Split the handle into individual operation handles
+    split_ops = transform.split_handle(
+        (anytype, anytype, anytype, anytype, anytype, anytype, anytype),  # 7 result types
+        generic_ops
+    )
+    
+    # Reverse split_ops to have operations in reverse order
+    split_ops = list(reversed(split_ops))
+    
+    # The first operation (after reversal) is the division - this is the consumer
+    last_op = split_ops[0]
+
+    # Tile the last operation using tile_using_forall
+    tiled_op, for_op = structured.structured_tile_using_forall(
+        anytype, anytype,
+        last_op,
+        num_threads=[],
+        tile_sizes=[],
+        static_tile_sizes=(parameters["wg_rows"],),
+    )
+
+    # Fuse the producer operations into the forall loop
+    # Iterate through remaining operations (already in reverse order)
+    current_forall = for_op
+    for producer_op in split_ops[1:]:
+        fused_op, current_forall = structured.structured_fuse_into_containing_op(
+            anytype, anytype,
+            producer_op,
+            current_forall
+        )
+        
+    func = transform.get_parent_op(
+        anytype,
+        current_forall,
+        op_name="func.func",
+        deduplicate=True,
+    )
+    transform.apply_cse(func)
+    canonicalize(func)
+    
+    if stop_at_stage == "tiled":
+        raise PipelineInterrupt()
+    
+    # vectorize
+    func = structured.VectorizeChildrenAndApplyPatternsOp(
+        func,
+        fold_type_extensions_into_contract=True,
+    ).result
+    transform.apply_cse(func)
+    canonicalize(func)
+    
+    if stop_at_stage == "vectorized":
+        raise PipelineInterrupt()
+    
+    # bufferize
+    mod = apply_registered_pass(mod, "eliminate-empty-tensors")
+    identity_layout = LayoutMapOption.IdentityLayoutMap
+    mod = transform_bufferization.OneShotBufferizeOp(
+        mod,
+        allow_return_allocs_from_loops=True,
+        bufferize_function_boundaries=True,
+        function_boundary_type_conversion=identity_layout,
+    ).result
+    # fold memref.subviews into vector.transfer_read/write ops
+    mod = apply_registered_pass(mod, "fold-memref-alias-ops")
+    transform.apply_cse(mod)
+    canonicalize(mod)
+    
+    if stop_at_stage == "bufferized":
+        raise PipelineInterrupt()
+    
+    # convert forall to parallel
+    wg_loops = match_and_split(mod, ops={"scf.forall"})
+    for wg_loop in wg_loops:
+        wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop)
+    func = transform.get_parent_op(anytype, wg_loop)
+    
+    # convert scf.parallel to gpu.launch
+    func = apply_registered_pass(func, "gpu-map-parallel-loops")
+    func = apply_registered_pass(func, "convert-parallel-loops-to-gpu")
+    func = apply_registered_pass(func, "lower-affine")
+    transform.apply_cse(func)
+    canonicalize(func)
+    
+    # set the number of threads for the gpu.launch operation
+    launch_op = match_and_split(func, ops={"gpu.launch"})
+    num_threads = parameters["sg_rows"] * parameters["subgroup_size"]
+    xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1])
+    
+    # outline gpu func
+    func = apply_registered_pass(func, "lower-affine")
+    canonicalize(func)
+    func = apply_registered_pass(func, "gpu-launch-sink-index-computations")
+    mod = apply_registered_pass(mod, "gpu-kernel-outlining")
+    transform.apply_cse(mod)
+    
+    # set xevm target
+    mod = apply_registered_pass(
+        mod,
+        "xevm-attach-target",
+        options={"O": "3", "chip": "bmg"},
+    )
+
+    # convert vector to xegpu
+    gpu_mod_ops = match_and_split(mod, ops={"gpu.module"})
+    for gpu_mod in gpu_mod_ops:
+        gpu_func = match(gpu_mod, ops={"gpu.func"})
+        gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu")
+        transform.apply_cse(gpu_func)
+    
+    if stop_at_stage == "xegpu-initial":
+        raise PipelineInterrupt()
+    
+    # Set layout attributes for xegpu.store_nd operations
+    store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1)
+    xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64])
+    
+    if stop_at_stage == "xegpu-wg":
+        raise PipelineInterrupt()
+    
+    return mod
+
+
+def bundle_xegpu_to_binary(
+    mod: ir.Value, stop_at_stage: str = ""
+) -> ir.Value[transform.AnyOpType]:
+    """Schedule for lowering xegpu wg level to binary."""
+    # upstream xegpu/xevm pipeline is payload independent.
+    mod = apply_registered_pass(
+        mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"}
+    )
+    
+    if stop_at_stage == "final":
+        raise PipelineInterrupt()
+
     return mod

From 64b5d73f52813059ad51293e028c0c1f2cabd459 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Tue, 24 Mar 2026 22:47:05 +0000
Subject: [PATCH 12/38] save working version

---
 examples/xegpu/softmax.py                     |  1 +
 lighthouse/schedule/xegpu/softmax_schedule.py | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
index 474eb1a3..c31af47a 100644
--- a/examples/xegpu/softmax.py
+++ b/examples/xegpu/softmax.py
@@ -260,6 +260,7 @@ def parse_cli():
     args = parse_cli()
 
     params = {
+        "sizes": args.sizes,
         "wg_rows": args.wg_rows,
         "sg_rows": args.sg_rows,
         "subgroup_size": args.subgroup_size,
diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py
index 98acd4e6..26bc5f3e 100644
--- a/lighthouse/schedule/xegpu/softmax_schedule.py
+++ b/lighthouse/schedule/xegpu/softmax_schedule.py
@@ -206,7 +206,8 @@ def bundle_xegpu_softmax_schedule(
     
     # set the number of threads for the gpu.launch operation
     launch_op = match_and_split(func, ops={"gpu.launch"})
-    num_threads = parameters["sg_rows"] * parameters["subgroup_size"]
+    num_subgroups = parameters["wg_rows"] // parameters["sg_rows"]
+    num_threads = num_subgroups * parameters["subgroup_size"]
     xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1])
     
     # outline gpu func
@@ -233,9 +234,12 @@ def bundle_xegpu_softmax_schedule(
     if stop_at_stage == "xegpu-initial":
         raise PipelineInterrupt()
     
-    # Set layout attributes for xegpu.store_nd operations
+    # Set layout attributes for xegpu.store_nd operations.
+    # FIXME: currently ecah subgroup is handling the entire row.
     store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1)
-    xegpu.set_op_layout_attr(store_ops[0], sg_layout=[8, 1], sg_data=[8, 64])
+    sg_layout = [parameters["sg_rows"], 1]
+    sg_data = [parameters["sg_rows"], parameters["sizes"][1]]
+    xegpu.set_op_layout_attr(store_ops[0], sg_layout=sg_layout, sg_data=sg_data)
     
     if stop_at_stage == "xegpu-wg":
         raise PipelineInterrupt()

From 108f2c09e8ee111b6b96f97553bc9bacead0013e Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Wed, 25 Mar 2026 20:48:48 +0000
Subject: [PATCH 13/38] save working version

---
 lighthouse/schedule/xegpu/mlp_schedule.py     | 13 +------------
 lighthouse/schedule/xegpu/softmax_schedule.py | 17 ++---------------
 2 files changed, 3 insertions(+), 27 deletions(-)

diff --git a/lighthouse/schedule/xegpu/mlp_schedule.py b/lighthouse/schedule/xegpu/mlp_schedule.py
index 94cbdf01..ab11a75c 100644
--- a/lighthouse/schedule/xegpu/mlp_schedule.py
+++ b/lighthouse/schedule/xegpu/mlp_schedule.py
@@ -14,6 +14,7 @@
     match_and_split,
     PipelineInterrupt,
 )
+from lighthouse.schedule.xegpu.helper import bundle_xegpu_to_binary
 
 from lighthouse.dialects import smt_ext, transform_smt_ext as td_smt_ext
 from lighthouse.dialects.transform_tune_ext import knob, KnobValue
@@ -600,15 +601,3 @@ def annotate_ab_load(tile, layout_load, layout_dpas):
 
     canonicalize(gpu_func)
     transform.apply_cse(gpu_func)
-
-
-def bundle_xegpu_to_binary(
-    mod: ir.Value, stop_at_stage: str = ""
-) -> ir.Value[transform.AnyOpType]:
-    """Schedule for lowering xegpu wg level to binary."""
-    # upstream xegpu/xevm pipeline is payload independent.
-    mod = apply_registered_pass(
-        mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"}
-    )
-
-    return mod
diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py
index 26bc5f3e..9e5226b9 100644
--- a/lighthouse/schedule/xegpu/softmax_schedule.py
+++ b/lighthouse/schedule/xegpu/softmax_schedule.py
@@ -15,6 +15,7 @@
     match_and_split,
     PipelineInterrupt,
 )
+from lighthouse.schedule.xegpu.helper import bundle_xegpu_to_binary
 
 
 def get_softmax_schedule_module(
@@ -38,6 +39,7 @@ def get_softmax_schedule_module(
             - wg_rows: Number of rows per workgroup
             - sg_rows: Number of rows per subgroup  
             - subgroup_size: Size of subgroup
+            - sizes: Tuple with the sizes of the input tensors (e.g. (M, N))
             
     Returns:
         MLIR module containing the transform schedule
@@ -245,18 +247,3 @@ def bundle_xegpu_softmax_schedule(
         raise PipelineInterrupt()
     
     return mod
-
-
-def bundle_xegpu_to_binary(
-    mod: ir.Value, stop_at_stage: str = ""
-) -> ir.Value[transform.AnyOpType]:
-    """Schedule for lowering xegpu wg level to binary."""
-    # upstream xegpu/xevm pipeline is payload independent.
-    mod = apply_registered_pass(
-        mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"}
-    )
-    
-    if stop_at_stage == "final":
-        raise PipelineInterrupt()
-
-    return mod

From a7e1e6c7535c5dd085c14ebea39ae13ac25eeb69 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Wed, 25 Mar 2026 20:58:29 +0000
Subject: [PATCH 14/38] save working version

---
 lighthouse/schedule/xegpu/helper.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 lighthouse/schedule/xegpu/helper.py

diff --git a/lighthouse/schedule/xegpu/helper.py b/lighthouse/schedule/xegpu/helper.py
new file mode 100644
index 00000000..1452301e
--- /dev/null
+++ b/lighthouse/schedule/xegpu/helper.py
@@ -0,0 +1,21 @@
+"""Helper functions for XeGPU scheduling."""
+
+from mlir import ir
+from mlir.dialects import transform
+
+from lighthouse.pipeline.helper import apply_registered_pass, PipelineInterrupt
+
+
+def bundle_xegpu_to_binary(
+    mod: ir.Value, stop_at_stage: str = ""
+) -> ir.Value[transform.AnyOpType]:
+    """Schedule for lowering xegpu wg level to binary."""
+    # upstream xegpu/xevm pipeline is payload independent.
+    mod = apply_registered_pass(
+        mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"}
+    )
+    
+    if stop_at_stage == "final":
+        raise PipelineInterrupt()
+
+    return mod

From df53caa54ff2c066393f3fca9479203cc0ef471a Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Wed, 25 Mar 2026 21:09:29 +0000
Subject: [PATCH 15/38] precommit issues

---
 examples/xegpu/softmax.py                     | 12 +--
 .../ingress/mlir_gen/gpu_softmax_payload.py   | 68 +++++++++------
 lighthouse/schedule/xegpu/helper.py           |  2 +-
 lighthouse/schedule/xegpu/softmax_schedule.py | 87 ++++++++++---------
 4 files changed, 97 insertions(+), 72 deletions(-)

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
index c31af47a..e3cf5840 100644
--- a/examples/xegpu/softmax.py
+++ b/examples/xegpu/softmax.py
@@ -28,7 +28,7 @@
 def softmax_complexity(M: int, N: int, nbytes: int):
     """
     Complexity of softmax operation.
-    
+
     For each row:
     - O(N) to find max
     - O(N) to compute exp(x - max) and sum
@@ -127,7 +127,7 @@ def check_correctness(
 
         output_ref = self._reference_solution
         output_computed = output_host.astype(np.float32)
-        
+
         if verbose > 1:
             print("Reference solution (first 5 rows):")
             print(output_ref[:5])
@@ -137,10 +137,10 @@ def check_correctness(
         # Check row sums are close to 1.0
         row_sums = np.sum(output_computed, axis=1)
         sums_ok = np.allclose(row_sums, 1.0, rtol=1e-5, atol=1e-6)
-        
+
         # Check values match reference
         values_ok = np.allclose(output_computed, output_ref, rtol=1e-4, atol=1e-6)
-        
+
         success = sums_ok and values_ok
 
         if verbose:
@@ -149,7 +149,9 @@ def check_correctness(
             else:
                 print("FAILED!")
                 if not sums_ok:
-                    print(f"  Row sums check failed. Min: {row_sums.min():.6f}, Max: {row_sums.max():.6f}")
+                    print(
+                        f"  Row sums check failed. Min: {row_sums.min():.6f}, Max: {row_sums.max():.6f}"
+                    )
                 if not values_ok:
                     max_diff = np.abs(output_computed - output_ref).max()
                     print(f"  Values mismatch. Max abs diff: {max_diff:.6e}")
diff --git a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py
index 26fd9042..4568448e 100644
--- a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py
+++ b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py
@@ -1,10 +1,13 @@
 """Generate MLIR payload for GPU softmax operation."""
 
 from mlir import ir
-from mlir.dialects import linalg, bufferization, arith, tensor, func, math
+from mlir.dialects import linalg, bufferization, arith, tensor, math
 
 from lighthouse.utils.mlir import func_cif
-from lighthouse.ingress.mlir_gen.gpu_utils import emit_gpu_util_funcs, emit_buf_to_tensor
+from lighthouse.ingress.mlir_gen.gpu_utils import (
+    emit_gpu_util_funcs,
+    emit_buf_to_tensor,
+)
 
 
 def generate_gpu_softmax_payload(
@@ -15,33 +18,33 @@ def generate_gpu_softmax_payload(
 ) -> ir.Module:
     """
     Generate MLIR module for softmax payload.
-    
+
     Computes softmax along the last dimension (rows):
     output[i, j] = exp(input[i, j] - max_i) / sum_i(exp(input[i, j] - max_i))
-    
+
     where max_i and sum_i are computed over row i.
-    
+
     Args:
         func_name: Name of the payload function
         M: Number of rows
-        N: Number of columns  
+        N: Number of columns
         dtype: MLIR element type (e.g., F32Type)
-        
+
     Returns:
         MLIR module containing the softmax payload function
     """
     mod = ir.Module.create()
     shape = (M, N)
     memref_t = ir.MemRefType.get(shape, dtype)
-    
+
     with ir.InsertionPoint(mod.body):
         # Function signature: payload(output, input)
         @func_cif(memref_t, memref_t, name=func_name)
         def payload(output, input_arg):
             # Convert memrefs to tensors
-            output_tensor = emit_buf_to_tensor(output, restrict=True, writable=True)
+            emit_buf_to_tensor(output, restrict=True, writable=True)
             input_tensor = emit_buf_to_tensor(input_arg, restrict=True)
-            
+
             # Define affine maps for indexing
             # #map = affine_map<(d0, d1) -> (d0, d1)>  (identity 2D)
             # #map1 = affine_map<(d0, d1) -> (d0)>     (broadcast/reduce along d1)
@@ -49,67 +52,82 @@ def payload(output, input_arg):
             d1 = ir.AffineDimExpr.get(1)
             map_2d = ir.AffineMap.get(2, 0, [d0, d1])
             map_1d = ir.AffineMap.get(2, 0, [d0])
-            
+
             # Step 1: Find max - linalg.generic reduction
-            neg_inf = arith.constant(dtype, float('-inf'))
+            neg_inf = arith.constant(dtype, float("-inf"))
             max_init = tensor.empty((M,), dtype)
             max_filled = linalg.fill(neg_inf, outs=[max_init])
-            
+
             @linalg.generic(
                 [input_tensor],  # inputs
                 [max_filled],  # outputs
                 [map_2d, map_1d],  # indexing_maps
-                [linalg.IteratorType.parallel, linalg.IteratorType.reduction],  # iterator_types
+                [
+                    linalg.IteratorType.parallel,
+                    linalg.IteratorType.reduction,
+                ],  # iterator_types
             )
             def row_max(in_val, acc):
                 return arith.maximumf(in_val, acc)
-            
+
             # Step 2: Subtract max (broadcast) - linalg.generic elementwise
             output_init = tensor.empty((M, N), dtype)
-            
+
             @linalg.generic(
                 [input_tensor, row_max],  # inputs
                 [output_init],  # outputs
                 [map_2d, map_1d, map_2d],  # indexing_maps
-                [linalg.IteratorType.parallel, linalg.IteratorType.parallel],  # iterator_types
+                [
+                    linalg.IteratorType.parallel,
+                    linalg.IteratorType.parallel,
+                ],  # iterator_types
             )
             def shifted(in_val, max_val, out):
                 return arith.subf(in_val, max_val)
-            
+
             # Step 3: Compute exp - linalg.generic elementwise
             @linalg.generic(
                 [shifted],  # inputs
                 [output_init],  # outputs
                 [map_2d, map_2d],  # indexing_maps
-                [linalg.IteratorType.parallel, linalg.IteratorType.parallel],  # iterator_types
+                [
+                    linalg.IteratorType.parallel,
+                    linalg.IteratorType.parallel,
+                ],  # iterator_types
             )
             def exp_vals(in_val, out):
                 return math.exp(in_val)
-            
+
             # Step 4: Sum exp values - linalg.generic reduction
             sum_init = tensor.empty((M,), dtype)
             zero = arith.constant(dtype, 0.0)
             sum_filled = linalg.fill(zero, outs=[sum_init])
-            
+
             @linalg.generic(
                 [exp_vals],  # inputs
                 [sum_filled],  # outputs
                 [map_2d, map_1d],  # indexing_maps
-                [linalg.IteratorType.parallel, linalg.IteratorType.reduction],  # iterator_types
+                [
+                    linalg.IteratorType.parallel,
+                    linalg.IteratorType.reduction,
+                ],  # iterator_types
             )
             def row_sum(in_val, acc):
                 return arith.addf(in_val, acc)
-            
+
             # Step 5: Divide by sum (broadcast) - linalg.generic elementwise
             @linalg.generic(
                 [exp_vals, row_sum],  # inputs
                 [output_init],  # outputs
                 [map_2d, map_1d, map_2d],  # indexing_maps
-                [linalg.IteratorType.parallel, linalg.IteratorType.parallel],  # iterator_types
+                [
+                    linalg.IteratorType.parallel,
+                    linalg.IteratorType.parallel,
+                ],  # iterator_types
             )
             def result(exp_val, sum_val, out):
                 return arith.divf(exp_val, sum_val)
-            
+
             # Materialize result back to output memref
             bufferization.materialize_in_destination(
                 None, result, output, restrict=True, writable=True
diff --git a/lighthouse/schedule/xegpu/helper.py b/lighthouse/schedule/xegpu/helper.py
index 1452301e..0c1d93a7 100644
--- a/lighthouse/schedule/xegpu/helper.py
+++ b/lighthouse/schedule/xegpu/helper.py
@@ -14,7 +14,7 @@ def bundle_xegpu_to_binary(
     mod = apply_registered_pass(
         mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"}
     )
-    
+
     if stop_at_stage == "final":
         raise PipelineInterrupt()
 
diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py
index 9e5226b9..d9701b84 100644
--- a/lighthouse/schedule/xegpu/softmax_schedule.py
+++ b/lighthouse/schedule/xegpu/softmax_schedule.py
@@ -24,7 +24,7 @@ def get_softmax_schedule_module(
 ) -> ir.Module:
     """
     Generate transform schedule for softmax operation.
-    
+
     The schedule performs the following transformations:
     1. Tile the consumer operation (division) using forall
     2. Fuse producer operations into the forall loop
@@ -32,32 +32,32 @@ def get_softmax_schedule_module(
     4. Bufferize tensors
     5. Convert to GPU dialect
     6. Lower to XeGPU operations
-    
+
     Args:
         stop_at_stage: Optional stage name to stop early (for debugging)
         parameters: Dictionary with scheduling parameters:
             - wg_rows: Number of rows per workgroup
-            - sg_rows: Number of rows per subgroup  
+            - sg_rows: Number of rows per subgroup
             - subgroup_size: Size of subgroup
             - sizes: Tuple with the sizes of the input tensors (e.g. (M, N))
-            
+
     Returns:
         MLIR module containing the transform schedule
     """
     assert parameters is not None, "Schedule parameters must be provided"
-    
+
     mod = ir.Module.create()
     mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get()
-    
+
     with ir.InsertionPoint(mod.body):
         # Create a transform sequence with proper signature
         named_sequence = transform.named_sequence(
             "__transform_main",
             [transform.AnyOpType.get()],  # input: module
             [],  # no outputs
-            arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}]
+            arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}],
         )
-        
+
         with ir.InsertionPoint(named_sequence.body):
             # match the payload module
             anytype = transform.AnyOpType.get()
@@ -68,13 +68,13 @@ def get_softmax_schedule_module(
                 op_name="builtin.module",
                 deduplicate=True,
             )
-            
+
             xegpu_softmax_transform_schedule(
                 payload_mod,
                 parameters=parameters,
                 stop_at_stage=stop_at_stage or "",
             )
-    
+
     return mod
 
 
@@ -107,36 +107,43 @@ def bundle_xegpu_softmax_schedule(
     stop_at_stage: str = "",
 ) -> ir.Value[transform.AnyOpType]:
     """Schedule for lowering softmax payload to xegpu wg level."""
-    
+
     if stop_at_stage == "initial":
         raise PipelineInterrupt()
-    
+
     anytype = transform.AnyOpType.get()
-    
+
     # Match all linalg.generic and linalg.fill operations
-    # We have 7 operations in softmax: 
+    # We have 7 operations in softmax:
     # fill(max_init), max, sub, exp, fill(sum_init), sum, div
     generic_ops = structured.structured_match(
-        transform.AnyOpType.get(),
-        mod,
-        ops=["linalg.generic", "linalg.fill"]
+        transform.AnyOpType.get(), mod, ops=["linalg.generic", "linalg.fill"]
     )
-    
+
     # Split the handle into individual operation handles
     split_ops = transform.split_handle(
-        (anytype, anytype, anytype, anytype, anytype, anytype, anytype),  # 7 result types
-        generic_ops
+        (
+            anytype,
+            anytype,
+            anytype,
+            anytype,
+            anytype,
+            anytype,
+            anytype,
+        ),  # 7 result types
+        generic_ops,
     )
-    
+
     # Reverse split_ops to have operations in reverse order
     split_ops = list(reversed(split_ops))
-    
+
     # The first operation (after reversal) is the division - this is the consumer
     last_op = split_ops[0]
 
     # Tile the last operation using tile_using_forall
     tiled_op, for_op = structured.structured_tile_using_forall(
-        anytype, anytype,
+        anytype,
+        anytype,
         last_op,
         num_threads=[],
         tile_sizes=[],
@@ -148,11 +155,9 @@ def bundle_xegpu_softmax_schedule(
     current_forall = for_op
     for producer_op in split_ops[1:]:
         fused_op, current_forall = structured.structured_fuse_into_containing_op(
-            anytype, anytype,
-            producer_op,
-            current_forall
+            anytype, anytype, producer_op, current_forall
         )
-        
+
     func = transform.get_parent_op(
         anytype,
         current_forall,
@@ -161,10 +166,10 @@ def bundle_xegpu_softmax_schedule(
     )
     transform.apply_cse(func)
     canonicalize(func)
-    
+
     if stop_at_stage == "tiled":
         raise PipelineInterrupt()
-    
+
     # vectorize
     func = structured.VectorizeChildrenAndApplyPatternsOp(
         func,
@@ -172,10 +177,10 @@ def bundle_xegpu_softmax_schedule(
     ).result
     transform.apply_cse(func)
     canonicalize(func)
-    
+
     if stop_at_stage == "vectorized":
         raise PipelineInterrupt()
-    
+
     # bufferize
     mod = apply_registered_pass(mod, "eliminate-empty-tensors")
     identity_layout = LayoutMapOption.IdentityLayoutMap
@@ -189,36 +194,36 @@ def bundle_xegpu_softmax_schedule(
     mod = apply_registered_pass(mod, "fold-memref-alias-ops")
     transform.apply_cse(mod)
     canonicalize(mod)
-    
+
     if stop_at_stage == "bufferized":
         raise PipelineInterrupt()
-    
+
     # convert forall to parallel
     wg_loops = match_and_split(mod, ops={"scf.forall"})
     for wg_loop in wg_loops:
         wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop)
     func = transform.get_parent_op(anytype, wg_loop)
-    
+
     # convert scf.parallel to gpu.launch
     func = apply_registered_pass(func, "gpu-map-parallel-loops")
     func = apply_registered_pass(func, "convert-parallel-loops-to-gpu")
     func = apply_registered_pass(func, "lower-affine")
     transform.apply_cse(func)
     canonicalize(func)
-    
+
     # set the number of threads for the gpu.launch operation
     launch_op = match_and_split(func, ops={"gpu.launch"})
     num_subgroups = parameters["wg_rows"] // parameters["sg_rows"]
     num_threads = num_subgroups * parameters["subgroup_size"]
     xegpu.set_gpu_launch_threads(launch_op[0], threads=[num_threads, 1, 1])
-    
+
     # outline gpu func
     func = apply_registered_pass(func, "lower-affine")
     canonicalize(func)
     func = apply_registered_pass(func, "gpu-launch-sink-index-computations")
     mod = apply_registered_pass(mod, "gpu-kernel-outlining")
     transform.apply_cse(mod)
-    
+
     # set xevm target
     mod = apply_registered_pass(
         mod,
@@ -232,18 +237,18 @@ def bundle_xegpu_softmax_schedule(
         gpu_func = match(gpu_mod, ops={"gpu.func"})
         gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu")
         transform.apply_cse(gpu_func)
-    
+
     if stop_at_stage == "xegpu-initial":
         raise PipelineInterrupt()
-    
+
     # Set layout attributes for xegpu.store_nd operations.
     # FIXME: currently ecah subgroup is handling the entire row.
     store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1)
     sg_layout = [parameters["sg_rows"], 1]
     sg_data = [parameters["sg_rows"], parameters["sizes"][1]]
     xegpu.set_op_layout_attr(store_ops[0], sg_layout=sg_layout, sg_data=sg_data)
-    
+
     if stop_at_stage == "xegpu-wg":
         raise PipelineInterrupt()
-    
+
     return mod

From 9bcc6538adc23d8cb2f9186bfb24cb581919c68d Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Fri, 27 Mar 2026 22:36:53 +0000
Subject: [PATCH 16/38] use linalg.softmax

---
 .../ingress/mlir_gen/gpu_softmax_payload.py   | 90 ++-----------------
 lighthouse/schedule/xegpu/softmax_schedule.py | 60 ++++---------
 2 files changed, 27 insertions(+), 123 deletions(-)

diff --git a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py
index 4568448e..05dc148a 100644
--- a/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py
+++ b/lighthouse/ingress/mlir_gen/gpu_softmax_payload.py
@@ -1,7 +1,7 @@
 """Generate MLIR payload for GPU softmax operation."""
 
 from mlir import ir
-from mlir.dialects import linalg, bufferization, arith, tensor, math
+from mlir.dialects import linalg, bufferization, tensor
 
 from lighthouse.utils.mlir import func_cif
 from lighthouse.ingress.mlir_gen.gpu_utils import (
@@ -45,88 +45,16 @@ def payload(output, input_arg):
             emit_buf_to_tensor(output, restrict=True, writable=True)
             input_tensor = emit_buf_to_tensor(input_arg, restrict=True)
 
-            # Define affine maps for indexing
-            # #map = affine_map<(d0, d1) -> (d0, d1)>  (identity 2D)
-            # #map1 = affine_map<(d0, d1) -> (d0)>     (broadcast/reduce along d1)
-            d0 = ir.AffineDimExpr.get(0)
-            d1 = ir.AffineDimExpr.get(1)
-            map_2d = ir.AffineMap.get(2, 0, [d0, d1])
-            map_1d = ir.AffineMap.get(2, 0, [d0])
+            # Create output tensor and fill with zeros
+            output_init = tensor.empty(shape, dtype)
 
-            # Step 1: Find max - linalg.generic reduction
-            neg_inf = arith.constant(dtype, float("-inf"))
-            max_init = tensor.empty((M,), dtype)
-            max_filled = linalg.fill(neg_inf, outs=[max_init])
-
-            @linalg.generic(
-                [input_tensor],  # inputs
-                [max_filled],  # outputs
-                [map_2d, map_1d],  # indexing_maps
-                [
-                    linalg.IteratorType.parallel,
-                    linalg.IteratorType.reduction,
-                ],  # iterator_types
-            )
-            def row_max(in_val, acc):
-                return arith.maximumf(in_val, acc)
-
-            # Step 2: Subtract max (broadcast) - linalg.generic elementwise
-            output_init = tensor.empty((M, N), dtype)
-
-            @linalg.generic(
-                [input_tensor, row_max],  # inputs
-                [output_init],  # outputs
-                [map_2d, map_1d, map_2d],  # indexing_maps
-                [
-                    linalg.IteratorType.parallel,
-                    linalg.IteratorType.parallel,
-                ],  # iterator_types
-            )
-            def shifted(in_val, max_val, out):
-                return arith.subf(in_val, max_val)
-
-            # Step 3: Compute exp - linalg.generic elementwise
-            @linalg.generic(
-                [shifted],  # inputs
-                [output_init],  # outputs
-                [map_2d, map_2d],  # indexing_maps
-                [
-                    linalg.IteratorType.parallel,
-                    linalg.IteratorType.parallel,
-                ],  # iterator_types
-            )
-            def exp_vals(in_val, out):
-                return math.exp(in_val)
-
-            # Step 4: Sum exp values - linalg.generic reduction
-            sum_init = tensor.empty((M,), dtype)
-            zero = arith.constant(dtype, 0.0)
-            sum_filled = linalg.fill(zero, outs=[sum_init])
-
-            @linalg.generic(
-                [exp_vals],  # inputs
-                [sum_filled],  # outputs
-                [map_2d, map_1d],  # indexing_maps
-                [
-                    linalg.IteratorType.parallel,
-                    linalg.IteratorType.reduction,
-                ],  # iterator_types
-            )
-            def row_sum(in_val, acc):
-                return arith.addf(in_val, acc)
-
-            # Step 5: Divide by sum (broadcast) - linalg.generic elementwise
-            @linalg.generic(
-                [exp_vals, row_sum],  # inputs
-                [output_init],  # outputs
-                [map_2d, map_1d, map_2d],  # indexing_maps
-                [
-                    linalg.IteratorType.parallel,
-                    linalg.IteratorType.parallel,
-                ],  # iterator_types
+            # Apply softmax along dimension 1 (last dimension)
+            result = linalg.softmax(
+                result=[ir.RankedTensorType.get(shape, dtype)],
+                input=input_tensor,
+                output=output_init,
+                dimension=1,
             )
-            def result(exp_val, sum_val, out):
-                return arith.divf(exp_val, sum_val)
 
             # Materialize result back to output memref
             bufferization.materialize_in_destination(
diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py
index d9701b84..35c3ab4c 100644
--- a/lighthouse/schedule/xegpu/softmax_schedule.py
+++ b/lighthouse/schedule/xegpu/softmax_schedule.py
@@ -26,12 +26,11 @@ def get_softmax_schedule_module(
     Generate transform schedule for softmax operation.
 
     The schedule performs the following transformations:
-    1. Tile the consumer operation (division) using forall
-    2. Fuse producer operations into the forall loop
-    3. Vectorize operations
-    4. Bufferize tensors
-    5. Convert to GPU dialect
-    6. Lower to XeGPU operations
+    1. Tile the linalg.softmax operation using forall
+    2. Vectorize operations
+    3. Bufferize tensors
+    4. Convert to GPU dialect
+    5. Lower to XeGPU operations
 
     Args:
         stop_at_stage: Optional stage name to stop early (for debugging)
@@ -113,57 +112,34 @@ def bundle_xegpu_softmax_schedule(
 
     anytype = transform.AnyOpType.get()
 
-    # Match all linalg.generic and linalg.fill operations
-    # We have 7 operations in softmax:
-    # fill(max_init), max, sub, exp, fill(sum_init), sum, div
-    generic_ops = structured.structured_match(
-        transform.AnyOpType.get(), mod, ops=["linalg.generic", "linalg.fill"]
+    # Match linalg.softmax operation
+    # We have only 1 operation: linalg.softmax
+    softmax_op = structured.structured_match(
+        transform.AnyOpType.get(), mod, ops=["linalg.softmax"]
     )
 
-    # Split the handle into individual operation handles
-    split_ops = transform.split_handle(
-        (
-            anytype,
-            anytype,
-            anytype,
-            anytype,
-            anytype,
-            anytype,
-            anytype,
-        ),  # 7 result types
-        generic_ops,
-    )
-
-    # Reverse split_ops to have operations in reverse order
-    split_ops = list(reversed(split_ops))
-
-    # The first operation (after reversal) is the division - this is the consumer
-    last_op = split_ops[0]
-
-    # Tile the last operation using tile_using_forall
+    # Tile the softmax operation using tile_using_forall
     tiled_op, for_op = structured.structured_tile_using_forall(
         anytype,
         anytype,
-        last_op,
+        softmax_op,
         num_threads=[],
         tile_sizes=[],
         static_tile_sizes=(parameters["wg_rows"],),
     )
 
-    # Fuse the producer operations into the forall loop
-    # Iterate through remaining operations (already in reverse order)
-    current_forall = for_op
-    for producer_op in split_ops[1:]:
-        fused_op, current_forall = structured.structured_fuse_into_containing_op(
-            anytype, anytype, producer_op, current_forall
-        )
-
     func = transform.get_parent_op(
         anytype,
-        current_forall,
+        for_op,
         op_name="func.func",
         deduplicate=True,
     )
+    # Decompose softmax into linalg.generic operations
+    softmax_ops = structured.structured_match(
+        transform.AnyOpType.get(), func, ops=["linalg.softmax"]
+    )
+    structured.structured_decompose_interface(anytype, softmax_ops)
+
     transform.apply_cse(func)
     canonicalize(func)
 

From 3f5cbceacfca0cf15ea47bab5e29e1b62de48b4e Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Mon, 30 Mar 2026 19:28:26 +0000
Subject: [PATCH 17/38] save work

---
 examples/xegpu/softmax.py                     | 17 +++++++++--
 lighthouse/schedule/xegpu/softmax_schedule.py | 29 +++++++++++++++++++
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
index e3cf5840..27a58ec2 100644
--- a/examples/xegpu/softmax.py
+++ b/examples/xegpu/softmax.py
@@ -217,6 +217,12 @@ def parse_cli():
         default=16,
         help="Subgroup size.",
     )
+    parser.add_argument(
+        "--reduction-step-size",
+        type=int,
+        default=16,
+        help="Step size for reduction loop tiling (optional).",
+    )
     parser.add_argument(
         "--nruns",
         type=int,
@@ -266,6 +272,7 @@ def parse_cli():
         "wg_rows": args.wg_rows,
         "sg_rows": args.sg_rows,
         "subgroup_size": args.subgroup_size,
+        "reduction_step_size": args.reduction_step_size,
     }
 
     M, N = args.sizes
@@ -304,7 +311,13 @@ def list2str(a):
                 f"wg-rows={args.wg_rows}",
                 f"sg-rows={args.sg_rows}",
                 f"subgroup-size={args.subgroup_size}",
-                f"time(us): {elapsed:.2f}",
-                f"GFLOPS: {gflops:.2f}",
             ]
+            if args.reduction_step_size is not None:
+                parts.append(f"reduction-step-size={args.reduction_step_size}")
+            parts.extend(
+                [
+                    f"time(us): {elapsed:.2f}",
+                    f"GFLOPS: {gflops:.2f}",
+                ]
+            )
             print(" ".join(parts))
diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py
index 35c3ab4c..11410612 100644
--- a/lighthouse/schedule/xegpu/softmax_schedule.py
+++ b/lighthouse/schedule/xegpu/softmax_schedule.py
@@ -39,6 +39,7 @@ def get_softmax_schedule_module(
             - sg_rows: Number of rows per subgroup
             - subgroup_size: Size of subgroup
             - sizes: Tuple with the sizes of the input tensors (e.g. (M, N))
+            - reduction_step_size: Optional step size for tiling reduction loops
 
     Returns:
         MLIR module containing the transform schedule
@@ -140,6 +141,34 @@ def bundle_xegpu_softmax_schedule(
     )
     structured.structured_decompose_interface(anytype, softmax_ops)
 
+    linalg_ops = match_and_split(
+        func, ops={"linalg.generic", "linalg.fill"}, nhandles=6
+    )
+    init_max_reduction = linalg_ops[0]
+    max_reduction = linalg_ops[1]
+    max_center_and_exp_op = linalg_ops[2]
+    init_sum_reduction = linalg_ops[3]
+    sum_reduction = linalg_ops[4]
+    div_op = linalg_ops[5]
+
+    reduction_step_size = parameters["reduction_step_size"]
+
+    # Tile the max reduction using TileReductionUsingFor
+    _, _, _, for_op = structured.structured_tile_reduction_using_for(
+        [anytype],
+        anytype,
+        anytype,
+        anytype,
+        target=max_reduction,
+        tile_sizes=[0, reduction_step_size],
+    )
+
+    # Fuse the init_max_reduction into the for loop
+    # fused_init, new_for_loop = structured.structured_fuse_into_containing_op(
+    #     anytype, anytype, init_max_reduction, for_op
+    # )
+    transform.PrintOp(target=init_max_reduction)
+
     transform.apply_cse(func)
     canonicalize(func)
 

From 6204d6c2a59cb8d816f87b2fc811d96163a613a3 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Mon, 30 Mar 2026 22:21:34 +0000
Subject: [PATCH 18/38] add inner dim tiling

---
 lighthouse/schedule/xegpu/softmax_schedule.py | 32 +++++++++----------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py
index 11410612..9bda77fc 100644
--- a/lighthouse/schedule/xegpu/softmax_schedule.py
+++ b/lighthouse/schedule/xegpu/softmax_schedule.py
@@ -144,30 +144,28 @@ def bundle_xegpu_softmax_schedule(
     linalg_ops = match_and_split(
         func, ops={"linalg.generic", "linalg.fill"}, nhandles=6
     )
-    init_max_reduction = linalg_ops[0]
     max_reduction = linalg_ops[1]
     max_center_and_exp_op = linalg_ops[2]
-    init_sum_reduction = linalg_ops[3]
     sum_reduction = linalg_ops[4]
     div_op = linalg_ops[5]
 
     reduction_step_size = parameters["reduction_step_size"]
 
-    # Tile the max reduction using TileReductionUsingFor
-    _, _, _, for_op = structured.structured_tile_reduction_using_for(
-        [anytype],
-        anytype,
-        anytype,
-        anytype,
-        target=max_reduction,
-        tile_sizes=[0, reduction_step_size],
-    )
-
-    # Fuse the init_max_reduction into the for loop
-    # fused_init, new_for_loop = structured.structured_fuse_into_containing_op(
-    #     anytype, anytype, init_max_reduction, for_op
-    # )
-    transform.PrintOp(target=init_max_reduction)
+    # Tile all reduction ops using the same step size
+    reduction_ops = [max_reduction, sum_reduction]
+    for reduction_op in reduction_ops:
+        structured.structured_tile_reduction_using_for(
+            [anytype],
+            anytype,
+            anytype,
+            anytype,
+            target=reduction_op,
+            tile_sizes=[0, reduction_step_size],
+        )
+    # Tile elementwise ops to match the reduction tile size
+    elementwise_ops = [max_center_and_exp_op, div_op]
+    for elementwise_op in elementwise_ops:
+        structured.TileUsingForOp(elementwise_op, sizes=[0, reduction_step_size])
 
     transform.apply_cse(func)
     canonicalize(func)

From 1feb0d48d94e6cc46fb6cbf7c01dc9825e43f73a Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Wed, 1 Apr 2026 22:09:26 +0000
Subject: [PATCH 19/38] save fused version

---
 examples/xegpu/softmax.py                     |  1 +
 lighthouse/schedule/xegpu/softmax_schedule.py | 76 +++++++++++++++----
 2 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
index 324d6e0c..8049f0f4 100644
--- a/examples/xegpu/softmax.py
+++ b/examples/xegpu/softmax.py
@@ -212,6 +212,7 @@ def parse_cli():
             "tiled",
             "vectorized",
             "bufferized",
+            "gpu-outlining",
             "xegpu-initial",
             "xegpu-wg",
             "final",
diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py
index 361dfdfe..b42b6479 100644
--- a/lighthouse/schedule/xegpu/softmax_schedule.py
+++ b/lighthouse/schedule/xegpu/softmax_schedule.py
@@ -140,6 +140,7 @@ def bundle_xegpu_softmax_schedule(
         transform.AnyOpType.get(), func, ops=["linalg.softmax"]
     )
     structured.structured_decompose_interface(anytype, softmax_ops)
+    # transform.print_(target=func, name="After structured_decompose_interface")
 
     linalg_ops = match_and_split(
         func, ops={"linalg.generic", "linalg.fill"}, nhandles=6
@@ -151,22 +152,60 @@ def bundle_xegpu_softmax_schedule(
 
     reduction_step_size = parameters["reduction_step_size"]
 
-    # Tile all reduction ops using the same step size
-    reduction_ops = [max_reduction, sum_reduction]
-    for reduction_op in reduction_ops:
-        structured.structured_tile_reduction_using_for(
-            [anytype],
-            anytype,
-            anytype,
-            anytype,
-            target=reduction_op,
-            tile_sizes=[0, reduction_step_size],
-        )
-    # Tile elementwise ops to match the reduction tile size
-    elementwise_ops = [max_center_and_exp_op, div_op]
-    for elementwise_op in elementwise_ops:
-        structured.TileUsingForOp(elementwise_op, sizes=[0, reduction_step_size])
+    # Tile the division op and fuse the sub+exp producer into it
+    _, div_loop = structured.TileUsingForOp(
+        div_op, sizes=[0, reduction_step_size]
+    ).results
+
+    # Fuse max_center_and_exp_op into the div loop
+    _, fused_loop = structured.structured_fuse_into_containing_op(
+        anytype,
+        anytype,
+        producer_op=max_center_and_exp_op,
+        containing_op=div_loop,
+    )
+
+    # Tile the sum reduction and fuse the sub+exp producer into it
+    _, _, _, sum_loop = structured.structured_tile_reduction_using_for(
+        [anytype],
+        anytype,
+        anytype,
+        anytype,
+        target=sum_reduction,
+        tile_sizes=[0, reduction_step_size],
+    )
+
+    func = transform.get_parent_op(
+        anytype,
+        fused_loop,
+        op_name="func.func",
+        deduplicate=True,
+    )
 
+    # Re-match and split linalg generic ops, there are 5 at this point
+    linalg_ops = match_and_split(func, ops={"linalg.generic"}, nhandles=5)
+    max_center_and_exp_op = linalg_ops[1]
+
+    # Fuse max_center_and_exp_op into the sum reduction loop
+    _, fused_sum_loop = structured.structured_fuse_into_containing_op(
+        anytype,
+        anytype,
+        producer_op=max_center_and_exp_op,
+        containing_op=sum_loop,
+    )
+
+    # Tile the max reduction.
+    max_reduction = linalg_ops[0]
+    structured.structured_tile_reduction_using_for(
+        [anytype],
+        anytype,
+        anytype,
+        anytype,
+        target=max_reduction,
+        tile_sizes=[0, reduction_step_size],
+    )
+
+    # Cleanup after tiling and fusion
     transform.apply_cse(func)
     canonicalize(func)
 
@@ -227,6 +266,9 @@ def bundle_xegpu_softmax_schedule(
     mod = apply_registered_pass(mod, "gpu-kernel-outlining")
     transform.apply_cse(mod)
 
+    if stop_at_stage == "gpu-outlining":
+        raise PipelineInterrupt()
+
     # set xevm target
     mod = apply_registered_pass(
         mod,
@@ -241,6 +283,10 @@ def bundle_xegpu_softmax_schedule(
         gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu")
         transform.apply_cse(gpu_func)
 
+    # Cleanup.
+    transform.apply_cse(mod)
+    canonicalize(mod)
+
     if stop_at_stage == "xegpu-initial":
         raise PipelineInterrupt()
 

From a28cf4a1035cf12956aa9922ff0fbd3a46e51314 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Wed, 1 Apr 2026 22:30:40 +0000
Subject: [PATCH 20/38] save work

---
 lighthouse/schedule/xegpu/softmax_schedule.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py
index b42b6479..18186b89 100644
--- a/lighthouse/schedule/xegpu/softmax_schedule.py
+++ b/lighthouse/schedule/xegpu/softmax_schedule.py
@@ -237,6 +237,17 @@ def bundle_xegpu_softmax_schedule(
     transform.apply_cse(mod)
     canonicalize(mod)
 
+    # promote memref.alloc to memref.alloca in payload function
+    func = match(mod, ops={"func.func"})
+    func = apply_registered_pass(
+        func,
+        "promote-buffers-to-stack",
+        options={
+            "max-alloc-size-in-bytes": "8192",
+            "max-rank-of-allocated-memref": "2",
+        },
+    )
+
     if stop_at_stage == "bufferized":
         raise PipelineInterrupt()
 

From 79e2f737caae45431f207744187a7119c5504b12 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Fri, 3 Apr 2026 19:16:35 +0000
Subject: [PATCH 21/38] save work

---
 lighthouse/schedule/xegpu/softmax_schedule.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py
index 18186b89..bda5f819 100644
--- a/lighthouse/schedule/xegpu/softmax_schedule.py
+++ b/lighthouse/schedule/xegpu/softmax_schedule.py
@@ -303,10 +303,10 @@ def bundle_xegpu_softmax_schedule(
 
     # Set layout attributes for xegpu.store_nd operations.
     # FIXME: currently ecah subgroup is handling the entire row.
-    store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1)
+    store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=5)
     sg_layout = [parameters["sg_rows"], 1]
-    sg_data = [parameters["sg_rows"], parameters["sizes"][1]]
-    xegpu.set_anchor_layout(store_ops[0], sg_layout=sg_layout, sg_data=sg_data)
+    sg_data = [parameters["sg_rows"], parameters["reduction_step_size"]]
+    xegpu.set_anchor_layout(store_ops[-1], sg_layout=sg_layout, sg_data=sg_data)
 
     if stop_at_stage == "xegpu-wg":
         raise PipelineInterrupt()

From 55c175c0f69c511f6d753b0f7c55b5480d5303b3 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Fri, 3 Apr 2026 22:24:15 +0000
Subject: [PATCH 22/38] save work

---
 docs/softmax_lowering.md | 402 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 402 insertions(+)
 create mode 100644 docs/softmax_lowering.md

diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md
new file mode 100644
index 00000000..f67779ae
--- /dev/null
+++ b/docs/softmax_lowering.md
@@ -0,0 +1,402 @@
+# Linalg softmax lowering in XeGPU pipeline
+
+## Overview
+
+The lowering process consists of seven stages:
+1. **initial** - High-level tensor operations
+2. **tiled-softmax** - Tiled softmax operations
+3. **decomposed** - Decomposition into constituent operations
+4. **vectorized** - Vector operations
+5. **bufferized** - Memory-based representation
+6. **xegpu-initial** - GPU kernel with XeGPU operations
+7. **xegpu-wg** - Work-group optimized XeGPU
+
+---
+
+## Stage 1: Initial
+
+**Code:**
+```mlir
+func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
+  // ...
+  %2 = tensor.empty() : tensor<1024x64xf32>
+  %3 = linalg.softmax dimension(1) ins(%1 : tensor<1024x64xf32>) 
+                                  outs(%2 : tensor<1024x64xf32>) -> tensor<1024x64xf32>
+  // ...
+  return
+}
+```
+---
+
+## Stage 2: Tiled Softmax
+
+**Key Characteristics:**
+- Work distribution via `scf.forall` (16 parallel iterations)
+- Each tile processes 64x64 elements
+
+**Code:**
+```mlir
+func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
+  // ...
+  %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) {
+    %4 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
+    // Extract 64x64 input slice
+    %extracted_slice = tensor.extract_slice ...
+    // Extract 64x64 output slice
+    %extracted_slice_0 = tensor.extract_slice ...
+    // Apply softmax to the tile
+    %5 = linalg.softmax dimension(1) ins(%extracted_slice : tensor<64x64xf32>) 
+                                     outs(%extracted_slice_0 : tensor<64x64xf32>) -> tensor<64x64xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %5 into %arg3[%4, %c0] [64, 64] [1, 1] : 
+        tensor<64x64xf32> into tensor<1024x64xf32>
+    }
+  }
+  // ...  
+  return
+}
+```
+
+---
+
+## Stage 3: Decomposed
+
+**Key Characteristics:**
+- Softmax decomposed into 4 constituent `linalg.generic` ops : max, sub+exp, sum, divide
+- Uses `structured.structured_decompose_interface` implemented by `linalg.softmax`
+
+**Code:**
+```mlir
+func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
+  %cst = arith.constant 0.000000e+00 : f32
+  %cst_0 = arith.constant 0xFFC00000 : f32  // -inf for max reduction
+  %0 = bufferization.to_tensor %arg1 restrict : memref<1024x64xf32> to tensor<1024x64xf32>
+  %1 = tensor.empty() : tensor<1024x64xf32>
+  
+  %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x64xf32>) {
+    %3 = affine.apply #map(%arg2)  // %3 = %arg2 * 64
+    %extracted_slice = tensor.extract_slice %0[%3, 0] [64, 64] [1, 1] : 
+      tensor<1024x64xf32> to tensor<64x64xf32>
+    
+    // Step 1: Find max along dimension 1
+    %4 = tensor.empty() : tensor<64xf32>
+    %5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<64xf32>) -> tensor<64xf32>
+    %6 = linalg.generic {indexing_maps = [#map1, #map2], 
+                         iterator_types = ["parallel", "reduction"]} 
+         ins(%extracted_slice : tensor<64x64xf32>) outs(%5 : tensor<64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %11 = arith.maxnumf %in, %out : f32
+      linalg.yield %11 : f32
+    } -> tensor<64xf32>
+    
+    // Step 2: Subtract max and exponentiate
+    %7 = linalg.generic {indexing_maps = [#map1, #map2, #map1], 
+                         iterator_types = ["parallel", "parallel"]} 
+         ins(%extracted_slice, %6 : tensor<64x64xf32>, tensor<64xf32>) 
+         outs(%extracted_slice_1 : tensor<64x64xf32>) {
+    ^bb0(%in: f32, %in_2: f32, %out: f32):
+      %11 = arith.subf %in, %in_2 : f32
+      %12 = math.exp %11 : f32
+      linalg.yield %12 : f32
+    } -> tensor<64x64xf32>
+    
+    // Step 3: Sum exponentials
+    %8 = linalg.fill ins(%cst : f32) outs(%4 : tensor<64xf32>) -> tensor<64xf32>
+    %9 = linalg.generic {indexing_maps = [#map1, #map2], 
+                         iterator_types = ["parallel", "reduction"]} 
+         ins(%7 : tensor<64x64xf32>) outs(%8 : tensor<64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %11 = arith.addf %in, %out : f32
+      linalg.yield %11 : f32
+    } -> tensor<64xf32>
+    
+    // Step 4: Normalize by sum
+    %10 = linalg.generic {indexing_maps = [#map1, #map2, #map1], 
+                          iterator_types = ["parallel", "parallel"]} 
+          ins(%7, %9 : tensor<64x64xf32>, tensor<64xf32>) 
+          outs(%extracted_slice_1 : tensor<64x64xf32>) {
+    ^bb0(%in: f32, %in_2: f32, %out: f32):
+      %11 = arith.divf %in, %in_2 : f32
+      linalg.yield %11 : f32
+    } -> tensor<64x64xf32>
+    
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %10 into %arg3[%3, 0] [64, 64] [1, 1] : 
+        tensor<64x64xf32> into tensor<1024x64xf32>
+    }
+  }
+  return
+}
+```
+
+**What Happens:**
+- The 1024x64 input is divided into 16 tiles of 64x64 each
+- Softmax algorithm made explicit:
+  1. **Max reduction**: Find maximum value per row (for numerical stability)
+  2. **Exp**: Compute exp(x - max) for each element
+  3. **Sum reduction**: Sum exponentials per row
+  4. **Normalize**: Divide each element by its row sum
+- Each tile is processed independently, enabling parallelization
+- Results are inserted back into the output tensor
+
+---
+
+## Stage 4: Vectorized
+
+**Key Characteristics:**
+- `linalg.generic` operations replaced with vector operations
+- SIMD-friendly representation using `vector<64x64xf32>`
+- Explicit vector multi-reductions
+- Vector transfers for reading/writing data
+
+**Code:**
+```mlir
+func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
+  %cst = arith.constant dense<0.000000e+00> : vector<64xf32>
+  %0 = ub.poison : f32
+  %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32>
+  %c0 = arith.constant 0 : index
+  %1 = bufferization.to_tensor %arg1 restrict : memref<1024x64xf32> to tensor<1024x64xf32>
+  %2 = tensor.empty() : tensor<1024x64xf32>
+  
+  %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) {
+    %4 = affine.apply #map(%arg2)  // %4 = %arg2 * 64
+    %extracted_slice = tensor.extract_slice %arg3[%4, 0] [64, 64] [1, 1]
+    
+    // Vector read: Load 64x64 tile
+    %5 = vector.transfer_read %1[%4, %c0], %0 {in_bounds = [true, true]} : 
+      tensor<1024x64xf32>, vector<64x64xf32>
+    
+    // Max reduction: Reduce dimension 1 -> vector<64xf32>
+    %6 = vector.multi_reduction <maxnumf>, %5, %cst_0 [1] : 
+      vector<64x64xf32> to vector<64xf32>
+    
+    // Broadcast max values back to 64x64 and transpose
+    %7 = vector.broadcast %6 : vector<64xf32> to vector<64x64xf32>
+    %8 = vector.transpose %7, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
+    
+    // Subtract max and exponentiate
+    %9 = arith.subf %5, %8 : vector<64x64xf32>
+    %10 = math.exp %9 : vector<64x64xf32>
+    
+    // Sum reduction: Reduce dimension 1 -> vector<64xf32>
+    %11 = vector.multi_reduction <add>, %10, %cst [1] : 
+      vector<64x64xf32> to vector<64xf32>
+    
+    // Broadcast sums back to 64x64 and transpose
+    %12 = vector.broadcast %11 : vector<64xf32> to vector<64x64xf32>
+    %13 = vector.transpose %12, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
+    
+    // Normalize
+    %14 = arith.divf %10, %13 : vector<64x64xf32>
+    
+    // Vector write
+    %15 = vector.transfer_write %14, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : 
+      vector<64x64xf32>, tensor<64x64xf32>
+    
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %15 into %arg3[%4, 0] [64, 64] [1, 1]
+    }
+  }
+  return
+}
+```
+
+**What Happens:**
+- Linalg operations converted to vector dialect operations
+- `vector.transfer_read` loads entire 64x64 tile at once
+- `vector.multi_reduction` performs SIMD reductions (max and sum)
+- `vector.broadcast` and `vector.transpose` handle dimension alignment
+- All arithmetic operations work on vectors, enabling SIMD execution
+- `vector.transfer_write` stores results back
+
+---
+
+## Stage 5: Bufferized
+
+**Key Characteristics:**
+- Tensors eliminated, working directly with memrefs
+- Vector operations read/write directly from/to memory
+- No more tensor extract/insert operations
+- Simplified control flow
+
+**Code:**
+```mlir
+func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
+  %cst = arith.constant dense<0.000000e+00> : vector<64xf32>
+  %0 = ub.poison : f32
+  %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32>
+  %c0 = arith.constant 0 : index
+  
+  scf.forall (%arg2) in (16) {
+    %1 = affine.apply #map(%arg2)  // %1 = %arg2 * 64
+    
+    // Direct memref read
+    %2 = vector.transfer_read %arg1[%1, %c0], %0 {in_bounds = [true, true]} : 
+      memref<1024x64xf32>, vector<64x64xf32>
+    
+    // Max reduction
+    %3 = vector.multi_reduction <maxnumf>, %2, %cst_0 [1] : 
+      vector<64x64xf32> to vector<64xf32>
+    %4 = vector.broadcast %3 : vector<64xf32> to vector<64x64xf32>
+    %5 = vector.transpose %4, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
+    
+    // Subtract and exp
+    %6 = arith.subf %2, %5 : vector<64x64xf32>
+    %7 = math.exp %6 : vector<64x64xf32>
+    
+    // Sum reduction
+    %8 = vector.multi_reduction <add>, %7, %cst [1] : 
+      vector<64x64xf32> to vector<64xf32>
+    %9 = vector.broadcast %8 : vector<64xf32> to vector<64x64xf32>
+    %10 = vector.transpose %9, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
+    
+    // Normalize
+    %11 = arith.divf %7, %10 : vector<64x64xf32>
+    
+    // Direct memref write
+    vector.transfer_write %11, %arg0[%1, %c0] {in_bounds = [true, true]} : 
+      vector<64x64xf32>, memref<1024x64xf32>
+  }
+  return
+}
+```
+
+**What Happens:**
+- All tensor operations converted to memref-based operations
+- `scf.forall` no longer uses `shared_outs`, simplified to pure side effects
+- Vector transfers work directly on input/output memrefs
+- Memory layout is now explicit
+- This representation is ready for GPU kernel extraction
+
+---
+
+## Stage 6: XeGPU-Initial
+
+**Key Characteristics:**
+- GPU kernel separated from host code
+- `gpu.launch_func` invocation with grid/block dimensions
+- XeGPU tensor descriptors for memory access
+- Block-based load/store operations
+
+**Code:**
+
+**Host Side:**
+```mlir
+func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  %c128 = arith.constant 128 : index
+  gpu.launch_func @payload_kernel::@payload_kernel 
+    blocks in (%c16, %c1, %c1) 
+    threads in (%c128, %c1, %c1)
+    args(%arg1 : memref<1024x64xf32>, %arg0 : memref<1024x64xf32>)
+  return
+}
+```
+
+**GPU Kernel:**
+```mlir
+gpu.module @payload_kernel [#xevm.target<O = 3>] {
+  gpu.func @payload_kernel(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) kernel 
+    attributes {known_block_size = array<i32: 128, 1, 1>, 
+                known_grid_size = array<i32: 16, 1, 1>} {
+    %cst = arith.constant dense<0.000000e+00> : vector<64xf32>
+    %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32>
+    %c64 = arith.constant 64 : index
+    %block_id_x = gpu.block_id x
+    %0 = arith.muli %block_id_x, %c64 overflow<nsw> : index
+    
+    // Create XeGPU tensor descriptor for load
+    %1 = xegpu.create_nd_tdesc %arg0 : memref<1024x64xf32> -> 
+      !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+    
+    // XeGPU block load
+    %2 = xegpu.load_nd %1[%0, 0] : 
+      !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -> 
+      vector<64x64xf32>
+    
+    // Same compute operations as before
+    %3 = vector.multi_reduction <maxnumf>, %2, %cst_0 [1] : 
+      vector<64x64xf32> to vector<64xf32>
+    %4 = vector.broadcast %3 : vector<64xf32> to vector<64x64xf32>
+    %5 = vector.transpose %4, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
+    %6 = arith.subf %2, %5 : vector<64x64xf32>
+    %7 = math.exp %6 : vector<64x64xf32>
+    %8 = vector.multi_reduction <add>, %7, %cst [1] : 
+      vector<64x64xf32> to vector<64xf32>
+    %9 = vector.broadcast %8 : vector<64xf32> to vector<64x64xf32>
+    %10 = vector.transpose %9, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
+    %11 = arith.divf %7, %10 : vector<64x64xf32>
+    
+    // Create XeGPU tensor descriptor for store
+    %12 = xegpu.create_nd_tdesc %arg1 : memref<1024x64xf32> -> 
+      !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+    
+    // XeGPU block store
+    xegpu.store_nd %11, %12[%0, 0] : 
+      vector<64x64xf32>, 
+      !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+    
+    gpu.return
+  }
+}
+```
+
+**What Happens:**
+- Host function now calls `gpu.launch_func` with 16 blocks, 128 threads per block
+- Separate `gpu.module` contains the kernel code
+- `gpu.block_id x` replaces the loop iterator
+- `xegpu.create_nd_tdesc` creates tensor descriptors for memory regions
+- `xegpu.load_nd` performs hardware-optimized block loads
+- `xegpu.store_nd` performs hardware-optimized block stores
+- XeGPU operations map directly to Intel GPU instructions
+- Boundary checking disabled for performance (sizes known at compile time)
+
+---
+
+## Stage 7: XeGPU-WG (Work-Group Optimized)
+
+**Key Characteristics:**
+- Additional layout hints for work-group optimization
+- Sub-group layout specification: `sg_layout` and `sg_data`
+- Optimized memory access patterns for Intel XeGPU
+
+**Code (differences from xegpu-initial):**
+```mlir
+// Store operation now includes layout hints
+xegpu.store_nd %11, %12[%0, 0] 
+  <{layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [8, 64]>}> : 
+  vector<64x64xf32>, 
+  !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+```
+
+**What Happens:**
+- The `layout` attribute provides explicit sub-group (SG) tiling information:
+  - `sg_layout = [8, 1]`: Data is distributed across 8 sub-groups in the first dimension
+  - `sg_data = [8, 64]`: Each sub-group handles an 8x64 slice of data
+- This layout specification:
+  - Matches the 64x64 total data size (8 sub-groups × 8 rows = 64 rows, 64 columns)
+  - Optimizes coalesced memory accesses
+  - Enables efficient SIMD execution within each sub-group
+  - Aligns with Intel GPU hardware execution model (128 threads = 8 sub-groups of 16 threads)
+- The layout is applied to the store operation to optimize write patterns
+- Load operation remains unchanged as reads are typically more flexible
+
+---
+
+## Summary
+
+The lowering pipeline progressively transforms abstract operations into hardware-specific instructions:
+
+| Stage | Abstraction Level | Key Operations |
+|-------|------------------|----------------|
+| **initial** | High-level ML | `linalg.softmax` on full tensor |
+| **tiled-softmax** | Tiled high-level | `linalg.softmax` on tiles, `scf.forall` |
+| **decomposed** | Tiled computation | `linalg.generic` with reductions |
+| **vectorized** | Vector operations | `vector.multi_reduction`, `vector.transfer_read/write` |
+| **bufferized** | Memory-based | Direct memref operations with vectors |
+| **xegpu-initial** | GPU-specific | `xegpu.load_nd`, `xegpu.store_nd`, `gpu.launch_func` |
+| **xegpu-wg** | Hardware-optimized | Layout hints for sub-group optimization |
+
+Each stage maintains the same computational semantics while providing increasingly detailed control over execution and memory access patterns, ultimately targeting efficient execution on Intel XeGPU hardware.

From bf3a8c66a058cf4f7e8443d7247f2494437672a1 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Fri, 3 Apr 2026 22:51:42 +0000
Subject: [PATCH 23/38] save work

---
 docs/softmax_lowering.md | 158 ++++++++-------------------------------
 1 file changed, 31 insertions(+), 127 deletions(-)

diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md
index f67779ae..c5608e37 100644
--- a/docs/softmax_lowering.md
+++ b/docs/softmax_lowering.md
@@ -1,7 +1,10 @@
-# Linalg softmax lowering in XeGPU pipeline
+# Linalg softmax lowering to XeGPU (Currently supported in lighthouse)
 
 ## Overview
 
+**Assumptions:**
+Softmax dimension size is small (64 in this example). 
+
 The lowering process consists of seven stages:
 1. **initial** - High-level tensor operations
 2. **tiled-softmax** - Tiled softmax operations
@@ -30,7 +33,7 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
 
 ## Stage 2: Tiled Softmax
 
-**Key Characteristics:**
+**Notes**
 - Work distribution via `scf.forall` (16 parallel iterations)
 - Each tile processes 64x64 elements
 
@@ -61,63 +64,45 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
 
 ## Stage 3: Decomposed
 
-**Key Characteristics:**
+**Notes**
 - Softmax decomposed into 4 constituent `linalg.generic` ops : max, sub+exp, sum, divide
 - Uses `structured.structured_decompose_interface` implemented by `linalg.softmax`
 
 **Code:**
 ```mlir
 func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
-  %cst = arith.constant 0.000000e+00 : f32
-  %cst_0 = arith.constant 0xFFC00000 : f32  // -inf for max reduction
-  %0 = bufferization.to_tensor %arg1 restrict : memref<1024x64xf32> to tensor<1024x64xf32>
-  %1 = tensor.empty() : tensor<1024x64xf32>
+  // ...
   
   %2 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %1) -> (tensor<1024x64xf32>) {
     %3 = affine.apply #map(%arg2)  // %3 = %arg2 * 64
-    %extracted_slice = tensor.extract_slice %0[%3, 0] [64, 64] [1, 1] : 
-      tensor<1024x64xf32> to tensor<64x64xf32>
+    %extracted_slice = tensor.extract_slice ...
     
     // Step 1: Find max along dimension 1
     %4 = tensor.empty() : tensor<64xf32>
     %5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<64xf32>) -> tensor<64xf32>
-    %6 = linalg.generic {indexing_maps = [#map1, #map2], 
-                         iterator_types = ["parallel", "reduction"]} 
-         ins(%extracted_slice : tensor<64x64xf32>) outs(%5 : tensor<64xf32>) {
-    ^bb0(%in: f32, %out: f32):
+    %6 = linalg.generic // ...
       %11 = arith.maxnumf %in, %out : f32
-      linalg.yield %11 : f32
+      // ...
     } -> tensor<64xf32>
     
     // Step 2: Subtract max and exponentiate
-    %7 = linalg.generic {indexing_maps = [#map1, #map2, #map1], 
-                         iterator_types = ["parallel", "parallel"]} 
-         ins(%extracted_slice, %6 : tensor<64x64xf32>, tensor<64xf32>) 
-         outs(%extracted_slice_1 : tensor<64x64xf32>) {
-    ^bb0(%in: f32, %in_2: f32, %out: f32):
+    %7 = linalg.generic // ...
       %11 = arith.subf %in, %in_2 : f32
       %12 = math.exp %11 : f32
-      linalg.yield %12 : f32
+      // ...
     } -> tensor<64x64xf32>
     
     // Step 3: Sum exponentials
     %8 = linalg.fill ins(%cst : f32) outs(%4 : tensor<64xf32>) -> tensor<64xf32>
-    %9 = linalg.generic {indexing_maps = [#map1, #map2], 
-                         iterator_types = ["parallel", "reduction"]} 
-         ins(%7 : tensor<64x64xf32>) outs(%8 : tensor<64xf32>) {
-    ^bb0(%in: f32, %out: f32):
+    %9 = linalg.generic // ...
       %11 = arith.addf %in, %out : f32
-      linalg.yield %11 : f32
+      // ...
     } -> tensor<64xf32>
     
     // Step 4: Normalize by sum
-    %10 = linalg.generic {indexing_maps = [#map1, #map2, #map1], 
-                          iterator_types = ["parallel", "parallel"]} 
-          ins(%7, %9 : tensor<64x64xf32>, tensor<64xf32>) 
-          outs(%extracted_slice_1 : tensor<64x64xf32>) {
-    ^bb0(%in: f32, %in_2: f32, %out: f32):
+    %10 = linalg.generic // ...
       %11 = arith.divf %in, %in_2 : f32
-      linalg.yield %11 : f32
+      // ...
     } -> tensor<64x64xf32>
     
     scf.forall.in_parallel {
@@ -129,39 +114,21 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
 }
 ```
 
-**What Happens:**
-- The 1024x64 input is divided into 16 tiles of 64x64 each
-- Softmax algorithm made explicit:
-  1. **Max reduction**: Find maximum value per row (for numerical stability)
-  2. **Exp**: Compute exp(x - max) for each element
-  3. **Sum reduction**: Sum exponentials per row
-  4. **Normalize**: Divide each element by its row sum
-- Each tile is processed independently, enabling parallelization
-- Results are inserted back into the output tensor
-
 ---
 
 ## Stage 4: Vectorized
 
-**Key Characteristics:**
+**Notes**
 - `linalg.generic` operations replaced with vector operations
-- SIMD-friendly representation using `vector<64x64xf32>`
-- Explicit vector multi-reductions
 - Vector transfers for reading/writing data
 
 **Code:**
 ```mlir
 func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
-  %cst = arith.constant dense<0.000000e+00> : vector<64xf32>
-  %0 = ub.poison : f32
-  %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32>
-  %c0 = arith.constant 0 : index
-  %1 = bufferization.to_tensor %arg1 restrict : memref<1024x64xf32> to tensor<1024x64xf32>
-  %2 = tensor.empty() : tensor<1024x64xf32>
-  
+  // ...  
   %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) {
     %4 = affine.apply #map(%arg2)  // %4 = %arg2 * 64
-    %extracted_slice = tensor.extract_slice %arg3[%4, 0] [64, 64] [1, 1]
+    %extracted_slice = tensor.extract_slice ..
     
     // Vector read: Load 64x64 tile
     %5 = vector.transfer_read %1[%4, %c0], %0 {in_bounds = [true, true]} : 
@@ -201,32 +168,17 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
   return
 }
 ```
-
-**What Happens:**
-- Linalg operations converted to vector dialect operations
-- `vector.transfer_read` loads entire 64x64 tile at once
-- `vector.multi_reduction` performs SIMD reductions (max and sum)
-- `vector.broadcast` and `vector.transpose` handle dimension alignment
-- All arithmetic operations work on vectors, enabling SIMD execution
-- `vector.transfer_write` stores results back
-
 ---
 
 ## Stage 5: Bufferized
 
-**Key Characteristics:**
+**Notes**
 - Tensors eliminated, working directly with memrefs
-- Vector operations read/write directly from/to memory
-- No more tensor extract/insert operations
-- Simplified control flow
 
 **Code:**
 ```mlir
 func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
-  %cst = arith.constant dense<0.000000e+00> : vector<64xf32>
-  %0 = ub.poison : f32
-  %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32>
-  %c0 = arith.constant 0 : index
+  // ...
   
   scf.forall (%arg2) in (16) {
     %1 = affine.apply #map(%arg2)  // %1 = %arg2 * 64
@@ -262,31 +214,21 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
 }
 ```
 
-**What Happens:**
-- All tensor operations converted to memref-based operations
-- `scf.forall` no longer uses `shared_outs`, simplified to pure side effects
-- Vector transfers work directly on input/output memrefs
-- Memory layout is now explicit
-- This representation is ready for GPU kernel extraction
-
 ---
 
 ## Stage 6: XeGPU-Initial
 
-**Key Characteristics:**
-- GPU kernel separated from host code
+**Notes**
+- GPU kernel separated from host code (Gpu Outlining)
 - `gpu.launch_func` invocation with grid/block dimensions
-- XeGPU tensor descriptors for memory access
-- Block-based load/store operations
+- Use `vector-to-xegpu`
 
 **Code:**
 
 **Host Side:**
 ```mlir
 func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
-  %c1 = arith.constant 1 : index
-  %c16 = arith.constant 16 : index
-  %c128 = arith.constant 128 : index
+  // ...
   gpu.launch_func @payload_kernel::@payload_kernel 
     blocks in (%c16, %c1, %c1) 
     threads in (%c128, %c1, %c1)
@@ -301,9 +243,7 @@ gpu.module @payload_kernel [#xevm.target<O = 3>] {
   gpu.func @payload_kernel(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) kernel 
     attributes {known_block_size = array<i32: 128, 1, 1>, 
                 known_grid_size = array<i32: 16, 1, 1>} {
-    %cst = arith.constant dense<0.000000e+00> : vector<64xf32>
-    %cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32>
-    %c64 = arith.constant 64 : index
+    // ...
     %block_id_x = gpu.block_id x
     %0 = arith.muli %block_id_x, %c64 overflow<nsw> : index
     
@@ -343,24 +283,14 @@ gpu.module @payload_kernel [#xevm.target<O = 3>] {
 }
 ```
 
-**What Happens:**
-- Host function now calls `gpu.launch_func` with 16 blocks, 128 threads per block
-- Separate `gpu.module` contains the kernel code
-- `gpu.block_id x` replaces the loop iterator
-- `xegpu.create_nd_tdesc` creates tensor descriptors for memory regions
-- `xegpu.load_nd` performs hardware-optimized block loads
-- `xegpu.store_nd` performs hardware-optimized block stores
-- XeGPU operations map directly to Intel GPU instructions
-- Boundary checking disabled for performance (sizes known at compile time)
-
 ---
 
 ## Stage 7: XeGPU-WG (Work-Group Optimized)
 
-**Key Characteristics:**
-- Additional layout hints for work-group optimization
-- Sub-group layout specification: `sg_layout` and `sg_data`
-- Optimized memory access patterns for Intel XeGPU
+**Notes**
+- Sets the layout for anchor xegpu ops. Each Wg consistes of [8, 1] subgroups
+  doing 8x64 softmax slice. 
+- Only sets the layotu for `store_nd`. Layout propagation does the rest.  
 
 **Code (differences from xegpu-initial):**
 ```mlir
@@ -371,32 +301,6 @@ xegpu.store_nd %11, %12[%0, 0]
   !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
 ```
 
-**What Happens:**
-- The `layout` attribute provides explicit sub-group (SG) tiling information:
-  - `sg_layout = [8, 1]`: Data is distributed across 8 sub-groups in the first dimension
-  - `sg_data = [8, 64]`: Each sub-group handles an 8x64 slice of data
-- This layout specification:
-  - Matches the 64x64 total data size (8 sub-groups × 8 rows = 64 rows, 64 columns)
-  - Optimizes coalesced memory accesses
-  - Enables efficient SIMD execution within each sub-group
-  - Aligns with Intel GPU hardware execution model (128 threads = 8 sub-groups of 16 threads)
-- The layout is applied to the store operation to optimize write patterns
-- Load operation remains unchanged as reads are typically more flexible
-
 ---
 
-## Summary
-
-The lowering pipeline progressively transforms abstract operations into hardware-specific instructions:
-
-| Stage | Abstraction Level | Key Operations |
-|-------|------------------|----------------|
-| **initial** | High-level ML | `linalg.softmax` on full tensor |
-| **tiled-softmax** | Tiled high-level | `linalg.softmax` on tiles, `scf.forall` |
-| **decomposed** | Tiled computation | `linalg.generic` with reductions |
-| **vectorized** | Vector operations | `vector.multi_reduction`, `vector.transfer_read/write` |
-| **bufferized** | Memory-based | Direct memref operations with vectors |
-| **xegpu-initial** | GPU-specific | `xegpu.load_nd`, `xegpu.store_nd`, `gpu.launch_func` |
-| **xegpu-wg** | Hardware-optimized | Layout hints for sub-group optimization |
-
-Each stage maintains the same computational semantics while providing increasingly detailed control over execution and memory access patterns, ultimately targeting efficient execution on Intel XeGPU hardware.
+# Supporting larger Softmax dimension sizes.

From b083887154b083d184430c0c87baf887d155dcca Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Fri, 3 Apr 2026 23:03:15 +0000
Subject: [PATCH 24/38] save work

---
 examples/xegpu/softmax.py                     |  8 --
 lighthouse/schedule/xegpu/softmax_schedule.py | 90 +------------------
 2 files changed, 3 insertions(+), 95 deletions(-)

diff --git a/examples/xegpu/softmax.py b/examples/xegpu/softmax.py
index 8049f0f4..afca86e3 100644
--- a/examples/xegpu/softmax.py
+++ b/examples/xegpu/softmax.py
@@ -181,12 +181,6 @@ def parse_cli():
         default=16,
         help="Subgroup size.",
     )
-    parser.add_argument(
-        "--reduction-step-size",
-        type=int,
-        default=16,
-        help="Step size for reduction loop tiling (optional).",
-    )
     parser.add_argument(
         "--nruns",
         type=int,
@@ -212,7 +206,6 @@ def parse_cli():
             "tiled",
             "vectorized",
             "bufferized",
-            "gpu-outlining",
             "xegpu-initial",
             "xegpu-wg",
             "final",
@@ -244,7 +237,6 @@ def parse_cli():
         "wg_rows": args.wg_rows,
         "sg_rows": args.sg_rows,
         "subgroup_size": args.subgroup_size,
-        "reduction_step_size": args.reduction_step_size,
     }
 
     M, N = args.sizes
diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py
index bda5f819..0907bc5a 100644
--- a/lighthouse/schedule/xegpu/softmax_schedule.py
+++ b/lighthouse/schedule/xegpu/softmax_schedule.py
@@ -39,7 +39,6 @@ def get_softmax_schedule_module(
             - sg_rows: Number of rows per subgroup
             - subgroup_size: Size of subgroup
             - sizes: Tuple with the sizes of the input tensors (e.g. (M, N))
-            - reduction_step_size: Optional step size for tiling reduction loops
 
     Returns:
         MLIR module containing the transform schedule
@@ -140,72 +139,7 @@ def bundle_xegpu_softmax_schedule(
         transform.AnyOpType.get(), func, ops=["linalg.softmax"]
     )
     structured.structured_decompose_interface(anytype, softmax_ops)
-    # transform.print_(target=func, name="After structured_decompose_interface")
 
-    linalg_ops = match_and_split(
-        func, ops={"linalg.generic", "linalg.fill"}, nhandles=6
-    )
-    max_reduction = linalg_ops[1]
-    max_center_and_exp_op = linalg_ops[2]
-    sum_reduction = linalg_ops[4]
-    div_op = linalg_ops[5]
-
-    reduction_step_size = parameters["reduction_step_size"]
-
-    # Tile the division op and fuse the sub+exp producer into it
-    _, div_loop = structured.TileUsingForOp(
-        div_op, sizes=[0, reduction_step_size]
-    ).results
-
-    # Fuse max_center_and_exp_op into the div loop
-    _, fused_loop = structured.structured_fuse_into_containing_op(
-        anytype,
-        anytype,
-        producer_op=max_center_and_exp_op,
-        containing_op=div_loop,
-    )
-
-    # Tile the sum reduction and fuse the sub+exp producer into it
-    _, _, _, sum_loop = structured.structured_tile_reduction_using_for(
-        [anytype],
-        anytype,
-        anytype,
-        anytype,
-        target=sum_reduction,
-        tile_sizes=[0, reduction_step_size],
-    )
-
-    func = transform.get_parent_op(
-        anytype,
-        fused_loop,
-        op_name="func.func",
-        deduplicate=True,
-    )
-
-    # Re-match and split linalg generic ops, there are 5 at this point
-    linalg_ops = match_and_split(func, ops={"linalg.generic"}, nhandles=5)
-    max_center_and_exp_op = linalg_ops[1]
-
-    # Fuse max_center_and_exp_op into the sum reduction loop
-    _, fused_sum_loop = structured.structured_fuse_into_containing_op(
-        anytype,
-        anytype,
-        producer_op=max_center_and_exp_op,
-        containing_op=sum_loop,
-    )
-
-    # Tile the max reduction.
-    max_reduction = linalg_ops[0]
-    structured.structured_tile_reduction_using_for(
-        [anytype],
-        anytype,
-        anytype,
-        anytype,
-        target=max_reduction,
-        tile_sizes=[0, reduction_step_size],
-    )
-
-    # Cleanup after tiling and fusion
     transform.apply_cse(func)
     canonicalize(func)
 
@@ -237,17 +171,6 @@ def bundle_xegpu_softmax_schedule(
     transform.apply_cse(mod)
     canonicalize(mod)
 
-    # promote memref.alloc to memref.alloca in payload function
-    func = match(mod, ops={"func.func"})
-    func = apply_registered_pass(
-        func,
-        "promote-buffers-to-stack",
-        options={
-            "max-alloc-size-in-bytes": "8192",
-            "max-rank-of-allocated-memref": "2",
-        },
-    )
-
     if stop_at_stage == "bufferized":
         raise PipelineInterrupt()
 
@@ -277,9 +200,6 @@ def bundle_xegpu_softmax_schedule(
     mod = apply_registered_pass(mod, "gpu-kernel-outlining")
     transform.apply_cse(mod)
 
-    if stop_at_stage == "gpu-outlining":
-        raise PipelineInterrupt()
-
     # set xevm target
     mod = apply_registered_pass(
         mod,
@@ -294,19 +214,15 @@ def bundle_xegpu_softmax_schedule(
         gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu")
         transform.apply_cse(gpu_func)
 
-    # Cleanup.
-    transform.apply_cse(mod)
-    canonicalize(mod)
-
     if stop_at_stage == "xegpu-initial":
         raise PipelineInterrupt()
 
     # Set layout attributes for xegpu.store_nd operations.
     # FIXME: currently ecah subgroup is handling the entire row.
-    store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=5)
+    store_ops = match_and_split(gpu_func, ops={"xegpu.store_nd"}, nhandles=1)
     sg_layout = [parameters["sg_rows"], 1]
-    sg_data = [parameters["sg_rows"], parameters["reduction_step_size"]]
-    xegpu.set_anchor_layout(store_ops[-1], sg_layout=sg_layout, sg_data=sg_data)
+    sg_data = [parameters["sg_rows"], parameters["sizes"][1]]
+    xegpu.set_anchor_layout(store_ops[0], sg_layout=sg_layout, sg_data=sg_data)
 
     if stop_at_stage == "xegpu-wg":
         raise PipelineInterrupt()

From 76a02bdac6643f889a51be9e40f3d9f857822eab Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Fri, 3 Apr 2026 23:37:26 +0000
Subject: [PATCH 25/38] tiled reduction doc

---
 docs/softmax_lowering.md | 331 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 330 insertions(+), 1 deletion(-)

diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md
index c5608e37..a31e6f4c 100644
--- a/docs/softmax_lowering.md
+++ b/docs/softmax_lowering.md
@@ -303,4 +303,333 @@ xegpu.store_nd %11, %12[%0, 0]
 
 ---
 
-# Supporting larger Softmax dimension sizes.
+# Supporting larger Softmax dimension sizes
+
+When the softmax dimension is larger than what can fit efficiently in registers, additional tiling and fusion transformations are applied to the reduction dimension. This section shows the intermediate stages between "decomposed" and "vectorized".
+
+**Approach:** Tile reductions along dimension 1 (step size = 16) and fuse producers into consumers to enable streaming computation.
+
+---
+
+## Decomposed → Tiled: Stage A - Tile div op
+
+**Notes:**
+- Tile the division operation with step size 16 along dimension 1
+- Creates `scf.for` loop iterating over 64 elements in chunks of 16
+
+**Key Changes:**
+```mlir
+// Before: Single division linalg.generic over 64x64
+%11 = linalg.generic {...} ins(%8, %10 : tensor<64x64xf32>, tensor<64xf32>) 
+      outs(%extracted_slice_0 : tensor<64x64xf32>) { ... } -> tensor<64x64xf32>
+
+// After: Division tiled into 64x16 chunks
+%11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) {
+  %extracted_slice_3 = tensor.extract_slice %8[0, %arg4] [64, 16] [1, 1]
+  %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) 
+        outs(%extracted_slice_5 : tensor<64x16xf32>) { ... } -> tensor<64x16xf32>
+  %inserted_slice = tensor.insert_slice %12 into %arg5[0, %arg4] [64, 16] [1, 1]
+  scf.yield %inserted_slice
+}
+```
+
+---
+
+## Stage B - Fuse sub+exp into div loop
+
+**Notes:**
+- Fuse the `sub+exp` producer (max_center_and_exp_op) into the div loop
+- Recomputes exp values on-the-fly instead of materializing full 64x64 tensor
+
+**Key Changes:**
+```mlir
+%11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) {
+  %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1]
+  
+  // Fused: sub+exp computed per 16-element chunk
+  %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) 
+        outs(%extracted_slice_5 : tensor<64x16xf32>) {
+    ^bb0(%in: f32, %in_8: f32, %out: f32):
+      %14 = arith.subf %in, %in_8 : f32
+      %15 = math.exp %14 : f32
+      linalg.yield %15 : f32
+  } -> tensor<64x16xf32>
+  
+  // Division operation
+  %13 = linalg.generic {...} ins(%12, %extracted_slice_6 : tensor<64x16xf32>, tensor<64xf32>) 
+        outs(%extracted_slice_7 : tensor<64x16xf32>) { ... } -> tensor<64x16xf32>
+  // ...
+}
+```
+
+---
+
+## Stage C - Tile sum reduction
+
+**Notes:**
+- Tile the sum reduction using `structured_tile_reduction_using_for`
+- Creates intermediate accumulator tensor (64x16)
+- Final reduction via `linalg.reduce` over dimension 1
+
+**Key Changes:**
+```mlir
+// Tiled sum reduction with intermediate accumulator
+%10 = tensor.empty() : tensor<64x16xf32>
+%11 = linalg.fill ins(%cst_2 : f32) outs(%10 : tensor<64x16xf32>) -> tensor<64x16xf32>
+
+%12 = scf.for %arg4 = %c0_3 to %c64 step %c16 iter_args(%arg5 = %11) -> (tensor<64x16xf32>) {
+  %extracted_slice_7 = tensor.extract_slice %8[0, %arg4] [64, 16] [1, 1]
+  %14 = linalg.generic {...} ins(%extracted_slice_7 : tensor<64x16xf32>) 
+        outs(%extracted_slice_8 : tensor<64x16xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %15 = arith.addf %in, %out : f32
+      linalg.yield %15 : f32
+  } -> tensor<64x16xf32>
+  // ...
+}
+
+// Final reduction to 64xf32
+%reduced = linalg.reduce ins(%12 : tensor<64x16xf32>) outs(%9 : tensor<64xf32>) dimensions = [1] 
+  (%in: f32, %init: f32) {
+    %14 = arith.addf %in, %init : f32
+    linalg.yield %14 : f32
+  }
+```
+
+---
+
+## Stage D - Fuse sub+exp into sum loop
+
+**Notes:**
+- Fuse `sub+exp` into the sum reduction loop
+- Stream computation: compute exp and accumulate in same loop
+
+**Key Changes:**
+```mlir
+%12 = scf.for %arg4 = %c0_3 to %c64 step %c16 iter_args(%arg5 = %11) -> (tensor<64x16xf32>) {
+  %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1]
+  
+  // Fused: sub+exp
+  %14 = linalg.generic {...} ins(%extracted_slice_7, %extracted_slice_8 : tensor<64x16xf32>, tensor<64xf32>) 
+        outs(%extracted_slice_9 : tensor<64x16xf32>) {
+    ^bb0(%in: f32, %in_11: f32, %out: f32):
+      %16 = arith.subf %in, %in_11 : f32
+      %17 = math.exp %16 : f32
+      linalg.yield %17 : f32
+  } -> tensor<64x16xf32>
+  
+  // Accumulate sum
+  %15 = linalg.generic {...} ins(%14 : tensor<64x16xf32>) 
+        outs(%extracted_slice_10 : tensor<64x16xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %16 = arith.addf %in, %out : f32
+      linalg.yield %16 : f32
+  } -> tensor<64x16xf32>
+  // ...
+}
+```
+
+---
+
+## Stage E - Tile max reduction
+
+**Notes:**
+- Tile max reduction similar to sum reduction
+- Creates 64x16 intermediate accumulator
+- Final reduction via `linalg.reduce` with maxnumf
+
+**Key Changes:**
+```mlir
+// Tiled max reduction
+%7 = tensor.empty() : tensor<64x16xf32>
+%8 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<64x16xf32>) -> tensor<64x16xf32>
+
+%9 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %8) -> (tensor<64x16xf32>) {
+  %extracted_slice_12 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1]
+  %16 = linalg.generic {...} ins(%extracted_slice_12 : tensor<64x16xf32>) 
+        outs(%extracted_slice_13 : tensor<64x16xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %17 = arith.maxnumf %in, %out : f32
+      linalg.yield %17 : f32
+  } -> tensor<64x16xf32>
+  // ...
+}
+
+// Final max reduction
+%reduced = linalg.reduce ins(%9 : tensor<64x16xf32>) outs(%6 : tensor<64xf32>) dimensions = [1] 
+  (%in: f32, %init: f32) {
+    %16 = arith.maxnumf %in, %init : f32
+    linalg.yield %16 : f32
+  }
+```
+
+**Result:** Now all three major computations (max, sum, div) are tiled and operate on 64x16 chunks, with exp computation fused into both sum and div loops.
+
+---
+
+## Stage F - Vectorization
+
+**Notes:**
+- Convert tiled linalg operations to vector operations
+- `scf.for` loops remain but operate on vectors
+- Vector size: 64x16 for tiled operations
+
+**Code:**
+```mlir
+func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
+  // ...
+  %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) {
+    // ...
+    
+    // Vectorized max reduction loop
+    %6 = vector.transfer_write %cst_1, %5[%c0, %c0] : vector<64x16xf32>, tensor<64x16xf32>
+    %7 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %6) -> (tensor<64x16xf32>) {
+      %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32>
+      %16 = vector.transfer_read %arg5[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32>
+      %17 = arith.maxnumf %15, %16 : vector<64x16xf32>
+      %18 = vector.transfer_write %17, %arg5[%c0, %c0] : vector<64x16xf32>, tensor<64x16xf32>
+      scf.yield %18 : tensor<64x16xf32>
+    }
+    %8 = vector.transfer_read %7[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32>
+    %9 = vector.multi_reduction <maxnumf>, %8, %cst_2 [1] : vector<64x16xf32> to vector<64xf32>
+    
+    // Vectorized sum reduction loop with fused sub+exp
+    %11 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %10) -> (tensor<64x16xf32>) {
+      %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32>
+      %16 = vector.broadcast %9 : vector<64xf32> to vector<16x64xf32>
+      %17 = vector.transpose %16, [1, 0] : vector<16x64xf32> to vector<64x16xf32>
+      %18 = arith.subf %15, %17 : vector<64x16xf32>
+      %19 = math.exp %18 : vector<64x16xf32>
+      %20 = vector.transfer_read %arg5[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32>
+      %21 = arith.addf %19, %20 : vector<64x16xf32>
+      %22 = vector.transfer_write %21, %arg5[%c0, %c0] : vector<64x16xf32>, tensor<64x16xf32>
+      scf.yield %22 : tensor<64x16xf32>
+    }
+    %12 = vector.transfer_read %11[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32>
+    %13 = vector.multi_reduction <add>, %12, %cst_0 [1] : vector<64x16xf32> to vector<64xf32>
+    
+    // Vectorized div loop with fused sub+exp
+    %14 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %extracted_slice) -> (tensor<64x64xf32>) {
+      %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32>
+      %16 = vector.broadcast %9 : vector<64xf32> to vector<16x64xf32>
+      %17 = vector.transpose %16, [1, 0] : vector<16x64xf32> to vector<64x16xf32>
+      %18 = arith.subf %15, %17 : vector<64x16xf32>
+      %19 = math.exp %18 : vector<64x16xf32>
+      %20 = vector.broadcast %13 : vector<64xf32> to vector<16x64xf32>
+      %21 = vector.transpose %20, [1, 0] : vector<16x64xf32> to vector<64x16xf32>
+      %22 = arith.divf %19, %21 : vector<64x16xf32>
+      %23 = vector.transfer_write %22, %arg5[%c0, %arg4] : vector<64x16xf32>, tensor<64x64xf32>
+      scf.yield %23 : tensor<64x64xf32>
+    }
+  }
+  // ...
+}
+```
+
+---
+
+## Stage G - Bufferization
+
+**Notes:**
+- Convert tensors to memrefs
+- Allocate stack buffer for 64x16 accumulator: `memref.alloc()`
+
+**Code:**
+```mlir
+func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
+  // ...
+  scf.forall (%arg2) in (16) {
+    %1 = affine.apply #map(%arg2)
+    %subview = memref.subview %arg0[%1, 0] [64, 64] [1, 1]
+    
+    // Allocate accumulator buffer
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<64x16xf32>
+    
+    // Max reduction loop
+    vector.transfer_write %cst_1, %alloc[%c0, %c0] : vector<64x16xf32>, memref<64x16xf32>
+    scf.for %arg3 = %c0 to %c64 step %c16 {
+      %6 = vector.transfer_read %arg1[%1, %arg3], %0 : memref<1024x64xf32>, vector<64x16xf32>
+      %7 = vector.transfer_read %alloc[%c0, %c0], %0 : memref<64x16xf32>, vector<64x16xf32>
+      %8 = arith.maxnumf %6, %7 : vector<64x16xf32>
+      vector.transfer_write %8, %alloc[%c0, %c0] : vector<64x16xf32>, memref<64x16xf32>
+    }
+    %2 = vector.transfer_read %alloc[%c0, %c0], %0 : memref<64x16xf32>, vector<64x16xf32>
+    %3 = vector.multi_reduction <maxnumf>, %2, %cst_2 [1] : vector<64x16xf32> to vector<64xf32>
+    
+    // Sum reduction loop (reuses %alloc)
+    // ...
+    
+    // Div loop (writes to %subview)
+    // ...
+  }
+}
+```
+
+---
+
+## Stage H - Promote buffers to stack
+
+**Notes:**
+- Convert `memref.alloc()` to `memref.alloca()` for stack allocation
+- Reduces memory allocation overhead
+
+**Code:**
+```mlir
+scf.forall (%arg2) in (16) {
+  %1 = affine.apply #map(%arg2)
+  %subview = memref.subview %arg0[%1, 0] [64, 64] [1, 1]
+  
+  // Stack allocation instead of heap
+  %alloca = memref.alloca() {alignment = 64 : i64} : memref<64x16xf32>
+  
+  // ... same operations using %alloca ...
+}
+```
+
+---
+
+## Stage I - GPU outlining
+
+**Notes:**
+- Convert `scf.forall` to `scf.parallel`, then to `gpu.launch`
+- Extract GPU kernel into separate `gpu.module`
+- Set thread count: 128 threads = (64 rows / 8 sg_rows) × 16 subgroup_size
+
+**Host Side:**
+```mlir
+func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
+  %c16 = arith.constant 16 : index
+  %c1 = arith.constant 1 : index
+  %c128 = arith.constant 128 : index
+  gpu.launch_func @payload_kernel::@payload_kernel 
+    blocks in (%c16, %c1, %c1) 
+    threads in (%c128, %c1, %c1)
+    args(%arg0 : memref<1024x64xf32>, %arg1 : memref<1024x64xf32>)
+  return
+}
+```
+
+**GPU Kernel:**
+```mlir
+gpu.module @payload_kernel {
+  gpu.func @payload_kernel(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) kernel 
+    attributes {known_block_size = array<i32: 128, 1, 1>, 
+                known_grid_size = array<i32: 16, 1, 1>} {
+    %block_id_x = gpu.block_id x
+    %1 = arith.muli %block_id_x, %c64 overflow<nsw> : index
+    %subview = memref.subview %arg0[%1, 0] [64, 64] [1, 1]
+    %alloca = memref.alloca() {alignment = 64 : i64} : memref<64x16xf32>
+    
+    // Three reduction loops (max, sum, div) with same structure
+    scf.for %arg2 = %c0 to %c64 step %c16 {
+      // Max: accumulate max values
+      // Sum: compute & accumulate exp(x - max)
+      // Div: compute exp(x - max) / sum
+    }
+    
+    gpu.return
+  }
+}
+```
+
+**Summary:** At this stage, the kernel processes 64x16 chunks in streaming fashion through three sequential loops, minimizing memory footprint.

From 4d6ae0a5b491fac31bfb2f4edfb9b1541fc35e3f Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Fri, 3 Apr 2026 23:55:14 +0000
Subject: [PATCH 26/38] save work

---
 docs/softmax_lowering.md | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md
index a31e6f4c..311746bc 100644
--- a/docs/softmax_lowering.md
+++ b/docs/softmax_lowering.md
@@ -305,9 +305,9 @@ xegpu.store_nd %11, %12[%0, 0]
 
 # Supporting larger Softmax dimension sizes
 
-When the softmax dimension is larger than what can fit efficiently in registers, additional tiling and fusion transformations are applied to the reduction dimension. This section shows the intermediate stages between "decomposed" and "vectorized".
+Previsouly we tiled all ops in the parallel dimension only (i.e. non softmax dim). Handling a larger softmax dimension require tiling the softmax contituent ops in that dimension. 
 
-**Approach:** Tile reductions along dimension 1 (step size = 16) and fuse producers into consumers to enable streaming computation.
+**Approach:** Tile reductions along dimension 1 (using appropriate step size) and fuse producers into consumers to enable streaming computation.
 
 ---
 
@@ -320,16 +320,27 @@ When the softmax dimension is larger than what can fit efficiently in registers,
 **Key Changes:**
 ```mlir
 // Before: Single division linalg.generic over 64x64
-%11 = linalg.generic {...} ins(%8, %10 : tensor<64x64xf32>, tensor<64xf32>) 
-      outs(%extracted_slice_0 : tensor<64x64xf32>) { ... } -> tensor<64x64xf32>
+scf.forall ... {
+  // Max, Center+Exp, Sum ops ...
+  %11 = linalg.generic {...} ins(%8, %10 : tensor<64x64xf32>, tensor<64xf32>) outs(%extracted_slice_0 : tensor<64x64xf32>) {
+    ^bb0(%in: f32, %in_2: f32, %out: f32):
+      %12 = arith.divf %in, %in_2 : f32
+      linalg.yield %12 : f32
+    } -> tensor<64x64xf32>
+}
 
 // After: Division tiled into 64x16 chunks
-%11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) {
-  %extracted_slice_3 = tensor.extract_slice %8[0, %arg4] [64, 16] [1, 1]
-  %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) 
-        outs(%extracted_slice_5 : tensor<64x16xf32>) { ... } -> tensor<64x16xf32>
-  %inserted_slice = tensor.insert_slice %12 into %arg5[0, %arg4] [64, 16] [1, 1]
-  scf.yield %inserted_slice
+scf.forall ... {
+  %11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) {
+      // Max, Center+Exp, Sum ops ...
+      %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) outs(%extracted_slice_5 : tensor<64x16xf32>) {
+      ^bb0(%in: f32, %in_6: f32, %out: f32):
+        %13 = arith.divf %in, %in_6 : f32
+        linalg.yield %13 : f32
+      } -> tensor<64x16xf32>
+      %inserted_slice = tensor.insert_slice %12 into %arg5[0, %arg4] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x64xf32>
+      scf.yield %inserted_slice : tensor<64x64xf32>
+    }
 }
 ```
 

From 5ba1e679f0d04ec37a5de715b9703af0c85dad07 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Wed, 8 Apr 2026 01:18:39 +0000
Subject: [PATCH 27/38] add attention lowering doc:

---
 docs/IREEAttentionLowering.md | 323 ++++++++++++++++++++++++++++++++++
 1 file changed, 323 insertions(+)
 create mode 100644 docs/IREEAttentionLowering.md

diff --git a/docs/IREEAttentionLowering.md b/docs/IREEAttentionLowering.md
new file mode 100644
index 00000000..e378e6fa
--- /dev/null
+++ b/docs/IREEAttentionLowering.md
@@ -0,0 +1,323 @@
+# IREE Attention Lowering Pipeline
+
+## 1. ConvertAttentionToOnlineAttentionPass
+
+### Overview
+
+The `ConvertAttentionToOnlineAttentionPass` transforms a standard (offline) attention operation (`iree_linalg_ext.attention`) into an **online attention** operation (`iree_linalg_ext.online_attention`). Online attention computes attention in a **tiled/streaming** fashion, maintaining running max and running sum accumulators to perform numerically stable softmax incrementally — this is the core idea behind **FlashAttention**.
+
+### Why?
+
+Standard attention computes `softmax(Q @ K^T / scale) @ V` in a single monolithic step, requiring the entire attention matrix to be materialized in memory. Online attention tiles the computation over the key/value sequence dimension, updating partial results with running statistics, enabling **O(1) memory** in the sequence length.
+
+### Before the Pass
+
+A standard `iree_linalg_ext.attention` op with Q, K, V, scale, and output:
+
+```mlir
+%result = iree_linalg_ext.attention {
+    indexing_maps = [
+        affine_map<(b, m, k1, k2, n) -> (b, m, k1)>,   // Q
+        affine_map<(b, m, k1, k2, n) -> (b, k2, k1)>,   // K
+        affine_map<(b, m, k1, k2, n) -> (b, k2, n)>,     // V
+        affine_map<(b, m, k1, k2, n) -> ()>,              // scale
+        affine_map<(b, m, k1, k2, n) -> (b, m, n)>       // output
+    ]}
+    ins(%Q, %K, %V, %scale : tensor<1x4048x64xf16>, tensor<1x4048x64xf16>,
+                              tensor<1x4048x64xf16>, f16)
+    outs(%output : tensor<1x4048x64xf32>)
+    -> tensor<1x4048x64xf32>
+```
+
+### After the Pass
+
+The op is converted to `iree_linalg_ext.online_attention` with two additional accumulator outputs — **running max** and **running sum** — initialized to `-inf` and `0` respectively:
+
+```mlir
+// Initialize accumulators
+%neg_inf = arith.constant dense<0xFF800000> : tensor<1x4048xf32>  // -inf for max
+%zeros   = arith.constant dense<0.0>        : tensor<1x4048xf32>  // 0 for sum
+
+// Online attention with streaming accumulators
+%result:3 = iree_linalg_ext.online_attention {
+    indexing_maps = [
+        affine_map<(b, m, k1, k2, n) -> (b, m, k1)>,   // Q
+        affine_map<(b, m, k1, k2, n) -> (b, k2, k1)>,   // K
+        affine_map<(b, m, k1, k2, n) -> (b, k2, n)>,     // V
+        affine_map<(b, m, k1, k2, n) -> ()>,              // scale
+        affine_map<(b, m, k1, k2, n) -> (b, m, n)>,      // output (acc)
+        affine_map<(b, m, k1, k2, n) -> (b, m)>,          // running max
+        affine_map<(b, m, k1, k2, n) -> (b, m)>           // running sum
+    ]}
+    ins(%Q, %K, %V, %scale : tensor<1x4048x64xf16>, tensor<1x4048x64xf16>,
+                              tensor<1x4048x64xf16>, f16)
+    outs(%output, %neg_inf, %zeros : tensor<1x4048x64xf32>,
+                                      tensor<1x4048xf32>,
+                                      tensor<1x4048xf32>)
+    -> tensor<1x4048x64xf32>, tensor<1x4048xf32>, tensor<1x4048xf32>
+
+// Final normalization: divide accumulated output by the final sum
+%final = linalg.generic {
+    indexing_maps = [
+        affine_map<(b, m, n) -> (b, m, n)>,   // accumulated output
+        affine_map<(b, m, n) -> (b, m)>,        // final sum
+        affine_map<(b, m, n) -> (b, m, n)>     // normalized output
+    ],
+    iterator_types = ["parallel", "parallel", "parallel"]}
+    ins(%result#0, %result#2 : ...)
+    outs(%empty : ...) {
+  ^bb0(%acc: f32, %sum: f32, %out: f32):
+    %normalized = arith.divf %acc, %sum : f32
+    linalg.yield %normalized : f32
+} -> tensor<1x4048x64xf32>
+```
+
+### Key Transformations
+
+| Aspect | Before | After |
+|--------|--------|-------|
+| **Op** | `iree_linalg_ext.attention` | `iree_linalg_ext.online_attention` |
+| **Outputs** | 1 (result) | 3 (result, max, sum) |
+| **Max accumulator** | N/A | Initialized to `-inf` |
+| **Sum accumulator** | N/A | Initialized to `0.0` |
+| **Post-processing** | None | Division by final sum (normalization) |
+| **Memory** | Materializes full attention matrix | Streams over K/V tiles |
+
+### Significance in the Pipeline
+
+This pass is a critical step in IREE's attention lowering pipeline. After this conversion, subsequent passes can **tile the online_attention op along the K2 (key sequence) dimension**, processing chunks of keys/values at a time while maintaining numerically stable softmax via the running max/sum — exactly the FlashAttention algorithm.
+
+---
+
+## 2. DecomposeAttentionPass
+
+### Overview
+
+The `DecomposeAttentionPass` (`iree-linalg-ext-decompose-attention`) runs **after** tiling has been applied to the online attention op. It decomposes each tiled `iree_linalg_ext.online_attention` op into a sequence of primitive `linalg.generic` operations that implement the online softmax + attention algorithm explicitly.
+
+This is the pass that eliminates all custom attention ops and produces standard linalg operations that the rest of the compiler knows how to handle (vectorize, bufferize, map to hardware intrinsics, etc.).
+
+### Pipeline Context
+
+By the time `DecomposeAttentionPass` runs, the IR has been through:
+
+1. `ConvertAttentionToOnlineAttention` — introduced online_attention + max/sum accumulators
+2. `TileAndDistributeToWorkgroups` — tiled across batch and query-sequence dims
+3. `GPUApplyTilingLevel` (multiple times) — tiled the K2 (key-sequence) reduction dimension into chunks (e.g., tiles of 64 or 128)
+
+So the input to this pass is a **tiled** online_attention operating on a slice of K/V.
+
+### Before the Pass (Tiled Online Attention)
+
+After tiling, the online attention operates on a K2-tile (e.g., 64 keys at a time):
+
+```mlir
+// Inside an scf.for loop over K2 tiles:
+%results:3 = iree_linalg_ext.online_attention {
+    indexing_maps = [
+        affine_map<(m, k1, k2, n) -> (m, k1)>,     // Q tile
+        affine_map<(m, k1, k2, n) -> (k2, k1)>,     // K tile (64 x 64)
+        affine_map<(m, k1, k2, n) -> (k2, n)>,       // V tile (64 x 64)
+        affine_map<(m, k1, k2, n) -> ()>,             // scale
+        affine_map<(m, k1, k2, n) -> (m, n)>,         // acc output
+        affine_map<(m, k1, k2, n) -> (m)>,             // running max
+        affine_map<(m, k1, k2, n) -> (m)>              // running sum
+    ]}
+    ins(%q_tile, %k_tile, %v_tile, %scale : ...)
+    outs(%acc, %old_max, %old_sum : tensor<64x64xf32>,
+                                     tensor<64xf32>,
+                                     tensor<64xf32>)
+    -> tensor<64x64xf32>, tensor<64xf32>, tensor<64xf32>
+```
+
+### After the Pass (Decomposed to linalg.generic)
+
+The pass decomposes the single online_attention op into **5 steps**:
+
+#### Step 1: Compute S = Q @ K^T * scale (matmul + scale)
+
+```mlir
+// S[m, k2] = sum_k1(Q[m, k1] * K[k2, k1]) * scale
+%empty_S = tensor.empty() : tensor<64x64xf32>
+%zero_S = linalg.fill ins(%cst_0) outs(%empty_S)
+%S = linalg.generic {
+    indexing_maps = [
+        affine_map<(m, k2, k1) -> (m, k1)>,     // Q
+        affine_map<(m, k2, k1) -> (k2, k1)>,     // K
+        affine_map<(m, k2, k1) -> ()>,             // scale
+        affine_map<(m, k2, k1) -> (m, k2)>        // S (output)
+    ],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%q_tile, %k_tile, %scale : ...)
+    outs(%zero_S : tensor<64x64xf32>) {
+  ^bb0(%q: f16, %k: f16, %s: f16, %out: f32):
+    %q_ext = arith.extf %q : f16 to f32
+    %k_ext = arith.extf %k : f16 to f32
+    %s_ext = arith.extf %s : f16 to f32
+    %mul = arith.mulf %q_ext, %k_ext : f32
+    %scaled = arith.mulf %mul, %s_ext : f32
+    %add = arith.addf %scaled, %out : f32
+    linalg.yield %add : f32
+} -> tensor<64x64xf32>
+```
+
+#### Step 2: Compute new_max = max(old_max, rowmax(S))
+
+```mlir
+// Row-wise max of S, then element-wise max with old_max
+%new_max = linalg.generic {
+    indexing_maps = [
+        affine_map<(m, k2) -> (m, k2)>,   // S
+        affine_map<(m, k2) -> (m)>          // max accumulator
+    ],
+    iterator_types = ["parallel", "reduction"]}
+    ins(%S : tensor<64x64xf32>)
+    outs(%old_max : tensor<64xf32>) {
+  ^bb0(%s_val: f32, %cur_max: f32):
+    %m = arith.maximumf %s_val, %cur_max : f32
+    linalg.yield %m : f32
+} -> tensor<64xf32>
+```
+
+#### Step 3: Compute P = exp(S - new_max) and correction factor alpha = exp(old_max - new_max)
+
+```mlir
+// Subtract new_max from S and exponentiate: P[m, k2] = exp(S[m, k2] - new_max[m])
+%P = linalg.generic {
+    indexing_maps = [
+        affine_map<(m, k2) -> (m, k2)>,   // S
+        affine_map<(m, k2) -> (m)>,         // new_max
+        affine_map<(m, k2) -> (m, k2)>     // P (output)
+    ],
+    iterator_types = ["parallel", "parallel"]}
+    ins(%S, %new_max : ...)
+    outs(%empty_S : tensor<64x64xf32>) {
+  ^bb0(%s_val: f32, %max_val: f32, %out: f32):
+    %sub = arith.subf %s_val, %max_val : f32
+    %exp = math.exp %sub : f32
+    linalg.yield %exp : f32
+} -> tensor<64x64xf32>
+
+// Correction factor: alpha[m] = exp(old_max[m] - new_max[m])
+%alpha = linalg.generic {
+    indexing_maps = [
+        affine_map<(m) -> (m)>,   // old_max
+        affine_map<(m) -> (m)>,   // new_max
+        affine_map<(m) -> (m)>    // alpha
+    ],
+    iterator_types = ["parallel"]}
+    ins(%old_max, %new_max : ...)
+    outs(%empty_alpha : tensor<64xf32>) {
+  ^bb0(%old_m: f32, %new_m: f32, %out: f32):
+    %sub = arith.subf %old_m, %new_m : f32
+    %exp = math.exp %sub : f32
+    linalg.yield %exp : f32
+} -> tensor<64xf32>
+```
+
+#### Step 4: Update sum = alpha * old_sum + rowsum(P)
+
+```mlir
+// Scale old sum by correction factor, then add row sums of P
+// new_sum[m] = alpha[m] * old_sum[m] + sum_k2(P[m, k2])
+%scaled_sum = linalg.generic {
+    indexing_maps = [
+        affine_map<(m) -> (m)>,   // old_sum
+        affine_map<(m) -> (m)>,   // alpha
+        affine_map<(m) -> (m)>    // output
+    ],
+    iterator_types = ["parallel"]}
+    ins(%old_sum, %alpha : ...)
+    outs(%empty_sum : tensor<64xf32>) {
+  ^bb0(%s: f32, %a: f32, %out: f32):
+    %mul = arith.mulf %s, %a : f32
+    linalg.yield %mul : f32
+} -> tensor<64xf32>
+
+%new_sum = linalg.generic {
+    indexing_maps = [
+        affine_map<(m, k2) -> (m, k2)>,   // P
+        affine_map<(m, k2) -> (m)>          // sum accumulator
+    ],
+    iterator_types = ["parallel", "reduction"]}
+    ins(%P : tensor<64x64xf32>)
+    outs(%scaled_sum : tensor<64xf32>) {
+  ^bb0(%p_val: f32, %cur_sum: f32):
+    %add = arith.addf %p_val, %cur_sum : f32
+    linalg.yield %add : f32
+} -> tensor<64xf32>
+```
+
+#### Step 5: Update output = alpha * old_acc + P @ V
+
+```mlir
+// Scale old accumulator by alpha: corrected_acc[m, n] = alpha[m] * old_acc[m, n]
+%corrected_acc = linalg.generic {
+    indexing_maps = [
+        affine_map<(m, n) -> (m)>,     // alpha
+        affine_map<(m, n) -> (m, n)>,  // old_acc
+        affine_map<(m, n) -> (m, n)>   // output
+    ],
+    iterator_types = ["parallel", "parallel"]}
+    ins(%alpha, %old_acc : ...)
+    outs(%empty_acc : tensor<64x64xf32>) {
+  ^bb0(%a: f32, %acc: f32, %out: f32):
+    %mul = arith.mulf %a, %acc : f32
+    linalg.yield %mul : f32
+} -> tensor<64x64xf32>
+
+// new_acc[m, n] = corrected_acc[m, n] + sum_k2(P[m, k2] * V[k2, n])
+%new_acc = linalg.generic {
+    indexing_maps = [
+        affine_map<(m, n, k2) -> (m, k2)>,   // P
+        affine_map<(m, n, k2) -> (k2, n)>,    // V
+        affine_map<(m, n, k2) -> (m, n)>      // acc (output)
+    ],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%P, %v_tile : ...)
+    outs(%corrected_acc : tensor<64x64xf32>) {
+  ^bb0(%p_val: f32, %v_val: f16, %acc: f32):
+    %v_ext = arith.extf %v_val : f16 to f32
+    %mul = arith.mulf %p_val, %v_ext : f32
+    %add = arith.addf %mul, %acc : f32
+    linalg.yield %add : f32
+} -> tensor<64x64xf32>
+```
+
+### Summary of Decomposition
+
+The online attention op is decomposed into these primitive operations:
+
+```
+┌─────────────────────────────────────────────────┐
+│  iree_linalg_ext.online_attention (1 tiled op)  │
+└────────────────────┬────────────────────────────┘
+                     │ DecomposeAttentionPass
+                     ▼
+┌─────────────────────────────────────────────────┐
+│ 1. S = Q @ K^T * scale        (linalg.generic)  │
+│ 2. new_max = max(old_max, rowmax(S))  (generic)  │
+│ 3. P = exp(S - new_max)               (generic)  │
+│    alpha = exp(old_max - new_max)      (generic)  │
+│ 4. new_sum = alpha*old_sum + rowsum(P) (generic)  │
+│ 5. new_acc = alpha*old_acc + P @ V     (generic)  │
+└─────────────────────────────────────────────────┘
+```
+
+| Step | Operation | Type | Dims |
+|------|-----------|------|------|
+| 1 | `S = Q @ K^T * scale` | Matmul + scale | `[m, k2]` ← `[m, k1] × [k2, k1]` |
+| 2 | `new_max = max(old_max, rowmax(S))` | Row reduction | `[m]` ← `[m, k2]` |
+| 3a | `P = exp(S - new_max)` | Elementwise | `[m, k2]` |
+| 3b | `alpha = exp(old_max - new_max)` | Elementwise | `[m]` |
+| 4 | `new_sum = alpha * old_sum + Σ P` | Scale + row reduction | `[m]` |
+| 5 | `new_acc = alpha * old_acc + P @ V` | Scale + matmul | `[m, n]` ← `[m, k2] × [k2, n]` |
+
+### Why This Matters
+
+After decomposition, all ops are standard `linalg.generic` operations. This enables:
+
+- **Vectorization** via IREE's vector distribution pipeline
+- **Mapping to MMA intrinsics** (e.g., MFMA on MI300X) for the two matmuls (Steps 1 and 5)
+- **Register-level tiling** and shared memory promotion for GPU targets
+- The `scf.for` loop around these ops implements the streaming/online iteration over K/V chunks

From 2b5f9ae508182ecf5658da40811276f823180761 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Wed, 8 Apr 2026 21:13:23 +0000
Subject: [PATCH 28/38] save work

---
 docs/IREEAttentionLowering.md | 176 ++++++++++++++++++----------------
 1 file changed, 94 insertions(+), 82 deletions(-)

diff --git a/docs/IREEAttentionLowering.md b/docs/IREEAttentionLowering.md
index e378e6fa..d17b6fe6 100644
--- a/docs/IREEAttentionLowering.md
+++ b/docs/IREEAttentionLowering.md
@@ -17,16 +17,16 @@ A standard `iree_linalg_ext.attention` op with Q, K, V, scale, and output:
 ```mlir
 %result = iree_linalg_ext.attention {
     indexing_maps = [
-        affine_map<(b, m, k1, k2, n) -> (b, m, k1)>,   // Q
-        affine_map<(b, m, k1, k2, n) -> (b, k2, k1)>,   // K
-        affine_map<(b, m, k1, k2, n) -> (b, k2, n)>,     // V
-        affine_map<(b, m, k1, k2, n) -> ()>,              // scale
-        affine_map<(b, m, k1, k2, n) -> (b, m, n)>       // output
+        affine_map<(d0, d1, d2, d3) -> (d0, d1)>,   // Q: (m, k1)
+        affine_map<(d0, d1, d2, d3) -> (d2, d1)>,   // K: (k2, k1)
+        affine_map<(d0, d1, d2, d3) -> (d2, d3)>,   // V: (k2, n)
+        affine_map<(d0, d1, d2, d3) -> ()>,         // scale
+        affine_map<(d0, d1, d2, d3) -> (d0, d3)>    // output: (m, n)
     ]}
-    ins(%Q, %K, %V, %scale : tensor<1x4048x64xf16>, tensor<1x4048x64xf16>,
-                              tensor<1x4048x64xf16>, f16)
-    outs(%output : tensor<1x4048x64xf32>)
-    -> tensor<1x4048x64xf32>
+    ins(%Q, %K, %V, %scale : tensor<16x64xf32>, tensor<4048x64xf32>,
+                              tensor<4048x64xf32>, f32)
+    outs(%output : tensor<16x64xf32>)
+    -> tensor<16x64xf32>
 ```
 
 ### After the Pass
@@ -35,41 +35,52 @@ The op is converted to `iree_linalg_ext.online_attention` with two additional ac
 
 ```mlir
 // Initialize accumulators
-%neg_inf = arith.constant dense<0xFF800000> : tensor<1x4048xf32>  // -inf for max
-%zeros   = arith.constant dense<0.0>        : tensor<1x4048xf32>  // 0 for sum
+%empty_output = tensor.empty() : tensor<16x64xf32>
+%empty_max = tensor.empty() : tensor<16xf32>
+%cst_0 = arith.constant 0.000000e+00 : f32  // 0 for output
+%cst_neg_inf = arith.constant -3.40282347E+38 : f32  // -inf for max
+%cst_zero = arith.constant 0.000000e+00 : f32  // 0 for sum
+
+%output_acc = linalg.fill ins(%cst_0 : f32) outs(%empty_output) -> tensor<16x64xf32>
+%max_init = linalg.fill ins(%cst_neg_inf : f32) outs(%empty_max) -> tensor<16xf32>
+%sum_init = linalg.fill ins(%cst_zero : f32) outs(%empty_max) -> tensor<16xf32>
 
 // Online attention with streaming accumulators
 %result:3 = iree_linalg_ext.online_attention {
     indexing_maps = [
-        affine_map<(b, m, k1, k2, n) -> (b, m, k1)>,   // Q
-        affine_map<(b, m, k1, k2, n) -> (b, k2, k1)>,   // K
-        affine_map<(b, m, k1, k2, n) -> (b, k2, n)>,     // V
-        affine_map<(b, m, k1, k2, n) -> ()>,              // scale
-        affine_map<(b, m, k1, k2, n) -> (b, m, n)>,      // output (acc)
-        affine_map<(b, m, k1, k2, n) -> (b, m)>,          // running max
-        affine_map<(b, m, k1, k2, n) -> (b, m)>           // running sum
+        affine_map<(d0, d1, d2, d3) -> (d0, d1)>,   // Q: (m, k1)
+        affine_map<(d0, d1, d2, d3) -> (d2, d1)>,   // K: (k2, k1)
+        affine_map<(d0, d1, d2, d3) -> (d2, d3)>,   // V: (k2, n)
+        affine_map<(d0, d1, d2, d3) -> ()>,         // scale
+        affine_map<(d0, d1, d2, d3) -> (d0, d3)>,   // output: (m, n)
+        affine_map<(d0, d1, d2, d3) -> (d0)>,       // running max: (m)
+        affine_map<(d0, d1, d2, d3) -> (d0)>        // running sum: (m)
     ]}
-    ins(%Q, %K, %V, %scale : tensor<1x4048x64xf16>, tensor<1x4048x64xf16>,
-                              tensor<1x4048x64xf16>, f16)
-    outs(%output, %neg_inf, %zeros : tensor<1x4048x64xf32>,
-                                      tensor<1x4048xf32>,
-                                      tensor<1x4048xf32>)
-    -> tensor<1x4048x64xf32>, tensor<1x4048xf32>, tensor<1x4048xf32>
+    ins(%Q, %K, %V, %scale : tensor<16x64xf32>, tensor<4048x64xf32>,
+                              tensor<4048x64xf32>, f32)
+    outs(%output_acc, %max_init, %sum_init : tensor<16x64xf32>,
+                                              tensor<16xf32>,
+                                              tensor<16xf32>) {
+  ^bb0(%arg: f32):
+    iree_linalg_ext.yield %arg : f32
+} -> tensor<16x64xf32>, tensor<16xf32>, tensor<16xf32>
 
 // Final normalization: divide accumulated output by the final sum
 %final = linalg.generic {
     indexing_maps = [
-        affine_map<(b, m, n) -> (b, m, n)>,   // accumulated output
-        affine_map<(b, m, n) -> (b, m)>,        // final sum
-        affine_map<(b, m, n) -> (b, m, n)>     // normalized output
+        affine_map<(d0, d1) -> (d0)>,      // final sum: (m)
+        affine_map<(d0, d1) -> (d0, d1)>,  // accumulated output: (m, n)
+        affine_map<(d0, d1) -> (d0, d1)>   // normalized output: (m, n)
     ],
-    iterator_types = ["parallel", "parallel", "parallel"]}
-    ins(%result#0, %result#2 : ...)
-    outs(%empty : ...) {
-  ^bb0(%acc: f32, %sum: f32, %out: f32):
-    %normalized = arith.divf %acc, %sum : f32
+    iterator_types = ["parallel", "parallel"]}
+    ins(%result#2, %result#0 : tensor<16xf32>, tensor<16x64xf32>)
+    outs(%empty_output : tensor<16x64xf32>) {
+  ^bb0(%sum: f32, %acc: f32, %out: f32):
+    %cst_1 = arith.constant 1.000000e+00 : f32
+    %inv_sum = arith.divf %cst_1, %sum : f32
+    %normalized = arith.mulf %inv_sum, %acc : f32
     linalg.yield %normalized : f32
-} -> tensor<1x4048x64xf32>
+} -> tensor<16x64xf32>
 ```
 
 ### Key Transformations
@@ -77,8 +88,11 @@ The op is converted to `iree_linalg_ext.online_attention` with two additional ac
 | Aspect | Before | After |
 |--------|--------|-------|
 | **Op** | `iree_linalg_ext.attention` | `iree_linalg_ext.online_attention` |
-| **Outputs** | 1 (result) | 3 (result, max, sum) |
-| **Max accumulator** | N/A | Initialized to `-inf` |
+| **Q shape** | `tensor<16x64xf32>` | `tensor<16x64xf32>` (unchanged) |
+| **K shape** | `tensor<4048x64xf32>` | `tensor<4048x64xf32>` (unchanged) |
+| **V shape** | `tensor<4048x64xf32>` | `tensor<4048x64xf32>` (unchanged) |
+| **Outputs** | 1 (result: `16x64xf32`) | 3 (result: `16x64xf32`, max: `16xf32`, sum: `16xf32`) |
+| **Max accumulator** | N/A | Initialized to `-inf` (`-3.40282347E+38`) |
 | **Sum accumulator** | N/A | Initialized to `0.0` |
 | **Post-processing** | None | Division by final sum (normalization) |
 | **Memory** | Materializes full attention matrix | Streams over K/V tiles |
@@ -109,25 +123,27 @@ So the input to this pass is a **tiled** online_attention operating on a slice o
 
 ### Before the Pass (Tiled Online Attention)
 
-After tiling, the online attention operates on a K2-tile (e.g., 64 keys at a time):
+After tiling, the online attention operates on a K2-tile (e.g., 16 keys at a time). This example shows a 16x64 Q-tile processing a 16x64 K-tile and V-tile:
 
 ```mlir
 // Inside an scf.for loop over K2 tiles:
 %results:3 = iree_linalg_ext.online_attention {
     indexing_maps = [
-        affine_map<(m, k1, k2, n) -> (m, k1)>,     // Q tile
-        affine_map<(m, k1, k2, n) -> (k2, k1)>,     // K tile (64 x 64)
-        affine_map<(m, k1, k2, n) -> (k2, n)>,       // V tile (64 x 64)
-        affine_map<(m, k1, k2, n) -> ()>,             // scale
-        affine_map<(m, k1, k2, n) -> (m, n)>,         // acc output
-        affine_map<(m, k1, k2, n) -> (m)>,             // running max
-        affine_map<(m, k1, k2, n) -> (m)>              // running sum
+        affine_map<(d0, d1, d2, d3) -> (d0, d1)>,   // Q tile: (m, k1)
+        affine_map<(d0, d1, d2, d3) -> (d2, d1)>,   // K tile: (k2, k1)
+        affine_map<(d0, d1, d2, d3) -> (d2, d3)>,   // V tile: (k2, n)
+        affine_map<(d0, d1, d2, d3) -> ()>,         // scale
+        affine_map<(d0, d1, d2, d3) -> (d0, d3)>,   // acc output: (m, n)
+        affine_map<(d0, d1, d2, d3) -> (d0)>,       // running max: (m)
+        affine_map<(d0, d1, d2, d3) -> (d0)>        // running sum: (m)
     ]}
-    ins(%q_tile, %k_tile, %v_tile, %scale : ...)
-    outs(%acc, %old_max, %old_sum : tensor<64x64xf32>,
-                                     tensor<64xf32>,
-                                     tensor<64xf32>)
-    -> tensor<64x64xf32>, tensor<64xf32>, tensor<64xf32>
+    ins(%q_tile, %k_tile, %v_tile, %scale : tensor<16x64xf32>,
+                                             tensor<16x64xf32>,
+                                             tensor<16x64xf32>, f32)
+    outs(%acc, %old_max, %old_sum : tensor<16x64xf32>,
+                                     tensor<16xf32>,
+                                     tensor<16xf32>)
+    -> tensor<16x64xf32>, tensor<16xf32>, tensor<16xf32>
 ```
 
 ### After the Pass (Decomposed to linalg.generic)
@@ -138,7 +154,7 @@ The pass decomposes the single online_attention op into **5 steps**:
 
 ```mlir
 // S[m, k2] = sum_k1(Q[m, k1] * K[k2, k1]) * scale
-%empty_S = tensor.empty() : tensor<64x64xf32>
+%empty_S = tensor.empty() : tensor<16x16xf32>
 %zero_S = linalg.fill ins(%cst_0) outs(%empty_S)
 %S = linalg.generic {
     indexing_maps = [
@@ -149,16 +165,13 @@ The pass decomposes the single online_attention op into **5 steps**:
     ],
     iterator_types = ["parallel", "parallel", "reduction"]}
     ins(%q_tile, %k_tile, %scale : ...)
-    outs(%zero_S : tensor<64x64xf32>) {
-  ^bb0(%q: f16, %k: f16, %s: f16, %out: f32):
-    %q_ext = arith.extf %q : f16 to f32
-    %k_ext = arith.extf %k : f16 to f32
-    %s_ext = arith.extf %s : f16 to f32
-    %mul = arith.mulf %q_ext, %k_ext : f32
-    %scaled = arith.mulf %mul, %s_ext : f32
+    outs(%zero_S : tensor<16x16xf32>) {
+  ^bb0(%q: f32, %k: f32, %s: f32, %out: f32):
+    %mul = arith.mulf %q, %k : f32
+    %scaled = arith.mulf %mul, %s : f32
     %add = arith.addf %scaled, %out : f32
     linalg.yield %add : f32
-} -> tensor<64x64xf32>
+} -> tensor<16x16xf32>
 ```
 
 #### Step 2: Compute new_max = max(old_max, rowmax(S))
@@ -171,12 +184,12 @@ The pass decomposes the single online_attention op into **5 steps**:
         affine_map<(m, k2) -> (m)>          // max accumulator
     ],
     iterator_types = ["parallel", "reduction"]}
-    ins(%S : tensor<64x64xf32>)
-    outs(%old_max : tensor<64xf32>) {
+    ins(%S : tensor<16x16xf32>)
+    outs(%old_max : tensor<16xf32>) {
   ^bb0(%s_val: f32, %cur_max: f32):
     %m = arith.maximumf %s_val, %cur_max : f32
     linalg.yield %m : f32
-} -> tensor<64xf32>
+} -> tensor<16xf32>
 ```
 
 #### Step 3: Compute P = exp(S - new_max) and correction factor alpha = exp(old_max - new_max)
@@ -191,12 +204,12 @@ The pass decomposes the single online_attention op into **5 steps**:
     ],
     iterator_types = ["parallel", "parallel"]}
     ins(%S, %new_max : ...)
-    outs(%empty_S : tensor<64x64xf32>) {
+    outs(%empty_S : tensor<16x16xf32>) {
   ^bb0(%s_val: f32, %max_val: f32, %out: f32):
     %sub = arith.subf %s_val, %max_val : f32
     %exp = math.exp %sub : f32
     linalg.yield %exp : f32
-} -> tensor<64x64xf32>
+} -> tensor<16x16xf32>
 
 // Correction factor: alpha[m] = exp(old_max[m] - new_max[m])
 %alpha = linalg.generic {
@@ -207,12 +220,12 @@ The pass decomposes the single online_attention op into **5 steps**:
     ],
     iterator_types = ["parallel"]}
     ins(%old_max, %new_max : ...)
-    outs(%empty_alpha : tensor<64xf32>) {
+    outs(%empty_alpha : tensor<16xf32>) {
   ^bb0(%old_m: f32, %new_m: f32, %out: f32):
     %sub = arith.subf %old_m, %new_m : f32
     %exp = math.exp %sub : f32
     linalg.yield %exp : f32
-} -> tensor<64xf32>
+} -> tensor<16xf32>
 ```
 
 #### Step 4: Update sum = alpha * old_sum + rowsum(P)
@@ -228,11 +241,11 @@ The pass decomposes the single online_attention op into **5 steps**:
     ],
     iterator_types = ["parallel"]}
     ins(%old_sum, %alpha : ...)
-    outs(%empty_sum : tensor<64xf32>) {
+    outs(%empty_sum : tensor<16xf32>) {
   ^bb0(%s: f32, %a: f32, %out: f32):
     %mul = arith.mulf %s, %a : f32
     linalg.yield %mul : f32
-} -> tensor<64xf32>
+} -> tensor<16xf32>
 
 %new_sum = linalg.generic {
     indexing_maps = [
@@ -240,12 +253,12 @@ The pass decomposes the single online_attention op into **5 steps**:
         affine_map<(m, k2) -> (m)>          // sum accumulator
     ],
     iterator_types = ["parallel", "reduction"]}
-    ins(%P : tensor<64x64xf32>)
-    outs(%scaled_sum : tensor<64xf32>) {
+    ins(%P : tensor<16x16xf32>)
+    outs(%scaled_sum : tensor<16xf32>) {
   ^bb0(%p_val: f32, %cur_sum: f32):
     %add = arith.addf %p_val, %cur_sum : f32
     linalg.yield %add : f32
-} -> tensor<64xf32>
+} -> tensor<16xf32>
 ```
 
 #### Step 5: Update output = alpha * old_acc + P @ V
@@ -260,11 +273,11 @@ The pass decomposes the single online_attention op into **5 steps**:
     ],
     iterator_types = ["parallel", "parallel"]}
     ins(%alpha, %old_acc : ...)
-    outs(%empty_acc : tensor<64x64xf32>) {
+    outs(%empty_acc : tensor<16x64xf32>) {
   ^bb0(%a: f32, %acc: f32, %out: f32):
     %mul = arith.mulf %a, %acc : f32
     linalg.yield %mul : f32
-} -> tensor<64x64xf32>
+} -> tensor<16x64xf32>
 
 // new_acc[m, n] = corrected_acc[m, n] + sum_k2(P[m, k2] * V[k2, n])
 %new_acc = linalg.generic {
@@ -275,13 +288,12 @@ The pass decomposes the single online_attention op into **5 steps**:
     ],
     iterator_types = ["parallel", "parallel", "reduction"]}
     ins(%P, %v_tile : ...)
-    outs(%corrected_acc : tensor<64x64xf32>) {
-  ^bb0(%p_val: f32, %v_val: f16, %acc: f32):
-    %v_ext = arith.extf %v_val : f16 to f32
-    %mul = arith.mulf %p_val, %v_ext : f32
+    outs(%corrected_acc : tensor<16x64xf32>) {
+  ^bb0(%p_val: f32, %v_val: f32, %acc: f32):
+    %mul = arith.mulf %p_val, %v_val : f32
     %add = arith.addf %mul, %acc : f32
     linalg.yield %add : f32
-} -> tensor<64x64xf32>
+} -> tensor<16x64xf32>
 ```
 
 ### Summary of Decomposition
@@ -306,12 +318,12 @@ The online attention op is decomposed into these primitive operations:
 
 | Step | Operation | Type | Dims |
 |------|-----------|------|------|
-| 1 | `S = Q @ K^T * scale` | Matmul + scale | `[m, k2]` ← `[m, k1] × [k2, k1]` |
-| 2 | `new_max = max(old_max, rowmax(S))` | Row reduction | `[m]` ← `[m, k2]` |
-| 3a | `P = exp(S - new_max)` | Elementwise | `[m, k2]` |
-| 3b | `alpha = exp(old_max - new_max)` | Elementwise | `[m]` |
-| 4 | `new_sum = alpha * old_sum + Σ P` | Scale + row reduction | `[m]` |
-| 5 | `new_acc = alpha * old_acc + P @ V` | Scale + matmul | `[m, n]` ← `[m, k2] × [k2, n]` |
+| 1 | `S = Q @ K^T * scale` | Matmul + scale | `[16, 16]` ← `[16, 64] × [16, 64]` |
+| 2 | `new_max = max(old_max, rowmax(S))` | Row reduction | `[16]` ← `[16, 16]` |
+| 3a | `P = exp(S - new_max)` | Elementwise | `[16, 16]` |
+| 3b | `alpha = exp(old_max - new_max)` | Elementwise | `[16]` |
+| 4 | `new_sum = alpha * old_sum + Σ P` | Scale + row reduction | `[16]` |
+| 5 | `new_acc = alpha * old_acc + P @ V` | Scale + matmul | `[16, 64]` ← `[16, 16] × [16, 64]` |
 
 ### Why This Matters
 

From 35598a8258224797a28b9c6bf9d5d616988e84ac Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Wed, 8 Apr 2026 22:18:55 +0000
Subject: [PATCH 29/38] save work

---
 docs/IREEAttentionLowering.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/IREEAttentionLowering.md b/docs/IREEAttentionLowering.md
index d17b6fe6..2b863bb1 100644
--- a/docs/IREEAttentionLowering.md
+++ b/docs/IREEAttentionLowering.md
@@ -6,9 +6,6 @@
 
 The `ConvertAttentionToOnlineAttentionPass` transforms a standard (offline) attention operation (`iree_linalg_ext.attention`) into an **online attention** operation (`iree_linalg_ext.online_attention`). Online attention computes attention in a **tiled/streaming** fashion, maintaining running max and running sum accumulators to perform numerically stable softmax incrementally — this is the core idea behind **FlashAttention**.
 
-### Why?
-
-Standard attention computes `softmax(Q @ K^T / scale) @ V` in a single monolithic step, requiring the entire attention matrix to be materialized in memory. Online attention tiles the computation over the key/value sequence dimension, updating partial results with running statistics, enabling **O(1) memory** in the sequence length.
 
 ### Before the Pass
 
@@ -23,8 +20,8 @@ A standard `iree_linalg_ext.attention` op with Q, K, V, scale, and output:
         affine_map<(d0, d1, d2, d3) -> ()>,         // scale
         affine_map<(d0, d1, d2, d3) -> (d0, d3)>    // output: (m, n)
     ]}
-    ins(%Q, %K, %V, %scale : tensor<16x64xf32>, tensor<4048x64xf32>,
-                              tensor<4048x64xf32>, f32)
+    ins(%Q, %K, %V, %scale : tensor<16x64xf32>, tensor<4096x64xf32>,
+                              tensor<4096x64xf32>, f32)
     outs(%output : tensor<16x64xf32>)
     -> tensor<16x64xf32>
 ```
@@ -56,8 +53,8 @@ The op is converted to `iree_linalg_ext.online_attention` with two additional ac
         affine_map<(d0, d1, d2, d3) -> (d0)>,       // running max: (m)
         affine_map<(d0, d1, d2, d3) -> (d0)>        // running sum: (m)
     ]}
-    ins(%Q, %K, %V, %scale : tensor<16x64xf32>, tensor<4048x64xf32>,
-                              tensor<4048x64xf32>, f32)
+    ins(%Q, %K, %V, %scale : tensor<16x64xf32>, tensor<4096x64xf32>,
+                              tensor<4096x64xf32>, f32)
     outs(%output_acc, %max_init, %sum_init : tensor<16x64xf32>,
                                               tensor<16xf32>,
                                               tensor<16xf32>) {
@@ -83,14 +80,16 @@ The op is converted to `iree_linalg_ext.online_attention` with two additional ac
 } -> tensor<16x64xf32>
 ```
 
+**Question**: not sure why they do it this way. why not embed the normalization also inside the op?
+
 ### Key Transformations
 
 | Aspect | Before | After |
 |--------|--------|-------|
 | **Op** | `iree_linalg_ext.attention` | `iree_linalg_ext.online_attention` |
 | **Q shape** | `tensor<16x64xf32>` | `tensor<16x64xf32>` (unchanged) |
-| **K shape** | `tensor<4048x64xf32>` | `tensor<4048x64xf32>` (unchanged) |
-| **V shape** | `tensor<4048x64xf32>` | `tensor<4048x64xf32>` (unchanged) |
+| **K shape** | `tensor<4096x64xf32>` | `tensor<4096x64xf32>` (unchanged) |
+| **V shape** | `tensor<4096x64xf32>` | `tensor<4096x64xf32>` (unchanged) |
 | **Outputs** | 1 (result: `16x64xf32`) | 3 (result: `16x64xf32`, max: `16xf32`, sum: `16xf32`) |
 | **Max accumulator** | N/A | Initialized to `-inf` (`-3.40282347E+38`) |
 | **Sum accumulator** | N/A | Initialized to `0.0` |
@@ -127,6 +126,7 @@ After tiling, the online attention operates on a K2-tile (e.g., 16 keys at a tim
 
 ```mlir
 // Inside an scf.for loop over K2 tiles:
+// Step size is 16.
 %results:3 = iree_linalg_ext.online_attention {
     indexing_maps = [
         affine_map<(d0, d1, d2, d3) -> (d0, d1)>,   // Q tile: (m, k1)

From a54fa4baf2531db023d6189a04accac9aefb1a06 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Wed, 8 Apr 2026 23:28:24 +0000
Subject: [PATCH 30/38] save work

---
 docs/softmax_lowering.md | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md
index 311746bc..e0c5a0e8 100644
--- a/docs/softmax_lowering.md
+++ b/docs/softmax_lowering.md
@@ -301,6 +301,33 @@ xegpu.store_nd %11, %12[%0, 0]
   !xegpu.tensor_desc<64x64xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
 ```
 
+---
+## What is reused in between matmul and softmax?
+
+The softmax implementation leverages several reusable lighthouse infrastructure components:
+
+The workload-specific code is minimal (payload generation + tiling/decomposition strategy). Most infrastructure (execution, benchmarking, XeGPU lowering) is shared across different operations like matmul, softmax, etc.
+
+### Pipeline Infrastructure (`lighthouse.pipeline`)
+- **`TransformDriver`**: Orchestrates application of transform schedules to payload modules
+- **`apply_registered_pass`**: Applies named MLIR passes (e.g., `eliminate-empty-tensors`, `gpu-kernel-outlining`)
+- **`canonicalize`, `match`, `match_and_split`, `PipelineInterrupt`**: Helper utilities for constructing transform sequences
+
+### Execution Infrastructure (`lighthouse.execution`)
+- **`execute`**: Runs compiled MLIR modules on GPU with memory management
+- **`benchmark`**: Benchmarks kernel execution with warmup and timing utilities
+- **`GPUMemoryManager`**: Manages host-device memory transfers for GPU execution
+- **`get_bench_wrapper_schedule`**: Wraps payload functions with benchmarking infrastructure
+
+### XeGPU Lowering (`lighthouse.schedule.xegpu.helper`)
+- **`bundle_xegpu_to_binary`**: Common lowering path from XeGPU to executable binary (shared with matmul and other XeGPU workloads)
+  - Handles XeGPU peephole optimizations, layout propagation, and GPU-to-SPIRV/binary compilation
+
+### Payload Generation (`lighthouse.ingress.mlir_gen`)
+- **`get_mlir_elem_type`**: Type conversion utilities for constructing MLIR types
+
+**Key Insight**: 
+
 ---
 
 # Supporting larger Softmax dimension sizes

From 6e1c7f5eec3068674c7f483e5fec3098d0f69087 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Wed, 8 Apr 2026 23:35:40 +0000
Subject: [PATCH 31/38] save work

---
 docs/softmax_lowering.pptx | Bin 0 -> 45031 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 docs/softmax_lowering.pptx

diff --git a/docs/softmax_lowering.pptx b/docs/softmax_lowering.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..42e7908862d02c963e89ddf7ea4bd97ed5e6955f
GIT binary patch
literal 45031
zcmdqIW00iXx~^NcZQEV8%`V%vZFE&-vCCbyZFHGkwr$(q^?h^gbLO09N34Bf{hJZ_
zzLEHo`QW<m@r;q*WkA8ufPjFYfSB9Ebz(3f<sN{6fYwlffW96;+9LLLE~a)a`YN6d
zrp~$y9=0~)zlQBrnUF#+ePM(!@nIt6S12qExxyHV8Gz&Yfk>lmo7Mzl)}PLK;xVx-
zQBvigu8o3s#y<UVkh5wjQrxA@XdU#hbkQmITmA?x79o4-p$ph|MlnQ;kuJigh>NIJ
z^kAmUswbp3f@dq%J*ajHHV|Un-h-e$7S`rinww?pNXl@SCogtA<1HO~_zYU*BeTZT
z%>OA)P+;1X0W2T#w5<j1H47gu56S6RtM0F9QCr0{jH|V^nlm%kr<UY?yH`?_w#4^%
zx#&$QC?ukGhYxo#<~K)7`>12;I;2sNS-0iQ*cE(Xnz5yp9yv356V~`l10is<a3|nc
zI(W~KI1&Y2Mt6V8q>V6AiJ~>HFh|qpubk2UZV`xMA)ZXulN-Qyo3PR7h-BAUu9)sy
zkl*-crFdhR|JCi2OF(<u7|PA_8_~l`1nfHGF1ha8$f6nG!@==p#sqTZbDUKknPxyb
zlj6@Eg-}s5G;@c{&tp2FqFb<31{4!70oGZL3(KmTut?6XRNV9YUrnrzoIy}KcMU`#
zU}Z?fhTz2g#|lDFjw^~t!WkE+K}>IoSRZ6?{z#z$kd9_CNI{&a;!r{vo+Kee6L(V*
z0dH!(Y*pm#Pn&n3$9fUqR7*bmU1Dy`zA?1G{(0>gOR$$v8jntPFPPu|rA)5jInIE-
z%48b^2nhD8O!S>hZJZey{<>GjPs)Q5BZXb^gD=AtGpt%geKYm`M$v(-OlS~t%-|Xb
zEi3o>kO%O$YLY|b>wSAVz%}SxdKPLo9$|o>FLosbhd`}_%}i!29~#a?OC{F8G7_YJ
z{I#FgktVPUq%*O%p@;#FQdi|=)}~IAPQbUvjS{M)IkXoz%Iq4^-7XYnwZPg;^KPng
z4y;U9Sv4~wPg}c!n$;I`+Q{;RG`?WG6^HSO@&m3^WC0zSR{GcIfqEOmo^&+q_yi_5
zf5puogFqq2uKQ}>wJU4md{61x5I?K?&>9{J7hg?cR8;wryP<xB+wcYRojLe*ott5Y
zAuPI;S!uyL+X4hXvsd7M4OArB+pgzVpjN*Eh5QvL6MN(DPWBGYjK=m(rhj!)bo{t{
z9}`l@C9l|=jC|=XhBP3a2zUn!MJZBnEviEf9$>q?TIA`SQ6Tqx9H_0v@@mPOe&?m0
z)Pqrxpk+#oKV%3LVNI+W_hWMO)q(&5Ar+-!4yLXK7*p@ioSFlA5hJUD-9Rx2mRGVh
zB;BS#oE4flraITi?hJn<lGc_s)UN^jTa<qt$RDy8^8yf)AYxK_CT38_1_>B`qVd6G
zd_e~ga+RDZWbUH9Be9w!p4KVq&5We}!EUKjA-pvnN=Ref#a*uzO<zB0Jwif^b!F1h
zl`_t#8<i_C@?^m_g=WuBpqTe=dZxJr#Lj*N3oF|nr{usf+Llv<q>O@rrcfkHoc__>
ztSt*P%n<I>&-=Dg<jzk=h|Tdtc4<C=H9xkJspkv!s@Lo^mPQ^fxX$tVeqCsvCA<KA
z+)HiF3wfq)Y|vjyJd+j8QWZ__IbrLQ-|^<wH?yzDzLVChKt2gvleGTwCnW&V^}E>q
z(G-0d^Y%=5%lNN(%9@Jty!^_OEf^3G(m&_P(81xaG-ayv#LO}wb&y^oCM!5tBnbcb
zzHbJuTj`a<s#HJfSwk89!&su}*zVbj0(!XUJF$@b<v4E#_T&;WheD$v<(&}E?}JFs
z5p1pF#HJy(F2)P9OD<UC3Xom8s%LD_u3^1Sq*izMmy&Nxbv1_L<yh&e@nD*U@P_i1
zGO07qI3$YxvO1bpfM$8W>gmBfxwKB1E9yMe60>oOKi)(0)F0KQPIc#uyl97^Z_J7<
zP(@R0X0UO*sMiCY=yfV&d`N_8gBH+nMRDg1cML7eb@-CD`LO7t>!dGh$f(0AGP~V^
z`#f%ot|{l|bds?a%ehXmXY)??4~>#lP+|xKF$8I4)~3DvJr+N7?=Vrab<aTgkhCk&
zR?OGIF@LWQ9u>*Jea9oiTAT8yLy;A%BLFpuoT}o7JeTr}C`TPR;)Q*Ir1u|QR)<<b
za2;3JVq5y-t*O9mlZNE0Hr4ybdk;Tjq)3Btwc~E9x_6_T2cOo373(8c)T=|3nIPe6
zALs?;HC_#+dd&D+ro<v>woB_J1I2It@Z1f1bT@<Ta9bl$Ty+kENi>{kO53g<uSE{c
zNRCT8rr+^su1bzHqqM)ikPN)H$V=>;ulJD*|4dD3H-RENhCjs$<n7caU)_LH*32Sb
z6!F5w<F=-`z`r$djU!uPz*hr1ApimWC6x{iE{qONrp{lwXy{^TZ}(S$oTtdy<uhRk
zK03qf$zkMU$aepvDwuImu`Gx|V><`dd78<=3AUk3xVtk(rQ~d+xd%G4*?8RLHIAga
zK!AsRATn9MF$%KJmjY_HDNpM{Pe?foOZP#M6!R3*9L@cOCnSH7EP%1zj!G0rTjpYx
zQ|`c348B4$Ki>B}S=*wd-Iix7%~osrdtvboeFfNiaXANFRDO5}d?i=i^A6ARxtVgx
z)o*(DXghxzl7RcupQ~*NhgR$EWvyL1UQ+QFh8^BXIRsi&CV@;aNAiu2w#M;$WeoH3
zh%jimg%8A<_bJlgI|}zy^Tm`xy69AcQptPD@q1KA&bOkdQN8;lZzS(tJ~Z(tlsqiU
zi=5657skzxV?!+NmT^N;droK0XE#fSwWG!S*=@Yno;x>lhoz%{NnOvSW(ghdrA3LV
zrin)h9sk-(M(6i|iH+Yp9Uo95B7tU{QhAb;i9;Z9yyLF`Zny75nlM0Fcojk$G(XwT
zQ7mUB*j`Ft*j<L^Sn%ew@UiJI=XkX%L-6@3MY~J*pP`<{N89u>aeE)&+_{T#cc$AP
zfKBb&1>wNC4^lg;1Yi;iG;P{o2VRhJV&76cE@NZ43fAP=R!-DjC^XJJO5xnqh+WAu
z(z!2*Twraia2T}J)DhW?-}^JHTg*w8l-K3(Yjkf^RW=(Rn{o@?9_zO`WK$)%Azz?@
z_yU#uKnn3(c(L)f2f|cQ5v?Xr#09Rm%(<VqI+F9V6VOg6vEQ~VLx-u3YzAPY@r67y
z4)b&B(P_isf_6A5r~iI?1W!nl8UOMq5}lTc5Dc4Vm)y732z(q*h-`dN=EbP+GyY!S
zTi>aF#OUY0;w(-PsM+faXTx9OO!!|F`wxx&_qzRCvpW_h?SqNXAg^9OV0H&)Mb9n^
zn^H`r5M`1ZUqMScwV1&{E3`jf<0>uZTT<|$BnYmaLjG_tEb6<WX9!v{h+#}~0zi;j
z=~}4jk8B@Z1Y$vKe^F_WgPTlayEN#n)9dFN4PH!3Y#ep7BXh&X4zePK9KAuY&DPoo
zE~N&mu(7z6NYB)~WhM#Ekfox`Um#a;^99Sr{jgG=Vp__?eNwRZ<h;ToF#S*rW&)K!
z(!B=BN@dTn(Jqo^(p;i(Q=3|PKpbYGbse@s!+hwb$1<+g-L&Wcd<f%Qwm!*lI?;l?
z=b6mb>Pj!MMeCBt<MDfkTSJS;>X1aO!*X;_UNwjNF)9~BG*uf|PoOtc<*D`)u^e8O
zKE^Ys-mxEKAyJ2FcnC)^fywbw50>=$jv&xdl=NDbkVH_IIpAc38->zjP*CRm%7-bQ
z22f#qedJ-3nv?@)KMgRWyhqPMlT$dM<)==|F0Fnmum~vX1WD9GO?3I_R^R;BwCQ9>
zpoxB^4e(!X;eUhXZ_Dr(Wg6p0|8fgrkCJa9dmYw1QnyM8G?qaj{XyV90ZE*dg-^t*
zzC`_9sKPL4?CUk@(?=Uz?yTkaQg9oA*rQ2fz?ueI-<Hc_Mr{$YvM&rkcuz~tj&jIG
zdtmDgbk{a2T#Df9Ww}qpkMpm^NiA8@f-{}DMgBDLP?YFWjqh=uCLzy7S1#vGOzpM^
z!u=WI6b^ENHtT2=uYSpDu24JIivr4~2!%%ltUKCH+P&a3XLB+9jQB>z(+YWdFtUo9
z6}(6h!!C!D%+%Wh5lVJvn8c1AWrfm@g{t#%y&9GlqB&$MBj?U|l+1?j&gOatC($e5
z5!5WMIsaR%ygvig($z$qmS2qAqjND3fvXz`otCvnmw5r0+Ho(1Nj?E3G%&ljuFP6P
z)wC{(gpd*X!^nhh9nsFHg6)%$_s&=D?$tZO{a-_F&O#BL{uS~d(0>@Tze4V2Y3ly}
zggjcgM}GH9`K$W{G&+M5po9TwPC^bsS_PpUxaMg%Tuu9qGNX%^WOkOYd=BXxyPkgb
zgIOE)UVRHl-+8oe-)MH>Dx4~tbjS76hen~zu?lSq9Wksr!S^2PBL$T8E7^S7!NkEZ
zXf<z5qdHMQ=25%=>J!0*E4@GT#Dj8CZGRGquHh=0Xi|HX(r?~}ZQlJJ5T^BTu$m8!
z29O2B#r5whd9T$!Z`M2mNwOM0mXeWv`yh2^lbB^6RsA$8WETmY{L_LjDgJ~4A<H`r
zZjo(>jO{mrWj{i)yVJ73DUf_oHR?nM$mX?hfrTtND3S}$P3m-15Nc*Ch9`e|JjYiN
zZL?D;bTeOM9!p}ancM|3Q`X!Lg{Nhx;v!As#Wnrz<xN4~`Qo!5bkSg%o~D1m;n}n7
z7EUF3I49)TOaeNI)nz2!!D_S;3Y(qbDPa}azBW#QFEu=D;(r+^iZ@AQ$({Mr6}cu*
zju&TTnzIX=()=(gyJ!-3QhavVeQ2j5BxZs2QE6JnJR;5h_#0B-%0*v)eCOnuz`tfP
z0mjyq7y=0B5dZ&GX%`DqTT{lr@8AAbX-zr%Rd%dS#3g?s7kkrXwG%Mch^5-qN;zCI
zyD$PvK=RV}Ji!?$AKaDQ0N}nh!xZ4(=S()VQy{-xx+t{6DnVZ`A5Qw1O=khKX4-)Z
zkm&_kkFUW6KKCn*{Zquq@i#5DTq)KVB(+1nIFPod7dif=ro^QVXU2oZGffntT8r!g
zZ(B4Q--!WGATJtrTTSRRNj6JOcLvy_qrAl5cyS$(4Z)R>mpZ#8x;c#8I@Gl5No9>;
z2RK2R+{H;xkXa=)wbGRj_#pIa<Bx-e`a4sk=BSw~{I?1OoJfAl=7@}~VEnm$o({mH
zh5bf{&TFCFcxJquu6`aLUP|78qbu-=<^mq*|9$dEl!*>H)6e;R#BI1M>sG)2-mAlI
zlKR;c=*-igv_8Q0L0jH@L>Y9yPi$j7P1{=S8s~Em#IIxaxaAbx$2ioUVN1KQ@j$I3
z$bRDLBJU%!($2jy$ld&@5!~l<<VCAKVp5Pe|GEc+JK7vO6@_`{EEIVI#KSrbh$XSo
zR9&91+_V*A8&c;E_$N3fEe<K0)E_ACfKwy!u%IH4D~O(z1fSsqSea5fGCm>H`6hmO
zCY64;o0I3*>MQ}VuZzp`>1=PCkusJbllN^|&;xWRQ2y!WmGR@_LHTZR_EL}kV<akt
z`{Oa2|Mh&4aa^zvRNv=)FV<W^&*%0zTY#6nyBUn&<O(=CT(o%@d1V)Q$qY#_bjJrR
z7)d$Ph!%k?#!=c>@KHrWXfw+o6ut$a)axoMfWZJnk5Hh5s+};RHV<NJoT&`Rozonj
zA&Pxn*7R5pYba2&1?#khFA7nv?zi*xc{Hk|xs}l}yp-6uE(GxqFaRKY%;F~)9(cxI
zh~9PaUBO{k0%pCqB)D{%9F$wdEDx78s6|#5M!21}bGbE)J0o?;KB12Wr3{1xRi>B5
zu03W5MnXK=vEOBnzCt)5{uOmw6^{X3l$$6jKTDy2b$SgsA)<{V*ZWsrXCH+kIyD*}
z6S2MFgzFx=z)>itbQJe2=Ugq}FsP(rlv^t^O;0&``~)!qrykCQcVfxy{DUq|$N`vC
zcGQnjGN|f>-H>P|KqQ5}<sx$Q=Ho?@_hWL-;7;o0(x}6&Z`<e(h0PE4j_9FlhQJYz
z&B9IhWA|Da#`43+%kg?Qo71q1D7l6^F#f!#^Kxj?W|M_K<i8Fd4Zo$Ei0%gpKpSd=
zmv4at)_+6M@fB8WA>0A$=S&R|#HauTpfGHw8Pe}(uB%cnWmLA#!cyeau59~tu1zs!
zk*C{WNa}tkHLWSD8BF3pJ=p6;FXXDyO!F>d)xM>egYP}I1~;$FiVPL)iFzvCbcUF%
zd5a`=veJqYK11!RGs~WRQg1)bk^w|II@$INKNzvqws4C`*jlew{>Hd*m}wGsun96u
z1QJMf!x;-|vGt_kxH{9ysP|aUP?aCoHFFRXB`H`LWF9N6lbBtkxt9HGTa|rx_y&QM
zHf*KEsx|Rh@`I01ix~%5fnGCJF4EkmB76z)os`wYxHe4a(kx4tKWXatbQTb>W`_Y#
zr(ersKhWPqV++<q*&$ig);e?cU}tdsW0vOifU>SXn>Yxbw$u5$s2~;ga5gl1J<9+S
z6RVJGrK#mRaTr5l$lNLH(j$vz4rWlDq^dBTFLvgMG=jM=?O_~8E0Kyft6-3Dv2rev
zi8{JlR?k@J>FREx;<UN6t7-fv!Z5L6aD&&P(knfh{BNo4a5xS^)>ReVzMM}CLY=hg
z#_KUKw#oWuw#K4rTkuo+qFx%@8j32rrj`qab<cz56)-z&s`sElxgSGjL$kn}!AJQ{
z*%Cj)Q?$gMlOjc?S_`pL>Rzhlg(}IfJO9+@ADj)Mm}fIw+-IKS_?%~5Sb5iq+MW}{
zk7Il!?({(tdE-`b7$;Z%0W*6xP&a`<Y!#pml|7&*cJFT}toMkGLb{JFHq5-WxUN=c
zO}*R)QCgMh9b66p_iF+q_TSe7WFn67tH~zp94R1dD7c0W6&bWmj;S%a0HbQ(H_n)i
zAF593&_V8T7?g*Ke^*>3+Dh{}Re)W{e+~w{HJVfZen$LPNm+W2jxGGM&rM&~U(?XP
z%z=xck&UUci>HmL^IwVrKPH6`^tJBW?f|%3eE8CfD3YP|64AP!0MjHA9}8OKqwKDm
zKHtPMjtfZVrD&X)u!4SW0i(3BtrtK-#CiuC3obaWJ6bjv)WE=cHf<L2Ix2WNm!qkl
zP8C_CJ3!<+I}3-~O60ci{n9)-zc*7{vu%Yd(0=lsO(F|C#0TsoK+FNMdN|60i1He`
zi%0jvEhRy9yjTYvRTk!3OW$ee`~>U%O(=`#^VlxFgmQ)f2#EN<ewDM0rHQGWp|gvr
zlQZK#z4(V}YF*mxvZH+ZDG(p=@AdW20!Z$MG>XGBUF`#1a9NK*r%-H9g=M%S<P-SN
zmfVkU@90-v-D2w$oJ6D>Oojlc6|+^BaCCH5oH!F>_GXgu59b{2uy*Uril$=xoYxyI
zlo+E4B;z;BTYE#Nj*Q&3G*Y*THzel43Epa%KF%*G&8A&D$L<aBvKmAr)h{jvlZAAz
zG0No5O*!-Fc_v(Aa<H{ID0%~9QsD2<qKqxUIvfrdtfS`#-q#tIk;kkuOP+Cbm~<(P
zNIjfMIeu)DBO{Wu*`$?d6(HYUPT7*=dph?bich#w=eq=IN|OrRL06r15jidNWz0w!
zDzrx+A50FGCyup?4ei8FcFWo^qDmG0UMs1pM`Uu<pB%iY1u4QCDl-_Cy467LMEefr
z2#*8nub;=@EI;jNFixN^Y6V3Wn=a<7^DWNgwWQ=^wXB#w{b22x{hNQReuL%A6iZTV
zbV3{#8q7GAbu!a+N~`hgQ4UA9H_e^ypka*IE2_Hzu~Dunk<)dYdnh;OYTomOHY!B1
z#!V@aDlV67T&MjN5>UgA`GfN+-}O@zY9MfR>D+V4eckocvI{NgEg2nF1|Fix05BIQ
zXwOKaS&xHJsWeYTb{Q(>Rn19x1x6{<+w*uU?9)L1!CwvGc4i1?E%Tno)<{2{nOZR}
z$xqWF4mg1RgS`Ym(@MeHd9|UAv|O-IVJt23Ag|uzGhaDVdbg+p`oa0br&)93^?IbK
z><6Bs_^qnOpIPL|cw89_3>g+nHqUCqfD<5+d0{2SVzD~jj)I_%8sb*RxQFO#fHBDN
zwZo12c0tXFIO>W7D>dY@oRz=ScTXg~Kd#x~G_ptl(9hG@GE7LN(x09C<>Al0-AWs?
zj;fX5$;N8h+B4qU@#=KcH*-5@$=BxL5{zj6e3SjRg|Cc>{0zy$6WR0xT`E58>omTb
z)X{R%*~+b*LQk)f$0|Xtgv6Ls+3cm!on(!m+1*{ybK+T-)k>>~;bDIkvzM)3f6U66
z@pbNJMc|K^Bcbcirq`8%=(f%(e@y*E<CPi0KGcp8x~byFAErxTBPTmoYcs0a)Z&WK
zltRQ6J8_f{!e3X;Iw5+&#EzMmSP#G35HlBIj9@y3pS=FPVG%Bo&yv!=i~O<7{tY;e
zq8=RAyuQ$2lhK+gD*-JlSa|4mHRM-W!gi-}(62-+%pWvC>jmAfk`dHIRfcbe@h}>>
zQgELGfNgImflO>g*^gqDKD?&R&b1|bSqKV>YC%jhUkwHd+&HNR<Wz?wu<6wx)Gkg=
zP~o@F%ewc*(1S9Y*!F8qPJu}8D%@!AX(WjbEFoY*2r&6kWQ6=}57j1GbWbkPA?Nb@
zLxryQ``hBp$M2Py7vuQ7kW=>J3rZ#ie;lwNEzrKkYlyMlPrL};=DPuRgZ)DutWgu<
zAHTCzS$g{3MVZ#kJ+jjfeuNR;QDJD3M3ns+;gG7eNW+)ceCrq0G-=!1Zk3xA6$$x?
zJ7g%?-ZG3jHO|}EB)>W&+42;VykK(BGr_=jb%`G}gNr3+COQ2xk60`iSq>#xX3GL1
zGziXgU>7&So@(iPO)ioMaVQEF%-k=o$s1gUP_LzhhAsbUXfu^XtP&K=yH1UfN6TQ*
zFr`O)Uc8+@hz@YAv+8Kq{PzSt9fLA!2#hN|r>-9F`0XWqk?vP7_<D80nC>3I`0WLK
zxEEXS-0jl{8#R5nms`<o8ZA7(76VwcheWpaGa2ZU2+A4yQC}Ot?~l{utN5EJy-Ld4
zN|?Rk5C@_!x=IO*HrjjV@T5hPCmB*aN#=bz>d8gs7iaTF3#b#yQCib7=>*U`eYq}F
z45==O9zy|d;ONWC70m{@N-CH`)HZ#fulAAtcP%y#oNeEFEM}L=*(u`7;yN0=s=XWM
zUY9{J8B1CM@X_)$u#@+{7xk#}i7>f`FFpuJx-=UyF&L0qNAsg%?>Dkgj8133&<17)
zzCX`aKTxRL1Mjfqj1UIX_`{XGS(e-(H9CZB`Of?R6(b`H0~=yQ+!CAy?he%Ae8WJP
zmAKdn%-iao1?|uxUnM2*i-f$`n!d9@#EPYV-X;JdXc-7@@E}SFdHQ+%G}(oSiZIiQ
z?6O5ffPjD<Q@MYFbOoiw+lxTpcqAsl`vJz=ixF#aTjSlo8#_d5bsf&*A*$%GwfY>m
zgPa~|bZDjV0Lx`=<f<6L$Y4E(b4u{Au9bYRy<bkWWD6kZO)`3gpXA&=%05?TSeF!d
zIlQL*N$6FrLh=RE)k`wHki4U#97;QAuUJ0rqibXD@OR^AzY{$86N1P)WysZlkRg%|
zznL=vhRlo2fQ$)$ae?=L+Zezey?%!Onhy*@0s+zepTzPXW3_*m%)iHK1M_m$eMDGa
zV>SLC9@leNv+Qy*a*W2`mn>Zf1$PoKlS@kIN1L;4m~g1-`nn^xMsNN7D6@BuopGk&
zD{vcjLQJ`Ya&YJ&=N)hSJT+CpxEK@B^3}SB;bJmQig$6Ndc$xQ5eIXT^6-W_9uEXd
zt-XGQj5YwH+iBTSZuc#?&4nL`xCln_GM08scHfhrW|9v*&|!<GEO+bIxvnC>^PV1X
z2i_oRedj><QqPKN{(hL%z(dZ_i%HB_4Gb-V7Y((7MD%aV+=Vvy3ARcj?BABz3vF-{
zlBi8hWt(KER|iZ~r8UtL&Xq?VzG06h_%(XYCA-0i9s<2C3T=td)2VDy3Y=C4GYs0l
zga50uk^y{I@BSLORKfuPk^FO5Weq*;U0wdmgZ$&gx4%tDqxy7wIy;*0rMksYaJB|8
zL=@k(8Jr$>=CX;q73C$&A8QH_MJcy47v_XbD*;OV8Cqby)zgB9xv$OEs+bABLm1sI
zuMTQ#EoEX@y=d=?=vk455f42Th24$co>mfkWay<%y&hsi9rQ^Zi_Kl_WfJlGk0zt)
zFu<bZ5+*kENo5jPqc&19B=#P1v>we{d1NpI1E>?=&tOyJeObF9Eo5S*jkg0_x6ek3
zCN3lOs#3J`0Z=31ATLgub9Ex+lS1X-NkGPCgi1=QX)*LuV&kv_M+GB?xUi`nwV@nU
zuCOR3EXny1iA+}n!jVx-COgPbp$9JcvcD=0gOq&LEB#ULEBw&R9#-%31;whSwa{Rj
z8CRIrWSKdhEMAn|WyI5eP~D@?38mNWYgm-~*AjrDP+3FNA5PmmJCLwzZ)AoKu7h~k
zD`u*ST$plaxoV-gr|3!OEIjhQQYj7)f2$NpIAq9^($haRJq65s$JogOI>d>rkjKZ5
zc;p(M1^JQ7ahFNqYQGise)OGo5*p&5V}U3fgySm6nB3%d$Vxx&=E*YQ77UDqxN&6s
zBRvbVk<$2Fvf}OcQ$i7GK_=3gNjk0NOEZl4LTN1W`3XeV^cs@aa5M>4Ml>&xf;<px
zXajYSf}#SrZ=h1;Z*fpJLd9qjEqoh{B^E%c!OQiASfLY6Nx;gqwZ+owdo7MEZ1
zao~CLJn_=g{Tf)CyNi&YZD~`Pq+g#I{5fkezWPLm7&W+V1u4^p$-tG59X5%UZ#^;*
z@W>F7%?Rtr@AWBsmslO5l?8vP52H^fpQrB~`Mi#ijv|KWruK=$LmQS!CeWuSxs<o6
zJQ2%*&ae8qS^5Bu=g%G46~vh;n@Lr3nI~l-5l6UmJ=ayAu%r+VK!uy_|D4Gz<0>eN
zh%XwLm7WSq;wPJQ9vIjF`u%P$5zu|l2hFXZG;z<c=04CBN()Plqy3?S3{2}X@iugL
zfnr`&F1bX_Y%C@}y{h`WEt#@~(<UeZJw;h~?I%1<VV4#<J3Qr<0rj%B6nmBnxsMwb
zcnDAGk2G*xWl=cM4&rvWaME1u|M_x$DxhAX+P*2&3vggm^$t_Z3lQD$!19tpTdh4U
zZW6#pIFrJGaKpLt6#(Q*&2+uy@hC15ng*r0JwCQuC+dERmFr-nZpAas$qkM+9S*X6
zihdRCtlMw-RjO*hheue|q33%~O+9prt9l;>M@JnsGR4Nv7em{K0@gzvqfo0S{i%gN
z&QKe#*^aOOhSdPm?eONWzGweJEB*he|NjZA|4f0uxf(s$fzV5Y1bMYyOhhLud}<Iy
zeQqR+rxV5B4PHfG3*MVM-JCK_fe`a}|9Y5q!KQEq0O(w4&DEj6bwDpX3q`Kpw`@R*
zW4HgH_L$p=CTnw>*Kr8av9oZA(-7jd<UhcBl#au8gor23V@%*Lz;>dZ%^~^416RVX
z+NgTGx^>=bLRs#~y_{Bm%CjTjcI0h-g*0X?p6xG6m6kEU@_kT6LlV?%yzuRUG1*F!
zq{Rxi(WCht=wAs4^nM0U^R?Ry{&g(iUshrNCj$Pp(j3>&jag+!^1Gyly6;hdxfhXY
zK(;p+F3;;Wb4RvE=uWcGh9WN`J71y(+0#0Abc$C>H1YIb;SK<O8ptb2-R8I*|B2V-
z+f0kSr1XVAIf)Bgb1;jf>-G}0+3BIMxj|B*e;}|ppD!ujUA?Qlk}`gu!(@^J#sW1d
z<7a83T3MDLRXrO$z^Bhz>)2j~K|1fQT%$+l=j4jZaLyqoF`bm6W@aa+ST<vdnD`;@
zYGQ*@C9)8<B2K3cX5(4qe6hwC16QJ^{a~cdt+^Xja<Li%kWsbywzbq(Zf$y;zQ-vk
z@Uo&9^h9K&Db)EPC9>yRy{4C7-ghwJG_QICN3_>YS!&h1P62w<hT*nY?o|cI64sRx
z$CV*L&We(a1AT8{)4AG|iL5E>4cWCfJ4n(itM%hT-$AY{rU%P0)@0myPo;>~g4Ox-
zG&ir#62Q<GKUgoCFYw8<veV5wF7mRd4J$N#$<i*bJba(Mn1t~RitlHfeIY%2W?MuR
z_lS#c7scXs8&HPyM(v;*BRxCq8wl@bYlJCe%YLk|Sm~V{40R)MV?`YW)moVP>u3lj
z(-<bIgjtlAhoCQ0=5XeC=5~9Nprd#1r7$qs)&b|jp&7@kJ@X;&^|6f7FRc)rBll)y
z`<oA6YHHaKtg>}DBaw5&(kpt!t>-v#8&(mv8V>!6bu59sqBhPgf+fU?1Yva1+dGzu
zO1(~Q)xt;=UHjF*Sf-=ig^kiwCVQYc*nOa8?=~nXpF-FM<fyxE+4IZOk{vyb3Aicb
zINInCM@&_vEE5272y{=h)(P*nWk5)9<@&t?m<A?~f*CV>5%=l^bfY$M%5>)*vfZNF
zTe5EyGzy}-2if4v&?hw@7}$Xm=z$lHI|Z5(4@g2HfOJpaVq6T=fEUPaO3z|?8~Ex8
z$^lChQ_E$yjgT;tz?}2K)c@t<4VCbo@*{%^<^p^8D6L*6_-fMe=1|)ET};0gTKl26
z>vnTU;4M&lUTL!$m+=E`$AsP8!9agk)516kgeNQCng^g5KbB#TL2+d5uRYmVUPfi9
zRly##VVmemqs?$fa`Xa9iqrkf1??rTG?DJYX$GDdoK(nwCo)QiOL7Op$MWg??G@Q<
z0ypl;fzcL+%HcOPVfFwy=O|y&{>oi>?iXxl{(|l8^3sC4`ntdn)T*&Uu{|C{7RMk}
zucMcw*0P75rAzU_$)mmh`^)?4ruiRLkF}~3+{b8@btoIrTztT;;}JP$lQCu?ds-~1
zcXDW8CIr8*RFpZu|Gm-DTbFehjLr7$(sMK5_mLQ^-rua9PmfB`Wkj_8NR3FtEXs7x
ztODj(ZN3a$0t5lQ;>+P^f3jK_u12M!8Se0vUN%ukN!M>vn3XY<siu$yL-D?R?wk<*
zD`@^1wg2K^|2Js<zo`8SH~%@L1y(<bwEgm$$zPk{{~c+W|N7~TYST{X+(^D%`nkD?
zek34p2oH0|Sa=s}sSU$($B<M+L0W+><jak#$FlzA?-_nlU+0-Ji&!QSJBf(Z%-tsm
z3Vfffw3Tamgi#j8BtB9(Kw5@1h0<Q1kJfDu{=%#kjHqVtVLpCxzB=Y-`yPu_ByEGS
zH8`s%skz04iu5&h2y?YV^d9zZ%k-u+g;l{_c}v(2K|(ecIm0baR1MM@&UHV>5kIwL
zvLTt|YU{NU3{iuKL<D`ckWqr@?4UvLWSqeZegvs=&Mcm@iw@9=xTPcY&DJT;Lt3o>
zh<UH=WU7cj3dn*AMVw(ANk3X9_Ik71>MdG7(z8yEbX;IbLx;9^gfo5@vzm1i%B$&z
zSHB3eRi!zzIg^0%wQIZ@76priE49FO-7!M7+mjdQ)%G@@UYrMntw7}jRou1eW3UNi
z{15(ee!bypjSr!q&|@Fj7h5myNOp6Un=)fPY^vho9-ok~2~&mS6^e=_Y)W^phDZzX
zLakE#frlUIVeleH9@H~!ZVtPh+FXJQG=*4f-okTY(o!G5R1gkAXdcl8y&y{lcltTR
z@H`_<Cd7`)Yeaz?{K9scd+bO2!b{3e-XueOahYk|@M2t5SoAJPJ=xRJ5$D0fKv7N!
zH=!4$uOgw*HuOC=;f71zJWj^~v->cIvY+{A_Dp=h{2ml<wqpIqN4w^E<~H;yuFO79
zIvz^=0RD!iKD||$f~&ZiA(0HsA2Z9oGv@qxg|mQlT>0}t{u?=r?0hm^x(-e!#@&Jb
z=v^kK1XGXWBB@_Ru#9PDmQRIU{4kkiBy1CZ-nF()8(RtF%OenjosVJ(Y4Sj^Iw=(O
z6QaJ|ViKs4<K@v5qS$JF=#%F1-6dgP;o&65NlpHVhzAMgIDNT5W}HT9KU$Cy5s;F*
z!W2z^(PY<TD5^RfXv>x9UCM;e>KrD639`h?8j@<ltYl5);BHfHeAzv1#uVuEBNQyZ
zs&Sv;=uJ{thpPae29OQApcFXnQ2707-FEW2V5pPYGn?jwn<djL*d1jog1qh1WT-hm
z{h)n`nL_kGhPIA}rO+&$a8h$0C*M0G#mGQj)ql}jM7I0cibLVY`Orrc@tgU0>1A3R
zcJ*UQIwWE-T;IMz(#m_X&<$`3(K7od7@F6jgwYS=e+sL=JN7>ZR*?qhTE<_tP~-o?
z7P9=q7OvX;wWYnO40ZCO8chks73Ksj8P67UhOA^3(tvCQLmFRvrm7=XP~X}+gMV03
zx(<)Sa(x2UiBxIM|LFtt`F8P?BRw9+y4OQdeK*b*LIR?+j+?P;z~k<Np|Llv2fWHj
z_B0|&Lc3SOp25#^J;OOof7`YLB@|8Z+L@O<MVR#v=*mV)dTjC}ROM#1-HaULCW&5f
z;!1+Lt)M$_3X+_GrheCa=xXuSGcL|Dvr<|Kpaw;X1`D0d7#aFFlV)7N7Ue)K?ko^O
z)&!58o?+63oRJ<%VPf5Cl8!m^U?c9F-&?5jD^9i+H{vJLdlH9x6cStcLRX#H!W#3q
zOGl@vcpNP;em!&>B0jc7>Kzzf4dV+n|K^xxsrWIxF$;UHrOiBQ30obq_yU7jpH)-4
zX9&Bt9=|ZW7r22!X<ZrkV<VTm{e&(t)wi=uRX)89o%v-*JC();iPX)s=6nlVHRJj-
z9mYhediUTLsE4H6AwFcz80+6Fq9HkiJ)p&4LR&;j*w(*0K|wF4Og2i9(px5)BVP4l
zW>j=cN43mKEliad%#36JS=FWMR48pFgA{D@TJi^%dnljEMuF0Y9SA<5g^)~znJY2%
znU_*Fb0X<~3Xmf9-B_heJF-(D7{!F4tK1CHFrn$|5JOAg<1u?ZB3p^ZrWg{_)umNt
zek|mQ#nW^Rz;*HbTqCh+!1Ov5B77?DyS(VFUh|Q06i;SLO7C%ykAoh0EmCQ~{^L(W
z{I!A%%<v~A=7l<EdfE<Y@#iL%9;>>$pefVIVznv*uc+F$gh80;SG3IQP?;rBfR(<F
z<nEFbPOTkOG20wmV9hHL{f***^nmLw@#P$y1gnn?7miC=h{W`^x=FWJwpn^zAzChg
zIO>qOajHI}9M*vuaJE+Lauo5xF!@OJShwyUu0rLpIwjjT0u~9cxaE)fFacjrAiO<5
z_XxVyvrz4sw%{S#Va~2!&$+HcK{qTAt1rPBox8*o7JPo_p`1m$`nQ9Y2p3laz4hR4
znV;~UoW7yK-sDXDE-txkWpc$!{x$LHh1jKg)njb>EJP2%(>1Xlt*d#Rf=QV`4aaxN
zobjZFMKT#td^96%lt_kpBxV`Je~&N7Acrm{HDgq3v}Z;>X5428N(t1&<Dm<}`My4S
zY8=1N81d+*GDV%oj9CFcN=Q3a5X#TI*&6XKX(E4$-1YcJ_JZF?r+VZSw9BLTD)P*)
z@JK>;_YF}heaZ4ffie!<kjnc8>xbIjpSx(^Nyj`SaKns?_+;8qTENAU&il>trMf=<
zr*n`F`0IK#uXk0;k=bAb`pi$x7#~V#W7;M~t6WPojfA<!iaBRw=^cmomXvH2Oxyh1
zDCG6D&#Xtej_6LS(@_#*w(7$jy7JMBPaunv*bp|?Tz}-H8tapLPyFX8iQLtDUmPs~
ztD2ni3)j%^4;UOA)|sS*{NGVK=FsNzWa0wB&fZa4pZ@La|NnnK`8TKja}cdC5`+i!
zg=h<u|K3ls{)OmsjdiCLPPER=;;9x)9j>})8#v{e9*T|SFF%R)0u(_X)Lgfp1QvMV
zQdHge%IO=Q6cSCG3*Mx200H|v>;4gYe*3UqmSZ5*@<)^FtVLZO@+Y&YrXD{6Pj(Ri
z-!0&0%NUa>Ikww^sLO`#=R*??RNDFuo3>;QmGrl65o-ohwt(*i&PoQ312jry-YkQ#
zbo2TjP%JUoXs$xJhn&E44tiSmo_-^q?CJZ|*!3=oKb7nxIde5=?gx0QMf=n;d(|9C
z(4x6Is^fLkO)Q2f-5#%s!Qo;U=VPpOX=i7l#{~hfjo*BYh|CIASPzm*iZYNoXi~av
zt(4ZCop<_fvVwVM;qIki7dWi2*6^5NpU$3)>-&S}qcC5}c!AYKMr*sZK7~6Z)KCi2
z#~aq=4&5PZj^7S|aM(e%V!qqXi?^m6ctmr}>L`J%RjNjJ=Czp~n*&Q1olPIukFAoh
zv!7bC*{@ZVRt&<wP^_Ud;Fu4q_SZFH+~ic5p(2+2L~SkVe=EI<RSc%}rB;)&jhp#k
ztqq_uk%eYxMu=PdHa<Hk$Shc=6ZEJd*&C=Ro)nNbw-Gle5gMPk%y;m#%W@EnNt^GI
zPi&*}WGvx^E3vQ)c0nyaoG~p+z5Sl1J%FhS|7(Jd+;Tq1e`^&~GxRNs1|fmpZi~~a
zBgNs*acH12CCUw77okgbaYBjM8>^LBM}b|XxVYI$hn7F1OB&oda7m#%P^4R}UgSAJ
zH}~%)yGSv{+KViNl7W|@8ox;R7{g$URi4oE#-%GPcAMIF<0W{Y3&=qg<-%ECsb<sL
z$oWE+KWHgS<qS)btEk6W)K0=mYGwiP<hhhn-sUq3hg<@g9o`h)y%;(Z3Plo!Gl%uT
zXDJufN9Mkj-qW22nVX9sXr1g$i-Qj<OM1Agt<mH5IICDPb&K;;<a$9YmK`4U&J2%A
zyvq7aD-m7qR0PqPQ_c3hiUwiCH|G{nc4wTzj=WB9ekl3X<B*C)vPqso@o$bj!|THS
zw;34dm|lEGZmZ!`;0%NbIRL5NUsb*9B~K~gBeFMg8XsB)Rb;GjTsP9b@+J(YaeBLP
z(Bmv1{B#jv=6=u&+n7QuV!l&s5?Ldkw3fykQbE@x_B`^xmJCCLe_xi~MT2bKz7U`M
zP^^>F%B5#aa{GHx{@MQgalQ$=N3(_xRYKWY`STuz&Z7zrkmt8GU}z%2AZ^2X@pz_)
z^5nMTjvSHifaY%BdD0UU2|-P1rgSZ_Fvh8KXEdU?M{OQ7zNb18;iUsY`ylEu;KcIH
zG1F_Xw*O^%3YHHsm;YqmP^pbgJ)eFpibpo$`v-x#yG#Pv56_7Kv+|MAA#MpRs5NY4
z-A~B>snz`(CjU7cJuc{v-TmSyJKlfKQMSK1s<|GY_cup#mSC%~punIOmXzt&?QB&Q
z+!v2{1#yf<5s=scO%n?$<JRZ=6THI|P<c%&oL;r6r||s_H~VjXc*C!Vw{-JnA=e=X
z9S9!guB`DAtnxR9dWW|b201Z<;1k9fGZ}Z>u)RElReqlOE}l4>+u)fna>9G<k=`$&
z;d$0+ylFq-ZrvM*T7B4y==y{|OeW37t|CJwlxCagCdVA&Pv^ZB%=qwQ6T;V~EYjk1
z=!CGOYJZ;GN-168R9Xb9G4+zmwt6X(*HTF;U^2R}5_U|56$`jjqM5O9!QatZ`m@R!
z(+{O)Kv}98Bs-*~D0qA`3Mo<Ke~m`(+lfnpy)ciJl`#VUPCqw(B1CS+fSCm9Z2j!a
z*T4S!%k+eyBX``pt<W0Qm2Dm`6_#F0UVPk!)pLm$SI*8|GW{OnV5MlJtk9`?yw=)T
z93TmRP*a<Kx!H*k*KbKywv1)=x7?66vaD~zOjx;xk*;3PAxJw-zg^KZ+9<vjw!I;{
z+$Xyz(t#TwA>a2%4w8=k6(vBg`<va?E&}sJNt(q{H^LPAh>U|D(KXtfL>78N{Q_0p
zFiljSRfr9RQusBzl}MIWHARdR8NH3_k&0M~K)@(ez?PVijYb;P+Iwte`*r=IVy(*P
z<JV5OtObq?{F$?;;IncH?l|r>U*J+){M5ZETT&~8&JcL_@|e7Z5G+x{822Q!&*X+Y
zpWb;5z9IbKbcl45$IK(R_9)YD+63-lW6doq4G`H&azLsagT-69tbPfrZ`t^oBsNzA
zZud%=TFHX7%5Nc8oH-!DB@FbO#S-~o7+jUDr*3Qb(~i&|@D;xyB5gwC16^@7sskHs
z_cV*Uz2GalAWlV!UHjd7Uf}7h4A9M9f1U@3!Q0e8oVI>Q5yL((r>%&>RFs^GhANxt
zpx|v9v}7B8E2-AYYyGKakCf_+6Gtr|=fGWMF~<L6&}K_y?>D5y7_<&iWIlat$9s|0
zk|iPI`A&wfO?{-)gX5nnomB}qNYm!B_nb2|Xu`zxveDo=?E$y%4_%5vtJ*=N&3N(Q
z2erxZ*izcvs}jAj6wubO7kB{v9AyfM%I)pit6>kym)74b?)L5wlZ%<9U?0vAsj{UJ
zyBWJ&6TwxczahQZwqy=|-&v*TozIlNXM3|<ey={r+9<bG!sZ2xJltEuC+WU4%5=FM
zFEYHZ#IL@kt=iQP;r1V7=IclP=zMoa(<t`aWL3YwFLtyX(6<w4@2vf4y_RP^DG1dj
z{t2;xV@Mdf-M9}CjO16|Wj{2d5@%xalUD)T^A}8qtxAC9D}_T^xFq-lA*#xdiC}42
z?|sSBVfctfjjkN4$8PZ9w>Yk9iJ%qIK2`w0O<+8$=`Cnt8#8MV#jDtQ<0<?`Sqe9(
zr~7v+0^F!DDCj*gGl+JP{id@mkKmNVdk>9eYaMUOO5^xE;pXSZ{m`TT#Lx1Bm*p5A
z(ltlL=mu(;0^)bSt!3g9Twfqah<0e7U9YXy!ERR~xYr|Y7rU-f780P}$fWx{vzLy(
zCg5A;BIR@dtbG@{4Kdbo`L3~`P9(Ld0$w0cHCGe1zIff>ZV-kW{OX8!mCDYPk+f0g
za%K^w2V_jddS;2b8#c^jWwCexb?TY$U)GR*>YxAPX~^H4{LhgH`UZ-7l&{fk3)Fuv
z5$u0SM5Efe-3rr})mGM623IdlKqAWN5fx^Uvyxz2fxJQioyJ?OhYyxZdeB<kz56;X
zfHcx3e**p+khyz%n|X<MxHrD4J3f_6qYKS{5rQcMsi+rTw^QzTc)|XdfG`t!70=Xj
z+)`iMcI$R?5{+l6rWe<-gayXXLJ(Km%q||ov@npiAgT`~#g#$-qYMzJ-q>46qTk~>
z?tlj7tCJAn4e5Q4;$5qw2A&BpQdYLZp^RWQ0_xOLS8CqalWCU{V{36-@E2zGmKw?`
zh0AxIcpz{pvx4oNN*8<>2ucc1zyWJkgXLoL&fBldo4a^=$c%^e%+KfvJC7DtwZzAR
z<MZL<EZH9XV|#^qtNNQfe_d(b4V+=h?hKq@WtFmtxs9d7B4i-HL1Qv|ntmZ=(qfk#
zh#%Uj-m3dOQ-1HgN5o^SY@mEUiHhI4AAHYv)_#V^ij`|swMqf-sD*Pz%nx6sT{p&n
zzP|zS3|5XBk34H;_z=~GX-S9>3ep}WsO_Ej>JE!|ykZiC;z!mm%dCc;9fusAoFud;
zid*&*6f^}$N6Rg0_fKm<dpmK2kIzo4p(6Jbz3`gSlRe3A`ZyLNsy4%d9P1uwL8Q#!
z*%`r+%`GjYraK~{IX<|Fp=*?j;PD-A#_^(hqT}b3;f6~}skPf#O`l_P%M8=m9FDR2
zr_K&T@&c=}w1au8{39RZ_LO9r{%N4F^`Qu$fZIhfyRX@$q4#zJ&APLSsilf?G9zvt
zk+62^l}{2EBWXu+9C*2O1i#IRf`SKNvIkZc)8U<tZQ9>rg>LNj!-o4Vm?$tcWe^|`
zmE$|>#~2YR#Va}Uw$!j<HUYdY<9)w9G`WZ<;Fp=Jetql{82g^Y9)`bC0)rwoYbY=h
zKc~>g`MA66u%&S8H#zCO&N(q-lIF^Id7c5YtC(AY>w7usW)XU=Z(LsmL*Af~7AZUz
z_8eLGB%Oqogxh22fR(1*JGeoF12%2oU}F%lP=!EeXI6ZQR8$4Xu-A>9yD(1YiVM*X
z5sBn2+e*-n8MgnQn$Ex5>OY4lr<mF92VZ}$Gygiz_1|p<ar{jb)oHtQCN$qob)tu#
zlvYG!KT4#~B~vL}Z5tX0w}Pa;_dM88%CM`R*xP2$GA=g;vRd<$A}+%=F+S>TI#0~s
zFF!X`Rlp<HnYgUo%CBXKSq$F9GYoaCrq4lpC#(EAXfaBR^5!A&K1@I~_aQ5PL%Cei
z()<}Tf=sYVTlfS;oLVUvy$`(`(%TgTz@g=gARht$Kqx(hd9LCSWn(B^AB=qLug{}m
zVK!cFqflX!0xPWvy4f$=s~a8MOo)ahA&v%o{3dJMHL=-L{k5^0?1+lX<naLZ(>#)<
znX0M>2^Y~e@l6~l&)wx#>Iys?cGtTx6}EiL)dtr%|3Tt3G>xcUh_N?i9V52sN69@5
zn#OAF`O1R4e_T^sWnJCiFFx)1`Fq^X<&zzF`#IeJ1cv#Nv>m;^YaULPig31S2XVEE
z5kO5lUw4(uFr%g$BmD?anpvJu+oQ$SX;QV;iL<Hp4P<zDd2c9euFhm5PiO-RcZG}_
zZ<+1bO=jmEf|_xEzVz4bx@tdO%LiIcj`+1R66I_vE<#461Gh^r6Bg$+U5i6Jr(}F7
zPxuU($I%NhWJy_tlorEckWO_{r8Bev*opyH6SS)x=6CnX_y~ixVjVf&y=;=|YD=|H
z8>((7iCZp=r_)HZXri@%Go)PkXn*UOOI9gt<(&RpnvZWP)QJ`*&rO3Mc1MZVF~wt!
z*wEbK7^<nLr&hX~9b1L|aC!z9!Y3T1*PkM@6nbe0=Cgks*O4e(@IvJGL|zB`0xu;D
z9dHJZJ1q;v<#XE}A5La>t78>nyJDxxj~wWFquLV7)-|maLB1w@p?)ZX@@L?ssAGA`
z=4l9!^4v#|LMUOg1^(ALL*<xJ%tB*19~thX8?but&uwF12^HTdvCCH%K^z}4p(dnn
z4Lt04b~svQZan<Z#Bwp4n1yWDt;)p4TR7;I0F1x%?PU|XO^KYo&dcaEG)ZLXPW8ZY
zJ%j#FDe-r!{O3?&d?aGN?rUk<`+qUw<NS*f<6k4be0HpkOJ%0JUUe|A9me{J7~<U7
zRGWBBb3#Gj<s#T{>AFQbbMuGBm7!(IQnGG0yrDq=qnEMC^O5AHJ3qdm?^6pMtZg9m
zs1zGs0IYS+HVwI^o?KWkV=Mp}j&)EmDK|*PgHywh{mJ8gH0&iFeVJY>j%yK2Y=B|<
zd$yyVej0_0d5h>-&|C{jmmCy5GN%6V+!)mhMagJy1N|tAOrp9MPu2sr0!0Gvz#>^@
zO3MyB^Z@vd(X&nRVuf+}Yt1|wi%`C}fIf=;yhlkwu>e6zTakeaF?WC;If=rMSdnT8
z8veJ^7K5qIeV?L%l4Mz!z7vUvS{v2KO$)v%!V(;>W{(`LEUB_&VUATotzXM<O7PC+
zPYu?2qx&JrH`L5|^kH91Q`%?C5&rf)treG|^bHrcF39u=yN8@{I}m4UC2ZCDmXz%!
zUCvl8#u|tPMb}f)dwQuZ=1LM00&G9j4V>wIy6pQ3Ie`=vuM~*q{8m1$)okda7hy^C
zymOx9-wCKgbs&(Oauq-xZjnndzZzMtOhxOG@ZQO?NR(6xSx8{+OSkE;2yaybRlOAh
zw1gup+2&w%A51-NG+k=D`jzNfi;5Yp*g7rkujg(=2E+!YKmfiayCYA`3v6mPE!`zr
z)<e}hBnn+t85W$Qtq;)OVoHl{DBLOy63=wTbEzDY_=qC-FPS)K%T&*F$IOZ!PSbc^
zCKx=ZzDJlB{OKn-dsG$7qUZSevBOh!DJ`V0ya^8UWT$;>x5X`(D6DX=pWq{5DWSF*
zGgxNX)xXlizOZ4C$*0j0?aHpv0%>h)ybW<m(AS1r;SP6dTl4X)qN@jDsHO)3m!L1S
z`hvI>8{D8nbECD`1iX}fAnTmEKK*<-ajTzth?88k2;CzvlHl$Ea2H#j@nG&QZ=&!g
zBVx9ik&ictKl{v7pgFzg9-5+Xqlx(tFD&&ANf-SJ4jr}Q8|N!uF>XK%Y=y`W?wzoK
z=OsI5rRben2jQM7g^wQle93dB2PLcf17!f@TXo?{-qC<L;nkmHMlhs@gL;G~i&02C
zLJwc)K2W{YYj8KDc7Xj*tTz8D0`IAu4^P(m_4Cv*y`)F?qBg(Og7(j=fJb+qtkW*J
z*w3M7hcwjIOjKRcm~1X&-Y`_tw;zbWr!~l_66d}Di@UcBt82-&KyeT5!QI^@xVyUs
zcM0z9Zh_$L5Zv7f?(QC(;DonN`u061>70A}z4xo{`hflI{jtUxtJa>gs%DKjOZ&iy
z=sV6Z#dqgC*ALd<mG&r+Jj77hmck1c1EIv2gT$k65-`2bzgrj+DQBi|3>P_upC4$S
zQ$sRXZ-9-HofrZi@Zq+}CIlju+%-_0aVn?UEe~BC;3zN(6@7_w*;Gp8v=x1(Bi<pJ
zsCH(j&vL%n&LE~Q9|HTfK8wT2yP;giD2{yYj*7jFBNIvB?}xOU>q$IOf6CzNjYxWu
zYs_>dSzc*d@%{O7qKePw*&|q!wXq5_@NSi9y`heCOoTuI)!i9KSQJd(u<K-!T{P?f
zJM4rXQ>>f9*`{N(3<ii^DSQrh=v>3a17tJeDS=6j(!i*PUQw|DgV98$!$31Zd#xX+
ztesbb-~k?of3%Tz;iOHZyuzb&E^SOEoGQ-WTkzY9Aeux84y&q;ldjAP_7eY>iqWF7
z*I(3#|L+%dez?8=J04$irY_zM;PDy%4Uf<HhsSRRc>GU4f|2@%K~<smNCZqyTo8lY
z+zDi<WL`kvMxQ8ssE4E5xigLcu}Imh{CFUt^n3#y6Wd6d=yw_C>3MR3=X_tUq*stf
zrfWXRmNPr%#u3pa)TzsmXPsj}q?twT5Or&4#2y*RE$;UA+n$#>CX=eU>e)1r5x_V{
z7d@t3AL9U_S4l)KX=ua=wrM4^3~E0YP{+zYQ)FMmEfya6;S>d}M-~YUq>LJ4C%9qR
zIshV&f$?yTTko`r&@87)3C++*rMrr!<aD;5!ll1aMV-5}6n;5W{h<CBq>r~@9HzRd
z{b99`aYH_rx6rdfq$yr}Bv|Dx5pjvCp>yWAvZcV{@Y`*&g*j0UHWV%kJvOsEg5n!H
z7SB1h%Lf|?#|KoTDQBg%+4}Zp@S2#fP3Q|5_6u*TY;~)n7E<Wd1Dru7KPlIgW_ZXB
z+8nJG@erpu>CZimmX<Mn@2Sg0^(1*oTkp=@r^%XYf#yq*cYh1<fP6!4N-x%P1-c?C
zZXYO~Kg$yy4jTtK7F==S))5d*S!)8*0Xu3$56Kr=lpP-7suM2hRZPe{aDM-exU_w&
zTr$n5%&^q7<*U@tRl=d6PZ&_D!$?T-LwASGogLLfUZ?^iORpi}aX!PljU(()w?rgu
zocKZdT$-p(v$k3Bc!yJBlXQ6*a9J8HKZ>d5G$Da_qUL^7A1=X4-cr=NWM;FO7x!u0
z7c;C?6ArQyCaFM_TTD^{)oi-LWvBTfG=I~jj`Q#N@Y%-0hFF?VRVI9cH<Gjkb>e|_
zLxtG&&FELq(H1?Q5t#8x1z1nvVOl!sumer=-es<BapLOg3vgf}++L=`BRIPz?`W;-
z^VcFkaq~Aqh6+#1f_~uUuL4CC*-R}0cE`=$0KLg!<{_5@dVxoD@<q0GRm=p+g=aVf
zcCYj?RP<M4HP=4%1qiJw2RrOI-b00G7q;X!4lLD7AwH$B^f~pZDx9NLg+ej)^F6)J
zA;LzN5hn9i2CMH6VIxP<5Oxp20A2{XhaQn4Y4E!lzQrE)Y}$Qz=tqgx_Br<L%}lqG
zRUm3q$>rRai(#;XBbB^o#;y611&idWMAt=FDRoIpr15+jJY{7=w9q8X{3AZd5t1T`
z+C!0+YEvD$xo|s5hNSG4mLSrCe9-Xe%|b|FFGvFdO;JRPo&NJ$x20nZg;eUfx9Tcm
zP0end>}b|o>3%qClXVW0?<rp*-y(HJ4e2D43Ae8AB8b^v4unSBmtsuP2k%F%UFYXU
zmG}32ngHo#nd;#u6c*KwxO&68&;SbNg-Qo6GD>nS=*nKF+UfIjuy$wr8ZBRz{VGCV
zcMyJASYT~N6&h%;xr780HR%rvIu5$=k(Oscy$L>0A=D(!@j+!BPW^(XP*{iaIj$}R
z4c*gOC$5N*R-<{>)IOE{4lkPuAG%^hS;tG=#8KmOzo05^Ajrxk2)WrbiKk`Oh?cyb
zb}{nsp_ulXl}g;Mqyq7rN<n{VN!&Qt&6zu$_y2Fl{lk0x-*MdR1FZK^fDqvrK;8Mj
zW)d;{BSzT$hmgMgjvDIQW<qj2jfXMudRM%fh@XtJ0(Uzzayyw|7>xwCb2!jL#jWEN
zDXwDt!unSb=M!DwaO@7ZYZHp`PmL?#d2dwG!rZ4}tNihDod_jW`DESk(VMewV%hTP
zpfOl8Zk(0x*4LkJM59^h&xd5o-++W_#tipqVG$2vo*0Ty=T-&gvwCX5s33t>9V4ZL
zdqNzB?@&MmHxZTJF25Wnz8E$e{ShbBnObHc2zM~_?a*Z`;bw7zNmEt^ja>1P9{x&_
z;hG(xVa2=qbrwQZ@Ek&re!oqawM5^b8?FfIMxZl*ks3-2wuN<EdNe3^cKmpZUUGad
zz@c@+77x|1K*UmkXVu$m%&vA==A!mAW-T$YQA%YANm*be48zsTG;6UE7+RSnqJPHX
zbBj-RgZf&BeiS$4eOF0TDdF9FjyM+*$)=dnowB_Qig)J9u%Uy&8-b0DW`+%U+VzwS
z=p!da)%ce=j%#7Y4J8yV^TT48YeZO%*83)dB_46NTV@oSqtK{LVwSJHt3oYISVj5k
zu!c5F<Kc2cP{L9sliay)REFaGc7S44ad%i8V(aMeLy0<hg=~})qdyEHx*&JZ8Fivu
z6&Fr|l7s6&>w+X5B5!M!@Q|Pe2u&)tLVj<Q`!u1j`dtJl2w7^`L&V1%eEFh%@|6)^
z6)gqcD-mJUjoSe$^T<IK7Wg1-=FJ{?mzZe}y{dFZ^g163FNUSDDtYhVK~L>o#9_)i
zXnPlAlfKD3(8<m|V2Yg0Zx~HfP<qY<_}XW-wSksBqL&8*e}pBm?i+_c9Fj}$TmZt7
zCqKfH`V1qr_m(aviHM~JXk$w|np02p?b5QNlA(E)IJa3$AYgP^2?8kz+E5-a2(`$1
zMATAwX?4$^$zXUsq&~3w6sR#dYKR195IK%ZhObwZ#Y}v_(iX%xYyx@|G0%O?F{@>m
z2&p|9LhuKB>|&99Rsh@k1P`geBDsA+GIl3YX1&W=&0YV8J3Bcz-}v6aU$NaQ3u_a|
zZM}@8Ne$tbe`JPfEOA_9kp6UO_2E?RdaG*!^5sWt@|$hvax`t^U{G!)?sNTD>|uuQ
zIZ5T}If3X>jw+$0rxKr0vr6Lqn;Tf*mPH+ezRoeg1Sovq#U8Lu^VMSKYG^v5)Csfu
z7CrP7MIAq~=9sD#Kgr(6dS&E~@Obgb=YQ~K|7Kdy50n1C<IWCuC8jt5?yQys2#Dgp
z5~J)K%mGDs9h8hed@#2$bp+HN{OLU-ouRqSsm26g;RAERo!YiqE18>7UUe~1k&yx$
z**XuI>QuQoY;-l4!{H02M}GUpnIi}=3PT|7Jjrp6irrWGPQF`=r_r!niu3!8^Sf|E
z?eFPL#|Xlp(rvI6rD`l_Zfy3|>>ZP$s#UAAbXuf4LU#@Cy_l<%H~N8<#->sPI`b@y
zftEYYiGW&E)mR~4atf|6fwb4mN^dh5Uzvewm!v}s=-0>+s^0@iOL__~d3FUPJpip4
zL=WKChIRmlG;%GPR7{s#`tQ@?8Df1YT72IEJy<Mu2zE7kQWdzZ*Y|$1UC0l(=ey^k
z!~NMM+zNf@5t~RX5_SwHE69LtOwifHuKm;k(5MHbpkXkXwd9s3E^07eZy76vL}YJW
zPaIC&ip4A(qRvnrEfP2!)2!o`dSv%`9%F7rIPmr;6oL9MgyCXYLvO4tr6>-=4jcMl
z2yVVGc-bcylm76WH#&4m_EzGEnzVry5DVSB6Ap`X>hG2Xs8*IJAheqb2ndtwNs*-S
zEBYNhhc=!N7Cnd%KAW_*F;vOaS_XS@_X)7u`Ohm>+;9%m*EQfCeKJcwTophE$ptoq
zn|gyd=t0UVWhYfFbz>Jan>z`+diaekvRZi3k=7eV5~OWPek>RZo$UQb$8V%KU_5v<
ztyr-hw*J=K6EnVq<W_A$Y@>M?uc8b{PB9}-wzSURrcReKsO&yg>9#DeWX8^T7tlHv
zVX;O{yeLGYBXj$7a-?HX17}zlC7F9zT?L2N?Jz0i3rqJLhHm0hme0fp=uBcohWE+&
zq+DoJQbl?9$%RfEF5#a_jd#JaH8|SEGiTYJ?jmZwtCAKdPFW_^)uXW8O{s#Fn5j;e
zzc|5As$$L7iSP06;Ho2=!_5LsvB#+%2`eP>6@YktK0@~35?GJ7SD*L>ZI3~<+rpF*
z{IOcSH)k>00;S6(lTGcE8eO*cRVY)OBiT3M5E1{N-L*X>zF0T--Imq+OqbA;D^v8b
z2$Cv0iaaH-ntcI;8Q&nvR(W`<&m)8Wm|Y<5aU`Tv>Sgn9zwIx%wa015<z7?Xz?h7n
zm+eYOwP;R%B+ty(ofv-Yyhamw*YxlL`*Y+|A8FDg1CWz=0x&_ezvd<WS4Q%WROAXJ
zKov0txb|~3sE+$Os$pM4sBP56aCGn*T3;o7wSE&TgGT~kGMVo$B%C(kUO=2TM3UP(
z9<$8tP28Hq@QC6bYDqJnBq+OBJLW3OMh?b$2PX;$R4Y{mxQj<n<lP#dtGhT;Rr?IZ
z997VGna2zA3n9x62Z)<Kt~}_Y4Zc4#dOIscQZG7AmY`*x<lNtkS5-0y1Qv}pDEcab
zTuFG@nqjlK&hGKH86qBtEJ3j<F5K8tL+KNnNRJGVIJb){oca#e?#*2<7u1?+e;uN%
zMsc`cCW`4tobW5r?+WYfeu(g0oh7Ek-$ZMH8NF06v#F9RJ+C7Iuj9DunZ(<PAYM>E
zV|ez6&)E%G^)IT7j5a>9uni2-ckq&CSVul6j=ixTBL{u(H6PBna45)N$|!z`uHXt!
z^6^9Ltk<Xa?%8>CBSfL9AguamEcS-;@xq7wU6jr26RoL`$+-d4X=4DP!mcZ&8{hj(
z8tm$wMVQk*Nf+KbZ#lap$}6UzE2fW8&>N`VIgFAxj^VZ}J9-WuAC@pezq}q_vH$!$
z*$(B${+VY=@bmNhOWM<qr~1!61g3}KM`|gQ22*MsJkj*52w1no4RG}CGQ<kWZth|u
zdmcFC;M3zv^ZrGq=xvWMohz+{26XT`*okLV|BJKcIanUtdJiU#1tW2oP^(2Hhaeq0
zOOtpxK_&~nL$pW15F7{aFyd5-1jaO2b7F}a(ic#7<xi!{r8m|W&IUELb37T=V@uZ2
z_c6FF<5)Zp*Xff7_(<WRrk5a&uKPWL$BdSAdoDhol}C}gRaT1oTTg#cmTjAVptb|7
zaYX~xxPJE~{*f|Kq4Li!F=B@i_oC!<CAB|U|FYqluhX044{Iv*N9x5i(D7GudVG@6
z35F-W@Yf<tGFQCuL<P0iGPDLk^UxIo?E>@Xz@^$4g0utetq(;vb&%tZDt0Ak(H>eK
z+OLN{F*7^V06D1I^bJbvwPP{vU%^<OF<HyURn9i-X9~CK2klvcRFJ;CeIhU47bNBO
zxyJT+Kb3Q}s+3Ldx47VYScA%`d*rEsq6%uLAS$m0y%P^@oIGKK<ZQZ+&gfY(SI^4K
zT(9tA&00?oZqFpne~CRP(c>JRfn}M#4f!<Yb@`<)b>Kqo&;XVfLzf8d7A}gCe4&cV
z=R-D&L5MtdX}d@-zO$0w2DbkdFqOFwjO1<lYQ4h4SP)%<->%>7eL_tBHFIYYCOAqm
zW^mT3;$1md<%D`{kHwA!k!M8zve9eimBOd10@ciD$inyob#smEpsiOle$-C^5|GX3
zh7{G$vQw?(Abh8H#NTzB%O=DQlIx*y+59!e3c!k_^se8a`nNE^;kEH{LgWCkx3M9w
zwPIL2ocA$n=SZ5nVn8WYO-<kXedZE75Y1o!f@8YR4aCqEgvsj9l_dQvnSJfJ3=)-X
z9b87-VF2Ms_*r}fyD+M)%KRWw3`^Uf(lQTsD&Io@44I3iijbTwt-|<lY5XCE`QER2
zL6t}3yXax&ffoeEXiZ=80feP>;}Ix=xt6E*i*uiAe79U%=6XR0JP|bB6xDHd{e?>2
z`MU)K+UpnAxOK~?k96g_*oIfR(cqzvYC7g$FXXFxSK_2RddE(w9HuRIr*sGOf(z|j
zlyIfhJ!Q!|e;MfE3CJe>?{@i9ZqHxsQW+A-eEDzfvc*e|*u#JbqYeG-Pj(6Zr(HH^
z%&FlJro6@0CQ(I{OGDrs3{W(JNcS}gJsi!4HTTyZnJt6b2j(rcRykFEWOJ=$SX#2k
z9Cz;hH|(-5iPXQWP&LBk(=X-3dM@$ymjQP90GO)&&MtrG<mXf^ZD$xzU&gDdiL6ga
z=6Rm7wYc=nK%obO2N5oi$?JC)2b^0``+N?TC6;EL=16Xd@VEQ!Rvjn44flqKIrUHa
zQ^<hJaU0Md67(>~Dd`qwoMVtb<-t!wT;nizODt<3uW_ceaV5SVa{wFGj<G);i;R1}
zv9J`WtbP|rf@2sYkH%(KCdsFE4<k;!FDAfnK`q_9us`~=D<#6R1@RWKB}{lNL>Lu(
zYzQ)LjC#{T9MvFNd7^lB9gJ(Vkft#&86zCgqaUM4Qg*M|VXL;kS&&7peNqTVjpQ3o
z=bJi)P6-E$@0#vzpt;7NZ8EqO#q{FV>k-&NK9=eXOsvwH;Ej?MO?7WCo9L8{#SU==
zM5pluSMW3iqYRM6{EWNa7yC0o#pE82bqz2B!Nn6`a{Bs=5ht+~2`dlj=4TS6UVi1#
z^_(yCt0Ztvi$EDsCzfrTF2DCQ&r4wtWLJaf73$i8c2w5v`1E$j@(4Tj4TE)|$#7@_
z%~}ccZvH(4&m;K<)|xB&(`632ZZjLwrnf4(c+5fQ9}9F5`!4k-on1ips}9q2(?HIQ
z$dm%4{CgN5`$}>?Y6RLhp`Si#e><qc=~y{%T{}43@`4Y@=96r|TJwLU&+F|QDLUO=
ztvM#9HGPVtff9;z_%f1-9#mo%wb`4!O8q)<MtXAAAG4|3nMIOkebztBfF&wtn)RJ=
z@R$$|E*lgQf}Mj&N&+Um+X645IafG`*<p^&=>S&ai`#rH`Hku}DGJB=nZ63dd*wLE
zIK!0G0EW<b_t;qxoYUm}&2JDZ_$tK9cIt6k=y4v03p1muVe;W{TbjjqCHWf8cau#S
zi`kc0U;_kpr`F??&!eZe1#i!IKh-9loXIv>f6kQq^zkhsM!U5AQ^T{Admk#JWBt*@
zMX{65#);al-nqofiM7SRECx*{9S)k6=NpAV8_6E;)`#p3-+P;bcfYPNo3vPa{qG*&
zr__<3Jph{dF_Hf!$s>@MgLQy4vV+3cl}}Ut0XtqV-?hD5V?h7m0jU4W0~GzyGoStF
znGpd!GhvueZE+O`KOOsQ>yMuK-0_c|`Gy1JN6-9r8Vb-eD*<|DF6EQb<zhh3Y+~}G
zXFd(DT|vFX=dP<@c7@yL%$?vPLVYv74PJQhB`9dtU|xIV{JWJowG^0|Oohog+b_A;
z(A{9$v4Eka3Sf_c-+6!^3hz1f4M34vMDGnfd&JFUEX9|G?G}|5TA`;JCy@+8gnoSn
z<UPbZisXD;kKXThxRjK#xa1Vo^JmWl3T-=s=Vv<#iF$3?3plFE=JTPRm=9Zs#T04r
zVIuE5eIAaFjG&eg6_D4V(h%5{5%e;B?gV97>470(c}bgHtRvGQ{OBspW$HG>sU}rf
zOUWSc=ZGlKngu4)Anh?t1Y|y5v|L^}4oHcxq6K4;v>*t3!HX-YlxOp?_t%J)qYEpX
z@qc4YSmqua3df>9)ftr%DX|Wlvm`aWs&Lq6FKdKRPxXj{^q*z9Ys+T-U?adi#Oj~1
z6~RrX*${n~vxD~-UpXc>hMl{@S<d{C)manbuJHJCU2DQoi(-BI`>(#l)0bGmr`-?-
z98w7!bx=m(wfuhj5;%OL<?Z|-G;eyt9HAOG6wR;G*jN`u&nPu9CvFqw=$Gp%+apfV
z$EXL~q-0%YX&xNjSE#uTn%SUsixAiaaZ=FwXk+6nbl1zych#q+zA1{X`H(U%KYfh5
zeb~uR8dl(A?6^~nJuWHL&0E|*2d-94#B<lfAW>5uaTcvL*<1g)4y7rAGYhLG8&Pg6
z-d?k(yhK`Netv@E+$J)jA%^MD$y@!@NbU4~v<x?B%QgNTnajcRYdp6kt=By1L8g8u
z`>`BFDqFY?t#xW|IMSrZ-cAQUM~FhAA4!7*CxehmD^|K-a%C+l+;O?w_u8(b%C2EG
zL{}c!nc?>!<VPCr==ib*FZL3%DU%$1Q0dI!#9dY-72<wNZDz^g3gEVfG8nS~Nm{fb
zTPv{71jE7X`8aPvH3OB3%`42l1%5hEz|@*rP(kL?_;^s{Vf0#xV=yrhABjik2I7OC
zxliEtoIq4VvxCAfTwdh0=kT%Z$Wq&OuT2B{ltLp7bobk=35;kk%v!N6Y1qdB)CVM<
z1gK*a%7;Raj|;+@$$3m|d%D3Fyk`>Rw<S_{3KTm7x#9*4qiH)DFra3&$1cH@D|at>
zlW&O$$HBH>^&06-S6?oxO(z=j4qd5Hs@d#h&P;4!R#;QLYKUTn>S(iYORq0hNLG>u
zLHnY+En8_HrN7w5po=Z(XVZ^o4&bN}s(8g2)-xDj-$hHjgPT^uMBTz4BzT<IBIoT%
z3{NXS4#{-U%f6GV>b~`0U-SmS(0-BJwWWp(!k=?}*I}oF+_|^HyosCA^+Ij;{g=u$
z|DNCaDOKY?-){}egmdx(MlLmgvZ258TR&vS6)J1C*o<&K8+rz7qt<E8TX|_zV9J_h
zr^cs~*%K<_Uyx{1geCMRJH5xffR?$!D%wbz7r(z)4c<oCIwu9wK4zDv?!M%vfMOdI
zFzu1}XnTIQ%Mc1(6|b!5mgpFa-H_ghT%$09HnQpom8)@zwCPhvv@Ca}C>Ii{N&1Es
zb8{p`#``Lfon#WAdrn-|#H3+)KK|JS4>WCvnDY@iln=76QdM#@1MO3IH8x^Q3&b1C
zJwdjDp+4==I>^S0&?BP;y|z!Ssa5?5E2)sA<QjaSz0~POSeD=w*p`vUpig6MJPOd2
z3ePC+lLGY0vrsRfLU|*@H)UviAed5%!%4x%*;5xqYZZiNxB~rPNltX6gxsVE8EW=-
z!y{fq8q>OZS8?$6cz4(*2r0?SQ5+d!fH{enT}Y4WI7H(6WH9q@-;ON`;gg;0UwI!5
zFkUKH<8)z6(qOf<Xs3ENototxT$G#nu{H5YDmxPHwo<#QJle5e3BD>Ct9{<)jt!Ca
zW18wgGc#_Yi$H$kPU<VUz%Ih+PMGp43pYu>nfZ}-Bucz1_OThPq!>x@T<BF$H~?h-
zlkttVu=wZcL!eboY&<e5Jf!vDG3524Ru_z19I}z)K#>qwLOryL*ntq$38+!t0?V>#
zGwFls65ZabPFY?EIF>kG-ko6GS`NG{c;j!vDnZ^Nw2pB+#Utj@pb5uGAhwpQEd8FU
zRkb?|#)A)3b)5%iD-DG?VKVPF=GCGVR~s%Ag62BYkAAVUe{Y8HlivMjJNr+ZA@FCn
zQ7i%MjPcFy?d*pbcTa5tFhfA}KB(dlDTP6vHD7kpsr%qBy8;^-7HzsqCTlrtA9?!W
z^t=1GY|lVT4sRqww^aK5**l+u^KJY=@)g0ZVDnz7)SHliY6R&*<AFV@BcFCI5BAf5
z&&npsiSeW_xXdpJOJ(oT-o^n#NTJM-sB4uO3QD(_fU^cDGKMv@&4Sn8m#C4<++Yj9
z3gj}R$ND>9v0NTgUP?0e<`<xcy#ve50#V%Olv^@SRRuSkBS?cru$b3Q)elZBfQ@UO
zFQy**=#lWru>$>A(c`**UlDX7ULVspA6&}cmnC>0YF6NT&?>CnnmuEw^J+-6<{@=M
zJ^lzSoKN&tdbiEz8OFu8rTt7D1($3KL6(dzU7?rVgoU_W@PiyUF}ah3lbB+DsADq}
zTxlF0;qVFen6mAl&HW0?i;k-3o4WD*wX+bFSvK#mwOSkKDz%bLwL{a=&5Jp&mc}(w
z77f`J(hQNr%NY=iXP-t4wL#UmIRhA+4P5-)Mn?E7ga)qYL*Lsll$GRlesJ9REZNO0
z*6njR=Rn6|RV?n$fD=WQK-S7_lOI&(SOR&0z^$d;V#pA_l~`b$BiYO3%1h6DJMf^1
zKh^gTxgsr5_H8FT=8PJTw{yFff#+ki5BUQ}q@8NZ%&KQqu{XpVW9?*KQ<jGR8fEef
zs;Q%{Y8vw!jIS*c#|rk)enG}C!HntlsR9Cx2*r<Lls9CtzMJvkbNc-xGd$q=LDKxS
z<&tZHbFAdP>vo}a#rw{@{8Tkm4u|#+=<deGgu$MuPQs-DA6BSB4b_#U%FHcZo_Fw|
ztjf5mLVMf8?_0t3OO4CkrEKKN?!5*n1}55>>LJLZO<ds9_Qg^6^~~X&#Y{U;Ngs%~
z-t>gWnk@NLgGIe>vMW;SwpBlfN9e^UbQ@ro({#O!(b>JFk+X@pE1AqHer&r8*b3@q
z;=R*>Ij8)#wHFP0$)36N%WV1I>u7(Hr~m8#{u3t%%VcU9;eZK(AfWum?>gEaI@>k%
zl|L4+0i`hEz&O0^D<X`S6ms9B(Z;8cyg8Q9^Y8ahrbq=6`Ao8xbo~uivZnpB3kGGH
z+0?xN#<-oU$@rFt_d_Wqr&Y;xIr8vgmizl`q6#LmaHHp=<FTqyMl&H{QCCtkrga5g
zZ>P&qDJ;uOU}tnLvani4(SgdcnK!gynX2(Hb$dxvVclvfO{qkUn0320n{IJh$Ykjq
zozKbN6%}NH2qFB|F@(LOVyPxHBz>x;Tl&;>5JlYHD%*aX9S1i_u5>jy*VLtyJWW{a
zP;nsTE1iEfcv1i!MILnFkBujb-nk84A3No$M>vzX+m4P%&e(A~)W?4eXWMt0^Yy{(
z<q9ybo+9FSYk@D-=z`4NOm&Lu2>BIh9OGjHC^yb~q;Zf5Q3A$j5X!U^feW6RTU6ZK
z$?N2b0FOvOu)x+_#otRDM?`67(YeN}Namo$<?~psdoRa%*AZSCc-CpTQ<Te>eN`k>
zKmMTloVw=<w#6!h1x^jn1ruZE^oCetk0Us&vDZG1M_7~kx{HMdHtX{6+Ao4;ddi<W
zc3~O<$)gX(OcZ8KT?WV2LQWixAk66Z4UNSG2qU-XrCSxh!j6?EnCm>jTbDkeuJH~d
zDDiR^aNU1be#&Im3y!g0<9_XIx)p-W`qt0+K7v`mm65hMJG>yfc$cIAYn7?CTV#R>
zs!6ig=KHHDQLhq_DWC7q0}otE3=$_S#ugD~w8V>x%<SFfbT6EyX}*x}l0+J@R+yv4
z9AG+te`uXG8!$8D^aQAjm_QTkf;LP5^N9kim!myo3m?XmB_w}gX_kWMgX~<C2n6x#
zHAQ+C#kb1KV*wr(mP7Ag7@ZQ+)wYZFuKAK-)9%`ywvRVW6_hQOm{5m6r2L=T$lp+r
zmMOIY!Bs{d7Dw0Y2e08|`_g(yU@N4!4Gx&u%;k<nL1w+2kk5>Q<^m~sSwCX+UukYx
z#3Ne+#q>^&n`#_wVI=h{u$_RCpelLNgPYF6jgKQeTp4lJ?FCF3`p0Mby1Xcr7^_q<
z4%VT-TJLW5dq?+^!N_qb=~P|Y0BdB1??U%)dO#s|eqmE8hbLGq3T8{B8-d2zKLulI
zFkZTd2tDWPc!;GrVaMAzamH&sheKfIDaM3Nx-)xT>O>xWpj_wmUTqybe@QI66aN0$
z8j?N^x0G&$Y*=_IBj~)T=~RyZ<5v;ozvr=jlFI+*d#rQ{BIXEy$Kv@LkM%<*y7#BY
zQs!6(dV6B1;hb(3U=pxfAQ9mHgbD^*DZC^s5l%{}B^&265gYHawnn0!9TrvSJ!u^r
zV!97!>-|`chBL4HR&~@9GbWc%USpOtVbt?}uzysnmJn0QNWzuY{N1V|v#0azj1nt*
z8<-W%So&`Fh$bt7kWzUufr&CT)s*U-CSpXDJ63X<6{7M5Ls%{iYT$6TZPnR~uDL~V
zT_4fZgx}$l@2pBc%wsMcwx2p~XyIx=8k!OrNAFNLIQ`mMF|169NBoJJChe@&>6q=j
zEo(Is!Zr{I5(|QVHT#W?!NL$<U}GBnop$6g&bc5t_aW@^@_`fVr4#-*mQ@5kZUDE=
zX?0|0f}584))^+Ze#8f+{%X)gL;RT!>B8`SUp`|{O!7#@52n^)9L5$Xuw27$y@{S4
z=*X=Hx{HNOLua$w7=BcH&%KZl@kDcy5_uh+epJGFwKuO(brC&f#nMd_t8RcdQmls^
zaq8YQK8)4$8MbwAR!Q~yG?t|1OEkwms3@!BO_jC{kLsAl5yd@5dmGmQ_I%)h0BY2r
z#=KbL0s7fyi+T(EbAKi+jfUdKFI#R(D$2D8bsuEsTW{ZZ7PlhyW(?^iyL)OK@2*JU
zicJ|Md&``xKXev2rE@)&%Z=mK(-pJSv+;AHNFz&)DlN`F%B6>78_{8R1ih)HMpQP_
zO!cWScGnucbeVKul9*XqsKY6m`H+SqD}Ajdbki8i+ym9dJU)c{Rdp&H_2bxe%T%O=
zs}Q@FydF+YVX-L39?JNZ5&sR78x)iivMIvW`NYtZPBRdk;gH{EDe+M#MbO@DEkP-{
zGXWPQGz;8T5Wd2TJcx`0G<i*f{tE?<a^2Ig9K(a<uJTFToG^C!XL?@xl-K(;f_Fgz
zNbQ4`VAs9ZCq6nocy>=D(eZU-(+$|<2+6m@=5p()HD0|4svK1k5qpYkK`X(yIno2{
z#zfdQOM9$Xb|o13Nm0dIA`oF^SnuhHT`@Xey8}4}Wf)s_Z1By>qO*nfO{Ah=uZCFE
zR8qb|e~ICNzr~V`Bg7+DC6<oe@vyQ9p^6eA_S6w~N!xBZ=N-_*)=Lo^o>#|HqKV;n
z+%UVC0Cx@u<M4-RTu_qhHFrE#n>%z@;bp0hxm#`vVIO?F*w5Fqdr2$an8%Yu45-~~
ze$Brqp0xJ6(-wR^0A!y0ygvHxxvZbG-v9Y7D~WVe#0@aZ>iL`X(H{!Q3YBh)w~T+N
z1cTd~f|hG^pt}3uSqjW^tfe#fZ3Q&O+(qi*Cx@allsXEs=IM{bEeCUC2xUCuRMS%L
zMCG{i@^)0->xcQ()$WXEFPsA{Pg7bS?2Pw644xT4Qb?BN+oKO=uqqz7UL6m`3{h<A
zOA&L@9b>+TLs3~;k$g)R8*LZ~5*5`AZk88VXE1x;SG~c97&Y6l*!NV__v9PMHZm0U
z4gxt7WV=){<-R*9BKWgrtS?!BVg)Z6$~z5CW~#B>2~fkwn$ej}Q*Z2=hZP5G{xW+;
zimnleK@mc>;HXb#0aqdS#Gc=~f;>;|*>CJ%y(;qrFjtxDD&vJZo37HjxCUfP>Xh&B
zXnoSW-q@dFQIQoJeo|x#58167qpDozu9qcI<2jJwWI{Eyw`QRC#CE{8XNqu$wqx+e
zcjan7^lsp?Fpn0lDPY{4GOeMlS;TwU0BKu+vD>!V9rZ7XBb^_ofMf7X#nG&Art^?D
zObF;X<~EgUg!5iJH+}Zl5v5Irs$^7W;XgEvMM6{VW|ij-?@9`};Nr}n&7Df7-X9}Y
z{LXj2O7AX8XQiR1%F`1S&H5!kb$g`RE}KP7*aD;uqXkxQyFpPDqh-oTZRi;>pl&vq
z7^OY%pxN9@5<5YaV|kPk*n90I5XBs>I}Zc_o?ukU)_{R0P#rSySOkWTeQwRCaU$;^
z9+FPZ0T@*gtL}iBd{YZzCm*>_YL)o?_XKz7$7!lf7xUT#w`p{jm&w~}-kR|y6nkqn
zZ=_3LP;vh<5fy{UM(u$_uB`;@tVLcr!@KoFGS8&%R;eWq;1QUzSIGx#)kmh>7;#yf
zzb5Sddts2DG`&9?=zn4u#N@0()fh0QXhHx2LjPH{@nfR}#vh`;H8tx$)=9p1=-H!e
za)BbB^j992sI-g;)f#GHe4%8}$AlB*S`ws;C#D=ctA6bu288ss;a?_~yPo|LU0o)4
z2U`<{G3eRfgs@sDNyQ=TD6oK`2GLl4{t{warJErWqECZDx}&K7ZZ#LHgXeWSXatVl
zFEVIY9k0AxMFTU<Fz5|4afy6vd3=jOvhFt~8rD|A$>0zP)B=NE3#6Od<M&^aL<4%5
zq=fJM4r74hR8%nT+i8}AtGN)P0D}sZ3TZ$=J~T(&mZf6qsKd;@N5Qh@mRa3r6R(mS
zx@!V>{%VM80w@oGDs*Sj?}VQE9c@L1%0>5>TuTCaTLa$Y?1&qWb|d=ib%&Td4)wiE
z-X=?h(@~!&oe3g_XG~F@fwZnJt(atqLb{l)K)}O_B(9u}cfl$16DZQ$gD&93%-_??
zvu)yC@xM0!Z5*HH!&)ghQ(mo!t+E*Bb|@^`A4<_8Z!b*EZJvq@=a&z1qsZ&9rd|4G
zml2O*_x`|nbV^Cp`w)y&wG%noO-?}u_r<c29Z%}2<pb(-gs$#HQ!wj}8=}aLecJ8R
zni8ywoeoU~DBt3Bv$0elOTfm(_SNZ<rNT;@JxA&=a;L*FK(bZ%?og+Ci*5~H_VM0e
zIzcY$h9f*vmi#s8n?&N|01n;h{zUSTT&!z2%@svu7WiUCPdHlLC+4MNY}U5!Pw$Em
z_SxaKAkAq$w)F8HJ(qW+l;YaYha1)dY9834t~d;j^mEUGxFz$dbFlFFTGB<hKtOF8
z$@d*mP9BDM6}+Z4FD8kquE$FhiW&FbOsB({eSk9yCkTLe8#RYwlQpQow3MG9!z!Ce
zb9$|W86_G*<mP)(Vt)7@iVv@VN@szk#)|Wu73TfPh12HZSIc#j6nhRldLYT^NUi~*
zL^Bv2YHHTWW_X;@_lVeivAqy$(P){Af{CyvL#JGanB|WfDY%RgJ>u<l`P%qJt#r4z
zUo{^n-lnYDq*pPFY!Q6-Ld+1nf%I|?#Y*S2`A|+zG5kv8%|8xVZn;#P4D$*8xrKXV
z2JiViH(iMi=@5O^!7^IGMQ0dGEU%37P7j0JCCO#gy>;;0xlrrs!NuhiUt6nUuuJ0?
z|C6lLN8{6+!*vUViRbv-69*UHjw0m8h`WbhwhsI2aN(!yc}M!c%Bg<R<o@iA{u6^T
z$(lN6Q~-BW3-dSb=!br6PSyI4u<ZBoDh_{9f#a+Ba@sW}uz+u{Rw!#M`Q~YDeKN#2
zD~mQM{`cqU8HrI*G6}wfSKYFn6TIHzE}U*B#yX7Z@&!`eGqSrOxY;}?Zo?6Ejvty@
z-+lFE3p@yfmYo@oKOP@F71MIa*ML9#@+~xf{R<{13`oY9nMfTy$)4FKKrTT*Rw<N&
z2wiNQp#V<sOEwYdou-c)zE4SU;VNQ)ppQuInGjLo3{#%1Fk4k@8KFWVry?(P!EFO$
zja82Wx20P#MZK*jqpAZDLL&1yEj((X+p!3qo#NE&wg3ENkN?1MJJ`Z%1T`Y<+fA8c
zR}n99%WKV9#Fxch|L)l)Gqlx3i~fQ&Y0+=WD$WCoz`XHMz<~8y<U34~c8)qYqrg`F
z;C%y}&O<GaU|(<KU?E5-I=B336KN6fThec+W09r{!`m5bZ$5y39@YMu9Lc)<fqE=_
zB4yq^npxB_2MQf{5ati<M%vq(KeQXH4Ax{O!^HsY#w4Z#*NZRDBdjQE-POeD5<t5#
zvMOnZU&^sz0D~S_po@9y{^I?14Nbi|*V53O%`RXm6H`$hGjfv{7aykrKW1y`vwYV4
zpyU_QuTn~?dl+eajhO=vjWl%MRaU#Ht{V?JU3gzFvnG5{gWvY<R)Uyyzp4QXhb8GX
zgg7z>&6?>OQSL)jvjc4{!4;*UJHQKC;IGG9K@9F7rNgyOQ64_H8O)RsVk~y+tKXqa
z8d^{(2AC|$d%7-MNRX0HYI6%P%;}q;R@-B}Ho_7$>ms&y$|JsgE5X!OM&Le7fYpbf
zhV*!yUd*R}LMXb~^$`c&gz_lS$Tf}Iuu`&AKSsAsVD0f75?DG?9+gg#SxaTId{^|9
zsM$v)tV~5ui_w%Pq@Ln<8!@XqNC!#|U-M0uk_lU917v(pf~fAbwM+}glR=*zOzzBC
zuVvde0Vhtz{JD8K*(`LE>rc4)Hy2D8_vcW?aur$nStUj7i;@qJkhe_|c{$p!PRU9c
zFZrdg9p_r+cgI%Ud)8D<ZTiO2?c@$VTep4GpK>e)X(hha%J?*aK(A&vK5_s36y$&K
zDSxZcREGRd^eIg*S6@;9bC6&_WuxB?aeipu`c=nl07~PIb4oD1X%L-s=Pr3L8JP05
zaNtvP7Ru^RIg63B%E@4ovfG)rG#+)l!xT$4kBzxn%wLI?$Q~|d$Q5p;E?W|<pv{6*
z^VYq6Cf{c(W<V#!xU1F5*&=v+;i_b!tM^x^f>`0k>*Qn-qGlHD9)W+Lph{=%<!CSm
zCT;Yx=8v<^`Mevg%NWDFOASJ6?8$)H79&NcJUj3yrSB~m?1;UXa27~+itk~+V7^`H
zOesk@1egGXycX2^JjM!|jFB%B^Is6V_)7M$(n?bG8GJNkW5cXaPsOA!kwq0y8-wR*
zZTx$Lo(Xt5cMN5A?lB*;12Un>{Z2Pb%(OX!GEZ)nnF^p+l*|>^+*ux8tYG#ngyhGC
zL~u!MBhSeiaj6@>XV)&1r_SKKyUMAvIkte_>c1zPwA+y?rpZDXoUrg)<tkTHBgE-n
zKNN$@gI?k~Y|?r<M{`}Ew98G(TQfu%R-&l;0<_e9a)Y?!zEcBEsR@roLl@TGPI%GV
zO|ZFvt<_A{<BNKnqirP?-ewn6NpmA}Qp@cr*v}_7yf|^rw+Po<(}5|gAm4q?XcJkE
z5~!^X38i7!RIerd(V-woycw0fdaxK`#$QENd<(3NYGPyhu_?Ijs^EM2?IfqQ$}>cg
z(Z`un%|x84MksH#<`!SY_g>v-6EE}ooAI%f8idx&`rGvV<?(V!dwdHoZxMx@E7g2w
zfzL|TrjDx!*;P7$B#atMpW75u14%Z?Wyu|LIOeSe(bLCyw4l}QYc*6I?tILcx9Vqk
z-kW9v3E{UCAsmT#P25URG7KBEhB%1o_!Vf@pS~__dwRMIeuBWVbvL29^#4>f8XMZe
zutr0^m5@_Vr*<_!3g3yeqxZ7`|8<J;KN#@;F)=4LQnz+7z<^`^&VYYN*8m1A3y30g
z+);{pKo`f#6bH%+`xEd-WV2XIuV<_cSE^K5FRjPg9hV?XiXG-%lHhu{pnxe$ijdGf
zvG1HeJxB4b?<9y2>PHVakUmw|w9bGehDkPmd91Ka7=6h@S06MQ35;K~rJZV!g*cCP
zng~ZVtFGgnk8wEoKp$-#|GwKw%RCYF6QvS^SQS}>Qtug59pa@xks~HX<Y+No6+2ed
z8_mRMPGo>SVhK!m%!H@gF)0z;K!pH81gM^Kfem}|OZ;c?m$l<Cfg3r8?^B1L@lp{x
zGGgE^>hPV2W6Qg|QTv^;Wh2T2wg|nMcsvpfos#Zx9<%*2?Z^>ZsDr7JrY7GeJy#Q;
z8F)~l%~9oT-??N2Xj%9>;89tSJm^1vTxBa=-Igwpl4C0HxF2%wY5A-L14qBC=MTq@
znW9O(Q`}(JD;*3mV6hL=;Oi($_Y`Ct^FHgSicH%n+s1W!Ak=5&GUxOA;{3enZt-d`
zM^eRYZr2P>9-VTfhE1so!u};SMcZ@e=i(XY`!fC0H3n0`>@U@jS!OW`IU>e66|~ZC
zVY{z9JckFPBwm@l3ha&S##PMy%&k$SuJELrx=>k|b=zG@6eT`;!bUZ^-U;JjN{^C!
zwY;dH_OLm!46OVJjOYEm1oOOUlp1Z>Za?;s>{y<CvD<lmJ>YEX3|)d=DuUmNMecZ`
zt3WAs(zB#7RjneCoChObCMYXPK=s}_vsf@ih1@&BP;X}%Ujj>Wk#ntc`PACHvgoul
znldY3`r_h*J-wbf#hTMA23E&S`jVCr-cpoGBb(bNEZScyx3&1_neO64%9EP)eRP;o
zF!~Xm^ym^?nV1S!H>u5R<IXn;tSW>F;g1v2KB<t-T+z5`2KV{q&~vA*8OR~8Vcq}V
zEc$;CXZ|Nf9qe)4z6Ss-`XeAL_}eb|hp4MUMd2Un3$^sCX(D}dn9@?SqdqpaSP131
zoi!=CDAvISt0K!gv3r4M#&OwY*_Bwbp@@t{mkpP@2hVr!mS#cb&1lq^-jayr0cU?v
z>E5Gyt4&Y8E(58Ln*fd?X~?#ii<|yM7x!c~9ArpiInh{QFI{$LaUX5|3m#=u{H>(1
z#s1+!`F2k=H%NsD4_*|R{a8~Zm(L5bkM(5khc61*6Mi34e2Z&#IWx=JN}$WY87V-D
zE1DXnBAG@paxZG<qyi2*cORz@gEBu=zgY9KcSX=h62+De(1dT><S|ER6<r|lIq2~+
zd~RX&gyCHmpl?Qj8qUmNHQseExYB}EL~1}NC|K62d-k43YpNilILOJ>5%*^H7PoOx
zXdWw}YRKFT-me}~G+67g1`ozR4M7b4q{CP%s9rohXO++s56fku&e4lg*)aII@iGrO
z4?dqe*p<9{BgnNAWm{T8ECj-_+o3#>VzTT0)o81KUw=A)NCyr)nUBKVLvC|siemGe
zoqG;FI?e(=Ko<&J&(Q}|z#LOEgiDId*5Jw@L7Jcn+s#ssk9fDUCbQ6PN*!!vwXHh}
z!diK1mxMe;_CcMWqQHZM=VZE0<_$#H=mvY!PUTginFS4p3T$;cor%zbLMq<U1b)HW
znwy-P^t~cFsTPP38jrqC@h$TG{5PElZ(xJ1{mOfZlHk)C;=XB6r%<M|8HYhCy1%*7
z(ibe|DZd%M<qT?8sL$2BFh%_s%sLM-nwR`pu7RFXd%Q}3HCi*-R<qZL2V4!yC@2W7
zP+#rh34<izKs~GB^ZkWUUx*qB#y<7T#)4nkI%(-$ZFPQF&50~z0tZid?Jslkf3y4j
z9~6oIiN>1NV&D}9FjnNh>wf=Bk|+fjKO?T)p$4+q_$@jpW<$ZP0H@Vy+?e*suFwia
z84*1j1Ur<MyC>sSkW)%LBotC^4y?!Ka$K*sZNYK!Xj$~5`;t^(IYoQCz=IN75z4S0
zYfr4Z$Z<#Vu!F13cNNb7IgJm{2ix$M>US?q`o`2W-`T_-knCr3ek%t>xs{Aad#diE
zqL1AGm#?@=@P3u4Y-U*+**M)kM}&$zcn9`~3UvlzII0NKRGm8yTm~*651qI_t-uC9
z(x;4OPsT3fdvA=Ma#(2NZ~k;`=r4_)_m$&)uqCFkM#r*x>RyG?;}Cwc8iMpteXx&E
zR(X!;G`0TV+u->`$b}fyRb*y1{Xu!n(kPKpd{cJQ4twAcv0^J+#nmtP@iSahot~Bv
zWbvzkoG-heBHqVu!Ogd}V|y-k%qv(2O+F5a;4ntf<y7Td)zAA-d$+54+C(7~^^%3d
ze$-L{s!4dc88vKX<@CK#fdfxeBXn;;*erVS?%A-aoSBJ5*gGuHA6W+u#?vR&9NVu+
zb3vowXC!^2rV?Xy#Hz<+yX4Z8uFV%Wo$>HG<B4(xY4%afrr7$^B1VAM_977T?a81V
zTLKkn7m*`Qvtl;M5h${@Z}qZ9JmUv4nV|xql3oYk5B!-`1Z^U2S&E4_{B2rBO*~l^
zK?l=;REP1PgrID-<B+7Zd%BvLZ)aMK&7|Ymw*=!0g=O(9w=4sNl84Y{`-s6Y%)%Bk
z`n8PR1il(mi0hOQH&`SRL+IY<;>LO$BQ-^%VV4!(uFR<5Q0oWHG6H)Pz9Jnp5>#vD
zROslNTb;gg9A=!&c|NJ{t=vgbJol}oh#11s=R)$xb!y8uwa_(^xw%gdb^`x@8ZrML
z6E|vR!&l^i0|D&-qWTzssJ@ZydwB<2J4gBtW=__IHU{QabQX5Trl+cwb|@;S?^-?B
zDXqs4DQHn4ds>;g@@WOBLCDOP3lrJ$X{+pmB@$#%(#Q;4W|Czy;vV^5h!Hk?Xu#2M
zO>va#vZE`AvhzSg!C~OuBh}wy<AqoGxt`hR@*^g8wsU)&r#N<aoIQ-UJF>^UkE=3p
z6)CAe#SHdy@FZ~1T(PcIS6#xLD`=3U<^rl)$;gS#Vg+IWH=0r~VWZLeJe_?Ci9Wy&
zu`3unLmDLQ>_MuZr`=Sk!vz9|i!dR5`nr(3g-(J2ln~5un5-=mss-%YNlgN%dC6=>
zZxJUHTJJ(Mj?5^~CmQ}%SzFM)Jv@avT{8rnaZKdkifb)&1!<yKu7;h&fEBX9I0+h9
z;HDX_4&9mY@bV!j|AXyzNB{gT!r=Vd5Zl68^;)Pwp3*hyWbWLinZ3Gsg3L%)A8|IN
zC*?lAptmQ|p+%t8#5{)P86&3W*4OiNLg`V9rwP{e3LaY79i^HXqt2`l7@WE3FLcBH
ztN~%6a&F*S1!)M7HaUHgCqb_1?Amxy0NwT&>%4k8TUCK%RBu>W2AXqUb0|JaK(qy`
zX{FRf3g_(o>UykI%zoD!h-Z}~3+Y@G5|KQ^l=k|)V{Im}0U70k_#`~N650`4gomWs
z(B?R6!$k#*J)95W^GvMp%>-b_xr`4}Hm1-Or!;-A`jk6h!2a?v?XVzrSa6%ZObk1i
zAV|D;7k>D(AnxFG_aC{lk*rJHoP+YV!SV!BdBsh4s7g4GVNCPV35E$p6+_Q?6eac(
zh!o)<YGPN#&|{J7&=cG)n{V^K9ch~G*w6E0<|(c#GT08iIUt=sK~#IthRoz~nftow
z#0mS^GoBg<S(RWQJ+<6N@98NIh4E4R?j?lzu3qTd*zsuA7UaXbuTKvN&=5ZUd|`5Y
z9~sy?;}uXk-jO0uQU(~kX}=%Y?DGb4>n2%Op>_v(>wto%32optS3q_6F|Har2HS`2
z7ON4n5zD8oaD=j~&wAh40pG5*X>IjDf7885_e{u!{tn_!cUpJt-m~vccj~Q@6mN+I
zt*@TxeAJxGbkgC#N#P+C%H;>n7EhgK8*KN1fje5ab|cz%>V973r6miBZcc~a1R`?z
z6{&E1>le~r53QJ@cTrYYiyA^R_b`Rt&n22k>rYBApjuwBqM;66%}5LBG2YIX$!45D
zzyy-0){Li)l;oKvjoYYtB16?oQq4rnBg0jodA-qHlGkS$XZ?7gM2cwzyikq>QM6$z
zuN^OdN+Mu|zFO<+4LL{sm@D+q!MFs8%;v_}^{jU>&h@dn0uHo|4tV#A?BdmhEY3F~
z{yQ8sb*t$02e@l+-(5-?c^~7nOB*h<Fzvo4$yVsis{!%d^GfmEE7eFJ<m^FwETE<3
z<7*%qU1?h{anPP!5INJGR$r{Qg9Xxfx|^uAKqp@OxwoZ<%X?EB$3xCFj1Zt1+UaXT
zC2=VFIj?v&Wm5!=Zv(<^eBZrs$K1r^t#%KdVUFWvz;oPF5&YUaQjN34x?E>)geAVn
ziK8C8quz7LBVhgo^bj)lKumgw-CUH;-h#Yxm`c6TUTT@-I(l6pTIhfbV?F}6ce>V5
zG-f!1$w_lvXU24c*VIPgk~bOI(p;mtBDb)?=7?!kH%04$#nr2flyh};x~1VXCj{T^
z!^`^^zR%{@N^*SHUmYhpC+nS7r06?qUgj3oJx@+E&f8Yn0h0*)^9Q5zj+ZR&+r6!-
z$7mehc^}Ws8$d_?a`x0+%a`71ce$`Z_ws!(%SS;+*|fZ1Hb_KYU63QUparub2>B*3
zE4B*DKr`2=5{vV*zK&+H+fFiWdoLLp*LYMZ210?UTqZBW9#2vsMJuZT?yHoBwz}Yg
z{m|}3*AXW#O(3<}Rd#|G18#6|j)55_c5v`3hw$Sopf>{sK?Owx_;4s7rnXS+Xbec1
zTR_!>RTLl~z|VglzJSkwfV6~c0p&|<KIkdC*%>?P{857GSAfem>|cQZ|GWhl(E*ZU
z|NHO-CI@`-kENv_`T4(|7=ik{;|4f!1@LbG0os2bzQFQ;1)KlwVcy$182|3NnUm3O
z=YZ3#0hRv%DS7{W_yXSpKKbv{4eacGcXqWsYoagU!Vd62Kz}sGKM!AEG{6+n&h7)C
zQj4)8;LZ&`nA_U?4k1nfsL>s8{b9gO<Nxz?z^gCtAoM>F{&DZW;{3KPw9~~<g#loH
z42yqm3vU4YKk5AFa(~Cgl$k8*6X0}zK*Rlf`X$^yPIoppcKO@W6JV^J2m$Fq2e`jB
zMk}Cs{<!)FGh=JxfBgLs%>VUSF1|#?6aa3%0r30tw+J2}h5mtHU}$Bm^uf)_*ztFF
zP)M80dIq2{P4|!5-v4Y+U*O&U0{EdGkTGxsRMYvdf*${P^IJYxqn8hF01Yw#c$j}|
zocqti7Z?d}kALv_3GOcopnn1V?URxUlUM5kG;;-@VTt|;3wZSfHpBZ1@;^7|{c)MU
z?t=bn%l)-_=O1O0{&@iK0Pz3K^q+WtVft6p-%|Z`oAN)QEr@>u{a32LdqjU_`g5Dk
zkpBkmUzz>^`dgyE?(+L5(OHV$ApeEvkHGZ*)TBQX^##2B-<$N$X!ozkzwYq)Cs9AZ
zv-=n1zYzTqZ~GPQ?^~3Z={Ioy(xQI`@qR`Ab)UOGiIOn?2Kg^Uf5aMoh5LJ=mh8WQ
z`xl~r1~7g_{&gd%KZ#m#{08|iM1Rb%euevcqS3s+f%_Mte@@$eMgDa=lRt^Zy!#FE
zUx@xlbNLnS?}=^-{08n{i2j*7^DFYNyCnQcbW89z$bTXFL*4!>+}{(05d96@zYzUX
zGXE>`uZ7zGBnm0^8|1$b{h?|774Gke-b?)k?q7)hDU|&c`Pcfwe-eF={tfb9i2l&_
z{R;Q@L_aJ12JT;o{wWyz75UeCntu|lQTz?^Ux@xcg`B-g13?sp-9lf%(x1{&()t88
zR$7>#f(i)=iYO>(R)q)&2obc>2e9@T?0pG)AHc$KhUMOKyvLAYNwFVaW|w=iJ2SVe
z%4424#cb-Y2fRO~g&LN8?v-b%*Bb>*o$|uYc`VE=z>GSz>4belFh+3KDMNg;0pWhF
zt#4o&w{tSudk1LwT7+rl-x1F@=ci-(gRl(<C!`aM``9$Vy6WrzEenOI<rk98H|M8g
zUG;VW;e>Qjaa)ZBSXWznK+8g5I%%QBE+CwcP9)whZ$h&@pk<*jh4dvG9st4#>Ez*!
z#U|801X>mf(?*Zb=m-!_NGA+$bTpy*7-(52OcgD(ata71q?3gAF`3Zv8PKv&m>yc_
z>l_eHNGAsGQZS*<3!r78FeUUoG`<9c6Vl1RtFldKa0Rq16sLh7qJ;*Kj&(J60}N;M
zPXh4D%!Zhrz;6Lpl)}e6t#mQ~hF8+nniqDNQuhvUMQLKW#!+ewf#H>Oz2>DVrnEEy
zTu}-Ow!f87Y33dnUP)JMo@+OyZ^RX)uw-kcx3N^xHJfJ?P3iRka7Afi(Z*5Qe*}gf
kC0(_7#>AA`Pww>;&Y!HUPN#PuX$yJ!yrlOit>@owKR-4{djJ3c

literal 0
HcmV?d00001


From 6a21b02e88c153e59db0ba7df3dfa0d6bf6f7c06 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Wed, 8 Apr 2026 23:37:26 +0000
Subject: [PATCH 32/38] save work

---
 docs/softmax_lowering.pptx | Bin 45031 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 docs/softmax_lowering.pptx

diff --git a/docs/softmax_lowering.pptx b/docs/softmax_lowering.pptx
deleted file mode 100644
index 42e7908862d02c963e89ddf7ea4bd97ed5e6955f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 45031
zcmdqIW00iXx~^NcZQEV8%`V%vZFE&-vCCbyZFHGkwr$(q^?h^gbLO09N34Bf{hJZ_
zzLEHo`QW<m@r;q*WkA8ufPjFYfSB9Ebz(3f<sN{6fYwlffW96;+9LLLE~a)a`YN6d
zrp~$y9=0~)zlQBrnUF#+ePM(!@nIt6S12qExxyHV8Gz&Yfk>lmo7Mzl)}PLK;xVx-
zQBvigu8o3s#y<UVkh5wjQrxA@XdU#hbkQmITmA?x79o4-p$ph|MlnQ;kuJigh>NIJ
z^kAmUswbp3f@dq%J*ajHHV|Un-h-e$7S`rinww?pNXl@SCogtA<1HO~_zYU*BeTZT
z%>OA)P+;1X0W2T#w5<j1H47gu56S6RtM0F9QCr0{jH|V^nlm%kr<UY?yH`?_w#4^%
zx#&$QC?ukGhYxo#<~K)7`>12;I;2sNS-0iQ*cE(Xnz5yp9yv356V~`l10is<a3|nc
zI(W~KI1&Y2Mt6V8q>V6AiJ~>HFh|qpubk2UZV`xMA)ZXulN-Qyo3PR7h-BAUu9)sy
zkl*-crFdhR|JCi2OF(<u7|PA_8_~l`1nfHGF1ha8$f6nG!@==p#sqTZbDUKknPxyb
zlj6@Eg-}s5G;@c{&tp2FqFb<31{4!70oGZL3(KmTut?6XRNV9YUrnrzoIy}KcMU`#
zU}Z?fhTz2g#|lDFjw^~t!WkE+K}>IoSRZ6?{z#z$kd9_CNI{&a;!r{vo+Kee6L(V*
z0dH!(Y*pm#Pn&n3$9fUqR7*bmU1Dy`zA?1G{(0>gOR$$v8jntPFPPu|rA)5jInIE-
z%48b^2nhD8O!S>hZJZey{<>GjPs)Q5BZXb^gD=AtGpt%geKYm`M$v(-OlS~t%-|Xb
zEi3o>kO%O$YLY|b>wSAVz%}SxdKPLo9$|o>FLosbhd`}_%}i!29~#a?OC{F8G7_YJ
z{I#FgktVPUq%*O%p@;#FQdi|=)}~IAPQbUvjS{M)IkXoz%Iq4^-7XYnwZPg;^KPng
z4y;U9Sv4~wPg}c!n$;I`+Q{;RG`?WG6^HSO@&m3^WC0zSR{GcIfqEOmo^&+q_yi_5
zf5puogFqq2uKQ}>wJU4md{61x5I?K?&>9{J7hg?cR8;wryP<xB+wcYRojLe*ott5Y
zAuPI;S!uyL+X4hXvsd7M4OArB+pgzVpjN*Eh5QvL6MN(DPWBGYjK=m(rhj!)bo{t{
z9}`l@C9l|=jC|=XhBP3a2zUn!MJZBnEviEf9$>q?TIA`SQ6Tqx9H_0v@@mPOe&?m0
z)Pqrxpk+#oKV%3LVNI+W_hWMO)q(&5Ar+-!4yLXK7*p@ioSFlA5hJUD-9Rx2mRGVh
zB;BS#oE4flraITi?hJn<lGc_s)UN^jTa<qt$RDy8^8yf)AYxK_CT38_1_>B`qVd6G
zd_e~ga+RDZWbUH9Be9w!p4KVq&5We}!EUKjA-pvnN=Ref#a*uzO<zB0Jwif^b!F1h
zl`_t#8<i_C@?^m_g=WuBpqTe=dZxJr#Lj*N3oF|nr{usf+Llv<q>O@rrcfkHoc__>
ztSt*P%n<I>&-=Dg<jzk=h|Tdtc4<C=H9xkJspkv!s@Lo^mPQ^fxX$tVeqCsvCA<KA
z+)HiF3wfq)Y|vjyJd+j8QWZ__IbrLQ-|^<wH?yzDzLVChKt2gvleGTwCnW&V^}E>q
z(G-0d^Y%=5%lNN(%9@Jty!^_OEf^3G(m&_P(81xaG-ayv#LO}wb&y^oCM!5tBnbcb
zzHbJuTj`a<s#HJfSwk89!&su}*zVbj0(!XUJF$@b<v4E#_T&;WheD$v<(&}E?}JFs
z5p1pF#HJy(F2)P9OD<UC3Xom8s%LD_u3^1Sq*izMmy&Nxbv1_L<yh&e@nD*U@P_i1
zGO07qI3$YxvO1bpfM$8W>gmBfxwKB1E9yMe60>oOKi)(0)F0KQPIc#uyl97^Z_J7<
zP(@R0X0UO*sMiCY=yfV&d`N_8gBH+nMRDg1cML7eb@-CD`LO7t>!dGh$f(0AGP~V^
z`#f%ot|{l|bds?a%ehXmXY)??4~>#lP+|xKF$8I4)~3DvJr+N7?=Vrab<aTgkhCk&
zR?OGIF@LWQ9u>*Jea9oiTAT8yLy;A%BLFpuoT}o7JeTr}C`TPR;)Q*Ir1u|QR)<<b
za2;3JVq5y-t*O9mlZNE0Hr4ybdk;Tjq)3Btwc~E9x_6_T2cOo373(8c)T=|3nIPe6
zALs?;HC_#+dd&D+ro<v>woB_J1I2It@Z1f1bT@<Ta9bl$Ty+kENi>{kO53g<uSE{c
zNRCT8rr+^su1bzHqqM)ikPN)H$V=>;ulJD*|4dD3H-RENhCjs$<n7caU)_LH*32Sb
z6!F5w<F=-`z`r$djU!uPz*hr1ApimWC6x{iE{qONrp{lwXy{^TZ}(S$oTtdy<uhRk
zK03qf$zkMU$aepvDwuImu`Gx|V><`dd78<=3AUk3xVtk(rQ~d+xd%G4*?8RLHIAga
zK!AsRATn9MF$%KJmjY_HDNpM{Pe?foOZP#M6!R3*9L@cOCnSH7EP%1zj!G0rTjpYx
zQ|`c348B4$Ki>B}S=*wd-Iix7%~osrdtvboeFfNiaXANFRDO5}d?i=i^A6ARxtVgx
z)o*(DXghxzl7RcupQ~*NhgR$EWvyL1UQ+QFh8^BXIRsi&CV@;aNAiu2w#M;$WeoH3
zh%jimg%8A<_bJlgI|}zy^Tm`xy69AcQptPD@q1KA&bOkdQN8;lZzS(tJ~Z(tlsqiU
zi=5657skzxV?!+NmT^N;droK0XE#fSwWG!S*=@Yno;x>lhoz%{NnOvSW(ghdrA3LV
zrin)h9sk-(M(6i|iH+Yp9Uo95B7tU{QhAb;i9;Z9yyLF`Zny75nlM0Fcojk$G(XwT
zQ7mUB*j`Ft*j<L^Sn%ew@UiJI=XkX%L-6@3MY~J*pP`<{N89u>aeE)&+_{T#cc$AP
zfKBb&1>wNC4^lg;1Yi;iG;P{o2VRhJV&76cE@NZ43fAP=R!-DjC^XJJO5xnqh+WAu
z(z!2*Twraia2T}J)DhW?-}^JHTg*w8l-K3(Yjkf^RW=(Rn{o@?9_zO`WK$)%Azz?@
z_yU#uKnn3(c(L)f2f|cQ5v?Xr#09Rm%(<VqI+F9V6VOg6vEQ~VLx-u3YzAPY@r67y
z4)b&B(P_isf_6A5r~iI?1W!nl8UOMq5}lTc5Dc4Vm)y732z(q*h-`dN=EbP+GyY!S
zTi>aF#OUY0;w(-PsM+faXTx9OO!!|F`wxx&_qzRCvpW_h?SqNXAg^9OV0H&)Mb9n^
zn^H`r5M`1ZUqMScwV1&{E3`jf<0>uZTT<|$BnYmaLjG_tEb6<WX9!v{h+#}~0zi;j
z=~}4jk8B@Z1Y$vKe^F_WgPTlayEN#n)9dFN4PH!3Y#ep7BXh&X4zePK9KAuY&DPoo
zE~N&mu(7z6NYB)~WhM#Ekfox`Um#a;^99Sr{jgG=Vp__?eNwRZ<h;ToF#S*rW&)K!
z(!B=BN@dTn(Jqo^(p;i(Q=3|PKpbYGbse@s!+hwb$1<+g-L&Wcd<f%Qwm!*lI?;l?
z=b6mb>Pj!MMeCBt<MDfkTSJS;>X1aO!*X;_UNwjNF)9~BG*uf|PoOtc<*D`)u^e8O
zKE^Ys-mxEKAyJ2FcnC)^fywbw50>=$jv&xdl=NDbkVH_IIpAc38->zjP*CRm%7-bQ
z22f#qedJ-3nv?@)KMgRWyhqPMlT$dM<)==|F0Fnmum~vX1WD9GO?3I_R^R;BwCQ9>
zpoxB^4e(!X;eUhXZ_Dr(Wg6p0|8fgrkCJa9dmYw1QnyM8G?qaj{XyV90ZE*dg-^t*
zzC`_9sKPL4?CUk@(?=Uz?yTkaQg9oA*rQ2fz?ueI-<Hc_Mr{$YvM&rkcuz~tj&jIG
zdtmDgbk{a2T#Df9Ww}qpkMpm^NiA8@f-{}DMgBDLP?YFWjqh=uCLzy7S1#vGOzpM^
z!u=WI6b^ENHtT2=uYSpDu24JIivr4~2!%%ltUKCH+P&a3XLB+9jQB>z(+YWdFtUo9
z6}(6h!!C!D%+%Wh5lVJvn8c1AWrfm@g{t#%y&9GlqB&$MBj?U|l+1?j&gOatC($e5
z5!5WMIsaR%ygvig($z$qmS2qAqjND3fvXz`otCvnmw5r0+Ho(1Nj?E3G%&ljuFP6P
z)wC{(gpd*X!^nhh9nsFHg6)%$_s&=D?$tZO{a-_F&O#BL{uS~d(0>@Tze4V2Y3ly}
zggjcgM}GH9`K$W{G&+M5po9TwPC^bsS_PpUxaMg%Tuu9qGNX%^WOkOYd=BXxyPkgb
zgIOE)UVRHl-+8oe-)MH>Dx4~tbjS76hen~zu?lSq9Wksr!S^2PBL$T8E7^S7!NkEZ
zXf<z5qdHMQ=25%=>J!0*E4@GT#Dj8CZGRGquHh=0Xi|HX(r?~}ZQlJJ5T^BTu$m8!
z29O2B#r5whd9T$!Z`M2mNwOM0mXeWv`yh2^lbB^6RsA$8WETmY{L_LjDgJ~4A<H`r
zZjo(>jO{mrWj{i)yVJ73DUf_oHR?nM$mX?hfrTtND3S}$P3m-15Nc*Ch9`e|JjYiN
zZL?D;bTeOM9!p}ancM|3Q`X!Lg{Nhx;v!As#Wnrz<xN4~`Qo!5bkSg%o~D1m;n}n7
z7EUF3I49)TOaeNI)nz2!!D_S;3Y(qbDPa}azBW#QFEu=D;(r+^iZ@AQ$({Mr6}cu*
zju&TTnzIX=()=(gyJ!-3QhavVeQ2j5BxZs2QE6JnJR;5h_#0B-%0*v)eCOnuz`tfP
z0mjyq7y=0B5dZ&GX%`DqTT{lr@8AAbX-zr%Rd%dS#3g?s7kkrXwG%Mch^5-qN;zCI
zyD$PvK=RV}Ji!?$AKaDQ0N}nh!xZ4(=S()VQy{-xx+t{6DnVZ`A5Qw1O=khKX4-)Z
zkm&_kkFUW6KKCn*{Zquq@i#5DTq)KVB(+1nIFPod7dif=ro^QVXU2oZGffntT8r!g
zZ(B4Q--!WGATJtrTTSRRNj6JOcLvy_qrAl5cyS$(4Z)R>mpZ#8x;c#8I@Gl5No9>;
z2RK2R+{H;xkXa=)wbGRj_#pIa<Bx-e`a4sk=BSw~{I?1OoJfAl=7@}~VEnm$o({mH
zh5bf{&TFCFcxJquu6`aLUP|78qbu-=<^mq*|9$dEl!*>H)6e;R#BI1M>sG)2-mAlI
zlKR;c=*-igv_8Q0L0jH@L>Y9yPi$j7P1{=S8s~Em#IIxaxaAbx$2ioUVN1KQ@j$I3
z$bRDLBJU%!($2jy$ld&@5!~l<<VCAKVp5Pe|GEc+JK7vO6@_`{EEIVI#KSrbh$XSo
zR9&91+_V*A8&c;E_$N3fEe<K0)E_ACfKwy!u%IH4D~O(z1fSsqSea5fGCm>H`6hmO
zCY64;o0I3*>MQ}VuZzp`>1=PCkusJbllN^|&;xWRQ2y!WmGR@_LHTZR_EL}kV<akt
z`{Oa2|Mh&4aa^zvRNv=)FV<W^&*%0zTY#6nyBUn&<O(=CT(o%@d1V)Q$qY#_bjJrR
z7)d$Ph!%k?#!=c>@KHrWXfw+o6ut$a)axoMfWZJnk5Hh5s+};RHV<NJoT&`Rozonj
zA&Pxn*7R5pYba2&1?#khFA7nv?zi*xc{Hk|xs}l}yp-6uE(GxqFaRKY%;F~)9(cxI
zh~9PaUBO{k0%pCqB)D{%9F$wdEDx78s6|#5M!21}bGbE)J0o?;KB12Wr3{1xRi>B5
zu03W5MnXK=vEOBnzCt)5{uOmw6^{X3l$$6jKTDy2b$SgsA)<{V*ZWsrXCH+kIyD*}
z6S2MFgzFx=z)>itbQJe2=Ugq}FsP(rlv^t^O;0&``~)!qrykCQcVfxy{DUq|$N`vC
zcGQnjGN|f>-H>P|KqQ5}<sx$Q=Ho?@_hWL-;7;o0(x}6&Z`<e(h0PE4j_9FlhQJYz
z&B9IhWA|Da#`43+%kg?Qo71q1D7l6^F#f!#^Kxj?W|M_K<i8Fd4Zo$Ei0%gpKpSd=
zmv4at)_+6M@fB8WA>0A$=S&R|#HauTpfGHw8Pe}(uB%cnWmLA#!cyeau59~tu1zs!
zk*C{WNa}tkHLWSD8BF3pJ=p6;FXXDyO!F>d)xM>egYP}I1~;$FiVPL)iFzvCbcUF%
zd5a`=veJqYK11!RGs~WRQg1)bk^w|II@$INKNzvqws4C`*jlew{>Hd*m}wGsun96u
z1QJMf!x;-|vGt_kxH{9ysP|aUP?aCoHFFRXB`H`LWF9N6lbBtkxt9HGTa|rx_y&QM
zHf*KEsx|Rh@`I01ix~%5fnGCJF4EkmB76z)os`wYxHe4a(kx4tKWXatbQTb>W`_Y#
zr(ersKhWPqV++<q*&$ig);e?cU}tdsW0vOifU>SXn>Yxbw$u5$s2~;ga5gl1J<9+S
z6RVJGrK#mRaTr5l$lNLH(j$vz4rWlDq^dBTFLvgMG=jM=?O_~8E0Kyft6-3Dv2rev
zi8{JlR?k@J>FREx;<UN6t7-fv!Z5L6aD&&P(knfh{BNo4a5xS^)>ReVzMM}CLY=hg
z#_KUKw#oWuw#K4rTkuo+qFx%@8j32rrj`qab<cz56)-z&s`sElxgSGjL$kn}!AJQ{
z*%Cj)Q?$gMlOjc?S_`pL>Rzhlg(}IfJO9+@ADj)Mm}fIw+-IKS_?%~5Sb5iq+MW}{
zk7Il!?({(tdE-`b7$;Z%0W*6xP&a`<Y!#pml|7&*cJFT}toMkGLb{JFHq5-WxUN=c
zO}*R)QCgMh9b66p_iF+q_TSe7WFn67tH~zp94R1dD7c0W6&bWmj;S%a0HbQ(H_n)i
zAF593&_V8T7?g*Ke^*>3+Dh{}Re)W{e+~w{HJVfZen$LPNm+W2jxGGM&rM&~U(?XP
z%z=xck&UUci>HmL^IwVrKPH6`^tJBW?f|%3eE8CfD3YP|64AP!0MjHA9}8OKqwKDm
zKHtPMjtfZVrD&X)u!4SW0i(3BtrtK-#CiuC3obaWJ6bjv)WE=cHf<L2Ix2WNm!qkl
zP8C_CJ3!<+I}3-~O60ci{n9)-zc*7{vu%Yd(0=lsO(F|C#0TsoK+FNMdN|60i1He`
zi%0jvEhRy9yjTYvRTk!3OW$ee`~>U%O(=`#^VlxFgmQ)f2#EN<ewDM0rHQGWp|gvr
zlQZK#z4(V}YF*mxvZH+ZDG(p=@AdW20!Z$MG>XGBUF`#1a9NK*r%-H9g=M%S<P-SN
zmfVkU@90-v-D2w$oJ6D>Oojlc6|+^BaCCH5oH!F>_GXgu59b{2uy*Uril$=xoYxyI
zlo+E4B;z;BTYE#Nj*Q&3G*Y*THzel43Epa%KF%*G&8A&D$L<aBvKmAr)h{jvlZAAz
zG0No5O*!-Fc_v(Aa<H{ID0%~9QsD2<qKqxUIvfrdtfS`#-q#tIk;kkuOP+Cbm~<(P
zNIjfMIeu)DBO{Wu*`$?d6(HYUPT7*=dph?bich#w=eq=IN|OrRL06r15jidNWz0w!
zDzrx+A50FGCyup?4ei8FcFWo^qDmG0UMs1pM`Uu<pB%iY1u4QCDl-_Cy467LMEefr
z2#*8nub;=@EI;jNFixN^Y6V3Wn=a<7^DWNgwWQ=^wXB#w{b22x{hNQReuL%A6iZTV
zbV3{#8q7GAbu!a+N~`hgQ4UA9H_e^ypka*IE2_Hzu~Dunk<)dYdnh;OYTomOHY!B1
z#!V@aDlV67T&MjN5>UgA`GfN+-}O@zY9MfR>D+V4eckocvI{NgEg2nF1|Fix05BIQ
zXwOKaS&xHJsWeYTb{Q(>Rn19x1x6{<+w*uU?9)L1!CwvGc4i1?E%Tno)<{2{nOZR}
z$xqWF4mg1RgS`Ym(@MeHd9|UAv|O-IVJt23Ag|uzGhaDVdbg+p`oa0br&)93^?IbK
z><6Bs_^qnOpIPL|cw89_3>g+nHqUCqfD<5+d0{2SVzD~jj)I_%8sb*RxQFO#fHBDN
zwZo12c0tXFIO>W7D>dY@oRz=ScTXg~Kd#x~G_ptl(9hG@GE7LN(x09C<>Al0-AWs?
zj;fX5$;N8h+B4qU@#=KcH*-5@$=BxL5{zj6e3SjRg|Cc>{0zy$6WR0xT`E58>omTb
z)X{R%*~+b*LQk)f$0|Xtgv6Ls+3cm!on(!m+1*{ybK+T-)k>>~;bDIkvzM)3f6U66
z@pbNJMc|K^Bcbcirq`8%=(f%(e@y*E<CPi0KGcp8x~byFAErxTBPTmoYcs0a)Z&WK
zltRQ6J8_f{!e3X;Iw5+&#EzMmSP#G35HlBIj9@y3pS=FPVG%Bo&yv!=i~O<7{tY;e
zq8=RAyuQ$2lhK+gD*-JlSa|4mHRM-W!gi-}(62-+%pWvC>jmAfk`dHIRfcbe@h}>>
zQgELGfNgImflO>g*^gqDKD?&R&b1|bSqKV>YC%jhUkwHd+&HNR<Wz?wu<6wx)Gkg=
zP~o@F%ewc*(1S9Y*!F8qPJu}8D%@!AX(WjbEFoY*2r&6kWQ6=}57j1GbWbkPA?Nb@
zLxryQ``hBp$M2Py7vuQ7kW=>J3rZ#ie;lwNEzrKkYlyMlPrL};=DPuRgZ)DutWgu<
zAHTCzS$g{3MVZ#kJ+jjfeuNR;QDJD3M3ns+;gG7eNW+)ceCrq0G-=!1Zk3xA6$$x?
zJ7g%?-ZG3jHO|}EB)>W&+42;VykK(BGr_=jb%`G}gNr3+COQ2xk60`iSq>#xX3GL1
zGziXgU>7&So@(iPO)ioMaVQEF%-k=o$s1gUP_LzhhAsbUXfu^XtP&K=yH1UfN6TQ*
zFr`O)Uc8+@hz@YAv+8Kq{PzSt9fLA!2#hN|r>-9F`0XWqk?vP7_<D80nC>3I`0WLK
zxEEXS-0jl{8#R5nms`<o8ZA7(76VwcheWpaGa2ZU2+A4yQC}Ot?~l{utN5EJy-Ld4
zN|?Rk5C@_!x=IO*HrjjV@T5hPCmB*aN#=bz>d8gs7iaTF3#b#yQCib7=>*U`eYq}F
z45==O9zy|d;ONWC70m{@N-CH`)HZ#fulAAtcP%y#oNeEFEM}L=*(u`7;yN0=s=XWM
zUY9{J8B1CM@X_)$u#@+{7xk#}i7>f`FFpuJx-=UyF&L0qNAsg%?>Dkgj8133&<17)
zzCX`aKTxRL1Mjfqj1UIX_`{XGS(e-(H9CZB`Of?R6(b`H0~=yQ+!CAy?he%Ae8WJP
zmAKdn%-iao1?|uxUnM2*i-f$`n!d9@#EPYV-X;JdXc-7@@E}SFdHQ+%G}(oSiZIiQ
z?6O5ffPjD<Q@MYFbOoiw+lxTpcqAsl`vJz=ixF#aTjSlo8#_d5bsf&*A*$%GwfY>m
zgPa~|bZDjV0Lx`=<f<6L$Y4E(b4u{Au9bYRy<bkWWD6kZO)`3gpXA&=%05?TSeF!d
zIlQL*N$6FrLh=RE)k`wHki4U#97;QAuUJ0rqibXD@OR^AzY{$86N1P)WysZlkRg%|
zznL=vhRlo2fQ$)$ae?=L+Zezey?%!Onhy*@0s+zepTzPXW3_*m%)iHK1M_m$eMDGa
zV>SLC9@leNv+Qy*a*W2`mn>Zf1$PoKlS@kIN1L;4m~g1-`nn^xMsNN7D6@BuopGk&
zD{vcjLQJ`Ya&YJ&=N)hSJT+CpxEK@B^3}SB;bJmQig$6Ndc$xQ5eIXT^6-W_9uEXd
zt-XGQj5YwH+iBTSZuc#?&4nL`xCln_GM08scHfhrW|9v*&|!<GEO+bIxvnC>^PV1X
z2i_oRedj><QqPKN{(hL%z(dZ_i%HB_4Gb-V7Y((7MD%aV+=Vvy3ARcj?BABz3vF-{
zlBi8hWt(KER|iZ~r8UtL&Xq?VzG06h_%(XYCA-0i9s<2C3T=td)2VDy3Y=C4GYs0l
zga50uk^y{I@BSLORKfuPk^FO5Weq*;U0wdmgZ$&gx4%tDqxy7wIy;*0rMksYaJB|8
zL=@k(8Jr$>=CX;q73C$&A8QH_MJcy47v_XbD*;OV8Cqby)zgB9xv$OEs+bABLm1sI
zuMTQ#EoEX@y=d=?=vk455f42Th24$co>mfkWay<%y&hsi9rQ^Zi_Kl_WfJlGk0zt)
zFu<bZ5+*kENo5jPqc&19B=#P1v>we{d1NpI1E>?=&tOyJeObF9Eo5S*jkg0_x6ek3
zCN3lOs#3J`0Z=31ATLgub9Ex+lS1X-NkGPCgi1=QX)*LuV&kv_M+GB?xUi`nwV@nU
zuCOR3EXny1iA+}n!jVx-COgPbp$9JcvcD=0gOq&LEB#ULEBw&R9#-%31;whSwa{Rj
z8CRIrWSKdhEMAn|WyI5eP~D@?38mNWYgm-~*AjrDP+3FNA5PmmJCLwzZ)AoKu7h~k
zD`u*ST$plaxoV-gr|3!OEIjhQQYj7)f2$NpIAq9^($haRJq65s$JogOI>d>rkjKZ5
zc;p(M1^JQ7ahFNqYQGise)OGo5*p&5V}U3fgySm6nB3%d$Vxx&=E*YQ77UDqxN&6s
zBRvbVk<$2Fvf}OcQ$i7GK_=3gNjk0NOEZl4LTN1W`3XeV^cs@aa5M>4Ml>&xf;<px
zXajYSf}#SrZ=h1;Z*fpJLd9qjEqoh{B^E%c!OQiASfLY6Nx;gqwZ+owdo7MEZ1
zao~CLJn_=g{Tf)CyNi&YZD~`Pq+g#I{5fkezWPLm7&W+V1u4^p$-tG59X5%UZ#^;*
z@W>F7%?Rtr@AWBsmslO5l?8vP52H^fpQrB~`Mi#ijv|KWruK=$LmQS!CeWuSxs<o6
zJQ2%*&ae8qS^5Bu=g%G46~vh;n@Lr3nI~l-5l6UmJ=ayAu%r+VK!uy_|D4Gz<0>eN
zh%XwLm7WSq;wPJQ9vIjF`u%P$5zu|l2hFXZG;z<c=04CBN()Plqy3?S3{2}X@iugL
zfnr`&F1bX_Y%C@}y{h`WEt#@~(<UeZJw;h~?I%1<VV4#<J3Qr<0rj%B6nmBnxsMwb
zcnDAGk2G*xWl=cM4&rvWaME1u|M_x$DxhAX+P*2&3vggm^$t_Z3lQD$!19tpTdh4U
zZW6#pIFrJGaKpLt6#(Q*&2+uy@hC15ng*r0JwCQuC+dERmFr-nZpAas$qkM+9S*X6
zihdRCtlMw-RjO*hheue|q33%~O+9prt9l;>M@JnsGR4Nv7em{K0@gzvqfo0S{i%gN
z&QKe#*^aOOhSdPm?eONWzGweJEB*he|NjZA|4f0uxf(s$fzV5Y1bMYyOhhLud}<Iy
zeQqR+rxV5B4PHfG3*MVM-JCK_fe`a}|9Y5q!KQEq0O(w4&DEj6bwDpX3q`Kpw`@R*
zW4HgH_L$p=CTnw>*Kr8av9oZA(-7jd<UhcBl#au8gor23V@%*Lz;>dZ%^~^416RVX
z+NgTGx^>=bLRs#~y_{Bm%CjTjcI0h-g*0X?p6xG6m6kEU@_kT6LlV?%yzuRUG1*F!
zq{Rxi(WCht=wAs4^nM0U^R?Ry{&g(iUshrNCj$Pp(j3>&jag+!^1Gyly6;hdxfhXY
zK(;p+F3;;Wb4RvE=uWcGh9WN`J71y(+0#0Abc$C>H1YIb;SK<O8ptb2-R8I*|B2V-
z+f0kSr1XVAIf)Bgb1;jf>-G}0+3BIMxj|B*e;}|ppD!ujUA?Qlk}`gu!(@^J#sW1d
z<7a83T3MDLRXrO$z^Bhz>)2j~K|1fQT%$+l=j4jZaLyqoF`bm6W@aa+ST<vdnD`;@
zYGQ*@C9)8<B2K3cX5(4qe6hwC16QJ^{a~cdt+^Xja<Li%kWsbywzbq(Zf$y;zQ-vk
z@Uo&9^h9K&Db)EPC9>yRy{4C7-ghwJG_QICN3_>YS!&h1P62w<hT*nY?o|cI64sRx
z$CV*L&We(a1AT8{)4AG|iL5E>4cWCfJ4n(itM%hT-$AY{rU%P0)@0myPo;>~g4Ox-
zG&ir#62Q<GKUgoCFYw8<veV5wF7mRd4J$N#$<i*bJba(Mn1t~RitlHfeIY%2W?MuR
z_lS#c7scXs8&HPyM(v;*BRxCq8wl@bYlJCe%YLk|Sm~V{40R)MV?`YW)moVP>u3lj
z(-<bIgjtlAhoCQ0=5XeC=5~9Nprd#1r7$qs)&b|jp&7@kJ@X;&^|6f7FRc)rBll)y
z`<oA6YHHaKtg>}DBaw5&(kpt!t>-v#8&(mv8V>!6bu59sqBhPgf+fU?1Yva1+dGzu
zO1(~Q)xt;=UHjF*Sf-=ig^kiwCVQYc*nOa8?=~nXpF-FM<fyxE+4IZOk{vyb3Aicb
zINInCM@&_vEE5272y{=h)(P*nWk5)9<@&t?m<A?~f*CV>5%=l^bfY$M%5>)*vfZNF
zTe5EyGzy}-2if4v&?hw@7}$Xm=z$lHI|Z5(4@g2HfOJpaVq6T=fEUPaO3z|?8~Ex8
z$^lChQ_E$yjgT;tz?}2K)c@t<4VCbo@*{%^<^p^8D6L*6_-fMe=1|)ET};0gTKl26
z>vnTU;4M&lUTL!$m+=E`$AsP8!9agk)516kgeNQCng^g5KbB#TL2+d5uRYmVUPfi9
zRly##VVmemqs?$fa`Xa9iqrkf1??rTG?DJYX$GDdoK(nwCo)QiOL7Op$MWg??G@Q<
z0ypl;fzcL+%HcOPVfFwy=O|y&{>oi>?iXxl{(|l8^3sC4`ntdn)T*&Uu{|C{7RMk}
zucMcw*0P75rAzU_$)mmh`^)?4ruiRLkF}~3+{b8@btoIrTztT;;}JP$lQCu?ds-~1
zcXDW8CIr8*RFpZu|Gm-DTbFehjLr7$(sMK5_mLQ^-rua9PmfB`Wkj_8NR3FtEXs7x
ztODj(ZN3a$0t5lQ;>+P^f3jK_u12M!8Se0vUN%ukN!M>vn3XY<siu$yL-D?R?wk<*
zD`@^1wg2K^|2Js<zo`8SH~%@L1y(<bwEgm$$zPk{{~c+W|N7~TYST{X+(^D%`nkD?
zek34p2oH0|Sa=s}sSU$($B<M+L0W+><jak#$FlzA?-_nlU+0-Ji&!QSJBf(Z%-tsm
z3Vfffw3Tamgi#j8BtB9(Kw5@1h0<Q1kJfDu{=%#kjHqVtVLpCxzB=Y-`yPu_ByEGS
zH8`s%skz04iu5&h2y?YV^d9zZ%k-u+g;l{_c}v(2K|(ecIm0baR1MM@&UHV>5kIwL
zvLTt|YU{NU3{iuKL<D`ckWqr@?4UvLWSqeZegvs=&Mcm@iw@9=xTPcY&DJT;Lt3o>
zh<UH=WU7cj3dn*AMVw(ANk3X9_Ik71>MdG7(z8yEbX;IbLx;9^gfo5@vzm1i%B$&z
zSHB3eRi!zzIg^0%wQIZ@76priE49FO-7!M7+mjdQ)%G@@UYrMntw7}jRou1eW3UNi
z{15(ee!bypjSr!q&|@Fj7h5myNOp6Un=)fPY^vho9-ok~2~&mS6^e=_Y)W^phDZzX
zLakE#frlUIVeleH9@H~!ZVtPh+FXJQG=*4f-okTY(o!G5R1gkAXdcl8y&y{lcltTR
z@H`_<Cd7`)Yeaz?{K9scd+bO2!b{3e-XueOahYk|@M2t5SoAJPJ=xRJ5$D0fKv7N!
zH=!4$uOgw*HuOC=;f71zJWj^~v->cIvY+{A_Dp=h{2ml<wqpIqN4w^E<~H;yuFO79
zIvz^=0RD!iKD||$f~&ZiA(0HsA2Z9oGv@qxg|mQlT>0}t{u?=r?0hm^x(-e!#@&Jb
z=v^kK1XGXWBB@_Ru#9PDmQRIU{4kkiBy1CZ-nF()8(RtF%OenjosVJ(Y4Sj^Iw=(O
z6QaJ|ViKs4<K@v5qS$JF=#%F1-6dgP;o&65NlpHVhzAMgIDNT5W}HT9KU$Cy5s;F*
z!W2z^(PY<TD5^RfXv>x9UCM;e>KrD639`h?8j@<ltYl5);BHfHeAzv1#uVuEBNQyZ
zs&Sv;=uJ{thpPae29OQApcFXnQ2707-FEW2V5pPYGn?jwn<djL*d1jog1qh1WT-hm
z{h)n`nL_kGhPIA}rO+&$a8h$0C*M0G#mGQj)ql}jM7I0cibLVY`Orrc@tgU0>1A3R
zcJ*UQIwWE-T;IMz(#m_X&<$`3(K7od7@F6jgwYS=e+sL=JN7>ZR*?qhTE<_tP~-o?
z7P9=q7OvX;wWYnO40ZCO8chks73Ksj8P67UhOA^3(tvCQLmFRvrm7=XP~X}+gMV03
zx(<)Sa(x2UiBxIM|LFtt`F8P?BRw9+y4OQdeK*b*LIR?+j+?P;z~k<Np|Llv2fWHj
z_B0|&Lc3SOp25#^J;OOof7`YLB@|8Z+L@O<MVR#v=*mV)dTjC}ROM#1-HaULCW&5f
z;!1+Lt)M$_3X+_GrheCa=xXuSGcL|Dvr<|Kpaw;X1`D0d7#aFFlV)7N7Ue)K?ko^O
z)&!58o?+63oRJ<%VPf5Cl8!m^U?c9F-&?5jD^9i+H{vJLdlH9x6cStcLRX#H!W#3q
zOGl@vcpNP;em!&>B0jc7>Kzzf4dV+n|K^xxsrWIxF$;UHrOiBQ30obq_yU7jpH)-4
zX9&Bt9=|ZW7r22!X<ZrkV<VTm{e&(t)wi=uRX)89o%v-*JC();iPX)s=6nlVHRJj-
z9mYhediUTLsE4H6AwFcz80+6Fq9HkiJ)p&4LR&;j*w(*0K|wF4Og2i9(px5)BVP4l
zW>j=cN43mKEliad%#36JS=FWMR48pFgA{D@TJi^%dnljEMuF0Y9SA<5g^)~znJY2%
znU_*Fb0X<~3Xmf9-B_heJF-(D7{!F4tK1CHFrn$|5JOAg<1u?ZB3p^ZrWg{_)umNt
zek|mQ#nW^Rz;*HbTqCh+!1Ov5B77?DyS(VFUh|Q06i;SLO7C%ykAoh0EmCQ~{^L(W
z{I!A%%<v~A=7l<EdfE<Y@#iL%9;>>$pefVIVznv*uc+F$gh80;SG3IQP?;rBfR(<F
z<nEFbPOTkOG20wmV9hHL{f***^nmLw@#P$y1gnn?7miC=h{W`^x=FWJwpn^zAzChg
zIO>qOajHI}9M*vuaJE+Lauo5xF!@OJShwyUu0rLpIwjjT0u~9cxaE)fFacjrAiO<5
z_XxVyvrz4sw%{S#Va~2!&$+HcK{qTAt1rPBox8*o7JPo_p`1m$`nQ9Y2p3laz4hR4
znV;~UoW7yK-sDXDE-txkWpc$!{x$LHh1jKg)njb>EJP2%(>1Xlt*d#Rf=QV`4aaxN
zobjZFMKT#td^96%lt_kpBxV`Je~&N7Acrm{HDgq3v}Z;>X5428N(t1&<Dm<}`My4S
zY8=1N81d+*GDV%oj9CFcN=Q3a5X#TI*&6XKX(E4$-1YcJ_JZF?r+VZSw9BLTD)P*)
z@JK>;_YF}heaZ4ffie!<kjnc8>xbIjpSx(^Nyj`SaKns?_+;8qTENAU&il>trMf=<
zr*n`F`0IK#uXk0;k=bAb`pi$x7#~V#W7;M~t6WPojfA<!iaBRw=^cmomXvH2Oxyh1
zDCG6D&#Xtej_6LS(@_#*w(7$jy7JMBPaunv*bp|?Tz}-H8tapLPyFX8iQLtDUmPs~
ztD2ni3)j%^4;UOA)|sS*{NGVK=FsNzWa0wB&fZa4pZ@La|NnnK`8TKja}cdC5`+i!
zg=h<u|K3ls{)OmsjdiCLPPER=;;9x)9j>})8#v{e9*T|SFF%R)0u(_X)Lgfp1QvMV
zQdHge%IO=Q6cSCG3*Mx200H|v>;4gYe*3UqmSZ5*@<)^FtVLZO@+Y&YrXD{6Pj(Ri
z-!0&0%NUa>Ikww^sLO`#=R*??RNDFuo3>;QmGrl65o-ohwt(*i&PoQ312jry-YkQ#
zbo2TjP%JUoXs$xJhn&E44tiSmo_-^q?CJZ|*!3=oKb7nxIde5=?gx0QMf=n;d(|9C
z(4x6Is^fLkO)Q2f-5#%s!Qo;U=VPpOX=i7l#{~hfjo*BYh|CIASPzm*iZYNoXi~av
zt(4ZCop<_fvVwVM;qIki7dWi2*6^5NpU$3)>-&S}qcC5}c!AYKMr*sZK7~6Z)KCi2
z#~aq=4&5PZj^7S|aM(e%V!qqXi?^m6ctmr}>L`J%RjNjJ=Czp~n*&Q1olPIukFAoh
zv!7bC*{@ZVRt&<wP^_Ud;Fu4q_SZFH+~ic5p(2+2L~SkVe=EI<RSc%}rB;)&jhp#k
ztqq_uk%eYxMu=PdHa<Hk$Shc=6ZEJd*&C=Ro)nNbw-Gle5gMPk%y;m#%W@EnNt^GI
zPi&*}WGvx^E3vQ)c0nyaoG~p+z5Sl1J%FhS|7(Jd+;Tq1e`^&~GxRNs1|fmpZi~~a
zBgNs*acH12CCUw77okgbaYBjM8>^LBM}b|XxVYI$hn7F1OB&oda7m#%P^4R}UgSAJ
zH}~%)yGSv{+KViNl7W|@8ox;R7{g$URi4oE#-%GPcAMIF<0W{Y3&=qg<-%ECsb<sL
z$oWE+KWHgS<qS)btEk6W)K0=mYGwiP<hhhn-sUq3hg<@g9o`h)y%;(Z3Plo!Gl%uT
zXDJufN9Mkj-qW22nVX9sXr1g$i-Qj<OM1Agt<mH5IICDPb&K;;<a$9YmK`4U&J2%A
zyvq7aD-m7qR0PqPQ_c3hiUwiCH|G{nc4wTzj=WB9ekl3X<B*C)vPqso@o$bj!|THS
zw;34dm|lEGZmZ!`;0%NbIRL5NUsb*9B~K~gBeFMg8XsB)Rb;GjTsP9b@+J(YaeBLP
z(Bmv1{B#jv=6=u&+n7QuV!l&s5?Ldkw3fykQbE@x_B`^xmJCCLe_xi~MT2bKz7U`M
zP^^>F%B5#aa{GHx{@MQgalQ$=N3(_xRYKWY`STuz&Z7zrkmt8GU}z%2AZ^2X@pz_)
z^5nMTjvSHifaY%BdD0UU2|-P1rgSZ_Fvh8KXEdU?M{OQ7zNb18;iUsY`ylEu;KcIH
zG1F_Xw*O^%3YHHsm;YqmP^pbgJ)eFpibpo$`v-x#yG#Pv56_7Kv+|MAA#MpRs5NY4
z-A~B>snz`(CjU7cJuc{v-TmSyJKlfKQMSK1s<|GY_cup#mSC%~punIOmXzt&?QB&Q
z+!v2{1#yf<5s=scO%n?$<JRZ=6THI|P<c%&oL;r6r||s_H~VjXc*C!Vw{-JnA=e=X
z9S9!guB`DAtnxR9dWW|b201Z<;1k9fGZ}Z>u)RElReqlOE}l4>+u)fna>9G<k=`$&
z;d$0+ylFq-ZrvM*T7B4y==y{|OeW37t|CJwlxCagCdVA&Pv^ZB%=qwQ6T;V~EYjk1
z=!CGOYJZ;GN-168R9Xb9G4+zmwt6X(*HTF;U^2R}5_U|56$`jjqM5O9!QatZ`m@R!
z(+{O)Kv}98Bs-*~D0qA`3Mo<Ke~m`(+lfnpy)ciJl`#VUPCqw(B1CS+fSCm9Z2j!a
z*T4S!%k+eyBX``pt<W0Qm2Dm`6_#F0UVPk!)pLm$SI*8|GW{OnV5MlJtk9`?yw=)T
z93TmRP*a<Kx!H*k*KbKywv1)=x7?66vaD~zOjx;xk*;3PAxJw-zg^KZ+9<vjw!I;{
z+$Xyz(t#TwA>a2%4w8=k6(vBg`<va?E&}sJNt(q{H^LPAh>U|D(KXtfL>78N{Q_0p
zFiljSRfr9RQusBzl}MIWHARdR8NH3_k&0M~K)@(ez?PVijYb;P+Iwte`*r=IVy(*P
z<JV5OtObq?{F$?;;IncH?l|r>U*J+){M5ZETT&~8&JcL_@|e7Z5G+x{822Q!&*X+Y
zpWb;5z9IbKbcl45$IK(R_9)YD+63-lW6doq4G`H&azLsagT-69tbPfrZ`t^oBsNzA
zZud%=TFHX7%5Nc8oH-!DB@FbO#S-~o7+jUDr*3Qb(~i&|@D;xyB5gwC16^@7sskHs
z_cV*Uz2GalAWlV!UHjd7Uf}7h4A9M9f1U@3!Q0e8oVI>Q5yL((r>%&>RFs^GhANxt
zpx|v9v}7B8E2-AYYyGKakCf_+6Gtr|=fGWMF~<L6&}K_y?>D5y7_<&iWIlat$9s|0
zk|iPI`A&wfO?{-)gX5nnomB}qNYm!B_nb2|Xu`zxveDo=?E$y%4_%5vtJ*=N&3N(Q
z2erxZ*izcvs}jAj6wubO7kB{v9AyfM%I)pit6>kym)74b?)L5wlZ%<9U?0vAsj{UJ
zyBWJ&6TwxczahQZwqy=|-&v*TozIlNXM3|<ey={r+9<bG!sZ2xJltEuC+WU4%5=FM
zFEYHZ#IL@kt=iQP;r1V7=IclP=zMoa(<t`aWL3YwFLtyX(6<w4@2vf4y_RP^DG1dj
z{t2;xV@Mdf-M9}CjO16|Wj{2d5@%xalUD)T^A}8qtxAC9D}_T^xFq-lA*#xdiC}42
z?|sSBVfctfjjkN4$8PZ9w>Yk9iJ%qIK2`w0O<+8$=`Cnt8#8MV#jDtQ<0<?`Sqe9(
zr~7v+0^F!DDCj*gGl+JP{id@mkKmNVdk>9eYaMUOO5^xE;pXSZ{m`TT#Lx1Bm*p5A
z(ltlL=mu(;0^)bSt!3g9Twfqah<0e7U9YXy!ERR~xYr|Y7rU-f780P}$fWx{vzLy(
zCg5A;BIR@dtbG@{4Kdbo`L3~`P9(Ld0$w0cHCGe1zIff>ZV-kW{OX8!mCDYPk+f0g
za%K^w2V_jddS;2b8#c^jWwCexb?TY$U)GR*>YxAPX~^H4{LhgH`UZ-7l&{fk3)Fuv
z5$u0SM5Efe-3rr})mGM623IdlKqAWN5fx^Uvyxz2fxJQioyJ?OhYyxZdeB<kz56;X
zfHcx3e**p+khyz%n|X<MxHrD4J3f_6qYKS{5rQcMsi+rTw^QzTc)|XdfG`t!70=Xj
z+)`iMcI$R?5{+l6rWe<-gayXXLJ(Km%q||ov@npiAgT`~#g#$-qYMzJ-q>46qTk~>
z?tlj7tCJAn4e5Q4;$5qw2A&BpQdYLZp^RWQ0_xOLS8CqalWCU{V{36-@E2zGmKw?`
zh0AxIcpz{pvx4oNN*8<>2ucc1zyWJkgXLoL&fBldo4a^=$c%^e%+KfvJC7DtwZzAR
z<MZL<EZH9XV|#^qtNNQfe_d(b4V+=h?hKq@WtFmtxs9d7B4i-HL1Qv|ntmZ=(qfk#
zh#%Uj-m3dOQ-1HgN5o^SY@mEUiHhI4AAHYv)_#V^ij`|swMqf-sD*Pz%nx6sT{p&n
zzP|zS3|5XBk34H;_z=~GX-S9>3ep}WsO_Ej>JE!|ykZiC;z!mm%dCc;9fusAoFud;
zid*&*6f^}$N6Rg0_fKm<dpmK2kIzo4p(6Jbz3`gSlRe3A`ZyLNsy4%d9P1uwL8Q#!
z*%`r+%`GjYraK~{IX<|Fp=*?j;PD-A#_^(hqT}b3;f6~}skPf#O`l_P%M8=m9FDR2
zr_K&T@&c=}w1au8{39RZ_LO9r{%N4F^`Qu$fZIhfyRX@$q4#zJ&APLSsilf?G9zvt
zk+62^l}{2EBWXu+9C*2O1i#IRf`SKNvIkZc)8U<tZQ9>rg>LNj!-o4Vm?$tcWe^|`
zmE$|>#~2YR#Va}Uw$!j<HUYdY<9)w9G`WZ<;Fp=Jetql{82g^Y9)`bC0)rwoYbY=h
zKc~>g`MA66u%&S8H#zCO&N(q-lIF^Id7c5YtC(AY>w7usW)XU=Z(LsmL*Af~7AZUz
z_8eLGB%Oqogxh22fR(1*JGeoF12%2oU}F%lP=!EeXI6ZQR8$4Xu-A>9yD(1YiVM*X
z5sBn2+e*-n8MgnQn$Ex5>OY4lr<mF92VZ}$Gygiz_1|p<ar{jb)oHtQCN$qob)tu#
zlvYG!KT4#~B~vL}Z5tX0w}Pa;_dM88%CM`R*xP2$GA=g;vRd<$A}+%=F+S>TI#0~s
zFF!X`Rlp<HnYgUo%CBXKSq$F9GYoaCrq4lpC#(EAXfaBR^5!A&K1@I~_aQ5PL%Cei
z()<}Tf=sYVTlfS;oLVUvy$`(`(%TgTz@g=gARht$Kqx(hd9LCSWn(B^AB=qLug{}m
zVK!cFqflX!0xPWvy4f$=s~a8MOo)ahA&v%o{3dJMHL=-L{k5^0?1+lX<naLZ(>#)<
znX0M>2^Y~e@l6~l&)wx#>Iys?cGtTx6}EiL)dtr%|3Tt3G>xcUh_N?i9V52sN69@5
zn#OAF`O1R4e_T^sWnJCiFFx)1`Fq^X<&zzF`#IeJ1cv#Nv>m;^YaULPig31S2XVEE
z5kO5lUw4(uFr%g$BmD?anpvJu+oQ$SX;QV;iL<Hp4P<zDd2c9euFhm5PiO-RcZG}_
zZ<+1bO=jmEf|_xEzVz4bx@tdO%LiIcj`+1R66I_vE<#461Gh^r6Bg$+U5i6Jr(}F7
zPxuU($I%NhWJy_tlorEckWO_{r8Bev*opyH6SS)x=6CnX_y~ixVjVf&y=;=|YD=|H
z8>((7iCZp=r_)HZXri@%Go)PkXn*UOOI9gt<(&RpnvZWP)QJ`*&rO3Mc1MZVF~wt!
z*wEbK7^<nLr&hX~9b1L|aC!z9!Y3T1*PkM@6nbe0=Cgks*O4e(@IvJGL|zB`0xu;D
z9dHJZJ1q;v<#XE}A5La>t78>nyJDxxj~wWFquLV7)-|maLB1w@p?)ZX@@L?ssAGA`
z=4l9!^4v#|LMUOg1^(ALL*<xJ%tB*19~thX8?but&uwF12^HTdvCCH%K^z}4p(dnn
z4Lt04b~svQZan<Z#Bwp4n1yWDt;)p4TR7;I0F1x%?PU|XO^KYo&dcaEG)ZLXPW8ZY
zJ%j#FDe-r!{O3?&d?aGN?rUk<`+qUw<NS*f<6k4be0HpkOJ%0JUUe|A9me{J7~<U7
zRGWBBb3#Gj<s#T{>AFQbbMuGBm7!(IQnGG0yrDq=qnEMC^O5AHJ3qdm?^6pMtZg9m
zs1zGs0IYS+HVwI^o?KWkV=Mp}j&)EmDK|*PgHywh{mJ8gH0&iFeVJY>j%yK2Y=B|<
zd$yyVej0_0d5h>-&|C{jmmCy5GN%6V+!)mhMagJy1N|tAOrp9MPu2sr0!0Gvz#>^@
zO3MyB^Z@vd(X&nRVuf+}Yt1|wi%`C}fIf=;yhlkwu>e6zTakeaF?WC;If=rMSdnT8
z8veJ^7K5qIeV?L%l4Mz!z7vUvS{v2KO$)v%!V(;>W{(`LEUB_&VUATotzXM<O7PC+
zPYu?2qx&JrH`L5|^kH91Q`%?C5&rf)treG|^bHrcF39u=yN8@{I}m4UC2ZCDmXz%!
zUCvl8#u|tPMb}f)dwQuZ=1LM00&G9j4V>wIy6pQ3Ie`=vuM~*q{8m1$)okda7hy^C
zymOx9-wCKgbs&(Oauq-xZjnndzZzMtOhxOG@ZQO?NR(6xSx8{+OSkE;2yaybRlOAh
zw1gup+2&w%A51-NG+k=D`jzNfi;5Yp*g7rkujg(=2E+!YKmfiayCYA`3v6mPE!`zr
z)<e}hBnn+t85W$Qtq;)OVoHl{DBLOy63=wTbEzDY_=qC-FPS)K%T&*F$IOZ!PSbc^
zCKx=ZzDJlB{OKn-dsG$7qUZSevBOh!DJ`V0ya^8UWT$;>x5X`(D6DX=pWq{5DWSF*
zGgxNX)xXlizOZ4C$*0j0?aHpv0%>h)ybW<m(AS1r;SP6dTl4X)qN@jDsHO)3m!L1S
z`hvI>8{D8nbECD`1iX}fAnTmEKK*<-ajTzth?88k2;CzvlHl$Ea2H#j@nG&QZ=&!g
zBVx9ik&ictKl{v7pgFzg9-5+Xqlx(tFD&&ANf-SJ4jr}Q8|N!uF>XK%Y=y`W?wzoK
z=OsI5rRben2jQM7g^wQle93dB2PLcf17!f@TXo?{-qC<L;nkmHMlhs@gL;G~i&02C
zLJwc)K2W{YYj8KDc7Xj*tTz8D0`IAu4^P(m_4Cv*y`)F?qBg(Og7(j=fJb+qtkW*J
z*w3M7hcwjIOjKRcm~1X&-Y`_tw;zbWr!~l_66d}Di@UcBt82-&KyeT5!QI^@xVyUs
zcM0z9Zh_$L5Zv7f?(QC(;DonN`u061>70A}z4xo{`hflI{jtUxtJa>gs%DKjOZ&iy
z=sV6Z#dqgC*ALd<mG&r+Jj77hmck1c1EIv2gT$k65-`2bzgrj+DQBi|3>P_upC4$S
zQ$sRXZ-9-HofrZi@Zq+}CIlju+%-_0aVn?UEe~BC;3zN(6@7_w*;Gp8v=x1(Bi<pJ
zsCH(j&vL%n&LE~Q9|HTfK8wT2yP;giD2{yYj*7jFBNIvB?}xOU>q$IOf6CzNjYxWu
zYs_>dSzc*d@%{O7qKePw*&|q!wXq5_@NSi9y`heCOoTuI)!i9KSQJd(u<K-!T{P?f
zJM4rXQ>>f9*`{N(3<ii^DSQrh=v>3a17tJeDS=6j(!i*PUQw|DgV98$!$31Zd#xX+
ztesbb-~k?of3%Tz;iOHZyuzb&E^SOEoGQ-WTkzY9Aeux84y&q;ldjAP_7eY>iqWF7
z*I(3#|L+%dez?8=J04$irY_zM;PDy%4Uf<HhsSRRc>GU4f|2@%K~<smNCZqyTo8lY
z+zDi<WL`kvMxQ8ssE4E5xigLcu}Imh{CFUt^n3#y6Wd6d=yw_C>3MR3=X_tUq*stf
zrfWXRmNPr%#u3pa)TzsmXPsj}q?twT5Or&4#2y*RE$;UA+n$#>CX=eU>e)1r5x_V{
z7d@t3AL9U_S4l)KX=ua=wrM4^3~E0YP{+zYQ)FMmEfya6;S>d}M-~YUq>LJ4C%9qR
zIshV&f$?yTTko`r&@87)3C++*rMrr!<aD;5!ll1aMV-5}6n;5W{h<CBq>r~@9HzRd
z{b99`aYH_rx6rdfq$yr}Bv|Dx5pjvCp>yWAvZcV{@Y`*&g*j0UHWV%kJvOsEg5n!H
z7SB1h%Lf|?#|KoTDQBg%+4}Zp@S2#fP3Q|5_6u*TY;~)n7E<Wd1Dru7KPlIgW_ZXB
z+8nJG@erpu>CZimmX<Mn@2Sg0^(1*oTkp=@r^%XYf#yq*cYh1<fP6!4N-x%P1-c?C
zZXYO~Kg$yy4jTtK7F==S))5d*S!)8*0Xu3$56Kr=lpP-7suM2hRZPe{aDM-exU_w&
zTr$n5%&^q7<*U@tRl=d6PZ&_D!$?T-LwASGogLLfUZ?^iORpi}aX!PljU(()w?rgu
zocKZdT$-p(v$k3Bc!yJBlXQ6*a9J8HKZ>d5G$Da_qUL^7A1=X4-cr=NWM;FO7x!u0
z7c;C?6ArQyCaFM_TTD^{)oi-LWvBTfG=I~jj`Q#N@Y%-0hFF?VRVI9cH<Gjkb>e|_
zLxtG&&FELq(H1?Q5t#8x1z1nvVOl!sumer=-es<BapLOg3vgf}++L=`BRIPz?`W;-
z^VcFkaq~Aqh6+#1f_~uUuL4CC*-R}0cE`=$0KLg!<{_5@dVxoD@<q0GRm=p+g=aVf
zcCYj?RP<M4HP=4%1qiJw2RrOI-b00G7q;X!4lLD7AwH$B^f~pZDx9NLg+ej)^F6)J
zA;LzN5hn9i2CMH6VIxP<5Oxp20A2{XhaQn4Y4E!lzQrE)Y}$Qz=tqgx_Br<L%}lqG
zRUm3q$>rRai(#;XBbB^o#;y611&idWMAt=FDRoIpr15+jJY{7=w9q8X{3AZd5t1T`
z+C!0+YEvD$xo|s5hNSG4mLSrCe9-Xe%|b|FFGvFdO;JRPo&NJ$x20nZg;eUfx9Tcm
zP0end>}b|o>3%qClXVW0?<rp*-y(HJ4e2D43Ae8AB8b^v4unSBmtsuP2k%F%UFYXU
zmG}32ngHo#nd;#u6c*KwxO&68&;SbNg-Qo6GD>nS=*nKF+UfIjuy$wr8ZBRz{VGCV
zcMyJASYT~N6&h%;xr780HR%rvIu5$=k(Oscy$L>0A=D(!@j+!BPW^(XP*{iaIj$}R
z4c*gOC$5N*R-<{>)IOE{4lkPuAG%^hS;tG=#8KmOzo05^Ajrxk2)WrbiKk`Oh?cyb
zb}{nsp_ulXl}g;Mqyq7rN<n{VN!&Qt&6zu$_y2Fl{lk0x-*MdR1FZK^fDqvrK;8Mj
zW)d;{BSzT$hmgMgjvDIQW<qj2jfXMudRM%fh@XtJ0(Uzzayyw|7>xwCb2!jL#jWEN
zDXwDt!unSb=M!DwaO@7ZYZHp`PmL?#d2dwG!rZ4}tNihDod_jW`DESk(VMewV%hTP
zpfOl8Zk(0x*4LkJM59^h&xd5o-++W_#tipqVG$2vo*0Ty=T-&gvwCX5s33t>9V4ZL
zdqNzB?@&MmHxZTJF25Wnz8E$e{ShbBnObHc2zM~_?a*Z`;bw7zNmEt^ja>1P9{x&_
z;hG(xVa2=qbrwQZ@Ek&re!oqawM5^b8?FfIMxZl*ks3-2wuN<EdNe3^cKmpZUUGad
zz@c@+77x|1K*UmkXVu$m%&vA==A!mAW-T$YQA%YANm*be48zsTG;6UE7+RSnqJPHX
zbBj-RgZf&BeiS$4eOF0TDdF9FjyM+*$)=dnowB_Qig)J9u%Uy&8-b0DW`+%U+VzwS
z=p!da)%ce=j%#7Y4J8yV^TT48YeZO%*83)dB_46NTV@oSqtK{LVwSJHt3oYISVj5k
zu!c5F<Kc2cP{L9sliay)REFaGc7S44ad%i8V(aMeLy0<hg=~})qdyEHx*&JZ8Fivu
z6&Fr|l7s6&>w+X5B5!M!@Q|Pe2u&)tLVj<Q`!u1j`dtJl2w7^`L&V1%eEFh%@|6)^
z6)gqcD-mJUjoSe$^T<IK7Wg1-=FJ{?mzZe}y{dFZ^g163FNUSDDtYhVK~L>o#9_)i
zXnPlAlfKD3(8<m|V2Yg0Zx~HfP<qY<_}XW-wSksBqL&8*e}pBm?i+_c9Fj}$TmZt7
zCqKfH`V1qr_m(aviHM~JXk$w|np02p?b5QNlA(E)IJa3$AYgP^2?8kz+E5-a2(`$1
zMATAwX?4$^$zXUsq&~3w6sR#dYKR195IK%ZhObwZ#Y}v_(iX%xYyx@|G0%O?F{@>m
z2&p|9LhuKB>|&99Rsh@k1P`geBDsA+GIl3YX1&W=&0YV8J3Bcz-}v6aU$NaQ3u_a|
zZM}@8Ne$tbe`JPfEOA_9kp6UO_2E?RdaG*!^5sWt@|$hvax`t^U{G!)?sNTD>|uuQ
zIZ5T}If3X>jw+$0rxKr0vr6Lqn;Tf*mPH+ezRoeg1Sovq#U8Lu^VMSKYG^v5)Csfu
z7CrP7MIAq~=9sD#Kgr(6dS&E~@Obgb=YQ~K|7Kdy50n1C<IWCuC8jt5?yQys2#Dgp
z5~J)K%mGDs9h8hed@#2$bp+HN{OLU-ouRqSsm26g;RAERo!YiqE18>7UUe~1k&yx$
z**XuI>QuQoY;-l4!{H02M}GUpnIi}=3PT|7Jjrp6irrWGPQF`=r_r!niu3!8^Sf|E
z?eFPL#|Xlp(rvI6rD`l_Zfy3|>>ZP$s#UAAbXuf4LU#@Cy_l<%H~N8<#->sPI`b@y
zftEYYiGW&E)mR~4atf|6fwb4mN^dh5Uzvewm!v}s=-0>+s^0@iOL__~d3FUPJpip4
zL=WKChIRmlG;%GPR7{s#`tQ@?8Df1YT72IEJy<Mu2zE7kQWdzZ*Y|$1UC0l(=ey^k
z!~NMM+zNf@5t~RX5_SwHE69LtOwifHuKm;k(5MHbpkXkXwd9s3E^07eZy76vL}YJW
zPaIC&ip4A(qRvnrEfP2!)2!o`dSv%`9%F7rIPmr;6oL9MgyCXYLvO4tr6>-=4jcMl
z2yVVGc-bcylm76WH#&4m_EzGEnzVry5DVSB6Ap`X>hG2Xs8*IJAheqb2ndtwNs*-S
zEBYNhhc=!N7Cnd%KAW_*F;vOaS_XS@_X)7u`Ohm>+;9%m*EQfCeKJcwTophE$ptoq
zn|gyd=t0UVWhYfFbz>Jan>z`+diaekvRZi3k=7eV5~OWPek>RZo$UQb$8V%KU_5v<
ztyr-hw*J=K6EnVq<W_A$Y@>M?uc8b{PB9}-wzSURrcReKsO&yg>9#DeWX8^T7tlHv
zVX;O{yeLGYBXj$7a-?HX17}zlC7F9zT?L2N?Jz0i3rqJLhHm0hme0fp=uBcohWE+&
zq+DoJQbl?9$%RfEF5#a_jd#JaH8|SEGiTYJ?jmZwtCAKdPFW_^)uXW8O{s#Fn5j;e
zzc|5As$$L7iSP06;Ho2=!_5LsvB#+%2`eP>6@YktK0@~35?GJ7SD*L>ZI3~<+rpF*
z{IOcSH)k>00;S6(lTGcE8eO*cRVY)OBiT3M5E1{N-L*X>zF0T--Imq+OqbA;D^v8b
z2$Cv0iaaH-ntcI;8Q&nvR(W`<&m)8Wm|Y<5aU`Tv>Sgn9zwIx%wa015<z7?Xz?h7n
zm+eYOwP;R%B+ty(ofv-Yyhamw*YxlL`*Y+|A8FDg1CWz=0x&_ezvd<WS4Q%WROAXJ
zKov0txb|~3sE+$Os$pM4sBP56aCGn*T3;o7wSE&TgGT~kGMVo$B%C(kUO=2TM3UP(
z9<$8tP28Hq@QC6bYDqJnBq+OBJLW3OMh?b$2PX;$R4Y{mxQj<n<lP#dtGhT;Rr?IZ
z997VGna2zA3n9x62Z)<Kt~}_Y4Zc4#dOIscQZG7AmY`*x<lNtkS5-0y1Qv}pDEcab
zTuFG@nqjlK&hGKH86qBtEJ3j<F5K8tL+KNnNRJGVIJb){oca#e?#*2<7u1?+e;uN%
zMsc`cCW`4tobW5r?+WYfeu(g0oh7Ek-$ZMH8NF06v#F9RJ+C7Iuj9DunZ(<PAYM>E
zV|ez6&)E%G^)IT7j5a>9uni2-ckq&CSVul6j=ixTBL{u(H6PBna45)N$|!z`uHXt!
z^6^9Ltk<Xa?%8>CBSfL9AguamEcS-;@xq7wU6jr26RoL`$+-d4X=4DP!mcZ&8{hj(
z8tm$wMVQk*Nf+KbZ#lap$}6UzE2fW8&>N`VIgFAxj^VZ}J9-WuAC@pezq}q_vH$!$
z*$(B${+VY=@bmNhOWM<qr~1!61g3}KM`|gQ22*MsJkj*52w1no4RG}CGQ<kWZth|u
zdmcFC;M3zv^ZrGq=xvWMohz+{26XT`*okLV|BJKcIanUtdJiU#1tW2oP^(2Hhaeq0
zOOtpxK_&~nL$pW15F7{aFyd5-1jaO2b7F}a(ic#7<xi!{r8m|W&IUELb37T=V@uZ2
z_c6FF<5)Zp*Xff7_(<WRrk5a&uKPWL$BdSAdoDhol}C}gRaT1oTTg#cmTjAVptb|7
zaYX~xxPJE~{*f|Kq4Li!F=B@i_oC!<CAB|U|FYqluhX044{Iv*N9x5i(D7GudVG@6
z35F-W@Yf<tGFQCuL<P0iGPDLk^UxIo?E>@Xz@^$4g0utetq(;vb&%tZDt0Ak(H>eK
z+OLN{F*7^V06D1I^bJbvwPP{vU%^<OF<HyURn9i-X9~CK2klvcRFJ;CeIhU47bNBO
zxyJT+Kb3Q}s+3Ldx47VYScA%`d*rEsq6%uLAS$m0y%P^@oIGKK<ZQZ+&gfY(SI^4K
zT(9tA&00?oZqFpne~CRP(c>JRfn}M#4f!<Yb@`<)b>Kqo&;XVfLzf8d7A}gCe4&cV
z=R-D&L5MtdX}d@-zO$0w2DbkdFqOFwjO1<lYQ4h4SP)%<->%>7eL_tBHFIYYCOAqm
zW^mT3;$1md<%D`{kHwA!k!M8zve9eimBOd10@ciD$inyob#smEpsiOle$-C^5|GX3
zh7{G$vQw?(Abh8H#NTzB%O=DQlIx*y+59!e3c!k_^se8a`nNE^;kEH{LgWCkx3M9w
zwPIL2ocA$n=SZ5nVn8WYO-<kXedZE75Y1o!f@8YR4aCqEgvsj9l_dQvnSJfJ3=)-X
z9b87-VF2Ms_*r}fyD+M)%KRWw3`^Uf(lQTsD&Io@44I3iijbTwt-|<lY5XCE`QER2
zL6t}3yXax&ffoeEXiZ=80feP>;}Ix=xt6E*i*uiAe79U%=6XR0JP|bB6xDHd{e?>2
z`MU)K+UpnAxOK~?k96g_*oIfR(cqzvYC7g$FXXFxSK_2RddE(w9HuRIr*sGOf(z|j
zlyIfhJ!Q!|e;MfE3CJe>?{@i9ZqHxsQW+A-eEDzfvc*e|*u#JbqYeG-Pj(6Zr(HH^
z%&FlJro6@0CQ(I{OGDrs3{W(JNcS}gJsi!4HTTyZnJt6b2j(rcRykFEWOJ=$SX#2k
z9Cz;hH|(-5iPXQWP&LBk(=X-3dM@$ymjQP90GO)&&MtrG<mXf^ZD$xzU&gDdiL6ga
z=6Rm7wYc=nK%obO2N5oi$?JC)2b^0``+N?TC6;EL=16Xd@VEQ!Rvjn44flqKIrUHa
zQ^<hJaU0Md67(>~Dd`qwoMVtb<-t!wT;nizODt<3uW_ceaV5SVa{wFGj<G);i;R1}
zv9J`WtbP|rf@2sYkH%(KCdsFE4<k;!FDAfnK`q_9us`~=D<#6R1@RWKB}{lNL>Lu(
zYzQ)LjC#{T9MvFNd7^lB9gJ(Vkft#&86zCgqaUM4Qg*M|VXL;kS&&7peNqTVjpQ3o
z=bJi)P6-E$@0#vzpt;7NZ8EqO#q{FV>k-&NK9=eXOsvwH;Ej?MO?7WCo9L8{#SU==
zM5pluSMW3iqYRM6{EWNa7yC0o#pE82bqz2B!Nn6`a{Bs=5ht+~2`dlj=4TS6UVi1#
z^_(yCt0Ztvi$EDsCzfrTF2DCQ&r4wtWLJaf73$i8c2w5v`1E$j@(4Tj4TE)|$#7@_
z%~}ccZvH(4&m;K<)|xB&(`632ZZjLwrnf4(c+5fQ9}9F5`!4k-on1ips}9q2(?HIQ
z$dm%4{CgN5`$}>?Y6RLhp`Si#e><qc=~y{%T{}43@`4Y@=96r|TJwLU&+F|QDLUO=
ztvM#9HGPVtff9;z_%f1-9#mo%wb`4!O8q)<MtXAAAG4|3nMIOkebztBfF&wtn)RJ=
z@R$$|E*lgQf}Mj&N&+Um+X645IafG`*<p^&=>S&ai`#rH`Hku}DGJB=nZ63dd*wLE
zIK!0G0EW<b_t;qxoYUm}&2JDZ_$tK9cIt6k=y4v03p1muVe;W{TbjjqCHWf8cau#S
zi`kc0U;_kpr`F??&!eZe1#i!IKh-9loXIv>f6kQq^zkhsM!U5AQ^T{Admk#JWBt*@
zMX{65#);al-nqofiM7SRECx*{9S)k6=NpAV8_6E;)`#p3-+P;bcfYPNo3vPa{qG*&
zr__<3Jph{dF_Hf!$s>@MgLQy4vV+3cl}}Ut0XtqV-?hD5V?h7m0jU4W0~GzyGoStF
znGpd!GhvueZE+O`KOOsQ>yMuK-0_c|`Gy1JN6-9r8Vb-eD*<|DF6EQb<zhh3Y+~}G
zXFd(DT|vFX=dP<@c7@yL%$?vPLVYv74PJQhB`9dtU|xIV{JWJowG^0|Oohog+b_A;
z(A{9$v4Eka3Sf_c-+6!^3hz1f4M34vMDGnfd&JFUEX9|G?G}|5TA`;JCy@+8gnoSn
z<UPbZisXD;kKXThxRjK#xa1Vo^JmWl3T-=s=Vv<#iF$3?3plFE=JTPRm=9Zs#T04r
zVIuE5eIAaFjG&eg6_D4V(h%5{5%e;B?gV97>470(c}bgHtRvGQ{OBspW$HG>sU}rf
zOUWSc=ZGlKngu4)Anh?t1Y|y5v|L^}4oHcxq6K4;v>*t3!HX-YlxOp?_t%J)qYEpX
z@qc4YSmqua3df>9)ftr%DX|Wlvm`aWs&Lq6FKdKRPxXj{^q*z9Ys+T-U?adi#Oj~1
z6~RrX*${n~vxD~-UpXc>hMl{@S<d{C)manbuJHJCU2DQoi(-BI`>(#l)0bGmr`-?-
z98w7!bx=m(wfuhj5;%OL<?Z|-G;eyt9HAOG6wR;G*jN`u&nPu9CvFqw=$Gp%+apfV
z$EXL~q-0%YX&xNjSE#uTn%SUsixAiaaZ=FwXk+6nbl1zych#q+zA1{X`H(U%KYfh5
zeb~uR8dl(A?6^~nJuWHL&0E|*2d-94#B<lfAW>5uaTcvL*<1g)4y7rAGYhLG8&Pg6
z-d?k(yhK`Netv@E+$J)jA%^MD$y@!@NbU4~v<x?B%QgNTnajcRYdp6kt=By1L8g8u
z`>`BFDqFY?t#xW|IMSrZ-cAQUM~FhAA4!7*CxehmD^|K-a%C+l+;O?w_u8(b%C2EG
zL{}c!nc?>!<VPCr==ib*FZL3%DU%$1Q0dI!#9dY-72<wNZDz^g3gEVfG8nS~Nm{fb
zTPv{71jE7X`8aPvH3OB3%`42l1%5hEz|@*rP(kL?_;^s{Vf0#xV=yrhABjik2I7OC
zxliEtoIq4VvxCAfTwdh0=kT%Z$Wq&OuT2B{ltLp7bobk=35;kk%v!N6Y1qdB)CVM<
z1gK*a%7;Raj|;+@$$3m|d%D3Fyk`>Rw<S_{3KTm7x#9*4qiH)DFra3&$1cH@D|at>
zlW&O$$HBH>^&06-S6?oxO(z=j4qd5Hs@d#h&P;4!R#;QLYKUTn>S(iYORq0hNLG>u
zLHnY+En8_HrN7w5po=Z(XVZ^o4&bN}s(8g2)-xDj-$hHjgPT^uMBTz4BzT<IBIoT%
z3{NXS4#{-U%f6GV>b~`0U-SmS(0-BJwWWp(!k=?}*I}oF+_|^HyosCA^+Ij;{g=u$
z|DNCaDOKY?-){}egmdx(MlLmgvZ258TR&vS6)J1C*o<&K8+rz7qt<E8TX|_zV9J_h
zr^cs~*%K<_Uyx{1geCMRJH5xffR?$!D%wbz7r(z)4c<oCIwu9wK4zDv?!M%vfMOdI
zFzu1}XnTIQ%Mc1(6|b!5mgpFa-H_ghT%$09HnQpom8)@zwCPhvv@Ca}C>Ii{N&1Es
zb8{p`#``Lfon#WAdrn-|#H3+)KK|JS4>WCvnDY@iln=76QdM#@1MO3IH8x^Q3&b1C
zJwdjDp+4==I>^S0&?BP;y|z!Ssa5?5E2)sA<QjaSz0~POSeD=w*p`vUpig6MJPOd2
z3ePC+lLGY0vrsRfLU|*@H)UviAed5%!%4x%*;5xqYZZiNxB~rPNltX6gxsVE8EW=-
z!y{fq8q>OZS8?$6cz4(*2r0?SQ5+d!fH{enT}Y4WI7H(6WH9q@-;ON`;gg;0UwI!5
zFkUKH<8)z6(qOf<Xs3ENototxT$G#nu{H5YDmxPHwo<#QJle5e3BD>Ct9{<)jt!Ca
zW18wgGc#_Yi$H$kPU<VUz%Ih+PMGp43pYu>nfZ}-Bucz1_OThPq!>x@T<BF$H~?h-
zlkttVu=wZcL!eboY&<e5Jf!vDG3524Ru_z19I}z)K#>qwLOryL*ntq$38+!t0?V>#
zGwFls65ZabPFY?EIF>kG-ko6GS`NG{c;j!vDnZ^Nw2pB+#Utj@pb5uGAhwpQEd8FU
zRkb?|#)A)3b)5%iD-DG?VKVPF=GCGVR~s%Ag62BYkAAVUe{Y8HlivMjJNr+ZA@FCn
zQ7i%MjPcFy?d*pbcTa5tFhfA}KB(dlDTP6vHD7kpsr%qBy8;^-7HzsqCTlrtA9?!W
z^t=1GY|lVT4sRqww^aK5**l+u^KJY=@)g0ZVDnz7)SHliY6R&*<AFV@BcFCI5BAf5
z&&npsiSeW_xXdpJOJ(oT-o^n#NTJM-sB4uO3QD(_fU^cDGKMv@&4Sn8m#C4<++Yj9
z3gj}R$ND>9v0NTgUP?0e<`<xcy#ve50#V%Olv^@SRRuSkBS?cru$b3Q)elZBfQ@UO
zFQy**=#lWru>$>A(c`**UlDX7ULVspA6&}cmnC>0YF6NT&?>CnnmuEw^J+-6<{@=M
zJ^lzSoKN&tdbiEz8OFu8rTt7D1($3KL6(dzU7?rVgoU_W@PiyUF}ah3lbB+DsADq}
zTxlF0;qVFen6mAl&HW0?i;k-3o4WD*wX+bFSvK#mwOSkKDz%bLwL{a=&5Jp&mc}(w
z77f`J(hQNr%NY=iXP-t4wL#UmIRhA+4P5-)Mn?E7ga)qYL*Lsll$GRlesJ9REZNO0
z*6njR=Rn6|RV?n$fD=WQK-S7_lOI&(SOR&0z^$d;V#pA_l~`b$BiYO3%1h6DJMf^1
zKh^gTxgsr5_H8FT=8PJTw{yFff#+ki5BUQ}q@8NZ%&KQqu{XpVW9?*KQ<jGR8fEef
zs;Q%{Y8vw!jIS*c#|rk)enG}C!HntlsR9Cx2*r<Lls9CtzMJvkbNc-xGd$q=LDKxS
z<&tZHbFAdP>vo}a#rw{@{8Tkm4u|#+=<deGgu$MuPQs-DA6BSB4b_#U%FHcZo_Fw|
ztjf5mLVMf8?_0t3OO4CkrEKKN?!5*n1}55>>LJLZO<ds9_Qg^6^~~X&#Y{U;Ngs%~
z-t>gWnk@NLgGIe>vMW;SwpBlfN9e^UbQ@ro({#O!(b>JFk+X@pE1AqHer&r8*b3@q
z;=R*>Ij8)#wHFP0$)36N%WV1I>u7(Hr~m8#{u3t%%VcU9;eZK(AfWum?>gEaI@>k%
zl|L4+0i`hEz&O0^D<X`S6ms9B(Z;8cyg8Q9^Y8ahrbq=6`Ao8xbo~uivZnpB3kGGH
z+0?xN#<-oU$@rFt_d_Wqr&Y;xIr8vgmizl`q6#LmaHHp=<FTqyMl&H{QCCtkrga5g
zZ>P&qDJ;uOU}tnLvani4(SgdcnK!gynX2(Hb$dxvVclvfO{qkUn0320n{IJh$Ykjq
zozKbN6%}NH2qFB|F@(LOVyPxHBz>x;Tl&;>5JlYHD%*aX9S1i_u5>jy*VLtyJWW{a
zP;nsTE1iEfcv1i!MILnFkBujb-nk84A3No$M>vzX+m4P%&e(A~)W?4eXWMt0^Yy{(
z<q9ybo+9FSYk@D-=z`4NOm&Lu2>BIh9OGjHC^yb~q;Zf5Q3A$j5X!U^feW6RTU6ZK
z$?N2b0FOvOu)x+_#otRDM?`67(YeN}Namo$<?~psdoRa%*AZSCc-CpTQ<Te>eN`k>
zKmMTloVw=<w#6!h1x^jn1ruZE^oCetk0Us&vDZG1M_7~kx{HMdHtX{6+Ao4;ddi<W
zc3~O<$)gX(OcZ8KT?WV2LQWixAk66Z4UNSG2qU-XrCSxh!j6?EnCm>jTbDkeuJH~d
zDDiR^aNU1be#&Im3y!g0<9_XIx)p-W`qt0+K7v`mm65hMJG>yfc$cIAYn7?CTV#R>
zs!6ig=KHHDQLhq_DWC7q0}otE3=$_S#ugD~w8V>x%<SFfbT6EyX}*x}l0+J@R+yv4
z9AG+te`uXG8!$8D^aQAjm_QTkf;LP5^N9kim!myo3m?XmB_w}gX_kWMgX~<C2n6x#
zHAQ+C#kb1KV*wr(mP7Ag7@ZQ+)wYZFuKAK-)9%`ywvRVW6_hQOm{5m6r2L=T$lp+r
zmMOIY!Bs{d7Dw0Y2e08|`_g(yU@N4!4Gx&u%;k<nL1w+2kk5>Q<^m~sSwCX+UukYx
z#3Ne+#q>^&n`#_wVI=h{u$_RCpelLNgPYF6jgKQeTp4lJ?FCF3`p0Mby1Xcr7^_q<
z4%VT-TJLW5dq?+^!N_qb=~P|Y0BdB1??U%)dO#s|eqmE8hbLGq3T8{B8-d2zKLulI
zFkZTd2tDWPc!;GrVaMAzamH&sheKfIDaM3Nx-)xT>O>xWpj_wmUTqybe@QI66aN0$
z8j?N^x0G&$Y*=_IBj~)T=~RyZ<5v;ozvr=jlFI+*d#rQ{BIXEy$Kv@LkM%<*y7#BY
zQs!6(dV6B1;hb(3U=pxfAQ9mHgbD^*DZC^s5l%{}B^&265gYHawnn0!9TrvSJ!u^r
zV!97!>-|`chBL4HR&~@9GbWc%USpOtVbt?}uzysnmJn0QNWzuY{N1V|v#0azj1nt*
z8<-W%So&`Fh$bt7kWzUufr&CT)s*U-CSpXDJ63X<6{7M5Ls%{iYT$6TZPnR~uDL~V
zT_4fZgx}$l@2pBc%wsMcwx2p~XyIx=8k!OrNAFNLIQ`mMF|169NBoJJChe@&>6q=j
zEo(Is!Zr{I5(|QVHT#W?!NL$<U}GBnop$6g&bc5t_aW@^@_`fVr4#-*mQ@5kZUDE=
zX?0|0f}584))^+Ze#8f+{%X)gL;RT!>B8`SUp`|{O!7#@52n^)9L5$Xuw27$y@{S4
z=*X=Hx{HNOLua$w7=BcH&%KZl@kDcy5_uh+epJGFwKuO(brC&f#nMd_t8RcdQmls^
zaq8YQK8)4$8MbwAR!Q~yG?t|1OEkwms3@!BO_jC{kLsAl5yd@5dmGmQ_I%)h0BY2r
z#=KbL0s7fyi+T(EbAKi+jfUdKFI#R(D$2D8bsuEsTW{ZZ7PlhyW(?^iyL)OK@2*JU
zicJ|Md&``xKXev2rE@)&%Z=mK(-pJSv+;AHNFz&)DlN`F%B6>78_{8R1ih)HMpQP_
zO!cWScGnucbeVKul9*XqsKY6m`H+SqD}Ajdbki8i+ym9dJU)c{Rdp&H_2bxe%T%O=
zs}Q@FydF+YVX-L39?JNZ5&sR78x)iivMIvW`NYtZPBRdk;gH{EDe+M#MbO@DEkP-{
zGXWPQGz;8T5Wd2TJcx`0G<i*f{tE?<a^2Ig9K(a<uJTFToG^C!XL?@xl-K(;f_Fgz
zNbQ4`VAs9ZCq6nocy>=D(eZU-(+$|<2+6m@=5p()HD0|4svK1k5qpYkK`X(yIno2{
z#zfdQOM9$Xb|o13Nm0dIA`oF^SnuhHT`@Xey8}4}Wf)s_Z1By>qO*nfO{Ah=uZCFE
zR8qb|e~ICNzr~V`Bg7+DC6<oe@vyQ9p^6eA_S6w~N!xBZ=N-_*)=Lo^o>#|HqKV;n
z+%UVC0Cx@u<M4-RTu_qhHFrE#n>%z@;bp0hxm#`vVIO?F*w5Fqdr2$an8%Yu45-~~
ze$Brqp0xJ6(-wR^0A!y0ygvHxxvZbG-v9Y7D~WVe#0@aZ>iL`X(H{!Q3YBh)w~T+N
z1cTd~f|hG^pt}3uSqjW^tfe#fZ3Q&O+(qi*Cx@allsXEs=IM{bEeCUC2xUCuRMS%L
zMCG{i@^)0->xcQ()$WXEFPsA{Pg7bS?2Pw644xT4Qb?BN+oKO=uqqz7UL6m`3{h<A
zOA&L@9b>+TLs3~;k$g)R8*LZ~5*5`AZk88VXE1x;SG~c97&Y6l*!NV__v9PMHZm0U
z4gxt7WV=){<-R*9BKWgrtS?!BVg)Z6$~z5CW~#B>2~fkwn$ej}Q*Z2=hZP5G{xW+;
zimnleK@mc>;HXb#0aqdS#Gc=~f;>;|*>CJ%y(;qrFjtxDD&vJZo37HjxCUfP>Xh&B
zXnoSW-q@dFQIQoJeo|x#58167qpDozu9qcI<2jJwWI{Eyw`QRC#CE{8XNqu$wqx+e
zcjan7^lsp?Fpn0lDPY{4GOeMlS;TwU0BKu+vD>!V9rZ7XBb^_ofMf7X#nG&Art^?D
zObF;X<~EgUg!5iJH+}Zl5v5Irs$^7W;XgEvMM6{VW|ij-?@9`};Nr}n&7Df7-X9}Y
z{LXj2O7AX8XQiR1%F`1S&H5!kb$g`RE}KP7*aD;uqXkxQyFpPDqh-oTZRi;>pl&vq
z7^OY%pxN9@5<5YaV|kPk*n90I5XBs>I}Zc_o?ukU)_{R0P#rSySOkWTeQwRCaU$;^
z9+FPZ0T@*gtL}iBd{YZzCm*>_YL)o?_XKz7$7!lf7xUT#w`p{jm&w~}-kR|y6nkqn
zZ=_3LP;vh<5fy{UM(u$_uB`;@tVLcr!@KoFGS8&%R;eWq;1QUzSIGx#)kmh>7;#yf
zzb5Sddts2DG`&9?=zn4u#N@0()fh0QXhHx2LjPH{@nfR}#vh`;H8tx$)=9p1=-H!e
za)BbB^j992sI-g;)f#GHe4%8}$AlB*S`ws;C#D=ctA6bu288ss;a?_~yPo|LU0o)4
z2U`<{G3eRfgs@sDNyQ=TD6oK`2GLl4{t{warJErWqECZDx}&K7ZZ#LHgXeWSXatVl
zFEVIY9k0AxMFTU<Fz5|4afy6vd3=jOvhFt~8rD|A$>0zP)B=NE3#6Od<M&^aL<4%5
zq=fJM4r74hR8%nT+i8}AtGN)P0D}sZ3TZ$=J~T(&mZf6qsKd;@N5Qh@mRa3r6R(mS
zx@!V>{%VM80w@oGDs*Sj?}VQE9c@L1%0>5>TuTCaTLa$Y?1&qWb|d=ib%&Td4)wiE
z-X=?h(@~!&oe3g_XG~F@fwZnJt(atqLb{l)K)}O_B(9u}cfl$16DZQ$gD&93%-_??
zvu)yC@xM0!Z5*HH!&)ghQ(mo!t+E*Bb|@^`A4<_8Z!b*EZJvq@=a&z1qsZ&9rd|4G
zml2O*_x`|nbV^Cp`w)y&wG%noO-?}u_r<c29Z%}2<pb(-gs$#HQ!wj}8=}aLecJ8R
zni8ywoeoU~DBt3Bv$0elOTfm(_SNZ<rNT;@JxA&=a;L*FK(bZ%?og+Ci*5~H_VM0e
zIzcY$h9f*vmi#s8n?&N|01n;h{zUSTT&!z2%@svu7WiUCPdHlLC+4MNY}U5!Pw$Em
z_SxaKAkAq$w)F8HJ(qW+l;YaYha1)dY9834t~d;j^mEUGxFz$dbFlFFTGB<hKtOF8
z$@d*mP9BDM6}+Z4FD8kquE$FhiW&FbOsB({eSk9yCkTLe8#RYwlQpQow3MG9!z!Ce
zb9$|W86_G*<mP)(Vt)7@iVv@VN@szk#)|Wu73TfPh12HZSIc#j6nhRldLYT^NUi~*
zL^Bv2YHHTWW_X;@_lVeivAqy$(P){Af{CyvL#JGanB|WfDY%RgJ>u<l`P%qJt#r4z
zUo{^n-lnYDq*pPFY!Q6-Ld+1nf%I|?#Y*S2`A|+zG5kv8%|8xVZn;#P4D$*8xrKXV
z2JiViH(iMi=@5O^!7^IGMQ0dGEU%37P7j0JCCO#gy>;;0xlrrs!NuhiUt6nUuuJ0?
z|C6lLN8{6+!*vUViRbv-69*UHjw0m8h`WbhwhsI2aN(!yc}M!c%Bg<R<o@iA{u6^T
z$(lN6Q~-BW3-dSb=!br6PSyI4u<ZBoDh_{9f#a+Ba@sW}uz+u{Rw!#M`Q~YDeKN#2
zD~mQM{`cqU8HrI*G6}wfSKYFn6TIHzE}U*B#yX7Z@&!`eGqSrOxY;}?Zo?6Ejvty@
z-+lFE3p@yfmYo@oKOP@F71MIa*ML9#@+~xf{R<{13`oY9nMfTy$)4FKKrTT*Rw<N&
z2wiNQp#V<sOEwYdou-c)zE4SU;VNQ)ppQuInGjLo3{#%1Fk4k@8KFWVry?(P!EFO$
zja82Wx20P#MZK*jqpAZDLL&1yEj((X+p!3qo#NE&wg3ENkN?1MJJ`Z%1T`Y<+fA8c
zR}n99%WKV9#Fxch|L)l)Gqlx3i~fQ&Y0+=WD$WCoz`XHMz<~8y<U34~c8)qYqrg`F
z;C%y}&O<GaU|(<KU?E5-I=B336KN6fThec+W09r{!`m5bZ$5y39@YMu9Lc)<fqE=_
zB4yq^npxB_2MQf{5ati<M%vq(KeQXH4Ax{O!^HsY#w4Z#*NZRDBdjQE-POeD5<t5#
zvMOnZU&^sz0D~S_po@9y{^I?14Nbi|*V53O%`RXm6H`$hGjfv{7aykrKW1y`vwYV4
zpyU_QuTn~?dl+eajhO=vjWl%MRaU#Ht{V?JU3gzFvnG5{gWvY<R)Uyyzp4QXhb8GX
zgg7z>&6?>OQSL)jvjc4{!4;*UJHQKC;IGG9K@9F7rNgyOQ64_H8O)RsVk~y+tKXqa
z8d^{(2AC|$d%7-MNRX0HYI6%P%;}q;R@-B}Ho_7$>ms&y$|JsgE5X!OM&Le7fYpbf
zhV*!yUd*R}LMXb~^$`c&gz_lS$Tf}Iuu`&AKSsAsVD0f75?DG?9+gg#SxaTId{^|9
zsM$v)tV~5ui_w%Pq@Ln<8!@XqNC!#|U-M0uk_lU917v(pf~fAbwM+}glR=*zOzzBC
zuVvde0Vhtz{JD8K*(`LE>rc4)Hy2D8_vcW?aur$nStUj7i;@qJkhe_|c{$p!PRU9c
zFZrdg9p_r+cgI%Ud)8D<ZTiO2?c@$VTep4GpK>e)X(hha%J?*aK(A&vK5_s36y$&K
zDSxZcREGRd^eIg*S6@;9bC6&_WuxB?aeipu`c=nl07~PIb4oD1X%L-s=Pr3L8JP05
zaNtvP7Ru^RIg63B%E@4ovfG)rG#+)l!xT$4kBzxn%wLI?$Q~|d$Q5p;E?W|<pv{6*
z^VYq6Cf{c(W<V#!xU1F5*&=v+;i_b!tM^x^f>`0k>*Qn-qGlHD9)W+Lph{=%<!CSm
zCT;Yx=8v<^`Mevg%NWDFOASJ6?8$)H79&NcJUj3yrSB~m?1;UXa27~+itk~+V7^`H
zOesk@1egGXycX2^JjM!|jFB%B^Is6V_)7M$(n?bG8GJNkW5cXaPsOA!kwq0y8-wR*
zZTx$Lo(Xt5cMN5A?lB*;12Un>{Z2Pb%(OX!GEZ)nnF^p+l*|>^+*ux8tYG#ngyhGC
zL~u!MBhSeiaj6@>XV)&1r_SKKyUMAvIkte_>c1zPwA+y?rpZDXoUrg)<tkTHBgE-n
zKNN$@gI?k~Y|?r<M{`}Ew98G(TQfu%R-&l;0<_e9a)Y?!zEcBEsR@roLl@TGPI%GV
zO|ZFvt<_A{<BNKnqirP?-ewn6NpmA}Qp@cr*v}_7yf|^rw+Po<(}5|gAm4q?XcJkE
z5~!^X38i7!RIerd(V-woycw0fdaxK`#$QENd<(3NYGPyhu_?Ijs^EM2?IfqQ$}>cg
z(Z`un%|x84MksH#<`!SY_g>v-6EE}ooAI%f8idx&`rGvV<?(V!dwdHoZxMx@E7g2w
zfzL|TrjDx!*;P7$B#atMpW75u14%Z?Wyu|LIOeSe(bLCyw4l}QYc*6I?tILcx9Vqk
z-kW9v3E{UCAsmT#P25URG7KBEhB%1o_!Vf@pS~__dwRMIeuBWVbvL29^#4>f8XMZe
zutr0^m5@_Vr*<_!3g3yeqxZ7`|8<J;KN#@;F)=4LQnz+7z<^`^&VYYN*8m1A3y30g
z+);{pKo`f#6bH%+`xEd-WV2XIuV<_cSE^K5FRjPg9hV?XiXG-%lHhu{pnxe$ijdGf
zvG1HeJxB4b?<9y2>PHVakUmw|w9bGehDkPmd91Ka7=6h@S06MQ35;K~rJZV!g*cCP
zng~ZVtFGgnk8wEoKp$-#|GwKw%RCYF6QvS^SQS}>Qtug59pa@xks~HX<Y+No6+2ed
z8_mRMPGo>SVhK!m%!H@gF)0z;K!pH81gM^Kfem}|OZ;c?m$l<Cfg3r8?^B1L@lp{x
zGGgE^>hPV2W6Qg|QTv^;Wh2T2wg|nMcsvpfos#Zx9<%*2?Z^>ZsDr7JrY7GeJy#Q;
z8F)~l%~9oT-??N2Xj%9>;89tSJm^1vTxBa=-Igwpl4C0HxF2%wY5A-L14qBC=MTq@
znW9O(Q`}(JD;*3mV6hL=;Oi($_Y`Ct^FHgSicH%n+s1W!Ak=5&GUxOA;{3enZt-d`
zM^eRYZr2P>9-VTfhE1so!u};SMcZ@e=i(XY`!fC0H3n0`>@U@jS!OW`IU>e66|~ZC
zVY{z9JckFPBwm@l3ha&S##PMy%&k$SuJELrx=>k|b=zG@6eT`;!bUZ^-U;JjN{^C!
zwY;dH_OLm!46OVJjOYEm1oOOUlp1Z>Za?;s>{y<CvD<lmJ>YEX3|)d=DuUmNMecZ`
zt3WAs(zB#7RjneCoChObCMYXPK=s}_vsf@ih1@&BP;X}%Ujj>Wk#ntc`PACHvgoul
znldY3`r_h*J-wbf#hTMA23E&S`jVCr-cpoGBb(bNEZScyx3&1_neO64%9EP)eRP;o
zF!~Xm^ym^?nV1S!H>u5R<IXn;tSW>F;g1v2KB<t-T+z5`2KV{q&~vA*8OR~8Vcq}V
zEc$;CXZ|Nf9qe)4z6Ss-`XeAL_}eb|hp4MUMd2Un3$^sCX(D}dn9@?SqdqpaSP131
zoi!=CDAvISt0K!gv3r4M#&OwY*_Bwbp@@t{mkpP@2hVr!mS#cb&1lq^-jayr0cU?v
z>E5Gyt4&Y8E(58Ln*fd?X~?#ii<|yM7x!c~9ArpiInh{QFI{$LaUX5|3m#=u{H>(1
z#s1+!`F2k=H%NsD4_*|R{a8~Zm(L5bkM(5khc61*6Mi34e2Z&#IWx=JN}$WY87V-D
zE1DXnBAG@paxZG<qyi2*cORz@gEBu=zgY9KcSX=h62+De(1dT><S|ER6<r|lIq2~+
zd~RX&gyCHmpl?Qj8qUmNHQseExYB}EL~1}NC|K62d-k43YpNilILOJ>5%*^H7PoOx
zXdWw}YRKFT-me}~G+67g1`ozR4M7b4q{CP%s9rohXO++s56fku&e4lg*)aII@iGrO
z4?dqe*p<9{BgnNAWm{T8ECj-_+o3#>VzTT0)o81KUw=A)NCyr)nUBKVLvC|siemGe
zoqG;FI?e(=Ko<&J&(Q}|z#LOEgiDId*5Jw@L7Jcn+s#ssk9fDUCbQ6PN*!!vwXHh}
z!diK1mxMe;_CcMWqQHZM=VZE0<_$#H=mvY!PUTginFS4p3T$;cor%zbLMq<U1b)HW
znwy-P^t~cFsTPP38jrqC@h$TG{5PElZ(xJ1{mOfZlHk)C;=XB6r%<M|8HYhCy1%*7
z(ibe|DZd%M<qT?8sL$2BFh%_s%sLM-nwR`pu7RFXd%Q}3HCi*-R<qZL2V4!yC@2W7
zP+#rh34<izKs~GB^ZkWUUx*qB#y<7T#)4nkI%(-$ZFPQF&50~z0tZid?Jslkf3y4j
z9~6oIiN>1NV&D}9FjnNh>wf=Bk|+fjKO?T)p$4+q_$@jpW<$ZP0H@Vy+?e*suFwia
z84*1j1Ur<MyC>sSkW)%LBotC^4y?!Ka$K*sZNYK!Xj$~5`;t^(IYoQCz=IN75z4S0
zYfr4Z$Z<#Vu!F13cNNb7IgJm{2ix$M>US?q`o`2W-`T_-knCr3ek%t>xs{Aad#diE
zqL1AGm#?@=@P3u4Y-U*+**M)kM}&$zcn9`~3UvlzII0NKRGm8yTm~*651qI_t-uC9
z(x;4OPsT3fdvA=Ma#(2NZ~k;`=r4_)_m$&)uqCFkM#r*x>RyG?;}Cwc8iMpteXx&E
zR(X!;G`0TV+u->`$b}fyRb*y1{Xu!n(kPKpd{cJQ4twAcv0^J+#nmtP@iSahot~Bv
zWbvzkoG-heBHqVu!Ogd}V|y-k%qv(2O+F5a;4ntf<y7Td)zAA-d$+54+C(7~^^%3d
ze$-L{s!4dc88vKX<@CK#fdfxeBXn;;*erVS?%A-aoSBJ5*gGuHA6W+u#?vR&9NVu+
zb3vowXC!^2rV?Xy#Hz<+yX4Z8uFV%Wo$>HG<B4(xY4%afrr7$^B1VAM_977T?a81V
zTLKkn7m*`Qvtl;M5h${@Z}qZ9JmUv4nV|xql3oYk5B!-`1Z^U2S&E4_{B2rBO*~l^
zK?l=;REP1PgrID-<B+7Zd%BvLZ)aMK&7|Ymw*=!0g=O(9w=4sNl84Y{`-s6Y%)%Bk
z`n8PR1il(mi0hOQH&`SRL+IY<;>LO$BQ-^%VV4!(uFR<5Q0oWHG6H)Pz9Jnp5>#vD
zROslNTb;gg9A=!&c|NJ{t=vgbJol}oh#11s=R)$xb!y8uwa_(^xw%gdb^`x@8ZrML
z6E|vR!&l^i0|D&-qWTzssJ@ZydwB<2J4gBtW=__IHU{QabQX5Trl+cwb|@;S?^-?B
zDXqs4DQHn4ds>;g@@WOBLCDOP3lrJ$X{+pmB@$#%(#Q;4W|Czy;vV^5h!Hk?Xu#2M
zO>va#vZE`AvhzSg!C~OuBh}wy<AqoGxt`hR@*^g8wsU)&r#N<aoIQ-UJF>^UkE=3p
z6)CAe#SHdy@FZ~1T(PcIS6#xLD`=3U<^rl)$;gS#Vg+IWH=0r~VWZLeJe_?Ci9Wy&
zu`3unLmDLQ>_MuZr`=Sk!vz9|i!dR5`nr(3g-(J2ln~5un5-=mss-%YNlgN%dC6=>
zZxJUHTJJ(Mj?5^~CmQ}%SzFM)Jv@avT{8rnaZKdkifb)&1!<yKu7;h&fEBX9I0+h9
z;HDX_4&9mY@bV!j|AXyzNB{gT!r=Vd5Zl68^;)Pwp3*hyWbWLinZ3Gsg3L%)A8|IN
zC*?lAptmQ|p+%t8#5{)P86&3W*4OiNLg`V9rwP{e3LaY79i^HXqt2`l7@WE3FLcBH
ztN~%6a&F*S1!)M7HaUHgCqb_1?Amxy0NwT&>%4k8TUCK%RBu>W2AXqUb0|JaK(qy`
zX{FRf3g_(o>UykI%zoD!h-Z}~3+Y@G5|KQ^l=k|)V{Im}0U70k_#`~N650`4gomWs
z(B?R6!$k#*J)95W^GvMp%>-b_xr`4}Hm1-Or!;-A`jk6h!2a?v?XVzrSa6%ZObk1i
zAV|D;7k>D(AnxFG_aC{lk*rJHoP+YV!SV!BdBsh4s7g4GVNCPV35E$p6+_Q?6eac(
zh!o)<YGPN#&|{J7&=cG)n{V^K9ch~G*w6E0<|(c#GT08iIUt=sK~#IthRoz~nftow
z#0mS^GoBg<S(RWQJ+<6N@98NIh4E4R?j?lzu3qTd*zsuA7UaXbuTKvN&=5ZUd|`5Y
z9~sy?;}uXk-jO0uQU(~kX}=%Y?DGb4>n2%Op>_v(>wto%32optS3q_6F|Har2HS`2
z7ON4n5zD8oaD=j~&wAh40pG5*X>IjDf7885_e{u!{tn_!cUpJt-m~vccj~Q@6mN+I
zt*@TxeAJxGbkgC#N#P+C%H;>n7EhgK8*KN1fje5ab|cz%>V973r6miBZcc~a1R`?z
z6{&E1>le~r53QJ@cTrYYiyA^R_b`Rt&n22k>rYBApjuwBqM;66%}5LBG2YIX$!45D
zzyy-0){Li)l;oKvjoYYtB16?oQq4rnBg0jodA-qHlGkS$XZ?7gM2cwzyikq>QM6$z
zuN^OdN+Mu|zFO<+4LL{sm@D+q!MFs8%;v_}^{jU>&h@dn0uHo|4tV#A?BdmhEY3F~
z{yQ8sb*t$02e@l+-(5-?c^~7nOB*h<Fzvo4$yVsis{!%d^GfmEE7eFJ<m^FwETE<3
z<7*%qU1?h{anPP!5INJGR$r{Qg9Xxfx|^uAKqp@OxwoZ<%X?EB$3xCFj1Zt1+UaXT
zC2=VFIj?v&Wm5!=Zv(<^eBZrs$K1r^t#%KdVUFWvz;oPF5&YUaQjN34x?E>)geAVn
ziK8C8quz7LBVhgo^bj)lKumgw-CUH;-h#Yxm`c6TUTT@-I(l6pTIhfbV?F}6ce>V5
zG-f!1$w_lvXU24c*VIPgk~bOI(p;mtBDb)?=7?!kH%04$#nr2flyh};x~1VXCj{T^
z!^`^^zR%{@N^*SHUmYhpC+nS7r06?qUgj3oJx@+E&f8Yn0h0*)^9Q5zj+ZR&+r6!-
z$7mehc^}Ws8$d_?a`x0+%a`71ce$`Z_ws!(%SS;+*|fZ1Hb_KYU63QUparub2>B*3
zE4B*DKr`2=5{vV*zK&+H+fFiWdoLLp*LYMZ210?UTqZBW9#2vsMJuZT?yHoBwz}Yg
z{m|}3*AXW#O(3<}Rd#|G18#6|j)55_c5v`3hw$Sopf>{sK?Owx_;4s7rnXS+Xbec1
zTR_!>RTLl~z|VglzJSkwfV6~c0p&|<KIkdC*%>?P{857GSAfem>|cQZ|GWhl(E*ZU
z|NHO-CI@`-kENv_`T4(|7=ik{;|4f!1@LbG0os2bzQFQ;1)KlwVcy$182|3NnUm3O
z=YZ3#0hRv%DS7{W_yXSpKKbv{4eacGcXqWsYoagU!Vd62Kz}sGKM!AEG{6+n&h7)C
zQj4)8;LZ&`nA_U?4k1nfsL>s8{b9gO<Nxz?z^gCtAoM>F{&DZW;{3KPw9~~<g#loH
z42yqm3vU4YKk5AFa(~Cgl$k8*6X0}zK*Rlf`X$^yPIoppcKO@W6JV^J2m$Fq2e`jB
zMk}Cs{<!)FGh=JxfBgLs%>VUSF1|#?6aa3%0r30tw+J2}h5mtHU}$Bm^uf)_*ztFF
zP)M80dIq2{P4|!5-v4Y+U*O&U0{EdGkTGxsRMYvdf*${P^IJYxqn8hF01Yw#c$j}|
zocqti7Z?d}kALv_3GOcopnn1V?URxUlUM5kG;;-@VTt|;3wZSfHpBZ1@;^7|{c)MU
z?t=bn%l)-_=O1O0{&@iK0Pz3K^q+WtVft6p-%|Z`oAN)QEr@>u{a32LdqjU_`g5Dk
zkpBkmUzz>^`dgyE?(+L5(OHV$ApeEvkHGZ*)TBQX^##2B-<$N$X!ozkzwYq)Cs9AZ
zv-=n1zYzTqZ~GPQ?^~3Z={Ioy(xQI`@qR`Ab)UOGiIOn?2Kg^Uf5aMoh5LJ=mh8WQ
z`xl~r1~7g_{&gd%KZ#m#{08|iM1Rb%euevcqS3s+f%_Mte@@$eMgDa=lRt^Zy!#FE
zUx@xlbNLnS?}=^-{08n{i2j*7^DFYNyCnQcbW89z$bTXFL*4!>+}{(05d96@zYzUX
zGXE>`uZ7zGBnm0^8|1$b{h?|774Gke-b?)k?q7)hDU|&c`Pcfwe-eF={tfb9i2l&_
z{R;Q@L_aJ12JT;o{wWyz75UeCntu|lQTz?^Ux@xcg`B-g13?sp-9lf%(x1{&()t88
zR$7>#f(i)=iYO>(R)q)&2obc>2e9@T?0pG)AHc$KhUMOKyvLAYNwFVaW|w=iJ2SVe
z%4424#cb-Y2fRO~g&LN8?v-b%*Bb>*o$|uYc`VE=z>GSz>4belFh+3KDMNg;0pWhF
zt#4o&w{tSudk1LwT7+rl-x1F@=ci-(gRl(<C!`aM``9$Vy6WrzEenOI<rk98H|M8g
zUG;VW;e>Qjaa)ZBSXWznK+8g5I%%QBE+CwcP9)whZ$h&@pk<*jh4dvG9st4#>Ez*!
z#U|801X>mf(?*Zb=m-!_NGA+$bTpy*7-(52OcgD(ata71q?3gAF`3Zv8PKv&m>yc_
z>l_eHNGAsGQZS*<3!r78FeUUoG`<9c6Vl1RtFldKa0Rq16sLh7qJ;*Kj&(J60}N;M
zPXh4D%!Zhrz;6Lpl)}e6t#mQ~hF8+nniqDNQuhvUMQLKW#!+ewf#H>Oz2>DVrnEEy
zTu}-Ow!f87Y33dnUP)JMo@+OyZ^RX)uw-kcx3N^xHJfJ?P3iRka7Afi(Z*5Qe*}gf
kC0(_7#>AA`Pww>;&Y!HUPN#PuX$yJ!yrlOit>@owKR-4{djJ3c


From 69a3e450223e74f606553843b9868ee7b783cd30 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Wed, 8 Apr 2026 23:51:41 +0000
Subject: [PATCH 33/38] save work

---
 docs/softmax_lowering.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md
index e0c5a0e8..f462eb9b 100644
--- a/docs/softmax_lowering.md
+++ b/docs/softmax_lowering.md
@@ -3,7 +3,7 @@
 ## Overview
 
 **Assumptions:**
-Softmax dimension size is small (64 in this example). 
+Softmax dimension size is small (64 in this example). No tiling in reduction dim.
 
 The lowering process consists of seven stages:
 1. **initial** - High-level tensor operations
@@ -302,7 +302,7 @@ xegpu.store_nd %11, %12[%0, 0]
 ```
 
 ---
-## What is reused in between matmul and softmax?
+## What is reused form the lighthouse project?
 
 The softmax implementation leverages several reusable lighthouse infrastructure components:
 
@@ -326,7 +326,6 @@ The workload-specific code is minimal (payload generation + tiling/decomposition
 ### Payload Generation (`lighthouse.ingress.mlir_gen`)
 - **`get_mlir_elem_type`**: Type conversion utilities for constructing MLIR types
 
-**Key Insight**: 
 
 ---
 

From c5e5d374abec1e989290a0b298369fbd35c7a540 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Thu, 9 Apr 2026 00:08:36 +0000
Subject: [PATCH 34/38] save work

---
 docs/softmax_lowering.md | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md
index f462eb9b..e3f6518a 100644
--- a/docs/softmax_lowering.md
+++ b/docs/softmax_lowering.md
@@ -8,7 +8,7 @@ Softmax dimension size is small (64 in this example). No tiling in reduction dim
 The lowering process consists of seven stages:
 1. **initial** - High-level tensor operations
 2. **tiled-softmax** - Tiled softmax operations
-3. **decomposed** - Decomposition into constituent operations
+3. **decomposed** - Decomposition into generic operations
 4. **vectorized** - Vector operations
 5. **bufferized** - Memory-based representation
 6. **xegpu-initial** - GPU kernel with XeGPU operations
@@ -65,7 +65,7 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
 ## Stage 3: Decomposed
 
 **Notes**
-- Softmax decomposed into 4 constituent `linalg.generic` ops : max, sub+exp, sum, divide
+- Softmax decomposed into 4 `linalg.generic` ops : max, sub+exp, sum, divide
 - Uses `structured.structured_decompose_interface` implemented by `linalg.softmax`
 
 **Code:**
@@ -331,9 +331,9 @@ The workload-specific code is minimal (payload generation + tiling/decomposition
 
 # Supporting larger Softmax dimension sizes
 
-Previsouly we tiled all ops in the parallel dimension only (i.e. non softmax dim). Handling a larger softmax dimension require tiling the softmax contituent ops in that dimension. 
+Previsouly we tiled all ops in the parallel dimension only (i.e. non softmax dim). Handling a larger softmax reduction dimension require tiling the softmax contituent ops in the reduction dimension. 
 
-**Approach:** Tile reductions along dimension 1 (using appropriate step size) and fuse producers into consumers to enable streaming computation.
+**Approach:** Tile reductions along dimension 1 (using appropriate step size) and fuse producers into consumers to avoid extra memory buffers.
 
 ---
 
@@ -357,8 +357,8 @@ scf.forall ... {
 
 // After: Division tiled into 64x16 chunks
 scf.forall ... {
+  // Tiled Max, Center+Exp, Sum ops ...
   %11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) {
-      // Max, Center+Exp, Sum ops ...
       %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) outs(%extracted_slice_5 : tensor<64x16xf32>) {
       ^bb0(%in: f32, %in_6: f32, %out: f32):
         %13 = arith.divf %in, %in_6 : f32
@@ -380,10 +380,11 @@ scf.forall ... {
 
 **Key Changes:**
 ```mlir
+// Original tiled div loop.
 %11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) {
   %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1]
   
-  // Fused: sub+exp computed per 16-element chunk
+  // Fused recomputation: sub+exp computed per 16-element chunk
   %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) 
         outs(%extracted_slice_5 : tensor<64x16xf32>) {
     ^bb0(%in: f32, %in_8: f32, %out: f32):
@@ -443,10 +444,11 @@ scf.forall ... {
 
 **Key Changes:**
 ```mlir
+// Original tiled mun loop.
 %12 = scf.for %arg4 = %c0_3 to %c64 step %c16 iter_args(%arg5 = %11) -> (tensor<64x16xf32>) {
   %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, %arg4] [64, 16] [1, 1]
   
-  // Fused: sub+exp
+  // Fused recomputation: sub+exp
   %14 = linalg.generic {...} ins(%extracted_slice_7, %extracted_slice_8 : tensor<64x16xf32>, tensor<64xf32>) 
         outs(%extracted_slice_9 : tensor<64x16xf32>) {
     ^bb0(%in: f32, %in_11: f32, %out: f32):
@@ -608,7 +610,7 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
 
 **Notes:**
 - Convert `memref.alloc()` to `memref.alloca()` for stack allocation
-- Reduces memory allocation overhead
+- **Why?** We can only allocate SLM inside GPU kernel.
 
 **Code:**
 ```mlir

From c001e19e3a74846985295b1f6bea94c969986405 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Thu, 9 Apr 2026 00:15:15 +0000
Subject: [PATCH 35/38] save work

---
 docs/softmax_lowering.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md
index e3f6518a..37a5b659 100644
--- a/docs/softmax_lowering.md
+++ b/docs/softmax_lowering.md
@@ -512,15 +512,19 @@ scf.forall ... {
 - Convert tiled linalg operations to vector operations
 - `scf.for` loops remain but operate on vectors
 - Vector size: 64x16 for tiled operations
+- Same buffer is used for max and sum streaming reductions.
 
 **Code:**
 ```mlir
 func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
-  // ...
+  // ... 
+  %cst = arith.constant dense<0.000000e+00> : vector<64x16xf32> // 0.0
+  %cst_1 = arith.constant dense<0xFFC00000> : vector<64x16xf32> // -inf
   %3 = scf.forall (%arg2) in (16) shared_outs(%arg3 = %2) -> (tensor<1024x64xf32>) {
     // ...
     
     // Vectorized max reduction loop
+    %5 = tensor.empty() : tensor<64x16xf32>
     %6 = vector.transfer_write %cst_1, %5[%c0, %c0] : vector<64x16xf32>, tensor<64x16xf32>
     %7 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %6) -> (tensor<64x16xf32>) {
       %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32>
@@ -531,7 +535,7 @@ func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
     }
     %8 = vector.transfer_read %7[%c0, %c0], %0 : tensor<64x16xf32>, vector<64x16xf32>
     %9 = vector.multi_reduction <maxnumf>, %8, %cst_2 [1] : vector<64x16xf32> to vector<64xf32>
-    
+    %10 = vector.transfer_write %cst, %5[%c0, %c0] {in_bounds = [true, true]} : vector<64x16xf32>, tensor<64x16xf32>
     // Vectorized sum reduction loop with fused sub+exp
     %11 = scf.for %arg4 = %c0 to %c64 step %c16 iter_args(%arg5 = %10) -> (tensor<64x16xf32>) {
       %15 = vector.transfer_read %1[%4, %arg4], %0 : tensor<1024x64xf32>, vector<64x16xf32>

From a60dbe940e7e1a39e4de0568db706afe1834ce89 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Thu, 9 Apr 2026 00:18:04 +0000
Subject: [PATCH 36/38] save work

---
 docs/softmax_lowering.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md
index 37a5b659..d769b08c 100644
--- a/docs/softmax_lowering.md
+++ b/docs/softmax_lowering.md
@@ -337,7 +337,7 @@ Previsouly we tiled all ops in the parallel dimension only (i.e. non softmax dim
 
 ---
 
-## Decomposed → Tiled: Stage A - Tile div op
+## Stage A - Tile div op
 
 **Notes:**
 - Tile the division operation with step size 16 along dimension 1
@@ -357,7 +357,7 @@ scf.forall ... {
 
 // After: Division tiled into 64x16 chunks
 scf.forall ... {
-  // Tiled Max, Center+Exp, Sum ops ...
+  // Max, Center+Exp, Sum ops ...
   %11 = scf.for %arg4 = %c0_2 to %c64 step %c16 iter_args(%arg5 = %extracted_slice_0) -> (tensor<64x64xf32>) {
       %12 = linalg.generic {...} ins(%extracted_slice_3, %extracted_slice_4 : tensor<64x16xf32>, tensor<64xf32>) outs(%extracted_slice_5 : tensor<64x16xf32>) {
       ^bb0(%in: f32, %in_6: f32, %out: f32):

From 996a91168244217e00bd0ec6202a88343c6a673a Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Thu, 9 Apr 2026 00:22:15 +0000
Subject: [PATCH 37/38] save work

---
 docs/softmax_lowering.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md
index d769b08c..b56a80f6 100644
--- a/docs/softmax_lowering.md
+++ b/docs/softmax_lowering.md
@@ -676,3 +676,8 @@ gpu.module @payload_kernel {
 ```
 
 **Summary:** At this stage, the kernel processes 64x16 chunks in streaming fashion through three sequential loops, minimizing memory footprint.
+
+## XeGPU improvements needed for e2e support (WIP)
+
+* Change `memeref.alloca()` address space to SLM.
+* Support for lowering load/store using SLM to `xegpu.load/store_matrix` instead of `xegpu.load/store_nd`

From 14176ac5acbd928bfd893244d4b0ef46125a8d7b Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge@intel.com>
Date: Thu, 9 Apr 2026 14:48:46 +0000
Subject: [PATCH 38/38] save work

---
 docs/softmax_lowering.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/softmax_lowering.md b/docs/softmax_lowering.md
index b56a80f6..d15d2e85 100644
--- a/docs/softmax_lowering.md
+++ b/docs/softmax_lowering.md
@@ -290,7 +290,7 @@ gpu.module @payload_kernel [#xevm.target<O = 3>] {
 **Notes**
 - Sets the layout for anchor xegpu ops. Each Wg consistes of [8, 1] subgroups
   doing 8x64 softmax slice. 
-- Only sets the layotu for `store_nd`. Layout propagation does the rest.  
+- Only sets the layout for `store_nd`. Layout propagation does the rest.  
 
 **Code (differences from xegpu-initial):**
 ```mlir