diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index e4abf423..00000000
Binary files a/.DS_Store and /dev/null differ
diff --git a/.gitignore b/.gitignore
index 0950c051..c7686a89 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,8 @@
+
+# Generated files
+samples/benchmarks/plots/**/*.csv
+samples/benchmarks/plots/**/*.pdf
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -8,7 +13,8 @@ __pycache__/
 
 
 # DS STore
-*.DS_STORE
+.DS_Store
+**/.DS_Store
 
 #
 .idea
@@ -33,6 +39,10 @@ share/python-wheels/
 *.egg
 MANIFEST
 
+# SDK
+/tests/csl_runtime/cerebras-sdk/
+/tests/csl_runtime/cerebras-sdk.tar.gz
+
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
diff --git a/LICENSE b/LICENSE
index 22f678fa..e956fbce 100644
--- a/LICENSE
+++ b/LICENSE
@@ -2,6 +2,7 @@ BSD 3-Clause License
 
 Copyright (c) 2026, Lawrence Livermore National Security, LLC
 Copyright (c) 2026, SPCL, ETH Zurich
+Copyright (c) 2026, Noéda AG
 
 All rights reserved.
 
diff --git a/README.md b/README.md
index 7a11e478..b40ff77a 100644
--- a/README.md
+++ b/README.md
@@ -29,8 +29,8 @@ For full details, see the paper:
 Clone the repository and install the package:
 
 ```bash
-git clone https://github.com/glukas/spatialstencil.git
-cd spatialstencil
+git clone https://github.com/glukas/spada.git
+cd spada
 pip install -e .
 ```
 
@@ -64,7 +64,7 @@ Key options:
 To compile a GT4Py stencil file to SPADA IR (`.spst` and `.sptl`):
 
 ```bash
-python -m spatialstencil.cli.gt4py_to_spatial samples/stencils.py 128,128,80 output/ --function-name laplacian
+python -m spada.cli.gt4py_to_spatial samples/stencils.py 128,128,80 output/ --function-name laplacian
 ```
 
 Arguments in order: `input_file`, `domain_size` (comma-separated `x,y,z`), `output_dir`. Omitting `--function-name` compiles all stencils in the file.
@@ -76,13 +76,13 @@ The resulting `.sptl` file can then be passed to `sptlc`.
 After compiling with `cslc` (invoked automatically by `sptlc` unless `--generate-only` is set), run the kernel via the Cerebras `cs_python` launcher:
 
 ```bash
-cs_python spatialstencil/runtime/runtime.py output/ in_field.npy
+cs_python spada/runtime/runtime.py output/ in_field.npy
 ```
 
 Alternatively, use the `Program` class directly from Python (must be run with `cs_python`):
 
 ```python
-from spatialstencil.runtime.runtime import Program
+from spada.runtime.runtime import Program
 import numpy as np
 
 program = Program("output/")
@@ -183,23 +183,26 @@ brew install lima qemu lima-additional-guestagents   # one-time
 tests/csl_runtime/run-in-lima.sh --sdk-url <url>
 ```
 
-This creates the Lima VM on first use (~5–10 min), downloads and extracts the SDK to `tests/csl_runtime/cerebras-sdk/`, installs Python dependencies inside the VM, and runs the full test suite. Other modes:
+This creates the Lima VM on first use (~5–10 min), downloads and extracts the SDK to `tests/csl_runtime/cerebras-sdk/`, installs Python dependencies inside the VM, and runs the full test suite. 
+If the SDK tarball is already downloaded or extracted, use `--sdk /path/to/cs_sdk` instead of `--sdk-url`.
+
+Other modes:
 
 ```bash
 # Run a single test
-tests/csl_runtime/run-in-lima.sh --sdk-url <url> --test test_add.sh
+tests/csl_runtime/run-in-lima.sh --sdk <dir> --test test_add.sh
 
 # Verify the SDK toolchain only
-tests/csl_runtime/run-in-lima.sh --sdk-url <url> --check
+tests/csl_runtime/run-in-lima.sh  --sdk <dir> --check
 
 # Run the Cerebras SDK smoke test
-tests/csl_runtime/run-in-lima.sh --sdk-url <url> --smoke /path/to/csl-extras-*
+tests/csl_runtime/run-in-lima.sh  --sdk <dir> --smoke /path/to/csl-extras-*
 
 # Drop into an interactive shell inside the VM
-tests/csl_runtime/run-in-lima.sh --sdk-url <url> --shell
+tests/csl_runtime/run-in-lima.sh  --sdk <dir> --shell
 ```
 
-If the SDK tarball is already downloaded or extracted, use `--sdk /path/to/cs_sdk` instead of `--sdk-url`. The repository must reside under `$HOME` (Lima mounts the Mac home directory by default). The Lima configuration is in `tests/csl_runtime/lima-ubuntu-x86_64.yaml`.
+ The repository must reside under `$HOME` (Lima mounts the Mac home directory by default). The Lima configuration is in `tests/csl_runtime/lima-ubuntu-x86_64.yaml`.
 
 **Cleanup** generated test artifacts:
 
@@ -214,7 +217,7 @@ make -C tests/csl_runtime clean-sdk  # also remove the downloaded SDK
 
 Questions, discussions, and feedback are welcome via GitHub Issues:
 
-- **Bug reports and feature requests**: [GitHub Issues](https://github.com/glukas/spatialstencil/issues)
+- **Bug reports and feature requests**: [GitHub Issues](https://github.com/glukas/spada/issues)
 
 ---
 
@@ -227,9 +230,9 @@ Contributions are welcome. Please follow these steps:
 3. **Write tests** for any new functionality. Tests live in `tests/` and are organized by subsystem (`stencil_ir/`, `spatial_ir/`, `placement/`, `gt4py/`, `csl_runtime/`).
 4. **Format** your code with `black` and `isort`, and verify with `flake8`:
    ```bash
-   black spatialstencil tests
-   isort spatialstencil tests
-   flake8 spatialstencil tests
+   black spada tests
+   isort spada tests
+   flake8 spada tests
    ```
 5. **Run tests**: see the [Testing](#testing) section for Python unit tests and CSL runtime tests.
 6. **Open a pull request** against `main` with a clear description of the change and its motivation.
diff --git a/irspec/docs/index.md b/irspec/docs/index.md
index 000ea345..32c5b651 100644
--- a/irspec/docs/index.md
+++ b/irspec/docs/index.md
@@ -1,17 +1,17 @@
-# Welcome to MkDocs
+# SPADA — Multi-Level Spatial IR Specification
 
-For full documentation visit [mkdocs.org](https://www.mkdocs.org).
+SPADA is a programming language and compiler for spatial dataflow architectures such as the [Cerebras Wafer-Scale Engine](https://www.cerebras.net/). It provides precise control over data placement, communication streams, and asynchronous execution while abstracting architecture-specific routing details.
 
-## Commands
+This site documents the three intermediate representations (IRs) used in the SPADA compilation pipeline:
 
-* `mkdocs new [dir-name]` - Create a new project.
-* `mkdocs serve` - Start the live-reloading docs server.
-* `mkdocs build` - Build the documentation site.
-* `mkdocs -h` - Print help message and exit.
+| IR | Input | Output |
+|---|---|---|
+| **Stencil IR** | GT4Py stencil definitions | Spatial IR |
+| **Spatial IR** | Stencil IR / hand-written SPADA kernels | Dataflow Task IR |
+| **Dataflow Task IR** | Spatial IR | Cerebras CSL |
+
+For full details on the SPADA language, compiler, and hardware results, see:
+
+> Lukas Gianinazzi, Tal Ben-Nun, Torsten Hoefler. *SPADA: A Spatial Dataflow Architecture Programming Language.* arXiv:2511.09447, 2025.
 
-## Project layout
 
-    mkdocs.yml    # The configuration file.
-    docs/
-        index.md  # The documentation homepage.
-        ...       # Other markdown pages, images and other files.
diff --git a/irspec/mkdocs.yml b/irspec/mkdocs.yml
index 76ffdc4c..9df7e611 100644
--- a/irspec/mkdocs.yml
+++ b/irspec/mkdocs.yml
@@ -1,4 +1,4 @@
-site_name: Multi-Level Spatial IR
+site_name: Spatial Dataflow Abstraction (SpaDA)
 site_url: http://localhost:8000
 theme: material
 nav:
diff --git a/samples/benchmarks/bench_hardware.sh b/samples/benchmarks/bench_hardware.sh
index e98ef780..bfad55d5 100755
--- a/samples/benchmarks/bench_hardware.sh
+++ b/samples/benchmarks/bench_hardware.sh
@@ -9,7 +9,7 @@ BLUE='\033[0;34m'
 NC='\033[0m'
 
 BENCHMARK_DIR="samples/benchmarks"
-RUNTIME="spatialstencil/runtime/runtime.py"
+RUNTIME="spada/runtime/runtime.py"
 OUTPUT_DIR="benchmark_results"
 
 mkdir $OUTPUT_DIR
diff --git a/samples/benchmarks/laplacian_4_4_4_test.sptl b/samples/benchmarks/laplacian_4_4_4_test.sptl
deleted file mode 100644
index 738adef8..00000000
--- a/samples/benchmarks/laplacian_4_4_4_test.sptl
+++ /dev/null
@@ -1,1600 +0,0 @@
-kernel @laplacian<>(stream<f32, 4>[6, 6] readonly _in_field, stream<f32, 4>[4, 4] writeonly __kernel_out_0) {
-  place u16 i#3, u16 j#3 in [0:1:2 , 1:5:2] {
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i#3, u16 j#3 in [0:1:2 , 2:5:2] {
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i#1, u16 j#1 in [0:1:2 , 0:1:2] {
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i#1, u16 j#1 in [5:6:2 , 0:1:2] {
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i#1, u16 j#1 in [1:5:2 , 0:1:2] {
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i#1, u16 j#1 in [2:5:2 , 0:1:2] {
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i#2, u16 j#2 in [0:1:2 , 5:6:2] {
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i#2, u16 j#2 in [5:6:2 , 5:6:2] {
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i#2, u16 j#2 in [1:5:2 , 5:6:2] {
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i#2, u16 j#2 in [2:5:2 , 5:6:2] {
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i#4, u16 j#4 in [5:6:2 , 4:5:2] {
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [4:5:2 , 4:5:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i#4, u16 j#4 in [5:6:2 , 2:4:2] {
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i#4, u16 j#4 in [5:6:2 , 3:4:2] {
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i#4, u16 j#4 in [5:6:2 , 1:2:2] {
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [1:2:2 , 2:4:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [1:2:2 , 3:4:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [4:5:2 , 1:2:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [2:4:2 , 4:5:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [3:4:2 , 4:5:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [1:2:2 , 4:5:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [2:4:2 , 2:4:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [2:4:2 , 3:4:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [3:4:2 , 2:4:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [3:4:2 , 3:4:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [4:5:2 , 2:4:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [4:5:2 , 3:4:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [2:4:2 , 1:2:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [3:4:2 , 1:2:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  place u16 i, u16 j in [1:2:2 , 1:2:2] {
-    f32[4] out_field_0_0_0
-    f32[4] in_field_0_0_0
-    f32[4] out_field_0_0_0#1
-    f32[4] _temp_0_0_0
-    f32[4] _temp_0_0_0#1
-    f32[4] _temp_0_0_0#2
-    f32[4] _temp_0_0_0#3
-    f32[4] _temp_0_0_0#4
-  }
-  dataflow u16 i#11, u16 j#11 in [0:1:2 , 1:5:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-  }
-  dataflow u16 i#11, u16 j#11 in [0:1:2 , 2:5:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-  }
-  dataflow u16 i#13, u16 j#13 in [1:5:2 , 0:1:2] {
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-  }
-  dataflow u16 i#13, u16 j#13 in [2:5:2 , 0:1:2] {
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-  }
-  dataflow u16 i#14, u16 j#14 in [1:5:2 , 5:6:2] {
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-  }
-  dataflow u16 i#14, u16 j#14 in [2:5:2 , 5:6:2] {
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-  }
-  dataflow u16 i#15, u16 j#15 in [5:6:2 , 4:5:2] {
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [4:5:2 , 4:5:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#15, u16 j#15 in [5:6:2 , 2:4:2] {
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#15, u16 j#15 in [5:6:2 , 3:4:2] {
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#15, u16 j#15 in [5:6:2 , 1:2:2] {
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [1:2:2 , 2:4:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [1:2:2 , 3:4:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [4:5:2 , 1:2:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [2:4:2 , 4:5:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [3:4:2 , 4:5:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [1:2:2 , 4:5:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [2:4:2 , 2:4:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [2:4:2 , 3:4:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [3:4:2 , 2:4:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [3:4:2 , 3:4:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [4:5:2 , 2:4:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [4:5:2 , 3:4:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [2:4:2 , 1:2:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [3:4:2 , 1:2:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  dataflow u16 i#12, u16 j#12 in [1:2:2 , 1:2:2] {
-    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
-      hops = [(1, 0)], 
-      channel = 1
-    }
-    stream<f32> _stream_in_field#6 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 2
-    }
-    stream<f32> _stream_in_field#7 = relative_stream(0, 1) {
-      hops = [(0, 1)], 
-      channel = 3
-    }
-    stream<f32> _stream_in_field#8 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 4
-    }
-    stream<f32> _stream_in_field#9 = relative_stream(0, -1) {
-      hops = [(0, -1)], 
-      channel = 5
-    }
-    stream<f32> _stream_in_field#10 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 6
-    }
-    stream<f32> _stream_in_field#11 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 7
-    }
-  }
-  compute u16 i#8, u16 j#8 in [0:1:2 , 1:5:2] {
-    await receive(in_field_0_0_0, _in_field[i#8, j#8])
-    awaitall
-    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#4)
-    await _send_comp#1
-  }
-  compute u16 i#8, u16 j#8 in [0:1:2 , 2:5:2] {
-    await receive(in_field_0_0_0, _in_field[i#8, j#8])
-    awaitall
-    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#4)
-    await _send_comp#1
-  }
-  compute u16 i#6, u16 j#6 in [0:1:2 , 0:1:2] {
-    await receive(in_field_0_0_0, _in_field[i#6, j#6])
-    awaitall
-  }
-  compute u16 i#6, u16 j#6 in [5:6:2 , 0:1:2] {
-    await receive(in_field_0_0_0, _in_field[i#6, j#6])
-    awaitall
-  }
-  compute u16 i#6, u16 j#6 in [1:5:2 , 0:1:2] {
-    await receive(in_field_0_0_0, _in_field[i#6, j#6])
-    awaitall
-    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#6)
-    await _send_comp#3
-  }
-  compute u16 i#6, u16 j#6 in [2:5:2 , 0:1:2] {
-    await receive(in_field_0_0_0, _in_field[i#6, j#6])
-    awaitall
-    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#6)
-    await _send_comp#3
-  }
-  compute u16 i#7, u16 j#7 in [0:1:2 , 5:6:2] {
-    await receive(in_field_0_0_0, _in_field[i#7, j#7])
-    awaitall
-  }
-  compute u16 i#7, u16 j#7 in [5:6:2 , 5:6:2] {
-    await receive(in_field_0_0_0, _in_field[i#7, j#7])
-    awaitall
-  }
-  compute u16 i#7, u16 j#7 in [1:5:2 , 5:6:2] {
-    await receive(in_field_0_0_0, _in_field[i#7, j#7])
-    awaitall
-    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#8)
-    await _send_comp#2
-  }
-  compute u16 i#7, u16 j#7 in [2:5:2 , 5:6:2] {
-    await receive(in_field_0_0_0, _in_field[i#7, j#7])
-    awaitall
-    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#8)
-    await _send_comp#2
-  }
-  compute u16 i#9, u16 j#9 in [5:6:2 , 4:5:2] {
-    await receive(in_field_0_0_0, _in_field[i#9, j#9])
-    awaitall
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#10)
-    await _send_comp
-  }
-  compute u16 i#5, u16 j#5 in [4:5:2 , 4:5:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#10) {
-      _temp_0_0_0[k] = x
-    }
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#11)
-    await _send_comp
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#5) {
-      _temp_0_0_0#1[k#1] = x#1
-    }
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#8) {
-      _temp_0_0_0#2[k#2] = (_temp_0_0_0#1 + x#2)
-    }
-    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#9)
-    await _send_comp#2
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#7) {
-      _temp_0_0_0#3[k#3] = x#3
-    }
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#9, u16 j#9 in [5:6:2 , 2:4:2] {
-    await receive(in_field_0_0_0, _in_field[i#9, j#9])
-    awaitall
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#10)
-    await _send_comp
-  }
-  compute u16 i#9, u16 j#9 in [5:6:2 , 3:4:2] {
-    await receive(in_field_0_0_0, _in_field[i#9, j#9])
-    awaitall
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#10)
-    await _send_comp
-  }
-  compute u16 i#9, u16 j#9 in [5:6:2 , 1:2:2] {
-    await receive(in_field_0_0_0, _in_field[i#9, j#9])
-    awaitall
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#10)
-    await _send_comp
-  }
-  compute u16 i#5, u16 j#5 in [1:2:2 , 2:4:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#11) {
-      _temp_0_0_0[k] = x
-    }
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#4) {
-      _temp_0_0_0#1[k#1] = (_temp_0_0_0 + x#1)
-    }
-    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#5)
-    await _send_comp#1
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#8) {
-      _temp_0_0_0#2[k#2] = (_temp_0_0_0#1 + x#2)
-    }
-    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#9)
-    await _send_comp#2
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#7) {
-      _temp_0_0_0#3[k#3] = (_temp_0_0_0#2 + x#3)
-    }
-    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#6)
-    await _send_comp#3
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#5, u16 j#5 in [1:2:2 , 3:4:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#11) {
-      _temp_0_0_0[k] = x
-    }
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#4) {
-      _temp_0_0_0#1[k#1] = (_temp_0_0_0 + x#1)
-    }
-    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#5)
-    await _send_comp#1
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#9) {
-      _temp_0_0_0#2[k#2] = (_temp_0_0_0#1 + x#2)
-    }
-    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#8)
-    await _send_comp#2
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#6) {
-      _temp_0_0_0#3[k#3] = (_temp_0_0_0#2 + x#3)
-    }
-    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#7)
-    await _send_comp#3
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#5, u16 j#5 in [4:5:2 , 1:2:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#10) {
-      _temp_0_0_0[k] = x
-    }
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#11)
-    await _send_comp
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#5) {
-      _temp_0_0_0#1[k#1] = x#1
-    }
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#9) {
-      _temp_0_0_0#2[k#2] = x#2
-    }
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#6) {
-      _temp_0_0_0#3[k#3] = (_temp_0_0_0#2 + x#3)
-    }
-    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#7)
-    await _send_comp#3
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#5, u16 j#5 in [2:4:2 , 4:5:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#10) {
-      _temp_0_0_0[k] = x
-    }
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#11)
-    await _send_comp
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#5) {
-      _temp_0_0_0#1[k#1] = (_temp_0_0_0 + x#1)
-    }
-    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#4)
-    await _send_comp#1
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#8) {
-      _temp_0_0_0#2[k#2] = (_temp_0_0_0#1 + x#2)
-    }
-    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#9)
-    await _send_comp#2
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#7) {
-      _temp_0_0_0#3[k#3] = x#3
-    }
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#5, u16 j#5 in [3:4:2 , 4:5:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#11) {
-      _temp_0_0_0[k] = x
-    }
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#10)
-    await _send_comp
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#4) {
-      _temp_0_0_0#1[k#1] = (_temp_0_0_0 + x#1)
-    }
-    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#5)
-    await _send_comp#1
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#8) {
-      _temp_0_0_0#2[k#2] = (_temp_0_0_0#1 + x#2)
-    }
-    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#9)
-    await _send_comp#2
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#7) {
-      _temp_0_0_0#3[k#3] = x#3
-    }
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#5, u16 j#5 in [1:2:2 , 4:5:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#11) {
-      _temp_0_0_0[k] = x
-    }
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#4) {
-      _temp_0_0_0#1[k#1] = (_temp_0_0_0 + x#1)
-    }
-    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#5)
-    await _send_comp#1
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#8) {
-      _temp_0_0_0#2[k#2] = (_temp_0_0_0#1 + x#2)
-    }
-    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#9)
-    await _send_comp#2
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#7) {
-      _temp_0_0_0#3[k#3] = x#3
-    }
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#5, u16 j#5 in [2:4:2 , 2:4:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#10) {
-      _temp_0_0_0[k] = x
-    }
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#11)
-    await _send_comp
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#5) {
-      _temp_0_0_0#1[k#1] = (_temp_0_0_0 + x#1)
-    }
-    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#4)
-    await _send_comp#1
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#8) {
-      _temp_0_0_0#2[k#2] = (_temp_0_0_0#1 + x#2)
-    }
-    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#9)
-    await _send_comp#2
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#7) {
-      _temp_0_0_0#3[k#3] = (_temp_0_0_0#2 + x#3)
-    }
-    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#6)
-    await _send_comp#3
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#5, u16 j#5 in [2:4:2 , 3:4:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#10) {
-      _temp_0_0_0[k] = x
-    }
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#11)
-    await _send_comp
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#5) {
-      _temp_0_0_0#1[k#1] = (_temp_0_0_0 + x#1)
-    }
-    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#4)
-    await _send_comp#1
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#9) {
-      _temp_0_0_0#2[k#2] = (_temp_0_0_0#1 + x#2)
-    }
-    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#8)
-    await _send_comp#2
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#6) {
-      _temp_0_0_0#3[k#3] = (_temp_0_0_0#2 + x#3)
-    }
-    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#7)
-    await _send_comp#3
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#5, u16 j#5 in [3:4:2 , 2:4:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#11) {
-      _temp_0_0_0[k] = x
-    }
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#10)
-    await _send_comp
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#4) {
-      _temp_0_0_0#1[k#1] = (_temp_0_0_0 + x#1)
-    }
-    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#5)
-    await _send_comp#1
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#8) {
-      _temp_0_0_0#2[k#2] = (_temp_0_0_0#1 + x#2)
-    }
-    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#9)
-    await _send_comp#2
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#7) {
-      _temp_0_0_0#3[k#3] = (_temp_0_0_0#2 + x#3)
-    }
-    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#6)
-    await _send_comp#3
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#5, u16 j#5 in [3:4:2 , 3:4:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#11) {
-      _temp_0_0_0[k] = x
-    }
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#10)
-    await _send_comp
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#4) {
-      _temp_0_0_0#1[k#1] = (_temp_0_0_0 + x#1)
-    }
-    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#5)
-    await _send_comp#1
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#9) {
-      _temp_0_0_0#2[k#2] = (_temp_0_0_0#1 + x#2)
-    }
-    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#8)
-    await _send_comp#2
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#6) {
-      _temp_0_0_0#3[k#3] = (_temp_0_0_0#2 + x#3)
-    }
-    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#7)
-    await _send_comp#3
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#5, u16 j#5 in [4:5:2 , 2:4:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#10) {
-      _temp_0_0_0[k] = x
-    }
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#11)
-    await _send_comp
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#5) {
-      _temp_0_0_0#1[k#1] = x#1
-    }
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#8) {
-      _temp_0_0_0#2[k#2] = (_temp_0_0_0#1 + x#2)
-    }
-    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#9)
-    await _send_comp#2
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#7) {
-      _temp_0_0_0#3[k#3] = (_temp_0_0_0#2 + x#3)
-    }
-    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#6)
-    await _send_comp#3
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#5, u16 j#5 in [4:5:2 , 3:4:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#10) {
-      _temp_0_0_0[k] = x
-    }
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#11)
-    await _send_comp
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#5) {
-      _temp_0_0_0#1[k#1] = x#1
-    }
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#9) {
-      _temp_0_0_0#2[k#2] = (_temp_0_0_0#1 + x#2)
-    }
-    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#8)
-    await _send_comp#2
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#6) {
-      _temp_0_0_0#3[k#3] = (_temp_0_0_0#2 + x#3)
-    }
-    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#7)
-    await _send_comp#3
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#5, u16 j#5 in [2:4:2 , 1:2:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#10) {
-      _temp_0_0_0[k] = x
-    }
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#11)
-    await _send_comp
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#5) {
-      _temp_0_0_0#1[k#1] = (_temp_0_0_0 + x#1)
-    }
-    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#4)
-    await _send_comp#1
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#9) {
-      _temp_0_0_0#2[k#2] = x#2
-    }
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#6) {
-      _temp_0_0_0#3[k#3] = (_temp_0_0_0#2 + x#3)
-    }
-    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#7)
-    await _send_comp#3
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#5, u16 j#5 in [3:4:2 , 1:2:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#11) {
-      _temp_0_0_0[k] = x
-    }
-    completion _send_comp = send(in_field_0_0_0, _stream_in_field#10)
-    await _send_comp
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#4) {
-      _temp_0_0_0#1[k#1] = (_temp_0_0_0 + x#1)
-    }
-    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#5)
-    await _send_comp#1
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#9) {
-      _temp_0_0_0#2[k#2] = x#2
-    }
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#6) {
-      _temp_0_0_0#3[k#3] = (_temp_0_0_0#2 + x#3)
-    }
-    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#7)
-    await _send_comp#3
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-  compute u16 i#5, u16 j#5 in [1:2:2 , 1:2:2] {
-    await receive(in_field_0_0_0, _in_field[i#5, j#5])
-    awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:4:1], receive(_stream_in_field#11) {
-      _temp_0_0_0[k] = x
-    }
-    await _recv_comp
-    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:4:1], receive(_stream_in_field#4) {
-      _temp_0_0_0#1[k#1] = (_temp_0_0_0 + x#1)
-    }
-    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#5)
-    await _send_comp#1
-    await _recv_comp#1
-    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:4:1], receive(_stream_in_field#9) {
-      _temp_0_0_0#2[k#2] = x#2
-    }
-    await _recv_comp#2
-    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:4:1], receive(_stream_in_field#6) {
-      _temp_0_0_0#3[k#3] = (_temp_0_0_0#2 + x#3)
-    }
-    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#7)
-    await _send_comp#3
-    await _recv_comp#3
-    await map i32 k#4 in [0:4:1] {
-      _temp_0_0_0#4[k#4] = (-4.0 * in_field_0_0_0[k#4])
-    }
-    await map i32 k#5 in [0:4:1] {
-      out_field_0_0_0#1[k#5] = (_temp_0_0_0#4[k#5] + _temp_0_0_0#3[k#5])
-    }
-    await map i32 k#6 in [0:4:1] {
-      out_field_0_0_0[k#6] = out_field_0_0_0#1[k#6]
-    }
-    awaitall
-    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
-  }
-}
\ No newline at end of file
diff --git a/samples/benchmarks/laplacian_746_990_320.sptl b/samples/benchmarks/laplacian_746_990_320.sptl
new file mode 100644
index 00000000..9ef12acc
--- /dev/null
+++ b/samples/benchmarks/laplacian_746_990_320.sptl
@@ -0,0 +1,1600 @@
+kernel @laplacian<>(stream<f32, 320>[748, 992] readonly _in_field, stream<f32, 320>[746, 990] writeonly __kernel_out_0) {
+  place u16 i#3, u16 j#3 in [0:1:2 , 1:991:2] {
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#1
+    f32[320] _temp_0_0_0
+    f32[320] _temp_0_0_0#1
+    f32[320] _temp_0_0_0#2
+    f32[320] _temp_0_0_0#3
+    f32[320] _temp_0_0_0#4
+  }
+  place u16 i#3, u16 j#3 in [0:1:2 , 2:991:2] {
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#1
+    f32[320] _temp_0_0_0
+    f32[320] _temp_0_0_0#1
+    f32[320] _temp_0_0_0#2
+    f32[320] _temp_0_0_0#3
+    f32[320] _temp_0_0_0#4
+  }
+  place u16 i#1, u16 j#1 in [0:1:2 , 0:1:2] {
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#2
+    f32[320] _temp_0_0_0#5
+    f32[320] _temp_0_0_0#6
+    f32[320] _temp_0_0_0#7
+    f32[320] _temp_0_0_0#8
+    f32[320] _temp_0_0_0#9
+  }
+  place u16 i#2, u16 j#2 in [0:1:2 , 991:992:2] {
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#3
+    f32[320] _temp_0_0_0#10
+    f32[320] _temp_0_0_0#11
+    f32[320] _temp_0_0_0#12
+    f32[320] _temp_0_0_0#13
+    f32[320] _temp_0_0_0#14
+  }
+  place u16 i#1, u16 j#1 in [747:748:2 , 0:1:2] {
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#5
+    f32[320] _temp_0_0_0#20
+    f32[320] _temp_0_0_0#21
+    f32[320] _temp_0_0_0#22
+    f32[320] _temp_0_0_0#23
+    f32[320] _temp_0_0_0#24
+  }
+  place u16 i#1, u16 j#1 in [1:747:2 , 0:1:2] {
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#6
+    f32[320] _temp_0_0_0#25
+    f32[320] _temp_0_0_0#26
+    f32[320] _temp_0_0_0#27
+    f32[320] _temp_0_0_0#28
+    f32[320] _temp_0_0_0#29
+  }
+  place u16 i#1, u16 j#1 in [2:747:2 , 0:1:2] {
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#6
+    f32[320] _temp_0_0_0#25
+    f32[320] _temp_0_0_0#26
+    f32[320] _temp_0_0_0#27
+    f32[320] _temp_0_0_0#28
+    f32[320] _temp_0_0_0#29
+  }
+  place u16 i#2, u16 j#2 in [747:748:2 , 991:992:2] {
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#7
+    f32[320] _temp_0_0_0#30
+    f32[320] _temp_0_0_0#31
+    f32[320] _temp_0_0_0#32
+    f32[320] _temp_0_0_0#33
+    f32[320] _temp_0_0_0#34
+  }
+  place u16 i#2, u16 j#2 in [1:747:2 , 991:992:2] {
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#8
+    f32[320] _temp_0_0_0#35
+    f32[320] _temp_0_0_0#36
+    f32[320] _temp_0_0_0#37
+    f32[320] _temp_0_0_0#38
+    f32[320] _temp_0_0_0#39
+  }
+  place u16 i#2, u16 j#2 in [2:747:2 , 991:992:2] {
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#8
+    f32[320] _temp_0_0_0#35
+    f32[320] _temp_0_0_0#36
+    f32[320] _temp_0_0_0#37
+    f32[320] _temp_0_0_0#38
+    f32[320] _temp_0_0_0#39
+  }
+  place u16 i#4, u16 j#4 in [747:748:2 , 2:991:2] {
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#4
+    f32[320] _temp_0_0_0#15
+    f32[320] _temp_0_0_0#16
+    f32[320] _temp_0_0_0#17
+    f32[320] _temp_0_0_0#18
+    f32[320] _temp_0_0_0#19
+  }
+  place u16 i#4, u16 j#4 in [747:748:2 , 3:991:2] {
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#4
+    f32[320] _temp_0_0_0#15
+    f32[320] _temp_0_0_0#16
+    f32[320] _temp_0_0_0#17
+    f32[320] _temp_0_0_0#18
+    f32[320] _temp_0_0_0#19
+  }
+  place u16 i, u16 j in [1:2:2 , 1:2:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#9
+    f32[320] _temp_0_0_0#40
+    f32[320] _temp_0_0_0#41
+    f32[320] _temp_0_0_0#42
+    f32[320] _temp_0_0_0#43
+    f32[320] _temp_0_0_0#44
+  }
+  place u16 i#4, u16 j#4 in [747:748:2 , 1:2:2] {
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#10
+    f32[320] _temp_0_0_0#45
+    f32[320] _temp_0_0_0#46
+    f32[320] _temp_0_0_0#47
+    f32[320] _temp_0_0_0#48
+    f32[320] _temp_0_0_0#49
+  }
+  place u16 i, u16 j in [1:2:2 , 990:991:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#14
+    f32[320] _temp_0_0_0#65
+    f32[320] _temp_0_0_0#66
+    f32[320] _temp_0_0_0#67
+    f32[320] _temp_0_0_0#68
+    f32[320] _temp_0_0_0#69
+  }
+  place u16 i, u16 j in [746:747:2 , 2:990:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#13
+    f32[320] _temp_0_0_0#60
+    f32[320] _temp_0_0_0#61
+    f32[320] _temp_0_0_0#62
+    f32[320] _temp_0_0_0#63
+    f32[320] _temp_0_0_0#64
+  }
+  place u16 i, u16 j in [746:747:2 , 3:990:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#13
+    f32[320] _temp_0_0_0#60
+    f32[320] _temp_0_0_0#61
+    f32[320] _temp_0_0_0#62
+    f32[320] _temp_0_0_0#63
+    f32[320] _temp_0_0_0#64
+  }
+  place u16 i, u16 j in [2:746:2 , 1:2:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#12
+    f32[320] _temp_0_0_0#55
+    f32[320] _temp_0_0_0#56
+    f32[320] _temp_0_0_0#57
+    f32[320] _temp_0_0_0#58
+    f32[320] _temp_0_0_0#59
+  }
+  place u16 i, u16 j in [3:746:2 , 1:2:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#12
+    f32[320] _temp_0_0_0#55
+    f32[320] _temp_0_0_0#56
+    f32[320] _temp_0_0_0#57
+    f32[320] _temp_0_0_0#58
+    f32[320] _temp_0_0_0#59
+  }
+  place u16 i, u16 j in [746:747:2 , 1:2:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#11
+    f32[320] _temp_0_0_0#50
+    f32[320] _temp_0_0_0#51
+    f32[320] _temp_0_0_0#52
+    f32[320] _temp_0_0_0#53
+    f32[320] _temp_0_0_0#54
+  }
+  place u16 i, u16 j in [746:747:2 , 990:991:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#18
+    f32[320] _temp_0_0_0#85
+    f32[320] _temp_0_0_0#86
+    f32[320] _temp_0_0_0#87
+    f32[320] _temp_0_0_0#88
+    f32[320] _temp_0_0_0#89
+  }
+  place u16 i, u16 j in [2:746:2 , 990:991:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#17
+    f32[320] _temp_0_0_0#80
+    f32[320] _temp_0_0_0#81
+    f32[320] _temp_0_0_0#82
+    f32[320] _temp_0_0_0#83
+    f32[320] _temp_0_0_0#84
+  }
+  place u16 i, u16 j in [3:746:2 , 990:991:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#17
+    f32[320] _temp_0_0_0#80
+    f32[320] _temp_0_0_0#81
+    f32[320] _temp_0_0_0#82
+    f32[320] _temp_0_0_0#83
+    f32[320] _temp_0_0_0#84
+  }
+  place u16 i, u16 j in [2:746:2 , 2:990:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#16
+    f32[320] _temp_0_0_0#75
+    f32[320] _temp_0_0_0#76
+    f32[320] _temp_0_0_0#77
+    f32[320] _temp_0_0_0#78
+    f32[320] _temp_0_0_0#79
+  }
+  place u16 i, u16 j in [2:746:2 , 3:990:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#16
+    f32[320] _temp_0_0_0#75
+    f32[320] _temp_0_0_0#76
+    f32[320] _temp_0_0_0#77
+    f32[320] _temp_0_0_0#78
+    f32[320] _temp_0_0_0#79
+  }
+  place u16 i, u16 j in [3:746:2 , 2:990:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#16
+    f32[320] _temp_0_0_0#75
+    f32[320] _temp_0_0_0#76
+    f32[320] _temp_0_0_0#77
+    f32[320] _temp_0_0_0#78
+    f32[320] _temp_0_0_0#79
+  }
+  place u16 i, u16 j in [3:746:2 , 3:990:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#16
+    f32[320] _temp_0_0_0#75
+    f32[320] _temp_0_0_0#76
+    f32[320] _temp_0_0_0#77
+    f32[320] _temp_0_0_0#78
+    f32[320] _temp_0_0_0#79
+  }
+  place u16 i, u16 j in [1:2:2 , 2:990:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#15
+    f32[320] _temp_0_0_0#70
+    f32[320] _temp_0_0_0#71
+    f32[320] _temp_0_0_0#72
+    f32[320] _temp_0_0_0#73
+    f32[320] _temp_0_0_0#74
+  }
+  place u16 i, u16 j in [1:2:2 , 3:990:2] {
+    f32[320] out_field_0_0_0
+    f32[320] in_field_0_0_0
+    f32[320] out_field_0_0_0#15
+    f32[320] _temp_0_0_0#70
+    f32[320] _temp_0_0_0#71
+    f32[320] _temp_0_0_0#72
+    f32[320] _temp_0_0_0#73
+    f32[320] _temp_0_0_0#74
+  }
+  dataflow u16 i#11, u16 j#11 in [0:1:2 , 1:991:2] {
+    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 0
+}
+    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 1
+}
+  }
+  dataflow u16 i#11, u16 j#11 in [0:1:2 , 2:991:2] {
+    stream<f32> _stream_in_field#4 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 0
+}
+    stream<f32> _stream_in_field#5 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 1
+}
+  }
+  dataflow u16 i#15, u16 j#15 in [747:748:2 , 2:991:2] {
+    stream<f32> _stream_in_field#6 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 2
+}
+    stream<f32> _stream_in_field#7 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 3
+}
+  }
+  dataflow u16 i#15, u16 j#15 in [747:748:2 , 3:991:2] {
+    stream<f32> _stream_in_field#6 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 2
+}
+    stream<f32> _stream_in_field#7 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 3
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [1:2:2 , 1:2:2] {
+    stream<f32> _stream_in_field#8 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 4
+}
+    stream<f32> _stream_in_field#9 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 5
+}
+    stream<f32> _stream_in_field#10 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 6
+}
+    stream<f32> _stream_in_field#11 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 7
+}
+    stream<f32> _stream_in_field#12 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 8
+}
+    stream<f32> _stream_in_field#13 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 9
+}
+    stream<f32> _stream_in_field#14 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 10
+}
+    stream<f32> _stream_in_field#15 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 11
+}
+  }
+  dataflow u16 i#13, u16 j#13 in [1:747:2 , 0:1:2] {
+    stream<f32> _stream_in_field#16 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 12
+}
+    stream<f32> _stream_in_field#17 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 13
+}
+  }
+  dataflow u16 i#13, u16 j#13 in [2:747:2 , 0:1:2] {
+    stream<f32> _stream_in_field#16 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 12
+}
+    stream<f32> _stream_in_field#17 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 13
+}
+  }
+  dataflow u16 i#14, u16 j#14 in [1:747:2 , 991:992:2] {
+    stream<f32> _stream_in_field#18 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 14
+}
+    stream<f32> _stream_in_field#19 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 15
+}
+  }
+  dataflow u16 i#14, u16 j#14 in [2:747:2 , 991:992:2] {
+    stream<f32> _stream_in_field#18 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 14
+}
+    stream<f32> _stream_in_field#19 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 15
+}
+  }
+  dataflow u16 i#15, u16 j#15 in [747:748:2 , 1:2:2] {
+    stream<f32> _stream_in_field#20 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 16
+}
+    stream<f32> _stream_in_field#21 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 17
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [1:2:2 , 990:991:2] {
+    stream<f32> _stream_in_field#22 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 18
+}
+    stream<f32> _stream_in_field#23 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 19
+}
+    stream<f32> _stream_in_field#24 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 20
+}
+    stream<f32> _stream_in_field#25 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 21
+}
+    stream<f32> _stream_in_field#26 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 22
+}
+    stream<f32> _stream_in_field#27 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 23
+}
+    stream<f32> _stream_in_field#28 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 24
+}
+    stream<f32> _stream_in_field#29 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 25
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [746:747:2 , 2:990:2] {
+    stream<f32> _stream_in_field#30 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 26
+}
+    stream<f32> _stream_in_field#31 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 27
+}
+    stream<f32> _stream_in_field#32 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 28
+}
+    stream<f32> _stream_in_field#33 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 29
+}
+    stream<f32> _stream_in_field#34 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 30
+}
+    stream<f32> _stream_in_field#35 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 31
+}
+    stream<f32> _stream_in_field#36 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 32
+}
+    stream<f32> _stream_in_field#37 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 33
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [746:747:2 , 3:990:2] {
+    stream<f32> _stream_in_field#30 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 26
+}
+    stream<f32> _stream_in_field#31 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 27
+}
+    stream<f32> _stream_in_field#32 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 28
+}
+    stream<f32> _stream_in_field#33 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 29
+}
+    stream<f32> _stream_in_field#34 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 30
+}
+    stream<f32> _stream_in_field#35 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 31
+}
+    stream<f32> _stream_in_field#36 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 32
+}
+    stream<f32> _stream_in_field#37 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 33
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [2:746:2 , 1:2:2] {
+    stream<f32> _stream_in_field#38 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 34
+}
+    stream<f32> _stream_in_field#39 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 35
+}
+    stream<f32> _stream_in_field#40 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 36
+}
+    stream<f32> _stream_in_field#41 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 37
+}
+    stream<f32> _stream_in_field#42 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 38
+}
+    stream<f32> _stream_in_field#43 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 39
+}
+    stream<f32> _stream_in_field#44 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 40
+}
+    stream<f32> _stream_in_field#45 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 41
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [3:746:2 , 1:2:2] {
+    stream<f32> _stream_in_field#38 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 34
+}
+    stream<f32> _stream_in_field#39 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 35
+}
+    stream<f32> _stream_in_field#40 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 36
+}
+    stream<f32> _stream_in_field#41 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 37
+}
+    stream<f32> _stream_in_field#42 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 38
+}
+    stream<f32> _stream_in_field#43 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 39
+}
+    stream<f32> _stream_in_field#44 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 40
+}
+    stream<f32> _stream_in_field#45 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 41
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [746:747:2 , 1:2:2] {
+    stream<f32> _stream_in_field#46 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 42
+}
+    stream<f32> _stream_in_field#47 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 43
+}
+    stream<f32> _stream_in_field#48 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 44
+}
+    stream<f32> _stream_in_field#49 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 45
+}
+    stream<f32> _stream_in_field#50 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 46
+}
+    stream<f32> _stream_in_field#51 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 47
+}
+    stream<f32> _stream_in_field#52 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 48
+}
+    stream<f32> _stream_in_field#53 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 49
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [746:747:2 , 990:991:2] {
+    stream<f32> _stream_in_field#54 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 50
+}
+    stream<f32> _stream_in_field#55 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 51
+}
+    stream<f32> _stream_in_field#56 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 52
+}
+    stream<f32> _stream_in_field#57 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 53
+}
+    stream<f32> _stream_in_field#58 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 54
+}
+    stream<f32> _stream_in_field#59 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 55
+}
+    stream<f32> _stream_in_field#60 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 56
+}
+    stream<f32> _stream_in_field#61 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 57
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [2:746:2 , 990:991:2] {
+    stream<f32> _stream_in_field#62 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 58
+}
+    stream<f32> _stream_in_field#63 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 59
+}
+    stream<f32> _stream_in_field#64 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 60
+}
+    stream<f32> _stream_in_field#65 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 61
+}
+    stream<f32> _stream_in_field#66 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 62
+}
+    stream<f32> _stream_in_field#67 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 63
+}
+    stream<f32> _stream_in_field#68 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 64
+}
+    stream<f32> _stream_in_field#69 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 65
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [3:746:2 , 990:991:2] {
+    stream<f32> _stream_in_field#62 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 58
+}
+    stream<f32> _stream_in_field#63 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 59
+}
+    stream<f32> _stream_in_field#64 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 60
+}
+    stream<f32> _stream_in_field#65 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 61
+}
+    stream<f32> _stream_in_field#66 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 62
+}
+    stream<f32> _stream_in_field#67 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 63
+}
+    stream<f32> _stream_in_field#68 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 64
+}
+    stream<f32> _stream_in_field#69 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 65
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [2:746:2 , 2:990:2] {
+    stream<f32> _stream_in_field#70 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 66
+}
+    stream<f32> _stream_in_field#71 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 67
+}
+    stream<f32> _stream_in_field#72 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 68
+}
+    stream<f32> _stream_in_field#73 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 69
+}
+    stream<f32> _stream_in_field#74 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 70
+}
+    stream<f32> _stream_in_field#75 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 71
+}
+    stream<f32> _stream_in_field#76 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 72
+}
+    stream<f32> _stream_in_field#77 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 73
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [2:746:2 , 3:990:2] {
+    stream<f32> _stream_in_field#70 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 66
+}
+    stream<f32> _stream_in_field#71 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 67
+}
+    stream<f32> _stream_in_field#72 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 68
+}
+    stream<f32> _stream_in_field#73 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 69
+}
+    stream<f32> _stream_in_field#74 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 70
+}
+    stream<f32> _stream_in_field#75 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 71
+}
+    stream<f32> _stream_in_field#76 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 72
+}
+    stream<f32> _stream_in_field#77 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 73
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [3:746:2 , 2:990:2] {
+    stream<f32> _stream_in_field#70 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 66
+}
+    stream<f32> _stream_in_field#71 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 67
+}
+    stream<f32> _stream_in_field#72 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 68
+}
+    stream<f32> _stream_in_field#73 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 69
+}
+    stream<f32> _stream_in_field#74 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 70
+}
+    stream<f32> _stream_in_field#75 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 71
+}
+    stream<f32> _stream_in_field#76 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 72
+}
+    stream<f32> _stream_in_field#77 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 73
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [3:746:2 , 3:990:2] {
+    stream<f32> _stream_in_field#70 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 66
+}
+    stream<f32> _stream_in_field#71 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 67
+}
+    stream<f32> _stream_in_field#72 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 68
+}
+    stream<f32> _stream_in_field#73 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 69
+}
+    stream<f32> _stream_in_field#74 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 70
+}
+    stream<f32> _stream_in_field#75 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 71
+}
+    stream<f32> _stream_in_field#76 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 72
+}
+    stream<f32> _stream_in_field#77 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 73
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [1:2:2 , 2:990:2] {
+    stream<f32> _stream_in_field#78 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 74
+}
+    stream<f32> _stream_in_field#79 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 75
+}
+    stream<f32> _stream_in_field#80 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 76
+}
+    stream<f32> _stream_in_field#81 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 77
+}
+    stream<f32> _stream_in_field#82 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 78
+}
+    stream<f32> _stream_in_field#83 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 79
+}
+    stream<f32> _stream_in_field#84 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 80
+}
+    stream<f32> _stream_in_field#85 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 81
+}
+  }
+  dataflow u16 i#12, u16 j#12 in [1:2:2 , 3:990:2] {
+    stream<f32> _stream_in_field#78 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 74
+}
+    stream<f32> _stream_in_field#79 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 75
+}
+    stream<f32> _stream_in_field#80 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 76
+}
+    stream<f32> _stream_in_field#81 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 77
+}
+    stream<f32> _stream_in_field#82 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 78
+}
+    stream<f32> _stream_in_field#83 = relative_stream(0, -1) {
+  hops = [(0, -1)], 
+  channel = 79
+}
+    stream<f32> _stream_in_field#84 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 80
+}
+    stream<f32> _stream_in_field#85 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 81
+}
+  }
+  compute u16 i#8, u16 j#8 in [0:1:2 , 1:991:2] {
+    await receive(in_field_0_0_0, _in_field[i#8, j#8])
+    awaitall
+    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#4)
+    await _send_comp#1
+    awaitall
+  }
+  compute u16 i#8, u16 j#8 in [0:1:2 , 2:991:2] {
+    await receive(in_field_0_0_0, _in_field[i#8, j#8])
+    awaitall
+    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#4)
+    await _send_comp#1
+    awaitall
+  }
+  compute u16 i#6, u16 j#6 in [0:1:2 , 0:1:2] {
+    await receive(in_field_0_0_0, _in_field[i#6, j#6])
+    awaitall
+  }
+  compute u16 i#7, u16 j#7 in [0:1:2 , 991:992:2] {
+    await receive(in_field_0_0_0, _in_field[i#7, j#7])
+    awaitall
+  }
+  compute u16 i#6, u16 j#6 in [747:748:2 , 0:1:2] {
+    await receive(in_field_0_0_0, _in_field[i#6, j#6])
+    awaitall
+  }
+  compute u16 i#6, u16 j#6 in [1:747:2 , 0:1:2] {
+    await receive(in_field_0_0_0, _in_field[i#6, j#6])
+    awaitall
+    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#16)
+    await _send_comp#3
+    awaitall
+  }
+  compute u16 i#6, u16 j#6 in [2:747:2 , 0:1:2] {
+    await receive(in_field_0_0_0, _in_field[i#6, j#6])
+    awaitall
+    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#16)
+    await _send_comp#3
+    awaitall
+  }
+  compute u16 i#7, u16 j#7 in [747:748:2 , 991:992:2] {
+    await receive(in_field_0_0_0, _in_field[i#7, j#7])
+    awaitall
+  }
+  compute u16 i#7, u16 j#7 in [1:747:2 , 991:992:2] {
+    await receive(in_field_0_0_0, _in_field[i#7, j#7])
+    awaitall
+    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#18)
+    await _send_comp#2
+    awaitall
+  }
+  compute u16 i#7, u16 j#7 in [2:747:2 , 991:992:2] {
+    await receive(in_field_0_0_0, _in_field[i#7, j#7])
+    awaitall
+    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#18)
+    await _send_comp#2
+    awaitall
+  }
+  compute u16 i#9, u16 j#9 in [747:748:2 , 2:991:2] {
+    await receive(in_field_0_0_0, _in_field[i#9, j#9])
+    awaitall
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#6)
+    await _send_comp
+    awaitall
+  }
+  compute u16 i#9, u16 j#9 in [747:748:2 , 3:991:2] {
+    await receive(in_field_0_0_0, _in_field[i#9, j#9])
+    awaitall
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#6)
+    await _send_comp
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [1:2:2 , 1:2:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#15) {
+      _temp_0_0_0#40[k] = x
+    }
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#8) {
+      _temp_0_0_0#41[k#1] = (_temp_0_0_0#40[k#1] + x#1)
+    }
+    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#9)
+    await _send_comp#1
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#13) {
+      _temp_0_0_0#42[k#2] = (_temp_0_0_0#41[k#2] + x#2)
+    }
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#10) {
+      _temp_0_0_0#43[k#3] = (_temp_0_0_0#42[k#3] + x#3)
+    }
+    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#11)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#44[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#9[k#5] = (_temp_0_0_0#44[k#5] + _temp_0_0_0#43[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#9[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#9, u16 j#9 in [747:748:2 , 1:2:2] {
+    await receive(in_field_0_0_0, _in_field[i#9, j#9])
+    awaitall
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#20)
+    await _send_comp
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [1:2:2 , 990:991:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#29) {
+      _temp_0_0_0#65[k] = x
+    }
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#22) {
+      _temp_0_0_0#66[k#1] = (_temp_0_0_0#65[k#1] + x#1)
+    }
+    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#23)
+    await _send_comp#1
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#26) {
+      _temp_0_0_0#67[k#2] = (_temp_0_0_0#66[k#2] + x#2)
+    }
+    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#27)
+    await _send_comp#2
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#25) {
+      _temp_0_0_0#68[k#3] = (_temp_0_0_0#67[k#3] + x#3)
+    }
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#69[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#14[k#5] = (_temp_0_0_0#69[k#5] + _temp_0_0_0#68[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#14[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [746:747:2 , 2:990:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#36) {
+      _temp_0_0_0#60[k] = x
+    }
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#37)
+    await _send_comp
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#31) {
+      _temp_0_0_0#61[k#1] = (_temp_0_0_0#60[k#1] + x#1)
+    }
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#34) {
+      _temp_0_0_0#62[k#2] = (_temp_0_0_0#61[k#2] + x#2)
+    }
+    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#35)
+    await _send_comp#2
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#33) {
+      _temp_0_0_0#63[k#3] = (_temp_0_0_0#62[k#3] + x#3)
+    }
+    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#32)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#64[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#13[k#5] = (_temp_0_0_0#64[k#5] + _temp_0_0_0#63[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#13[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [746:747:2 , 3:990:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#36) {
+      _temp_0_0_0#60[k] = x
+    }
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#37)
+    await _send_comp
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#31) {
+      _temp_0_0_0#61[k#1] = (_temp_0_0_0#60[k#1] + x#1)
+    }
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#35) {
+      _temp_0_0_0#62[k#2] = (_temp_0_0_0#61[k#2] + x#2)
+    }
+    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#34)
+    await _send_comp#2
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#32) {
+      _temp_0_0_0#63[k#3] = (_temp_0_0_0#62[k#3] + x#3)
+    }
+    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#33)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#64[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#13[k#5] = (_temp_0_0_0#64[k#5] + _temp_0_0_0#63[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#13[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [2:746:2 , 1:2:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#44) {
+      _temp_0_0_0#55[k] = x
+    }
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#45)
+    await _send_comp
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#39) {
+      _temp_0_0_0#56[k#1] = (_temp_0_0_0#55[k#1] + x#1)
+    }
+    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#38)
+    await _send_comp#1
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#43) {
+      _temp_0_0_0#57[k#2] = (_temp_0_0_0#56[k#2] + x#2)
+    }
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#40) {
+      _temp_0_0_0#58[k#3] = (_temp_0_0_0#57[k#3] + x#3)
+    }
+    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#41)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#59[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#12[k#5] = (_temp_0_0_0#59[k#5] + _temp_0_0_0#58[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#12[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [3:746:2 , 1:2:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#45) {
+      _temp_0_0_0#55[k] = x
+    }
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#44)
+    await _send_comp
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#38) {
+      _temp_0_0_0#56[k#1] = (_temp_0_0_0#55[k#1] + x#1)
+    }
+    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#39)
+    await _send_comp#1
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#43) {
+      _temp_0_0_0#57[k#2] = (_temp_0_0_0#56[k#2] + x#2)
+    }
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#40) {
+      _temp_0_0_0#58[k#3] = (_temp_0_0_0#57[k#3] + x#3)
+    }
+    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#41)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#59[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#12[k#5] = (_temp_0_0_0#59[k#5] + _temp_0_0_0#58[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#12[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [746:747:2 , 1:2:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#52) {
+      _temp_0_0_0#50[k] = x
+    }
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#53)
+    await _send_comp
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#47) {
+      _temp_0_0_0#51[k#1] = (_temp_0_0_0#50[k#1] + x#1)
+    }
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#51) {
+      _temp_0_0_0#52[k#2] = (_temp_0_0_0#51[k#2] + x#2)
+    }
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#48) {
+      _temp_0_0_0#53[k#3] = (_temp_0_0_0#52[k#3] + x#3)
+    }
+    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#49)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#54[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#11[k#5] = (_temp_0_0_0#54[k#5] + _temp_0_0_0#53[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#11[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [746:747:2 , 990:991:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#60) {
+      _temp_0_0_0#85[k] = x
+    }
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#61)
+    await _send_comp
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#55) {
+      _temp_0_0_0#86[k#1] = (_temp_0_0_0#85[k#1] + x#1)
+    }
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#58) {
+      _temp_0_0_0#87[k#2] = (_temp_0_0_0#86[k#2] + x#2)
+    }
+    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#59)
+    await _send_comp#2
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#57) {
+      _temp_0_0_0#88[k#3] = (_temp_0_0_0#87[k#3] + x#3)
+    }
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#89[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#18[k#5] = (_temp_0_0_0#89[k#5] + _temp_0_0_0#88[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#18[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [2:746:2 , 990:991:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#68) {
+      _temp_0_0_0#80[k] = x
+    }
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#69)
+    await _send_comp
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#63) {
+      _temp_0_0_0#81[k#1] = (_temp_0_0_0#80[k#1] + x#1)
+    }
+    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#62)
+    await _send_comp#1
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#66) {
+      _temp_0_0_0#82[k#2] = (_temp_0_0_0#81[k#2] + x#2)
+    }
+    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#67)
+    await _send_comp#2
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#65) {
+      _temp_0_0_0#83[k#3] = (_temp_0_0_0#82[k#3] + x#3)
+    }
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#84[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#17[k#5] = (_temp_0_0_0#84[k#5] + _temp_0_0_0#83[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#17[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [3:746:2 , 990:991:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#69) {
+      _temp_0_0_0#80[k] = x
+    }
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#68)
+    await _send_comp
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#62) {
+      _temp_0_0_0#81[k#1] = (_temp_0_0_0#80[k#1] + x#1)
+    }
+    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#63)
+    await _send_comp#1
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#66) {
+      _temp_0_0_0#82[k#2] = (_temp_0_0_0#81[k#2] + x#2)
+    }
+    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#67)
+    await _send_comp#2
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#65) {
+      _temp_0_0_0#83[k#3] = (_temp_0_0_0#82[k#3] + x#3)
+    }
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#84[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#17[k#5] = (_temp_0_0_0#84[k#5] + _temp_0_0_0#83[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#17[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [2:746:2 , 2:990:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#76) {
+      _temp_0_0_0#75[k] = x
+    }
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#77)
+    await _send_comp
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#71) {
+      _temp_0_0_0#76[k#1] = (_temp_0_0_0#75[k#1] + x#1)
+    }
+    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#70)
+    await _send_comp#1
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#74) {
+      _temp_0_0_0#77[k#2] = (_temp_0_0_0#76[k#2] + x#2)
+    }
+    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#75)
+    await _send_comp#2
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#73) {
+      _temp_0_0_0#78[k#3] = (_temp_0_0_0#77[k#3] + x#3)
+    }
+    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#72)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#79[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#16[k#5] = (_temp_0_0_0#79[k#5] + _temp_0_0_0#78[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#16[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [2:746:2 , 3:990:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#76) {
+      _temp_0_0_0#75[k] = x
+    }
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#77)
+    await _send_comp
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#71) {
+      _temp_0_0_0#76[k#1] = (_temp_0_0_0#75[k#1] + x#1)
+    }
+    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#70)
+    await _send_comp#1
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#75) {
+      _temp_0_0_0#77[k#2] = (_temp_0_0_0#76[k#2] + x#2)
+    }
+    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#74)
+    await _send_comp#2
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#72) {
+      _temp_0_0_0#78[k#3] = (_temp_0_0_0#77[k#3] + x#3)
+    }
+    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#73)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#79[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#16[k#5] = (_temp_0_0_0#79[k#5] + _temp_0_0_0#78[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#16[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [3:746:2 , 2:990:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#77) {
+      _temp_0_0_0#75[k] = x
+    }
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#76)
+    await _send_comp
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#70) {
+      _temp_0_0_0#76[k#1] = (_temp_0_0_0#75[k#1] + x#1)
+    }
+    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#71)
+    await _send_comp#1
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#74) {
+      _temp_0_0_0#77[k#2] = (_temp_0_0_0#76[k#2] + x#2)
+    }
+    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#75)
+    await _send_comp#2
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#73) {
+      _temp_0_0_0#78[k#3] = (_temp_0_0_0#77[k#3] + x#3)
+    }
+    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#72)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#79[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#16[k#5] = (_temp_0_0_0#79[k#5] + _temp_0_0_0#78[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#16[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [3:746:2 , 3:990:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#77) {
+      _temp_0_0_0#75[k] = x
+    }
+    completion _send_comp = send(in_field_0_0_0, _stream_in_field#76)
+    await _send_comp
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#70) {
+      _temp_0_0_0#76[k#1] = (_temp_0_0_0#75[k#1] + x#1)
+    }
+    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#71)
+    await _send_comp#1
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#75) {
+      _temp_0_0_0#77[k#2] = (_temp_0_0_0#76[k#2] + x#2)
+    }
+    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#74)
+    await _send_comp#2
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#72) {
+      _temp_0_0_0#78[k#3] = (_temp_0_0_0#77[k#3] + x#3)
+    }
+    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#73)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#79[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#16[k#5] = (_temp_0_0_0#79[k#5] + _temp_0_0_0#78[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#16[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [1:2:2 , 2:990:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#85) {
+      _temp_0_0_0#70[k] = x
+    }
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#78) {
+      _temp_0_0_0#71[k#1] = (_temp_0_0_0#70[k#1] + x#1)
+    }
+    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#79)
+    await _send_comp#1
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#82) {
+      _temp_0_0_0#72[k#2] = (_temp_0_0_0#71[k#2] + x#2)
+    }
+    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#83)
+    await _send_comp#2
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#81) {
+      _temp_0_0_0#73[k#3] = (_temp_0_0_0#72[k#3] + x#3)
+    }
+    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#80)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#74[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#15[k#5] = (_temp_0_0_0#74[k#5] + _temp_0_0_0#73[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#15[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [1:2:2 , 3:990:2] {
+    await receive(in_field_0_0_0, _in_field[i#5, j#5])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_in_field#85) {
+      _temp_0_0_0#70[k] = x
+    }
+    await _recv_comp
+    completion _recv_comp#1 = foreach i32 k#1, f32 x#1 in [0:320:1], receive(_stream_in_field#78) {
+      _temp_0_0_0#71[k#1] = (_temp_0_0_0#70[k#1] + x#1)
+    }
+    completion _send_comp#1 = send(in_field_0_0_0, _stream_in_field#79)
+    await _send_comp#1
+    await _recv_comp#1
+    completion _recv_comp#2 = foreach i32 k#2, f32 x#2 in [0:320:1], receive(_stream_in_field#83) {
+      _temp_0_0_0#72[k#2] = (_temp_0_0_0#71[k#2] + x#2)
+    }
+    completion _send_comp#2 = send(in_field_0_0_0, _stream_in_field#82)
+    await _send_comp#2
+    await _recv_comp#2
+    completion _recv_comp#3 = foreach i32 k#3, f32 x#3 in [0:320:1], receive(_stream_in_field#80) {
+      _temp_0_0_0#73[k#3] = (_temp_0_0_0#72[k#3] + x#3)
+    }
+    completion _send_comp#3 = send(in_field_0_0_0, _stream_in_field#81)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#4 in [0:320:1] {
+      _temp_0_0_0#74[k#4] = (-4.0 * in_field_0_0_0[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      out_field_0_0_0#15[k#5] = (_temp_0_0_0#74[k#5] + _temp_0_0_0#73[k#5])
+    }
+    await map i32 k#6 in [0:320:1] {
+      out_field_0_0_0[k#6] = out_field_0_0_0#15[k#6]
+    }
+    awaitall
+    await send(out_field_0_0_0, __kernel_out_0[(i#5 - 1), (j#5 - 1)])
+    awaitall
+  }
+}
\ No newline at end of file
diff --git a/samples/benchmarks/sweep_hardware.sh b/samples/benchmarks/sweep_hardware.sh
index e0987f2f..8b222a99 100755
--- a/samples/benchmarks/sweep_hardware.sh
+++ b/samples/benchmarks/sweep_hardware.sh
@@ -7,7 +7,7 @@ BLUE='\033[0;34m'
 NC='\033[0m'
 
 BENCHMARK_DIR="samples/benchmarks"
-RUNTIME="spatialstencil/runtime/runtime.py"
+RUNTIME="spada/runtime/runtime.py"
 OUTPUT_DIR="${BM_OUTPUT_DIR:-benchmark_results}"
 
 mkdir $OUTPUT_DIR
diff --git a/samples/benchmarks/uvbke_746_990_320.sptl b/samples/benchmarks/uvbke_746_990_320.sptl
new file mode 100644
index 00000000..bd0ff33b
--- /dev/null
+++ b/samples/benchmarks/uvbke_746_990_320.sptl
@@ -0,0 +1,1260 @@
+kernel @uvbke<>(stream<f32, 320>[746, 991] readonly _arg0, stream<f32, 320>[747, 990] readonly _arg1, stream<f32, 320>[746, 990] readonly _arg2, stream<f32, 320>[746, 990] readonly _arg3, stream<f32, 320>[746, 990] writeonly __kernel_out_0, stream<f32, 320>[746, 990] writeonly __kernel_out_1) {
+  place u16 i#2, u16 j#2 in [0:1:2 , 990:991:2] {
+    f32[320] arg1_0_0_0
+  }
+  place u16 i#2, u16 j#2 in [0:1:2 , 1:990:2] {
+    f32[320] arg1_0_0_0
+  }
+  place u16 i#2, u16 j#2 in [0:1:2 , 2:990:2] {
+    f32[320] arg1_0_0_0
+  }
+  place u16 i, u16 j in [1:746:2 , 990:991:2] {
+    f32[320] arg2_0_0_0
+    f32[320] arg3_0_0_0
+    f32[320] arg4_0_0_0
+    f32[320] arg5_0_0_0
+    f32[320] arg0_0_0_0
+    f32[320] arg1_0_0_0
+    f32[320] i21_0_0_0#1
+    f32[320] _temp_0_0_0#3
+    f32[320] arg5_0_0_0#1
+    f32[320] i16_0_0_0#2
+    f32[320] _temp_0_0_0#1
+    f32[320] i19_0_0_0#2
+    f32[320] i21_0_0_0
+    f32[320] _temp_0_0_0#4
+    f32[320] arg4_0_0_0#1
+    f32[320] i16_0_0_0#3
+    f32[320] _temp_0_0_0#5
+    f32[320] i19_0_0_0#3
+  }
+  place u16 i, u16 j in [2:746:2 , 990:991:2] {
+    f32[320] arg2_0_0_0
+    f32[320] arg3_0_0_0
+    f32[320] arg4_0_0_0
+    f32[320] arg5_0_0_0
+    f32[320] arg0_0_0_0
+    f32[320] arg1_0_0_0
+    f32[320] i21_0_0_0#1
+    f32[320] _temp_0_0_0#3
+    f32[320] arg5_0_0_0#1
+    f32[320] i16_0_0_0#2
+    f32[320] _temp_0_0_0#1
+    f32[320] i19_0_0_0#2
+    f32[320] i21_0_0_0
+    f32[320] _temp_0_0_0#4
+    f32[320] arg4_0_0_0#1
+    f32[320] i16_0_0_0#3
+    f32[320] _temp_0_0_0#5
+    f32[320] i19_0_0_0#3
+  }
+  place u16 i, u16 j in [746:747:2 , 990:991:2] {
+    f32[320] arg2_0_0_0
+    f32[320] arg3_0_0_0
+    f32[320] arg4_0_0_0
+    f32[320] arg5_0_0_0
+    f32[320] arg0_0_0_0
+    f32[320] arg1_0_0_0
+    f32[320] i21_0_0_0#2
+    f32[320] _temp_0_0_0#6
+    f32[320] arg5_0_0_0#2
+    f32[320] i16_0_0_0#4
+    f32[320] _temp_0_0_0#7
+    f32[320] i19_0_0_0#4
+    f32[320] i21_0_0_0#3
+    f32[320] _temp_0_0_0#8
+    f32[320] arg4_0_0_0#2
+    f32[320] i16_0_0_0#5
+    f32[320] _temp_0_0_0#9
+    f32[320] i19_0_0_0#5
+  }
+  place u16 i, u16 j in [1:746:2 , 1:990:2] {
+    f32[320] arg2_0_0_0
+    f32[320] arg3_0_0_0
+    f32[320] arg4_0_0_0
+    f32[320] arg5_0_0_0
+    f32[320] arg0_0_0_0
+    f32[320] arg1_0_0_0
+    f32[320] i21_0_0_0#4
+    f32[320] _temp_0_0_0#10
+    f32[320] arg5_0_0_0#3
+    f32[320] i16_0_0_0#6
+    f32[320] _temp_0_0_0#11
+    f32[320] i19_0_0_0#6
+    f32[320] i21_0_0_0#5
+    f32[320] _temp_0_0_0#12
+    f32[320] arg4_0_0_0#3
+    f32[320] i16_0_0_0#7
+    f32[320] _temp_0_0_0#13
+    f32[320] i19_0_0_0#7
+  }
+  place u16 i, u16 j in [1:746:2 , 2:990:2] {
+    f32[320] arg2_0_0_0
+    f32[320] arg3_0_0_0
+    f32[320] arg4_0_0_0
+    f32[320] arg5_0_0_0
+    f32[320] arg0_0_0_0
+    f32[320] arg1_0_0_0
+    f32[320] i21_0_0_0#4
+    f32[320] _temp_0_0_0#10
+    f32[320] arg5_0_0_0#3
+    f32[320] i16_0_0_0#6
+    f32[320] _temp_0_0_0#11
+    f32[320] i19_0_0_0#6
+    f32[320] i21_0_0_0#5
+    f32[320] _temp_0_0_0#12
+    f32[320] arg4_0_0_0#3
+    f32[320] i16_0_0_0#7
+    f32[320] _temp_0_0_0#13
+    f32[320] i19_0_0_0#7
+  }
+  place u16 i, u16 j in [2:746:2 , 1:990:2] {
+    f32[320] arg2_0_0_0
+    f32[320] arg3_0_0_0
+    f32[320] arg4_0_0_0
+    f32[320] arg5_0_0_0
+    f32[320] arg0_0_0_0
+    f32[320] arg1_0_0_0
+    f32[320] i21_0_0_0#4
+    f32[320] _temp_0_0_0#10
+    f32[320] arg5_0_0_0#3
+    f32[320] i16_0_0_0#6
+    f32[320] _temp_0_0_0#11
+    f32[320] i19_0_0_0#6
+    f32[320] i21_0_0_0#5
+    f32[320] _temp_0_0_0#12
+    f32[320] arg4_0_0_0#3
+    f32[320] i16_0_0_0#7
+    f32[320] _temp_0_0_0#13
+    f32[320] i19_0_0_0#7
+  }
+  place u16 i, u16 j in [2:746:2 , 2:990:2] {
+    f32[320] arg2_0_0_0
+    f32[320] arg3_0_0_0
+    f32[320] arg4_0_0_0
+    f32[320] arg5_0_0_0
+    f32[320] arg0_0_0_0
+    f32[320] arg1_0_0_0
+    f32[320] i21_0_0_0#4
+    f32[320] _temp_0_0_0#10
+    f32[320] arg5_0_0_0#3
+    f32[320] i16_0_0_0#6
+    f32[320] _temp_0_0_0#11
+    f32[320] i19_0_0_0#6
+    f32[320] i21_0_0_0#5
+    f32[320] _temp_0_0_0#12
+    f32[320] arg4_0_0_0#3
+    f32[320] i16_0_0_0#7
+    f32[320] _temp_0_0_0#13
+    f32[320] i19_0_0_0#7
+  }
+  place u16 i, u16 j in [746:747:2 , 1:990:2] {
+    f32[320] arg2_0_0_0
+    f32[320] arg3_0_0_0
+    f32[320] arg4_0_0_0
+    f32[320] arg5_0_0_0
+    f32[320] arg0_0_0_0
+    f32[320] arg1_0_0_0
+    f32[320] i21_0_0_0#6
+    f32[320] _temp_0_0_0#14
+    f32[320] arg5_0_0_0#4
+    f32[320] i16_0_0_0#8
+    f32[320] _temp_0_0_0#15
+    f32[320] i19_0_0_0#8
+    f32[320] i21_0_0_0#7
+    f32[320] _temp_0_0_0#16
+    f32[320] arg4_0_0_0#4
+    f32[320] i16_0_0_0#9
+    f32[320] _temp_0_0_0#17
+    f32[320] i19_0_0_0#9
+  }
+  place u16 i, u16 j in [746:747:2 , 2:990:2] {
+    f32[320] arg2_0_0_0
+    f32[320] arg3_0_0_0
+    f32[320] arg4_0_0_0
+    f32[320] arg5_0_0_0
+    f32[320] arg0_0_0_0
+    f32[320] arg1_0_0_0
+    f32[320] i21_0_0_0#6
+    f32[320] _temp_0_0_0#14
+    f32[320] arg5_0_0_0#4
+    f32[320] i16_0_0_0#8
+    f32[320] _temp_0_0_0#15
+    f32[320] i19_0_0_0#8
+    f32[320] i21_0_0_0#7
+    f32[320] _temp_0_0_0#16
+    f32[320] arg4_0_0_0#4
+    f32[320] i16_0_0_0#9
+    f32[320] _temp_0_0_0#17
+    f32[320] i19_0_0_0#9
+  }
+  place u16 i#1, u16 j#1 in [1:747:2 , 0:1:2] {
+    f32[320] arg0_0_0_0
+  }
+  place u16 i#1, u16 j#1 in [2:747:2 , 0:1:2] {
+    f32[320] arg0_0_0_0
+  }
+  place u16 i#7, u16 j#7 in [747:748:2 , 990:991:2] {
+    f32[320] i16_0_0_0
+    f32[320] _temp_0_0_0
+    f32[320] i19_0_0_0#1
+  }
+  place u16 i#8, u16 j#8 in [1:747:2 , 991:992:2] {
+    f32[320] i19_0_0_0
+    f32[320] i16_0_0_0#1
+    f32[320] _temp_0_0_0#2
+  }
+  place u16 i#8, u16 j#8 in [2:747:2 , 991:992:2] {
+    f32[320] i19_0_0_0
+    f32[320] i16_0_0_0#1
+    f32[320] _temp_0_0_0#2
+  }
+  place u16 i#7, u16 j#7 in [747:748:2 , 1:990:2] {
+    f32[320] i16_0_0_0
+    f32[320] _temp_0_0_0
+    f32[320] i19_0_0_0#1
+  }
+  place u16 i#7, u16 j#7 in [747:748:2 , 2:990:2] {
+    f32[320] i16_0_0_0
+    f32[320] _temp_0_0_0
+    f32[320] i19_0_0_0#1
+  }
+  dataflow u16 i#9, u16 j#9 in [0:1:2 , 990:991:2] {
+    stream<f32> _stream_arg1#2 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 0
+}
+    stream<f32> _stream_arg1#3 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 1
+}
+    stream<f32> _stream_arg1#4 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 2
+}
+    stream<f32> _stream_arg1#5 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 3
+}
+  }
+  dataflow u16 i#9, u16 j#9 in [0:1:2 , 1:990:2] {
+    stream<f32> _stream_arg1#6 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 4
+}
+    stream<f32> _stream_arg1#7 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 5
+}
+    stream<f32> _stream_arg1#8 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 6
+}
+    stream<f32> _stream_arg1#9 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 7
+}
+  }
+  dataflow u16 i#9, u16 j#9 in [0:1:2 , 2:990:2] {
+    stream<f32> _stream_arg1#6 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 4
+}
+    stream<f32> _stream_arg1#7 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 5
+}
+    stream<f32> _stream_arg1#8 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 6
+}
+    stream<f32> _stream_arg1#9 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 7
+}
+  }
+  dataflow u16 i#10, u16 j#10 in [1:746:2 , 990:991:2] {
+    stream<f32> _stream_arg1#10 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 8
+}
+    stream<f32> _stream_arg1#11 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 9
+}
+    stream<f32> _stream_arg0#2 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 10
+}
+    stream<f32> _stream_arg0#3 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 11
+}
+    stream<f32> _stream_arg0#4 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 12
+}
+    stream<f32> _stream_arg0#5 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 13
+}
+    stream<f32> _stream_arg1#12 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 14
+}
+    stream<f32> _stream_arg1#13 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 15
+}
+  }
+  dataflow u16 i#10, u16 j#10 in [2:746:2 , 990:991:2] {
+    stream<f32> _stream_arg1#10 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 8
+}
+    stream<f32> _stream_arg1#11 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 9
+}
+    stream<f32> _stream_arg0#2 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 10
+}
+    stream<f32> _stream_arg0#3 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 11
+}
+    stream<f32> _stream_arg0#4 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 12
+}
+    stream<f32> _stream_arg0#5 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 13
+}
+    stream<f32> _stream_arg1#12 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 14
+}
+    stream<f32> _stream_arg1#13 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 15
+}
+  }
+  dataflow u16 i#10, u16 j#10 in [746:747:2 , 990:991:2] {
+    stream<f32> _stream_arg1#14 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 16
+}
+    stream<f32> _stream_arg1#15 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 17
+}
+    stream<f32> _stream_arg0#6 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 18
+}
+    stream<f32> _stream_arg0#7 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 19
+}
+    stream<f32> _stream_arg0#8 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 20
+}
+    stream<f32> _stream_arg0#9 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 21
+}
+    stream<f32> _stream_arg1#16 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 22
+}
+    stream<f32> _stream_arg1#17 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 23
+}
+  }
+  dataflow u16 i#10, u16 j#10 in [1:746:2 , 1:990:2] {
+    stream<f32> _stream_arg1#18 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 24
+}
+    stream<f32> _stream_arg1#19 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 25
+}
+    stream<f32> _stream_arg0#10 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 26
+}
+    stream<f32> _stream_arg0#11 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 27
+}
+    stream<f32> _stream_arg0#12 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 28
+}
+    stream<f32> _stream_arg0#13 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 29
+}
+    stream<f32> _stream_arg1#20 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 30
+}
+    stream<f32> _stream_arg1#21 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 31
+}
+  }
+  dataflow u16 i#10, u16 j#10 in [1:746:2 , 2:990:2] {
+    stream<f32> _stream_arg1#18 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 24
+}
+    stream<f32> _stream_arg1#19 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 25
+}
+    stream<f32> _stream_arg0#10 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 26
+}
+    stream<f32> _stream_arg0#11 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 27
+}
+    stream<f32> _stream_arg0#12 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 28
+}
+    stream<f32> _stream_arg0#13 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 29
+}
+    stream<f32> _stream_arg1#20 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 30
+}
+    stream<f32> _stream_arg1#21 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 31
+}
+  }
+  dataflow u16 i#10, u16 j#10 in [2:746:2 , 1:990:2] {
+    stream<f32> _stream_arg1#18 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 24
+}
+    stream<f32> _stream_arg1#19 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 25
+}
+    stream<f32> _stream_arg0#10 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 26
+}
+    stream<f32> _stream_arg0#11 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 27
+}
+    stream<f32> _stream_arg0#12 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 28
+}
+    stream<f32> _stream_arg0#13 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 29
+}
+    stream<f32> _stream_arg1#20 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 30
+}
+    stream<f32> _stream_arg1#21 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 31
+}
+  }
+  dataflow u16 i#10, u16 j#10 in [2:746:2 , 2:990:2] {
+    stream<f32> _stream_arg1#18 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 24
+}
+    stream<f32> _stream_arg1#19 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 25
+}
+    stream<f32> _stream_arg0#10 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 26
+}
+    stream<f32> _stream_arg0#11 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 27
+}
+    stream<f32> _stream_arg0#12 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 28
+}
+    stream<f32> _stream_arg0#13 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 29
+}
+    stream<f32> _stream_arg1#20 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 30
+}
+    stream<f32> _stream_arg1#21 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 31
+}
+  }
+  dataflow u16 i#10, u16 j#10 in [746:747:2 , 1:990:2] {
+    stream<f32> _stream_arg1#22 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 32
+}
+    stream<f32> _stream_arg1#23 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 33
+}
+    stream<f32> _stream_arg0#14 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 34
+}
+    stream<f32> _stream_arg0#15 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 35
+}
+    stream<f32> _stream_arg0#16 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 36
+}
+    stream<f32> _stream_arg0#17 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 37
+}
+    stream<f32> _stream_arg1#24 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 38
+}
+    stream<f32> _stream_arg1#25 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 39
+}
+  }
+  dataflow u16 i#10, u16 j#10 in [746:747:2 , 2:990:2] {
+    stream<f32> _stream_arg1#22 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 32
+}
+    stream<f32> _stream_arg1#23 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 33
+}
+    stream<f32> _stream_arg0#14 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 34
+}
+    stream<f32> _stream_arg0#15 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 35
+}
+    stream<f32> _stream_arg0#16 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 36
+}
+    stream<f32> _stream_arg0#17 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 37
+}
+    stream<f32> _stream_arg1#24 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 38
+}
+    stream<f32> _stream_arg1#25 = relative_stream(1, 0) {
+  hops = [(1, 0)], 
+  channel = 39
+}
+  }
+  dataflow u16 i#11, u16 j#11 in [1:747:2 , 0:1:2] {
+    stream<f32> _stream_arg0#18 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 40
+}
+    stream<f32> _stream_arg0#19 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 41
+}
+    stream<f32> _stream_arg0#20 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 42
+}
+    stream<f32> _stream_arg0#21 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 43
+}
+  }
+  dataflow u16 i#11, u16 j#11 in [2:747:2 , 0:1:2] {
+    stream<f32> _stream_arg0#18 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 40
+}
+    stream<f32> _stream_arg0#19 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 41
+}
+    stream<f32> _stream_arg0#20 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 42
+}
+    stream<f32> _stream_arg0#21 = relative_stream(0, 1) {
+  hops = [(0, 1)], 
+  channel = 43
+}
+  }
+  compute u16 i#5, u16 j#5 in [0:1:2 , 990:991:2] {
+    await receive(arg1_0_0_0, _arg1[i#5, j#5])
+    awaitall
+    completion _send_comp = send(arg1_0_0_0, _stream_arg1#2)
+    await _send_comp
+    completion _send_comp#3 = send(arg1_0_0_0, _stream_arg1#4)
+    await _send_comp#3
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [0:1:2 , 1:990:2] {
+    await receive(arg1_0_0_0, _arg1[i#5, j#5])
+    awaitall
+    completion _send_comp = send(arg1_0_0_0, _stream_arg1#6)
+    await _send_comp
+    completion _send_comp#3 = send(arg1_0_0_0, _stream_arg1#8)
+    await _send_comp#3
+    awaitall
+  }
+  compute u16 i#5, u16 j#5 in [0:1:2 , 2:990:2] {
+    await receive(arg1_0_0_0, _arg1[i#5, j#5])
+    awaitall
+    completion _send_comp = send(arg1_0_0_0, _stream_arg1#6)
+    await _send_comp
+    completion _send_comp#3 = send(arg1_0_0_0, _stream_arg1#8)
+    await _send_comp#3
+    awaitall
+  }
+  compute u16 i#3, u16 j#3 in [1:746:2 , 990:991:2] {
+    await receive(arg2_0_0_0, _arg2[i#3, j#3])
+    await receive(arg3_0_0_0, _arg3[i#3, j#3])
+    await receive(arg0_0_0_0, _arg0[i#3, j#3])
+    await receive(arg1_0_0_0, _arg1[i#3, j#3])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_arg1#10) {
+      _temp_0_0_0#1[k] = (arg1_0_0_0[k] + x)
+    }
+    completion _send_comp = send(arg1_0_0_0, _stream_arg1#11)
+    await _send_comp
+    await _recv_comp
+    await map i32 k#1 in [0:320:1] {
+      i16_0_0_0#2[k#1] = (_temp_0_0_0#1[k#1] * arg2_0_0_0[k#1])
+    }
+    completion _recv_comp#1 = foreach i32 k#2, f32 x#1 in [0:320:1], receive(_stream_arg0#3) {
+      i19_0_0_0#2[k#2] = (arg0_0_0_0[k#2] + x#1)
+    }
+    await _recv_comp#1
+    await map i32 k#3 in [0:320:1] {
+      _temp_0_0_0#4[k#3] = (i19_0_0_0#2[k#3] - i16_0_0_0#2[k#3])
+    }
+    await map i32 k#4 in [0:320:1] {
+      i21_0_0_0[k#4] = (112.5 * _temp_0_0_0#4[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      arg4_0_0_0#1[k#5] = (arg3_0_0_0[k#5] * i21_0_0_0[k#5])
+    }
+    completion _recv_comp#2 = foreach i32 k#6, f32 x#2 in [0:320:1], receive(_stream_arg0#5) {
+      _temp_0_0_0#5[k#6] = (arg0_0_0_0[k#6] + x#2)
+    }
+    await _recv_comp#2
+    await map i32 k#7 in [0:320:1] {
+      i16_0_0_0#3[k#7] = (_temp_0_0_0#5[k#7] * arg2_0_0_0[k#7])
+    }
+    completion _recv_comp#3 = foreach i32 k#8, f32 x#3 in [0:320:1], receive(_stream_arg1#12) {
+      i19_0_0_0#3[k#8] = (arg1_0_0_0[k#8] + x#3)
+    }
+    completion _send_comp#3 = send(arg1_0_0_0, _stream_arg1#13)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#9 in [0:320:1] {
+      _temp_0_0_0#3[k#9] = (i19_0_0_0#3[k#9] - i16_0_0_0#3[k#9])
+    }
+    await map i32 k#10 in [0:320:1] {
+      i21_0_0_0#1[k#10] = (112.5 * _temp_0_0_0#3[k#10])
+    }
+    await map i32 k#11 in [0:320:1] {
+      arg5_0_0_0#1[k#11] = (arg3_0_0_0[k#11] * i21_0_0_0#1[k#11])
+    }
+    await map i32 k#12 in [0:320:1] {
+      arg4_0_0_0[k#12] = arg4_0_0_0#1[k#12]
+    }
+    await map i32 k#13 in [0:320:1] {
+      arg5_0_0_0[k#13] = arg5_0_0_0#1[k#13]
+    }
+    awaitall
+    await send(arg4_0_0_0, __kernel_out_0[(i#3 - 1), (j#3 - 1)])
+    awaitall
+    await send(arg5_0_0_0, __kernel_out_1[(i#3 - 1), (j#3 - 1)])
+    awaitall
+  }
+  compute u16 i#3, u16 j#3 in [2:746:2 , 990:991:2] {
+    await receive(arg2_0_0_0, _arg2[i#3, j#3])
+    await receive(arg3_0_0_0, _arg3[i#3, j#3])
+    await receive(arg0_0_0_0, _arg0[i#3, j#3])
+    await receive(arg1_0_0_0, _arg1[i#3, j#3])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_arg1#11) {
+      _temp_0_0_0#1[k] = (arg1_0_0_0[k] + x)
+    }
+    completion _send_comp = send(arg1_0_0_0, _stream_arg1#10)
+    await _send_comp
+    await _recv_comp
+    await map i32 k#1 in [0:320:1] {
+      i16_0_0_0#2[k#1] = (_temp_0_0_0#1[k#1] * arg2_0_0_0[k#1])
+    }
+    completion _recv_comp#1 = foreach i32 k#2, f32 x#1 in [0:320:1], receive(_stream_arg0#3) {
+      i19_0_0_0#2[k#2] = (arg0_0_0_0[k#2] + x#1)
+    }
+    await _recv_comp#1
+    await map i32 k#3 in [0:320:1] {
+      _temp_0_0_0#4[k#3] = (i19_0_0_0#2[k#3] - i16_0_0_0#2[k#3])
+    }
+    await map i32 k#4 in [0:320:1] {
+      i21_0_0_0[k#4] = (112.5 * _temp_0_0_0#4[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      arg4_0_0_0#1[k#5] = (arg3_0_0_0[k#5] * i21_0_0_0[k#5])
+    }
+    completion _recv_comp#2 = foreach i32 k#6, f32 x#2 in [0:320:1], receive(_stream_arg0#5) {
+      _temp_0_0_0#5[k#6] = (arg0_0_0_0[k#6] + x#2)
+    }
+    await _recv_comp#2
+    await map i32 k#7 in [0:320:1] {
+      i16_0_0_0#3[k#7] = (_temp_0_0_0#5[k#7] * arg2_0_0_0[k#7])
+    }
+    completion _recv_comp#3 = foreach i32 k#8, f32 x#3 in [0:320:1], receive(_stream_arg1#13) {
+      i19_0_0_0#3[k#8] = (arg1_0_0_0[k#8] + x#3)
+    }
+    completion _send_comp#3 = send(arg1_0_0_0, _stream_arg1#12)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#9 in [0:320:1] {
+      _temp_0_0_0#3[k#9] = (i19_0_0_0#3[k#9] - i16_0_0_0#3[k#9])
+    }
+    await map i32 k#10 in [0:320:1] {
+      i21_0_0_0#1[k#10] = (112.5 * _temp_0_0_0#3[k#10])
+    }
+    await map i32 k#11 in [0:320:1] {
+      arg5_0_0_0#1[k#11] = (arg3_0_0_0[k#11] * i21_0_0_0#1[k#11])
+    }
+    await map i32 k#12 in [0:320:1] {
+      arg4_0_0_0[k#12] = arg4_0_0_0#1[k#12]
+    }
+    await map i32 k#13 in [0:320:1] {
+      arg5_0_0_0[k#13] = arg5_0_0_0#1[k#13]
+    }
+    awaitall
+    await send(arg4_0_0_0, __kernel_out_0[(i#3 - 1), (j#3 - 1)])
+    awaitall
+    await send(arg5_0_0_0, __kernel_out_1[(i#3 - 1), (j#3 - 1)])
+    awaitall
+  }
+  compute u16 i#3, u16 j#3 in [746:747:2 , 990:991:2] {
+    await receive(arg2_0_0_0, _arg2[i#3, j#3])
+    await receive(arg3_0_0_0, _arg3[i#3, j#3])
+    await receive(arg0_0_0_0, _arg0[i#3, j#3])
+    await receive(arg1_0_0_0, _arg1[i#3, j#3])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_arg1#15) {
+      _temp_0_0_0#7[k] = (arg1_0_0_0[k] + x)
+    }
+    await _recv_comp
+    await map i32 k#1 in [0:320:1] {
+      i16_0_0_0#4[k#1] = (_temp_0_0_0#7[k#1] * arg2_0_0_0[k#1])
+    }
+    completion _recv_comp#1 = foreach i32 k#2, f32 x#1 in [0:320:1], receive(_stream_arg0#7) {
+      i19_0_0_0#4[k#2] = (arg0_0_0_0[k#2] + x#1)
+    }
+    await _recv_comp#1
+    await map i32 k#3 in [0:320:1] {
+      _temp_0_0_0#8[k#3] = (i19_0_0_0#4[k#3] - i16_0_0_0#4[k#3])
+    }
+    await map i32 k#4 in [0:320:1] {
+      i21_0_0_0#3[k#4] = (112.5 * _temp_0_0_0#8[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      arg4_0_0_0#2[k#5] = (arg3_0_0_0[k#5] * i21_0_0_0#3[k#5])
+    }
+    completion _recv_comp#2 = foreach i32 k#6, f32 x#2 in [0:320:1], receive(_stream_arg0#9) {
+      _temp_0_0_0#9[k#6] = (arg0_0_0_0[k#6] + x#2)
+    }
+    await _recv_comp#2
+    await map i32 k#7 in [0:320:1] {
+      i16_0_0_0#5[k#7] = (_temp_0_0_0#9[k#7] * arg2_0_0_0[k#7])
+    }
+    completion _recv_comp#3 = foreach i32 k#8, f32 x#3 in [0:320:1], receive(_stream_arg1#17) {
+      i19_0_0_0#5[k#8] = (arg1_0_0_0[k#8] + x#3)
+    }
+    await _recv_comp#3
+    await map i32 k#9 in [0:320:1] {
+      _temp_0_0_0#6[k#9] = (i19_0_0_0#5[k#9] - i16_0_0_0#5[k#9])
+    }
+    await map i32 k#10 in [0:320:1] {
+      i21_0_0_0#2[k#10] = (112.5 * _temp_0_0_0#6[k#10])
+    }
+    await map i32 k#11 in [0:320:1] {
+      arg5_0_0_0#2[k#11] = (arg3_0_0_0[k#11] * i21_0_0_0#2[k#11])
+    }
+    await map i32 k#12 in [0:320:1] {
+      arg4_0_0_0[k#12] = arg4_0_0_0#2[k#12]
+    }
+    await map i32 k#13 in [0:320:1] {
+      arg5_0_0_0[k#13] = arg5_0_0_0#2[k#13]
+    }
+    awaitall
+    await send(arg4_0_0_0, __kernel_out_0[(i#3 - 1), (j#3 - 1)])
+    awaitall
+    await send(arg5_0_0_0, __kernel_out_1[(i#3 - 1), (j#3 - 1)])
+    awaitall
+  }
+  compute u16 i#3, u16 j#3 in [1:746:2 , 1:990:2] {
+    await receive(arg2_0_0_0, _arg2[i#3, j#3])
+    await receive(arg3_0_0_0, _arg3[i#3, j#3])
+    await receive(arg0_0_0_0, _arg0[i#3, j#3])
+    await receive(arg1_0_0_0, _arg1[i#3, j#3])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_arg1#18) {
+      _temp_0_0_0#11[k] = (arg1_0_0_0[k] + x)
+    }
+    completion _send_comp = send(arg1_0_0_0, _stream_arg1#19)
+    await _send_comp
+    await _recv_comp
+    await map i32 k#1 in [0:320:1] {
+      i16_0_0_0#6[k#1] = (_temp_0_0_0#11[k#1] * arg2_0_0_0[k#1])
+    }
+    completion _recv_comp#1 = foreach i32 k#2, f32 x#1 in [0:320:1], receive(_stream_arg0#10) {
+      i19_0_0_0#6[k#2] = (arg0_0_0_0[k#2] + x#1)
+    }
+    completion _send_comp#1 = send(arg0_0_0_0, _stream_arg0#11)
+    await _send_comp#1
+    await _recv_comp#1
+    await map i32 k#3 in [0:320:1] {
+      _temp_0_0_0#12[k#3] = (i19_0_0_0#6[k#3] - i16_0_0_0#6[k#3])
+    }
+    await map i32 k#4 in [0:320:1] {
+      i21_0_0_0#5[k#4] = (112.5 * _temp_0_0_0#12[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      arg4_0_0_0#3[k#5] = (arg3_0_0_0[k#5] * i21_0_0_0#5[k#5])
+    }
+    completion _recv_comp#2 = foreach i32 k#6, f32 x#2 in [0:320:1], receive(_stream_arg0#12) {
+      _temp_0_0_0#13[k#6] = (arg0_0_0_0[k#6] + x#2)
+    }
+    completion _send_comp#2 = send(arg0_0_0_0, _stream_arg0#13)
+    await _send_comp#2
+    await _recv_comp#2
+    await map i32 k#7 in [0:320:1] {
+      i16_0_0_0#7[k#7] = (_temp_0_0_0#13[k#7] * arg2_0_0_0[k#7])
+    }
+    completion _recv_comp#3 = foreach i32 k#8, f32 x#3 in [0:320:1], receive(_stream_arg1#20) {
+      i19_0_0_0#7[k#8] = (arg1_0_0_0[k#8] + x#3)
+    }
+    completion _send_comp#3 = send(arg1_0_0_0, _stream_arg1#21)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#9 in [0:320:1] {
+      _temp_0_0_0#10[k#9] = (i19_0_0_0#7[k#9] - i16_0_0_0#7[k#9])
+    }
+    await map i32 k#10 in [0:320:1] {
+      i21_0_0_0#4[k#10] = (112.5 * _temp_0_0_0#10[k#10])
+    }
+    await map i32 k#11 in [0:320:1] {
+      arg5_0_0_0#3[k#11] = (arg3_0_0_0[k#11] * i21_0_0_0#4[k#11])
+    }
+    await map i32 k#12 in [0:320:1] {
+      arg4_0_0_0[k#12] = arg4_0_0_0#3[k#12]
+    }
+    await map i32 k#13 in [0:320:1] {
+      arg5_0_0_0[k#13] = arg5_0_0_0#3[k#13]
+    }
+    awaitall
+    await send(arg4_0_0_0, __kernel_out_0[(i#3 - 1), (j#3 - 1)])
+    awaitall
+    await send(arg5_0_0_0, __kernel_out_1[(i#3 - 1), (j#3 - 1)])
+    awaitall
+  }
+  compute u16 i#3, u16 j#3 in [1:746:2 , 2:990:2] {
+    await receive(arg2_0_0_0, _arg2[i#3, j#3])
+    await receive(arg3_0_0_0, _arg3[i#3, j#3])
+    await receive(arg0_0_0_0, _arg0[i#3, j#3])
+    await receive(arg1_0_0_0, _arg1[i#3, j#3])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_arg1#18) {
+      _temp_0_0_0#11[k] = (arg1_0_0_0[k] + x)
+    }
+    completion _send_comp = send(arg1_0_0_0, _stream_arg1#19)
+    await _send_comp
+    await _recv_comp
+    await map i32 k#1 in [0:320:1] {
+      i16_0_0_0#6[k#1] = (_temp_0_0_0#11[k#1] * arg2_0_0_0[k#1])
+    }
+    completion _recv_comp#1 = foreach i32 k#2, f32 x#1 in [0:320:1], receive(_stream_arg0#11) {
+      i19_0_0_0#6[k#2] = (arg0_0_0_0[k#2] + x#1)
+    }
+    completion _send_comp#1 = send(arg0_0_0_0, _stream_arg0#10)
+    await _send_comp#1
+    await _recv_comp#1
+    await map i32 k#3 in [0:320:1] {
+      _temp_0_0_0#12[k#3] = (i19_0_0_0#6[k#3] - i16_0_0_0#6[k#3])
+    }
+    await map i32 k#4 in [0:320:1] {
+      i21_0_0_0#5[k#4] = (112.5 * _temp_0_0_0#12[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      arg4_0_0_0#3[k#5] = (arg3_0_0_0[k#5] * i21_0_0_0#5[k#5])
+    }
+    completion _recv_comp#2 = foreach i32 k#6, f32 x#2 in [0:320:1], receive(_stream_arg0#13) {
+      _temp_0_0_0#13[k#6] = (arg0_0_0_0[k#6] + x#2)
+    }
+    completion _send_comp#2 = send(arg0_0_0_0, _stream_arg0#12)
+    await _send_comp#2
+    await _recv_comp#2
+    await map i32 k#7 in [0:320:1] {
+      i16_0_0_0#7[k#7] = (_temp_0_0_0#13[k#7] * arg2_0_0_0[k#7])
+    }
+    completion _recv_comp#3 = foreach i32 k#8, f32 x#3 in [0:320:1], receive(_stream_arg1#20) {
+      i19_0_0_0#7[k#8] = (arg1_0_0_0[k#8] + x#3)
+    }
+    completion _send_comp#3 = send(arg1_0_0_0, _stream_arg1#21)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#9 in [0:320:1] {
+      _temp_0_0_0#10[k#9] = (i19_0_0_0#7[k#9] - i16_0_0_0#7[k#9])
+    }
+    await map i32 k#10 in [0:320:1] {
+      i21_0_0_0#4[k#10] = (112.5 * _temp_0_0_0#10[k#10])
+    }
+    await map i32 k#11 in [0:320:1] {
+      arg5_0_0_0#3[k#11] = (arg3_0_0_0[k#11] * i21_0_0_0#4[k#11])
+    }
+    await map i32 k#12 in [0:320:1] {
+      arg4_0_0_0[k#12] = arg4_0_0_0#3[k#12]
+    }
+    await map i32 k#13 in [0:320:1] {
+      arg5_0_0_0[k#13] = arg5_0_0_0#3[k#13]
+    }
+    awaitall
+    await send(arg4_0_0_0, __kernel_out_0[(i#3 - 1), (j#3 - 1)])
+    awaitall
+    await send(arg5_0_0_0, __kernel_out_1[(i#3 - 1), (j#3 - 1)])
+    awaitall
+  }
+  compute u16 i#3, u16 j#3 in [2:746:2 , 1:990:2] {
+    await receive(arg2_0_0_0, _arg2[i#3, j#3])
+    await receive(arg3_0_0_0, _arg3[i#3, j#3])
+    await receive(arg0_0_0_0, _arg0[i#3, j#3])
+    await receive(arg1_0_0_0, _arg1[i#3, j#3])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_arg1#19) {
+      _temp_0_0_0#11[k] = (arg1_0_0_0[k] + x)
+    }
+    completion _send_comp = send(arg1_0_0_0, _stream_arg1#18)
+    await _send_comp
+    await _recv_comp
+    await map i32 k#1 in [0:320:1] {
+      i16_0_0_0#6[k#1] = (_temp_0_0_0#11[k#1] * arg2_0_0_0[k#1])
+    }
+    completion _recv_comp#1 = foreach i32 k#2, f32 x#1 in [0:320:1], receive(_stream_arg0#10) {
+      i19_0_0_0#6[k#2] = (arg0_0_0_0[k#2] + x#1)
+    }
+    completion _send_comp#1 = send(arg0_0_0_0, _stream_arg0#11)
+    await _send_comp#1
+    await _recv_comp#1
+    await map i32 k#3 in [0:320:1] {
+      _temp_0_0_0#12[k#3] = (i19_0_0_0#6[k#3] - i16_0_0_0#6[k#3])
+    }
+    await map i32 k#4 in [0:320:1] {
+      i21_0_0_0#5[k#4] = (112.5 * _temp_0_0_0#12[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      arg4_0_0_0#3[k#5] = (arg3_0_0_0[k#5] * i21_0_0_0#5[k#5])
+    }
+    completion _recv_comp#2 = foreach i32 k#6, f32 x#2 in [0:320:1], receive(_stream_arg0#12) {
+      _temp_0_0_0#13[k#6] = (arg0_0_0_0[k#6] + x#2)
+    }
+    completion _send_comp#2 = send(arg0_0_0_0, _stream_arg0#13)
+    await _send_comp#2
+    await _recv_comp#2
+    await map i32 k#7 in [0:320:1] {
+      i16_0_0_0#7[k#7] = (_temp_0_0_0#13[k#7] * arg2_0_0_0[k#7])
+    }
+    completion _recv_comp#3 = foreach i32 k#8, f32 x#3 in [0:320:1], receive(_stream_arg1#21) {
+      i19_0_0_0#7[k#8] = (arg1_0_0_0[k#8] + x#3)
+    }
+    completion _send_comp#3 = send(arg1_0_0_0, _stream_arg1#20)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#9 in [0:320:1] {
+      _temp_0_0_0#10[k#9] = (i19_0_0_0#7[k#9] - i16_0_0_0#7[k#9])
+    }
+    await map i32 k#10 in [0:320:1] {
+      i21_0_0_0#4[k#10] = (112.5 * _temp_0_0_0#10[k#10])
+    }
+    await map i32 k#11 in [0:320:1] {
+      arg5_0_0_0#3[k#11] = (arg3_0_0_0[k#11] * i21_0_0_0#4[k#11])
+    }
+    await map i32 k#12 in [0:320:1] {
+      arg4_0_0_0[k#12] = arg4_0_0_0#3[k#12]
+    }
+    await map i32 k#13 in [0:320:1] {
+      arg5_0_0_0[k#13] = arg5_0_0_0#3[k#13]
+    }
+    awaitall
+    await send(arg4_0_0_0, __kernel_out_0[(i#3 - 1), (j#3 - 1)])
+    awaitall
+    await send(arg5_0_0_0, __kernel_out_1[(i#3 - 1), (j#3 - 1)])
+    awaitall
+  }
+  compute u16 i#3, u16 j#3 in [2:746:2 , 2:990:2] {
+    await receive(arg2_0_0_0, _arg2[i#3, j#3])
+    await receive(arg3_0_0_0, _arg3[i#3, j#3])
+    await receive(arg0_0_0_0, _arg0[i#3, j#3])
+    await receive(arg1_0_0_0, _arg1[i#3, j#3])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_arg1#19) {
+      _temp_0_0_0#11[k] = (arg1_0_0_0[k] + x)
+    }
+    completion _send_comp = send(arg1_0_0_0, _stream_arg1#18)
+    await _send_comp
+    await _recv_comp
+    await map i32 k#1 in [0:320:1] {
+      i16_0_0_0#6[k#1] = (_temp_0_0_0#11[k#1] * arg2_0_0_0[k#1])
+    }
+    completion _recv_comp#1 = foreach i32 k#2, f32 x#1 in [0:320:1], receive(_stream_arg0#11) {
+      i19_0_0_0#6[k#2] = (arg0_0_0_0[k#2] + x#1)
+    }
+    completion _send_comp#1 = send(arg0_0_0_0, _stream_arg0#10)
+    await _send_comp#1
+    await _recv_comp#1
+    await map i32 k#3 in [0:320:1] {
+      _temp_0_0_0#12[k#3] = (i19_0_0_0#6[k#3] - i16_0_0_0#6[k#3])
+    }
+    await map i32 k#4 in [0:320:1] {
+      i21_0_0_0#5[k#4] = (112.5 * _temp_0_0_0#12[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      arg4_0_0_0#3[k#5] = (arg3_0_0_0[k#5] * i21_0_0_0#5[k#5])
+    }
+    completion _recv_comp#2 = foreach i32 k#6, f32 x#2 in [0:320:1], receive(_stream_arg0#13) {
+      _temp_0_0_0#13[k#6] = (arg0_0_0_0[k#6] + x#2)
+    }
+    completion _send_comp#2 = send(arg0_0_0_0, _stream_arg0#12)
+    await _send_comp#2
+    await _recv_comp#2
+    await map i32 k#7 in [0:320:1] {
+      i16_0_0_0#7[k#7] = (_temp_0_0_0#13[k#7] * arg2_0_0_0[k#7])
+    }
+    completion _recv_comp#3 = foreach i32 k#8, f32 x#3 in [0:320:1], receive(_stream_arg1#21) {
+      i19_0_0_0#7[k#8] = (arg1_0_0_0[k#8] + x#3)
+    }
+    completion _send_comp#3 = send(arg1_0_0_0, _stream_arg1#20)
+    await _send_comp#3
+    await _recv_comp#3
+    await map i32 k#9 in [0:320:1] {
+      _temp_0_0_0#10[k#9] = (i19_0_0_0#7[k#9] - i16_0_0_0#7[k#9])
+    }
+    await map i32 k#10 in [0:320:1] {
+      i21_0_0_0#4[k#10] = (112.5 * _temp_0_0_0#10[k#10])
+    }
+    await map i32 k#11 in [0:320:1] {
+      arg5_0_0_0#3[k#11] = (arg3_0_0_0[k#11] * i21_0_0_0#4[k#11])
+    }
+    await map i32 k#12 in [0:320:1] {
+      arg4_0_0_0[k#12] = arg4_0_0_0#3[k#12]
+    }
+    await map i32 k#13 in [0:320:1] {
+      arg5_0_0_0[k#13] = arg5_0_0_0#3[k#13]
+    }
+    awaitall
+    await send(arg4_0_0_0, __kernel_out_0[(i#3 - 1), (j#3 - 1)])
+    awaitall
+    await send(arg5_0_0_0, __kernel_out_1[(i#3 - 1), (j#3 - 1)])
+    awaitall
+  }
+  compute u16 i#3, u16 j#3 in [746:747:2 , 1:990:2] {
+    await receive(arg2_0_0_0, _arg2[i#3, j#3])
+    await receive(arg3_0_0_0, _arg3[i#3, j#3])
+    await receive(arg0_0_0_0, _arg0[i#3, j#3])
+    await receive(arg1_0_0_0, _arg1[i#3, j#3])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_arg1#23) {
+      _temp_0_0_0#15[k] = (arg1_0_0_0[k] + x)
+    }
+    await _recv_comp
+    await map i32 k#1 in [0:320:1] {
+      i16_0_0_0#8[k#1] = (_temp_0_0_0#15[k#1] * arg2_0_0_0[k#1])
+    }
+    completion _recv_comp#1 = foreach i32 k#2, f32 x#1 in [0:320:1], receive(_stream_arg0#14) {
+      i19_0_0_0#8[k#2] = (arg0_0_0_0[k#2] + x#1)
+    }
+    completion _send_comp#1 = send(arg0_0_0_0, _stream_arg0#15)
+    await _send_comp#1
+    await _recv_comp#1
+    await map i32 k#3 in [0:320:1] {
+      _temp_0_0_0#16[k#3] = (i19_0_0_0#8[k#3] - i16_0_0_0#8[k#3])
+    }
+    await map i32 k#4 in [0:320:1] {
+      i21_0_0_0#7[k#4] = (112.5 * _temp_0_0_0#16[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      arg4_0_0_0#4[k#5] = (arg3_0_0_0[k#5] * i21_0_0_0#7[k#5])
+    }
+    completion _recv_comp#2 = foreach i32 k#6, f32 x#2 in [0:320:1], receive(_stream_arg0#16) {
+      _temp_0_0_0#17[k#6] = (arg0_0_0_0[k#6] + x#2)
+    }
+    completion _send_comp#2 = send(arg0_0_0_0, _stream_arg0#17)
+    await _send_comp#2
+    await _recv_comp#2
+    await map i32 k#7 in [0:320:1] {
+      i16_0_0_0#9[k#7] = (_temp_0_0_0#17[k#7] * arg2_0_0_0[k#7])
+    }
+    completion _recv_comp#3 = foreach i32 k#8, f32 x#3 in [0:320:1], receive(_stream_arg1#25) {
+      i19_0_0_0#9[k#8] = (arg1_0_0_0[k#8] + x#3)
+    }
+    await _recv_comp#3
+    await map i32 k#9 in [0:320:1] {
+      _temp_0_0_0#14[k#9] = (i19_0_0_0#9[k#9] - i16_0_0_0#9[k#9])
+    }
+    await map i32 k#10 in [0:320:1] {
+      i21_0_0_0#6[k#10] = (112.5 * _temp_0_0_0#14[k#10])
+    }
+    await map i32 k#11 in [0:320:1] {
+      arg5_0_0_0#4[k#11] = (arg3_0_0_0[k#11] * i21_0_0_0#6[k#11])
+    }
+    await map i32 k#12 in [0:320:1] {
+      arg4_0_0_0[k#12] = arg4_0_0_0#4[k#12]
+    }
+    await map i32 k#13 in [0:320:1] {
+      arg5_0_0_0[k#13] = arg5_0_0_0#4[k#13]
+    }
+    awaitall
+    await send(arg4_0_0_0, __kernel_out_0[(i#3 - 1), (j#3 - 1)])
+    awaitall
+    await send(arg5_0_0_0, __kernel_out_1[(i#3 - 1), (j#3 - 1)])
+    awaitall
+  }
+  compute u16 i#3, u16 j#3 in [746:747:2 , 2:990:2] {
+    await receive(arg2_0_0_0, _arg2[i#3, j#3])
+    await receive(arg3_0_0_0, _arg3[i#3, j#3])
+    await receive(arg0_0_0_0, _arg0[i#3, j#3])
+    await receive(arg1_0_0_0, _arg1[i#3, j#3])
+    awaitall
+    completion _recv_comp = foreach i32 k, f32 x in [0:320:1], receive(_stream_arg1#23) {
+      _temp_0_0_0#15[k] = (arg1_0_0_0[k] + x)
+    }
+    await _recv_comp
+    await map i32 k#1 in [0:320:1] {
+      i16_0_0_0#8[k#1] = (_temp_0_0_0#15[k#1] * arg2_0_0_0[k#1])
+    }
+    completion _recv_comp#1 = foreach i32 k#2, f32 x#1 in [0:320:1], receive(_stream_arg0#15) {
+      i19_0_0_0#8[k#2] = (arg0_0_0_0[k#2] + x#1)
+    }
+    completion _send_comp#1 = send(arg0_0_0_0, _stream_arg0#14)
+    await _send_comp#1
+    await _recv_comp#1
+    await map i32 k#3 in [0:320:1] {
+      _temp_0_0_0#16[k#3] = (i19_0_0_0#8[k#3] - i16_0_0_0#8[k#3])
+    }
+    await map i32 k#4 in [0:320:1] {
+      i21_0_0_0#7[k#4] = (112.5 * _temp_0_0_0#16[k#4])
+    }
+    await map i32 k#5 in [0:320:1] {
+      arg4_0_0_0#4[k#5] = (arg3_0_0_0[k#5] * i21_0_0_0#7[k#5])
+    }
+    completion _recv_comp#2 = foreach i32 k#6, f32 x#2 in [0:320:1], receive(_stream_arg0#17) {
+      _temp_0_0_0#17[k#6] = (arg0_0_0_0[k#6] + x#2)
+    }
+    completion _send_comp#2 = send(arg0_0_0_0, _stream_arg0#16)
+    await _send_comp#2
+    await _recv_comp#2
+    await map i32 k#7 in [0:320:1] {
+      i16_0_0_0#9[k#7] = (_temp_0_0_0#17[k#7] * arg2_0_0_0[k#7])
+    }
+    completion _recv_comp#3 = foreach i32 k#8, f32 x#3 in [0:320:1], receive(_stream_arg1#25) {
+      i19_0_0_0#9[k#8] = (arg1_0_0_0[k#8] + x#3)
+    }
+    await _recv_comp#3
+    await map i32 k#9 in [0:320:1] {
+      _temp_0_0_0#14[k#9] = (i19_0_0_0#9[k#9] - i16_0_0_0#9[k#9])
+    }
+    await map i32 k#10 in [0:320:1] {
+      i21_0_0_0#6[k#10] = (112.5 * _temp_0_0_0#14[k#10])
+    }
+    await map i32 k#11 in [0:320:1] {
+      arg5_0_0_0#4[k#11] = (arg3_0_0_0[k#11] * i21_0_0_0#6[k#11])
+    }
+    await map i32 k#12 in [0:320:1] {
+      arg4_0_0_0[k#12] = arg4_0_0_0#4[k#12]
+    }
+    await map i32 k#13 in [0:320:1] {
+      arg5_0_0_0[k#13] = arg5_0_0_0#4[k#13]
+    }
+    awaitall
+    await send(arg4_0_0_0, __kernel_out_0[(i#3 - 1), (j#3 - 1)])
+    awaitall
+    await send(arg5_0_0_0, __kernel_out_1[(i#3 - 1), (j#3 - 1)])
+    awaitall
+  }
+  compute u16 i#4, u16 j#4 in [1:747:2 , 0:1:2] {
+    await receive(arg0_0_0_0, _arg0[i#4, j#4])
+    awaitall
+    completion _send_comp#1 = send(arg0_0_0_0, _stream_arg0#18)
+    await _send_comp#1
+    completion _send_comp#2 = send(arg0_0_0_0, _stream_arg0#20)
+    await _send_comp#2
+    awaitall
+  }
+  compute u16 i#4, u16 j#4 in [2:747:2 , 0:1:2] {
+    await receive(arg0_0_0_0, _arg0[i#4, j#4])
+    awaitall
+    completion _send_comp#1 = send(arg0_0_0_0, _stream_arg0#18)
+    await _send_comp#1
+    completion _send_comp#2 = send(arg0_0_0_0, _stream_arg0#20)
+    await _send_comp#2
+    awaitall
+  }
+  compute u16 i, u16 j in [747:748:2 , 990:991:2] {
+
+  }
+  compute u16 i, u16 j in [0:1:2 , 991:992:2] {
+
+  }
+  compute u16 i, u16 j in [747:748:2 , 991:992:2] {
+
+  }
+  compute u16 i, u16 j in [1:747:2 , 991:992:2] {
+
+  }
+  compute u16 i, u16 j in [2:747:2 , 991:992:2] {
+
+  }
+  compute u16 i, u16 j in [0:1:2 , 0:1:2] {
+
+  }
+  compute u16 i, u16 j in [747:748:2 , 0:1:2] {
+
+  }
+  compute u16 i, u16 j in [747:748:2 , 1:990:2] {
+
+  }
+  compute u16 i, u16 j in [747:748:2 , 2:990:2] {
+
+  }
+}
\ No newline at end of file
diff --git a/scripts/examples.py b/scripts/examples.py
index 195265d2..cfc8bd6c 100644
--- a/scripts/examples.py
+++ b/scripts/examples.py
@@ -1,8 +1,8 @@
 import numpy as np
 import igraph as ig
-from spatialstencil.placement.domain import FieldDomain
-from spatialstencil.placement.stencil import Stencil, StencilDirection
-from spatialstencil.placement.graph import StencilGraph
+from spada.placement.domain import FieldDomain
+from spada.placement.stencil import Stencil, StencilDirection
+from spada.placement.graph import StencilGraph
 
 def horizontal_diffusion():
     """
diff --git a/scripts/generate_benchmarks.sh b/scripts/generate_benchmarks.sh
index 979a51fb..df6c6291 100755
--- a/scripts/generate_benchmarks.sh
+++ b/scripts/generate_benchmarks.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-#python ./spatialstencil/cli/gt4py_to_spatial.py ./samples/gt4py_test_instances.py 4,4,4 ./samples/tests
-python ./spatialstencil/cli/gt4py_to_spatial.py ./samples/stencils.py 4,4,4 ./samples/benchmarks
-python ./spatialstencil/cli/gt4py_to_spatial.py ./samples/stencils.py 16,16,4 ./samples/benchmarks
-python ./spatialstencil/cli/gt4py_to_spatial.py ./samples/stencils.py 128,128,80 ./samples/benchmarks
-python ./spatialstencil/cli/gt4py_to_spatial.py ./samples/stencils.py 512,512,80 ./samples/benchmarks
\ No newline at end of file
+#python ./spada/cli/gt4py_to_spatial.py ./samples/gt4py_test_instances.py 4,4,4 ./samples/tests
+python ./spada/cli/gt4py_to_spatial.py ./samples/stencils.py 4,4,4 ./samples/benchmarks
+python ./spada/cli/gt4py_to_spatial.py ./samples/stencils.py 16,16,4 ./samples/benchmarks
+python ./spada/cli/gt4py_to_spatial.py ./samples/stencils.py 128,128,80 ./samples/benchmarks
+python ./spada/cli/gt4py_to_spatial.py ./samples/stencils.py 512,512,80 ./samples/benchmarks
\ No newline at end of file
diff --git a/scripts/placement_demo.py b/scripts/placement_demo.py
index 44a99f84..5a334422 100644
--- a/scripts/placement_demo.py
+++ b/scripts/placement_demo.py
@@ -5,12 +5,12 @@
 from numpy.typing import NDArray
 
 from scripts import examples
-from spatialstencil.placement.graph import Stencil, StencilDirection, FieldDomain, StencilGraph
-from spatialstencil.placement.placed_graph import PlacedStencilGraph
-from spatialstencil.placement.mla import linearize_with_ck
-from spatialstencil.placement.model import CostModel, PlacementCost
-from spatialstencil.placement.optimizer import best_of_k_placement
-from spatialstencil.placement.partition import FieldPartition
+from spada.placement.graph import Stencil, StencilDirection, FieldDomain, StencilGraph
+from spada.placement.placed_graph import PlacedStencilGraph
+from spada.placement.mla import linearize_with_ck
+from spada.placement.model import CostModel, PlacementCost
+from spada.placement.optimizer import best_of_k_placement
+from spada.placement.partition import FieldPartition
 
 
 def demo_graph():
diff --git a/setup.py b/setup.py
index 7953d2cb..dc897eeb 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Setup script for spatialstencil package."""
+"""Setup script for spada package."""
 
 from setuptools import setup, find_packages
 import os
@@ -11,28 +11,28 @@ def read_readme():
     if os.path.exists(readme_path):
         with open(readme_path, 'r', encoding='utf-8') as f:
             return f.read()
-    return "A spatial stencil compiler for high-performance computing."
+    return "A SpaDA compiler for high-performance computing."
 
 
 # Read version from package
 def get_version():
-    """Get version from spatialstencil package."""
+    """Get version from spada package."""
     try:
-        import spatialstencil
-        return spatialstencil.__version__
+        import spada
+        return spada.__version__
     except (ImportError, AttributeError):
         return "0.1.0"
 
 
 setup(
-    name="spatialstencil",
+    name="spada",
     version=get_version(),
-    author="SpatialStencil Team",
+    author="SpaDA Team",
     author_email="",
-    description="A spatial stencil compiler for high-performance computing",
+    description="A SpaDA compiler for high-performance computing",
     long_description=read_readme(),
     long_description_content_type="text/markdown",
-    url="https://github.com/glukas/spatialstencil",
+    url="https://github.com/glukas/spada",
     packages=find_packages(),
     classifiers=[
         "Development Status :: 3 - Alpha",
@@ -86,11 +86,11 @@ def get_version():
         ],
     },
     entry_points={
-        "console_scripts": ["sptlc=spatialstencil.cli.compiler:compile_spatial_ir"],
+        "console_scripts": ["sptlc=spada.cli.compiler:compile_spatial_ir"],
     },
     include_package_data=True,
     package_data={
-        "spatialstencil": ["**/*.py", "assets/csl/sync/*.csl"],
+        "spada": ["**/*.py", "assets/csl/sync/*.csl"],
     },
     keywords=[
         "stencil",
diff --git a/spatialstencil/__init__.py b/spada/__init__.py
similarity index 100%
rename from spatialstencil/__init__.py
rename to spada/__init__.py
diff --git a/spatialstencil/cli/__init__.py b/spada/cli/__init__.py
similarity index 100%
rename from spatialstencil/cli/__init__.py
rename to spada/cli/__init__.py
diff --git a/spatialstencil/cli/compiler.py b/spada/cli/compiler.py
similarity index 93%
rename from spatialstencil/cli/compiler.py
rename to spada/cli/compiler.py
index 24c851b0..7bb19a37 100644
--- a/spatialstencil/cli/compiler.py
+++ b/spada/cli/compiler.py
@@ -1,10 +1,10 @@
 import click
 import itertools
 import os
-from spatialstencil.lowering import spatial_ir_to_csl as s2c
-from spatialstencil.syntax.spatial_ir import parser, passes, analysis, irnodes as spa, canonicalization
-from spatialstencil.syntax.csl import constants as csl
-from spatialstencil.syntax.common import serialization
+from spada.lowering import spatial_ir_to_csl as s2c
+from spada.syntax.spatial_ir import parser, passes, analysis, irnodes as spa, canonicalization
+from spada.syntax.csl import constants as csl
+from spada.syntax.common import serialization
 import subprocess
 
 
@@ -16,7 +16,6 @@
 @click.option('--offset-y', '-y', default=0, type=int, help='Offset for rectangular region in y direction')
 @click.option('--generate-only', '-g', is_flag=True, help='Only generate the output files without compiling them')
 @click.option('--disable-benchmarking', is_flag=True, help='Disable benchmarking code generation (and memory overhead)')
-@click.option('--sync-benchmarking', is_flag=True, help='Generate sync-assisted benchmarking support')
 @click.option('--disable-asynchronous', is_flag=True, help='Disable asynchronous task code generation')
 @click.option('--disable-dsd', is_flag=True, help='Disable DSD operation detection and code generation')
 @click.option('--disable-map', is_flag=True, help='Disable @map operation detection and code generation')
@@ -24,12 +23,9 @@
 @click.option('--disable-task-recycling', is_flag=True, help='Disable task ID recycling')
 @click.option('--disable-copy-elision', is_flag=True, help='Disable copy elimination optimization pass')
 def compile_spatial_ir(input_file: str, output_folder: str, param: list[str], offset_x: int, offset_y: int,
-                       generate_only: bool, disable_benchmarking: bool, sync_benchmarking: bool,
+                       generate_only: bool, disable_benchmarking: bool,
                        disable_asynchronous: bool, disable_dsd: bool, disable_map: bool,
                        disable_task_fusion: bool, disable_task_recycling: bool, disable_copy_elision: bool):
-    if disable_benchmarking and sync_benchmarking:
-        raise click.UsageError("--sync-benchmarking cannot be used with --disable-benchmarking")
-
     # Parse parameters into dictionary
     kernel_parameters = {}
     for p in param:
@@ -87,14 +83,13 @@ def compile_spatial_ir(input_file: str, output_folder: str, param: list[str], of
         using_memcpy_mode = False
 
     if disable_map:
-        from spatialstencil.syntax.csl import statements
+        from spada.syntax.csl import statements
         statements.DISABLE_MAPS = True
 
     # Lower the spatial IR to CSL
     csl_files = s2c.lower_spatial_ir_to_csl(
         kernel,
         disable_benchmarking=disable_benchmarking,
-        sync_benchmarking=sync_benchmarking,
         disable_asynchronous=disable_asynchronous,
         disable_dsd=disable_dsd,
         task_fusion=not disable_task_fusion,
diff --git a/spatialstencil/cli/count_flop.py b/spada/cli/count_flop.py
similarity index 96%
rename from spatialstencil/cli/count_flop.py
rename to spada/cli/count_flop.py
index 654c2f1d..eda576a4 100644
--- a/spatialstencil/cli/count_flop.py
+++ b/spada/cli/count_flop.py
@@ -1,8 +1,8 @@
 import sys
 import os
 from pathlib import Path
-from spatialstencil.syntax.stencil_ir.parser import Parser
-from spatialstencil.syntax.stencil_ir.flop_counter import FLOPCounter
+from spada.syntax.stencil_ir.parser import Parser
+from spada.syntax.stencil_ir.flop_counter import FLOPCounter
 
 
 def find_spst_files(directory: str) -> list[Path]:
@@ -60,7 +60,7 @@ def analyze_file(filepath: Path, parser: Parser, counter: FLOPCounter) -> tuple[
 def print_header():
     """Print a nice header for the analysis."""
     print("=" * 80)
-    print(" " * 20 + "FLOP Analysis for Spatial Stencil Programs")
+    print(" " * 20 + "FLOP Analysis for SpaDA Programs")
     print("=" * 80)
     print()
 
diff --git a/spatialstencil/cli/gt4py_to_spatial.py b/spada/cli/gt4py_to_spatial.py
similarity index 95%
rename from spatialstencil/cli/gt4py_to_spatial.py
rename to spada/cli/gt4py_to_spatial.py
index 4a7352d7..36279d53 100644
--- a/spatialstencil/cli/gt4py_to_spatial.py
+++ b/spada/cli/gt4py_to_spatial.py
@@ -3,10 +3,10 @@
 import sys
 from pathlib import Path
 import traceback
-from spatialstencil.syntax.gt4py import parser
-from spatialstencil.lowering import gt4py_to_stencil_ir
-from spatialstencil.lowering.stencil_to_spatial import lower_stencil_to_spatial
-from spatialstencil.syntax.stencil_ir import type_inference
+from spada.syntax.gt4py import parser
+from spada.lowering import gt4py_to_stencil_ir
+from spada.lowering.stencil_to_spatial import lower_stencil_to_spatial
+from spada.syntax.stencil_ir import type_inference
 
 def parse_domain_size(domain_str):
     """Parse domain_size string in format 'x,y,z' and return tuple of ints."""
diff --git a/spatialstencil/lowering/__init__.py b/spada/lowering/__init__.py
similarity index 100%
rename from spatialstencil/lowering/__init__.py
rename to spada/lowering/__init__.py
diff --git a/spatialstencil/lowering/gt4py_to_stencil_ir.py b/spada/lowering/gt4py_to_stencil_ir.py
similarity index 98%
rename from spatialstencil/lowering/gt4py_to_stencil_ir.py
rename to spada/lowering/gt4py_to_stencil_ir.py
index 184eb94e..c1a8c5d9 100644
--- a/spatialstencil/lowering/gt4py_to_stencil_ir.py
+++ b/spada/lowering/gt4py_to_stencil_ir.py
@@ -1,10 +1,10 @@
 import ast
 from collections import defaultdict
 import copy
-from spatialstencil.syntax.gt4py import astnodes as gtast
-from spatialstencil.syntax.common.find_and_replace import PyASTFindReplace
-from spatialstencil.syntax.stencil_ir import irnodes as sast, type_inference
-from spatialstencil.syntax.stencil_ir.ssa import SSAVisitor
+from spada.syntax.gt4py import astnodes as gtast
+from spada.syntax.common.find_and_replace import PyASTFindReplace
+from spada.syntax.stencil_ir import irnodes as sast, type_inference
+from spada.syntax.stencil_ir.ssa import SSAVisitor
 
 
 def lower_gt4py_to_stencil_ir(program: gtast.GTProgram,
@@ -532,10 +532,10 @@ def visit_Call(self, node: ast.Call) -> None:
 
 if __name__ == '__main__':
     import sys
-    from spatialstencil.syntax.gt4py import parser
+    from spada.syntax.gt4py import parser
 
     if len(sys.argv) not in (2, 3):
-        print('USAGE: python -m spatialstencil.lowering.gt4py_to_stencil_ir <PYTHON FILE> [FUNCTION NAME]')
+        print('USAGE: python -m spada.lowering.gt4py_to_stencil_ir <PYTHON FILE> [FUNCTION NAME]')
         exit(1)
 
     out = parser.parse_file(sys.argv[1])
diff --git a/spatialstencil/lowering/spatial_ir_to_csl.py b/spada/lowering/spatial_ir_to_csl.py
similarity index 89%
rename from spatialstencil/lowering/spatial_ir_to_csl.py
rename to spada/lowering/spatial_ir_to_csl.py
index 1cc7a194..3c36712c 100644
--- a/spatialstencil/lowering/spatial_ir_to_csl.py
+++ b/spada/lowering/spatial_ir_to_csl.py
@@ -3,22 +3,21 @@
 """
 
 from collections import defaultdict
-from contextlib import nullcontext
 import copy
 import functools
 from io import StringIO
 import textwrap
-from spatialstencil.syntax.common.types import BIT_WIDTH
-from spatialstencil.syntax.spatial_ir import irnodes as spir, canonicalization, analysis, passes
-from spatialstencil.syntax.spatial_ir import copy_elimination
-from spatialstencil.syntax.spatial_ir import canonical_subgrids
-from spatialstencil.syntax.spatial_ir.canonicalization import PEBlock, Rectangle
-from spatialstencil.syntax.csl import constants as csl, preprocessing, tasks as tdag, statements as cslstmt, dsd_ops
-from spatialstencil.syntax.csl import benchmarking as cslbench
-from spatialstencil.syntax.csl import structures as cslstruct
-from spatialstencil.syntax.csl import task_recycling, prune_unused_fields as csl_pruning
-from spatialstencil.syntax.csl.codefile import CodeFile
-from spatialstencil.syntax.csl.statements import name_to_csl, dtype_as_csl, expr_to_csl
+from spada.syntax.common.types import BIT_WIDTH
+from spada.syntax.spatial_ir import irnodes as spir, canonicalization, analysis, passes
+from spada.syntax.spatial_ir import copy_elimination
+from spada.syntax.spatial_ir import canonical_subgrids
+from spada.syntax.spatial_ir.canonicalization import PEBlock, Rectangle
+from spada.syntax.csl import constants as csl, preprocessing, tasks as tdag, statements as cslstmt, dsd_ops
+from spada.syntax.csl import benchmarking as cslbench
+from spada.syntax.csl import structures as cslstruct
+from spada.syntax.csl import task_recycling, prune_unused_fields as csl_pruning
+from spada.syntax.csl.codefile import CodeFile
+from spada.syntax.csl.statements import name_to_csl, dtype_as_csl, expr_to_csl
 
 UniqueDSDDict = dict[str, list[tuple[str, cslstruct.DataStructureDescriptor]]]
 
@@ -34,7 +33,7 @@ def canonicalize_kernel(kernel: spir.Kernel) -> spir.Kernel:
 
     :param kernel: A fully concretized Spatial IR kernel.
     :return: The transformed kernel, ready for
-             :func:`~spatialstencil.syntax.spatial_ir.canonicalization.consolidate_rectangles_to_equivalence_classes`.
+             :func:`~spada.syntax.spatial_ir.canonicalization.consolidate_rectangles_to_equivalence_classes`.
     """
     kernel = canonicalization.inline_metaprogramming(kernel)
     kernel = canonicalization.canonicalize_phases(kernel)
@@ -48,7 +47,6 @@ def canonicalize_kernel(kernel: spir.Kernel) -> spir.Kernel:
 def lower_spatial_ir_to_csl(kernel: spir.Kernel,
                             rect_offset: tuple[int, int] = (0, 0),
                             disable_benchmarking: bool = False,
-                            sync_benchmarking: bool = False,
                             disable_asynchronous: bool = False,
                             disable_dsd: bool = False,
                             task_fusion: bool = True,
@@ -62,7 +60,6 @@ def lower_spatial_ir_to_csl(kernel: spir.Kernel,
     :param rect_offset: The offset of the output rectangle to use.
     :param disable_benchmarking: If True, disables benchmarking code generation (and memory overhead).
                                  Use in memory-limited scenarios.
-    :param sync_benchmarking: If True, generate sync-assisted benchmarking support for more accurate cycle counts.
     :param disable_asynchronous: If True, disables asynchronous task code generation.
     :param disable_dsd: If True, disables DSD operation detection and code generation.
     :param task_fusion: If True, enables task fusion to reduce number of tasks.
@@ -86,9 +83,6 @@ def lower_spatial_ir_to_csl(kernel: spir.Kernel,
     # Run the shared canonicalization pipeline
     kernel = canonicalize_kernel(kernel)
 
-    if disable_benchmarking and sync_benchmarking:
-        raise ValueError("Sync benchmarking requires benchmarking support to be enabled.")
-
     # Check if we are streaming or using memcpy mode
     use_memcpy_mode = analysis.kernel_uses_memcpy_mode(kernel)
 
@@ -120,7 +114,7 @@ def lower_spatial_ir_to_csl(kernel: spir.Kernel,
 
     # Add benchmarking fields
     if not disable_benchmarking:
-        _add_benchmarking_fields(rectangles, sync_benchmarking)
+        _add_benchmarking_fields(rectangles)
 
     # Collect scalar argument types
     scalar_argument_types = []
@@ -135,170 +129,158 @@ def lower_spatial_ir_to_csl(kernel: spir.Kernel,
     routing_instructions: list[str] = []
     color_maps = []
 
-    resource_context = cslbench.reserve_codegen_resources(csl) if sync_benchmarking and not disable_benchmarking else nullcontext(None)
-    with resource_context as sync_resources:
-        channel_to_color = _collect_colors_globally(kernel, rectangles, use_memcpy_mode)
-
-        for rect in rectangles:
-            # Create a unique CSL code file based on rectangle offset
-            csl_name = f'code_{rect.x_range[0]}_{rect.y_range[0]}.csl'
-            rect_code, color_map = generate_rectangle(kernel, rect, routing_instructions, scalar_arguments, use_memcpy_mode,
-                                                    stream_rects, channel_to_color, disable_benchmarking, sync_benchmarking,
-                                                    disable_asynchronous, disable_dsd, task_fusion,
-                                                    task_id_recycling)
-            color_maps.append(color_map)
-            csl_codes.append(CodeFile(csl_name, rect_code))
-
-        # Prepare outputs
-        layout_code = StringIO()
-
-        if sync_resources is not None:
-            csl_codes.extend(cslbench.load_sync_assets())
+    channel_to_color = _collect_colors_globally(kernel, rectangles, use_memcpy_mode)
 
-        ###############################################
-        # Generate main layout file
-
-        # Compute the tight PE bounding box. kernel.get_grid_rect() now returns tight bounds
-        # (last-contained PE + 1) rather than canonicalized stops.
-        x0, x1, y0, y1 = kernel.get_grid_rect()
-        assert x0 == 0, "PE Grid must start at x=0"
-        assert y0 == 0, "PE Grid must start at y=0"
-        rect_size = x1 - x0, y1 - y0
-
-        # Collect unique routes for all rectangles
-        routes_per_rectangle = _collect_routes(rectangles, color_maps)
+    for rect in rectangles:
+        # Create a unique CSL code file based on rectangle offset
+        csl_name = f'code_{rect.x_range[0]}_{rect.y_range[0]}.csl'
+        rect_code, color_map = generate_rectangle(kernel, rect, routing_instructions, scalar_arguments, use_memcpy_mode,
+                                                  stream_rects, channel_to_color, disable_benchmarking,
+                                                  disable_asynchronous, disable_dsd, task_fusion,
+                                                  task_id_recycling)
+        color_maps.append(color_map)
+        csl_codes.append(CodeFile(csl_name, rect_code))
+
+    # Prepare outputs
+    layout_code = StringIO()
+
+    ###############################################
+    # Generate main layout file
+
+    # Compute the tight PE bounding box. kernel.get_grid_rect() now returns tight bounds
+    # (last-contained PE + 1) rather than canonicalized stops.
+    x0, x1, y0, y1 = kernel.get_grid_rect()
+    assert x0 == 0, "PE Grid must start at x=0"
+    assert y0 == 0, "PE Grid must start at y=0"
+    rect_size = x1 - x0, y1 - y0
+
+    # Collect unique routes for all rectangles
+    routes_per_rectangle = _collect_routes(rectangles, color_maps)
 
-        if use_memcpy_mode:
-            layout_code.write(f'''
+    if use_memcpy_mode:
+        layout_code.write(f'''
 // Memcpy setup
 const memcpy = @import_module("<memcpy/get_params>", .{{
 .width = {rect_size[0]},
 .height = {rect_size[1]},
 }});
 ''')
-        else:
-            input_args = []
-            output_args = []
-            for arg in kernel.arguments:
-                if arg.compiletime:
-                    continue
-                if arg.readonly:
-                    input_args.append(arg)
-                elif arg.writeonly:
-                    output_args.append(arg)
-                else:
-                    input_args.append(arg)
-                    output_args.append(arg)
+    else:
+        input_args = []
+        output_args = []
+        for arg in kernel.arguments:
+            if arg.compiletime:
+                continue
+            if arg.readonly:
+                input_args.append(arg)
+            elif arg.writeonly:
+                output_args.append(arg)
+            else:
+                input_args.append(arg)
+                output_args.append(arg)
 
-            # Only up to 4 streams in each direction are supported (4 input, 4 output streams)
-            if len(input_args) > 4 or len(output_args) > 4:
-                raise ValueError('Too many input/output streams: only 4 input and 4 output streams are supported in CSL')
+        # Only up to 4 streams in each direction are supported (4 input, 4 output streams)
+        if len(input_args) > 4 or len(output_args) > 4:
+            raise ValueError('Too many input/output streams: only 4 input and 4 output streams are supported in CSL')
 
-            # Generate streaming DATA_*_ID parameters for each input/output stream
-            layout_code.write('// Streaming copy setup\n')
-            for i, input_arg in enumerate(input_args):
-                layout_code.write(f'''param MEMCPYH2D_DATA_{i}_ID: i16;
+        # Generate streaming DATA_*_ID parameters for each input/output stream
+        layout_code.write('// Streaming copy setup\n')
+        for i, input_arg in enumerate(input_args):
+            layout_code.write(f'''param MEMCPYH2D_DATA_{i}_ID: i16;
 const MEMCPYH2D_DATA_{i}: color = @get_color(MEMCPYH2D_DATA_{i}_ID);
 ''')
-            for i, output_arg in enumerate(output_args):
-                layout_code.write(f'''param MEMCPYD2H_DATA_{i}_ID: i16;
+        for i, output_arg in enumerate(output_args):
+            layout_code.write(f'''param MEMCPYD2H_DATA_{i}_ID: i16;
 const MEMCPYD2H_DATA_{i}: color = @get_color(MEMCPYD2H_DATA_{i}_ID);
 ''')
 
-            layout_code.write(f'''
+        layout_code.write(f'''
 const memcpy = @import_module("<memcpy/get_params>", .{{
      .width = width,
      .height = height,
 ''')
-            for i, input_arg in enumerate(input_args):
-                layout_code.write(f'''    .MEMCPYH2D_{i} = MEMCPYH2D_DATA_{i}_ID,
+        for i, input_arg in enumerate(input_args):
+            layout_code.write(f'''    .MEMCPYH2D_{i} = MEMCPYH2D_DATA_{i}_ID,
 ''')
-            for i, output_arg in enumerate(output_args):
-                layout_code.write(f'''    .MEMCPYD2H_{i} = MEMCPYD2H_DATA_{i}_ID,
+        for i, output_arg in enumerate(output_args):
+            layout_code.write(f'''    .MEMCPYD2H_{i} = MEMCPYD2H_DATA_{i}_ID,
 ''')
-            layout_code.write(f'''
+        layout_code.write(f'''
 }});
 ''')
 
-        if sync_resources is not None:
-            layout_code.write(cslbench.generate_sync_layout_setup(rect_size[0], rect_size[1], sync_resources))
-
-        layout_code.write(f'''layout {{
+    layout_code.write(f'''layout {{
     // Rectangle and code setup
     @set_rectangle{rect_size};''')
 
-        # First pass: @set_tile_code for every PE.
-        # All tile codes must be established before any @set_color_config call,
-        # because multi-hop routing config may reference neighboring PEs that
-        # belong to a different rectangle (e.g. pass-through relays).
-        sync_tile_binding = cslbench.generate_sync_tile_binding() if sync_resources is not None else ''
-        for rect in rectangles:
-            xb_pre, xe_pre, xs, yb_pre, ye_pre, ys = *rect.x_range, *rect.y_range
-            code_filename = f'code_{xb_pre}_{yb_pre}.csl'
-            xb = xb_pre + rect_offset[0]
-            xe = xe_pre + rect_offset[0]
-            yb = yb_pre + rect_offset[1]
-            ye = ye_pre + rect_offset[1]
+    # First pass: @set_tile_code for every PE.
+    # All tile codes must be established before any @set_color_config call,
+    # because multi-hop routing config may reference neighboring PEs that
+    # belong to a different rectangle (e.g. pass-through relays).
+    for rect in rectangles:
+        xb_pre, xe_pre, xs, yb_pre, ye_pre, ys = *rect.x_range, *rect.y_range
+        code_filename = f'code_{xb_pre}_{yb_pre}.csl'
+        xb = xb_pre + rect_offset[0]
+        xe = xe_pre + rect_offset[0]
+        yb = yb_pre + rect_offset[1]
+        ye = ye_pre + rect_offset[1]
 
-            layout_code.write(f'''
+        layout_code.write(f'''
     for (@range(i16, {xb}, {xe}, {xs})) |pe_x| {{
         for (@range(i16, {yb}, {ye}, {ys})) |pe_y| {{
-            @set_tile_code(pe_x, pe_y, "{code_filename}", .{{ .memcpy_params = memcpy.get_params(pe_x), {sync_tile_binding}}});
+            @set_tile_code(pe_x, pe_y, "{code_filename}", .{{ .memcpy_params = memcpy.get_params(pe_x), }});
         }}
     }}\n''')
 
-        # Second pass: routing (@set_color_config).  By emitting these after all
-        # @set_tile_code calls, every PE referenced by a multi-hop offset is
-        # guaranteed to already have tile code assigned.
-        layout_code.write('\n    // Routes\n')
-        for rect in rectangles:
-            xb_pre, xe_pre, xs, yb_pre, ye_pre, ys = *rect.x_range, *rect.y_range
-            xb = xb_pre + rect_offset[0]
-            xe = xe_pre + rect_offset[0]
-            yb = yb_pre + rect_offset[1]
-            ye = ye_pre + rect_offset[1]
-            route_code = routes_per_rectangle.get((xb_pre, yb_pre), '')
-            if route_code.strip():
-                layout_code.write(f'''
+    # Second pass: routing (@set_color_config).  By emitting these after all
+    # @set_tile_code calls, every PE referenced by a multi-hop offset is
+    # guaranteed to already have tile code assigned.
+    layout_code.write('\n    // Routes\n')
+    for rect in rectangles:
+        xb_pre, xe_pre, xs, yb_pre, ye_pre, ys = *rect.x_range, *rect.y_range
+        xb = xb_pre + rect_offset[0]
+        xe = xe_pre + rect_offset[0]
+        yb = yb_pre + rect_offset[1]
+        ye = ye_pre + rect_offset[1]
+        route_code = routes_per_rectangle.get((xb_pre, yb_pre), '')
+        if route_code.strip():
+            layout_code.write(f'''
     for (@range(i16, {xb}, {xe}, {xs})) |pe_x| {{
         for (@range(i16, {yb}, {ye}, {ys})) |pe_y| {{
 {route_code}
         }}
     }}\n''')
 
-        for rinst in routing_instructions:
-            layout_code.write(rinst + '\n')
-
-        # Emit symbol names for arguments and kernel
-        layout_code.write('\n    // Extern fields\n')
-        # Gather extern fields from kernel arguments
-        extern_fields: list[spir.FieldDeclaration] = []
-        for rect in rectangles:
-            place_block = rect.metadata.place
-            for field in place_block.statements:
-                if field.is_extern:
-                    if any(field.field_name == ef.field_name for ef in extern_fields):
-                        continue
-                    extern_fields.append(field)
+    for rinst in routing_instructions:
+        layout_code.write(rinst + '\n')
 
-        for field in extern_fields:
-            dtype = field.dtype
-            if isinstance(field.dtype, spir.ArrayType) and isinstance(field.dtype.base_type, spir.StreamType):
-                pass
-            elif isinstance(field.dtype, spir.StreamType):
-                # Support scalar streams
-                dtype = spir.ArrayType(field.dtype, [1])
+    # Emit symbol names for arguments and kernel
+    layout_code.write('\n    // Extern fields\n')
+    # Gather extern fields from kernel arguments
+    extern_fields: list[spir.FieldDeclaration] = []
+    for rect in rectangles:
+        place_block = rect.metadata.place
+        for field in place_block.statements:
+            if field.is_extern:
+                if any(field.field_name == ef.field_name for ef in extern_fields):
+                    continue
+                extern_fields.append(field)
 
-            layout_code.write(f'    @export_name("{field.field_name.name}", {dtype_as_csl(dtype, export=True)}, true);\n')
+    for field in extern_fields:
+        dtype = field.dtype
+        if isinstance(field.dtype, spir.ArrayType) and isinstance(field.dtype.base_type, spir.StreamType):
+            pass
+        elif isinstance(field.dtype, spir.StreamType):
+            # Support scalar streams
+            dtype = spir.ArrayType(field.dtype, [1])
 
-        if sync_resources is not None:
-            layout_code.write(cslbench.generate_sync_layout_exports())
+        layout_code.write(f'    @export_name("{field.field_name.name}", {dtype_as_csl(dtype, export=True)}, true);\n')
 
-        layout_code.write(f'''
+    layout_code.write(f'''
     // Kernel
     @export_name("{kernel.name}", fn({", ".join(scalar_argument_types)})void);
 }}''')
-        csl_codes.append(CodeFile('layout.csl', layout_code.getvalue()))
+    csl_codes.append(CodeFile('layout.csl', layout_code.getvalue()))
 
     # Return all generated code files
     return csl_codes
@@ -312,7 +294,6 @@ def generate_rectangle(kernel: spir.Kernel,
                        stream_extents: analysis.StreamExtents,
                        channel_to_color: dict[int, int],
                        disable_benchmarking: bool = False,
-                       sync_benchmarking: bool = False,
                        disable_asynchronous: bool = False,
                        disable_dsd: bool = False,
                        task_fusion: bool = True,
@@ -351,7 +332,7 @@ def generate_rectangle(kernel: spir.Kernel,
         canonicalization.convert_foreach_data_tasks_to_loops(rect, dtypes)
         dtypes = _collect_identifier_types(rect.metadata, kernel.arguments)
 
-    benchmark_code = _generate_benchmarking_code(header, disable_benchmarking, sync_benchmarking)
+    benchmark_code = _generate_benchmarking_code(header, disable_benchmarking)
 
     # Convert compute block subgraphs into tasks:
     #    * Make task DAG out of computations
@@ -1603,24 +1584,22 @@ def _generate_task_code(rect: PEBlock,
                 current_code.write(f'{indent}@unblock({task_id});\n')
 
 
-def _generate_benchmarking_code(header: StringIO, disable_benchmarking: bool,
-                                sync_benchmarking: bool) -> cslbench.RectangleBenchmarkingCode:
+def _generate_benchmarking_code(header: StringIO, disable_benchmarking: bool) -> cslbench.RectangleBenchmarkingCode:
     """
     Generates benchmarking code in the header and current code.
-    
+
     :param header: A code generator stream for a file's header (where the declarations are).
     :return: Benchmarking code fragments to insert into the generated file.
     """
     if disable_benchmarking:
         return cslbench.RectangleBenchmarkingCode()
 
-    benchmark_code = (
-        cslbench.generate_sync_rectangle_code() if sync_benchmarking else cslbench.generate_basic_rectangle_code())
+    benchmark_code = cslbench.generate_basic_rectangle_code()
     header.write(benchmark_code.header)
     return benchmark_code
 
 
-def _add_benchmarking_fields(rectangles: list[Rectangle[PEBlock]], sync_benchmarking: bool = False):
+def _add_benchmarking_fields(rectangles: list[Rectangle[PEBlock]]):
     """
     Adds benchmarking variables to the code.
     :param rectangles: The rectangles to modify.
@@ -1638,13 +1617,6 @@ def _add_benchmarking_fields(rectangles: list[Rectangle[PEBlock]], sync_benchmar
                 dtype=spir.ArrayType(spir.ScalarType.u16, [3]),
                 is_extern=True,
             ))
-        if sync_benchmarking:
-            rect.metadata.place.statements.append(
-                spir.FieldDeclaration(
-                    field_name=spir.Identifier('__benchmark_refclock', 0),
-                    dtype=spir.ArrayType(spir.ScalarType.u16, [3]),
-                    is_extern=True,
-                ))
 
 
 def _collect_identifier_types(rect: PEBlock,
diff --git a/spatialstencil/lowering/stencil_to_spatial.py b/spada/lowering/stencil_to_spatial.py
similarity index 87%
rename from spatialstencil/lowering/stencil_to_spatial.py
rename to spada/lowering/stencil_to_spatial.py
index d3722ea2..89dfb80a 100644
--- a/spatialstencil/lowering/stencil_to_spatial.py
+++ b/spada/lowering/stencil_to_spatial.py
@@ -1,24 +1,24 @@
 import copy
 
-from spatialstencil.lowering.stencil_to_spatial_routing import ChannelStrategy, KernelRouting
-import spatialstencil.syntax.stencil_ir.irnodes as sast
-import spatialstencil.syntax.spatial_ir.irnodes as spa
-from spatialstencil.lowering.stencil_to_spatial_compute import ProgramCompute, AbstractStatement
-from spatialstencil.lowering.stencil_to_spatial_dataflow import ProgramDataflow
-from spatialstencil.lowering.stencil_to_spatial_place import ProgramPlacement
-
-from spatialstencil.lowering.versioning import Versioning
-from spatialstencil.syntax.common.types import ScalarType
-from spatialstencil.syntax.spatial_ir.canonical_subgrids import canonicalize_subgrids, fill_compute_rectangle
-from spatialstencil.syntax.spatial_ir.grid_geometry import split_rectangles
-
-from spatialstencil.syntax.stencil_ir.domain_collector import DomainCollector
-from spatialstencil.syntax.stencil_ir.canonicalize_expression import CanonicalizeExpression
-from spatialstencil.syntax.stencil_ir.refactor_forward_backward_stencils import RefactorForwardBackwardStencils
-from spatialstencil.syntax.stencil_ir.type_inference import infer_scalar_types, infer_types
-from spatialstencil.syntax.stencil_ir.ssa import SSAVisitor
-from spatialstencil.syntax.spatial_ir.passes import mark_readonly_writeonly_arguments
-from spatialstencil.syntax.spatial_ir.analysis import detect_undefined_array_access
+from spada.lowering.stencil_to_spatial_routing import ChannelStrategy, KernelRouting
+import spada.syntax.stencil_ir.irnodes as sast
+import spada.syntax.spatial_ir.irnodes as spa
+from spada.lowering.stencil_to_spatial_compute import ProgramCompute, AbstractStatement
+from spada.lowering.stencil_to_spatial_dataflow import ProgramDataflow
+from spada.lowering.stencil_to_spatial_place import ProgramPlacement
+
+from spada.lowering.versioning import Versioning
+from spada.syntax.common.types import ScalarType
+from spada.syntax.spatial_ir.canonical_subgrids import canonicalize_subgrids, fill_compute_rectangle
+from spada.syntax.spatial_ir.grid_geometry import split_rectangles
+
+from spada.syntax.stencil_ir.domain_collector import DomainCollector
+from spada.syntax.stencil_ir.canonicalize_expression import CanonicalizeExpression
+from spada.syntax.stencil_ir.refactor_forward_backward_stencils import RefactorForwardBackwardStencils
+from spada.syntax.stencil_ir.type_inference import infer_scalar_types, infer_types
+from spada.syntax.stencil_ir.ssa import SSAVisitor
+from spada.syntax.spatial_ir.passes import mark_readonly_writeonly_arguments
+from spada.syntax.spatial_ir.analysis import detect_undefined_array_access
 
 def lower_stencil_to_spatial(stencil: sast.Program, channel_strategy: ChannelStrategy = ChannelStrategy.TRIVIAL) -> spa.Kernel:
     """Lower a stencil to a spatial program.
diff --git a/spatialstencil/lowering/stencil_to_spatial_compute.py b/spada/lowering/stencil_to_spatial_compute.py
similarity index 97%
rename from spatialstencil/lowering/stencil_to_spatial_compute.py
rename to spada/lowering/stencil_to_spatial_compute.py
index 3843b133..a544f174 100644
--- a/spatialstencil/lowering/stencil_to_spatial_compute.py
+++ b/spada/lowering/stencil_to_spatial_compute.py
@@ -1,17 +1,17 @@
 import copy
 from dataclasses import dataclass
 
-from spatialstencil.lowering.stencil_to_spatial_compute_fwbw import ForwardBackwardComputeVisitor
-from spatialstencil.lowering.stencil_to_spatial_dataflow import ProgramDataflow
-from spatialstencil.lowering.stencil_to_spatial_place import ProgramPlacement
-from spatialstencil.lowering.versioning import Versioning
-from spatialstencil.syntax.common.basenode import Wildcard
-from spatialstencil.syntax.common.tree_matching import PatternTransformer
-from spatialstencil.syntax.common.types import ScalarType
-from spatialstencil.syntax.spatial_ir.grid_geometry import Rectangle, group_rectangles_by_domain, split_rectangles
-from spatialstencil.syntax.stencil_ir.domain_collector import DomainCollector
-import spatialstencil.syntax.spatial_ir.irnodes as spa
-import spatialstencil.syntax.stencil_ir.irnodes as sast
+from spada.lowering.stencil_to_spatial_compute_fwbw import ForwardBackwardComputeVisitor
+from spada.lowering.stencil_to_spatial_dataflow import ProgramDataflow
+from spada.lowering.stencil_to_spatial_place import ProgramPlacement
+from spada.lowering.versioning import Versioning
+from spada.syntax.common.basenode import Wildcard
+from spada.syntax.common.tree_matching import PatternTransformer
+from spada.syntax.common.types import ScalarType
+from spada.syntax.spatial_ir.grid_geometry import Rectangle, group_rectangles_by_domain, split_rectangles
+from spada.syntax.stencil_ir.domain_collector import DomainCollector
+import spada.syntax.spatial_ir.irnodes as spa
+import spada.syntax.stencil_ir.irnodes as sast
 
 AbstractStatement = Rectangle[tuple[int, spa.Statement]]
 
diff --git a/spatialstencil/lowering/stencil_to_spatial_compute_fwbw.py b/spada/lowering/stencil_to_spatial_compute_fwbw.py
similarity index 94%
rename from spatialstencil/lowering/stencil_to_spatial_compute_fwbw.py
rename to spada/lowering/stencil_to_spatial_compute_fwbw.py
index 86ef9ba8..876b09e0 100644
--- a/spatialstencil/lowering/stencil_to_spatial_compute_fwbw.py
+++ b/spada/lowering/stencil_to_spatial_compute_fwbw.py
@@ -1,12 +1,12 @@
 import copy
-from spatialstencil.lowering.stencil_to_spatial_dataflow import ProgramDataflow
-from spatialstencil.lowering.stencil_to_spatial_place import ProgramPlacement
-from spatialstencil.lowering.versioning import Versioning
-from spatialstencil.syntax.common.types import ScalarType
-from spatialstencil.syntax.spatial_ir.grid_geometry import Rectangle
-import spatialstencil.syntax.spatial_ir.irnodes as spa
-import spatialstencil.syntax.stencil_ir.irnodes as sast
-from spatialstencil.syntax.stencil_ir.irnodes import ComputationBlock
+from spada.lowering.stencil_to_spatial_dataflow import ProgramDataflow
+from spada.lowering.stencil_to_spatial_place import ProgramPlacement
+from spada.lowering.versioning import Versioning
+from spada.syntax.common.types import ScalarType
+from spada.syntax.spatial_ir.grid_geometry import Rectangle
+import spada.syntax.spatial_ir.irnodes as spa
+import spada.syntax.stencil_ir.irnodes as sast
+from spada.syntax.stencil_ir.irnodes import ComputationBlock
 
 AbstractStatement = Rectangle[tuple[int, spa.Statement]]
 
diff --git a/spatialstencil/lowering/stencil_to_spatial_dataflow.py b/spada/lowering/stencil_to_spatial_dataflow.py
similarity index 95%
rename from spatialstencil/lowering/stencil_to_spatial_dataflow.py
rename to spada/lowering/stencil_to_spatial_dataflow.py
index 79697683..da218e0a 100644
--- a/spatialstencil/lowering/stencil_to_spatial_dataflow.py
+++ b/spada/lowering/stencil_to_spatial_dataflow.py
@@ -2,15 +2,15 @@
 from dataclasses import dataclass
 from enum import Enum, auto
 
-import spatialstencil.syntax.stencil_ir.irnodes as sast
-import spatialstencil.syntax.spatial_ir.irnodes as spa
-from spatialstencil.lowering.stencil_to_spatial_place import ProgramPlacement
+import spada.syntax.stencil_ir.irnodes as sast
+import spada.syntax.spatial_ir.irnodes as spa
+from spada.lowering.stencil_to_spatial_place import ProgramPlacement
 
-from spatialstencil.lowering.versioning import Versioning
-from spatialstencil.syntax.common.types import ScalarType
-from spatialstencil.syntax.spatial_ir.grid_geometry import Rectangle, split_rectangles, group_rectangles_by_domain
+from spada.lowering.versioning import Versioning
+from spada.syntax.common.types import ScalarType
+from spada.syntax.spatial_ir.grid_geometry import Rectangle, split_rectangles, group_rectangles_by_domain
 
-from spatialstencil.syntax.stencil_ir.domain_collector import DomainCollector
+from spada.syntax.stencil_ir.domain_collector import DomainCollector
 
 
 @dataclass(frozen=True)
diff --git a/spatialstencil/lowering/stencil_to_spatial_place.py b/spada/lowering/stencil_to_spatial_place.py
similarity index 96%
rename from spatialstencil/lowering/stencil_to_spatial_place.py
rename to spada/lowering/stencil_to_spatial_place.py
index 8176a75f..99d4ea66 100644
--- a/spatialstencil/lowering/stencil_to_spatial_place.py
+++ b/spada/lowering/stencil_to_spatial_place.py
@@ -3,13 +3,13 @@
 from dataclasses import dataclass
 from typing import Mapping, Set
 
-import spatialstencil.syntax.stencil_ir.irnodes as sast
-import spatialstencil.syntax.spatial_ir.irnodes as spa
-from spatialstencil.lowering.versioning import Versioning
+import spada.syntax.stencil_ir.irnodes as sast
+import spada.syntax.spatial_ir.irnodes as spa
+from spada.lowering.versioning import Versioning
 
-from spatialstencil.syntax.common.types import ScalarType
-from spatialstencil.syntax.spatial_ir.grid_geometry import Rectangle, split_rectangles, group_rectangles_by_domain
-from spatialstencil.syntax.stencil_ir.domain_collector import DomainCollector
+from spada.syntax.common.types import ScalarType
+from spada.syntax.spatial_ir.grid_geometry import Rectangle, split_rectangles, group_rectangles_by_domain
+from spada.syntax.stencil_ir.domain_collector import DomainCollector
 
 AbstractFieldDeclaration = Rectangle[spa.FieldDeclaration]
 
diff --git a/spatialstencil/lowering/stencil_to_spatial_routing.py b/spada/lowering/stencil_to_spatial_routing.py
similarity index 98%
rename from spatialstencil/lowering/stencil_to_spatial_routing.py
rename to spada/lowering/stencil_to_spatial_routing.py
index 0801d921..7203c9f8 100644
--- a/spatialstencil/lowering/stencil_to_spatial_routing.py
+++ b/spada/lowering/stencil_to_spatial_routing.py
@@ -1,8 +1,8 @@
 import copy
 from enum import Enum, auto
-from spatialstencil.lowering.versioning import Versioning
-import spatialstencil.syntax.spatial_ir.irnodes as spa
-from spatialstencil.syntax.spatial_ir.canonicalization import canonicalize_phases, inline_phases
+from spada.lowering.versioning import Versioning
+import spada.syntax.spatial_ir.irnodes as spa
+from spada.syntax.spatial_ir.canonicalization import canonicalize_phases, inline_phases
 
 
 class ChannelStrategy(Enum):
diff --git a/spatialstencil/lowering/versioning.py b/spada/lowering/versioning.py
similarity index 100%
rename from spatialstencil/lowering/versioning.py
rename to spada/lowering/versioning.py
diff --git a/spatialstencil/placement/README.md b/spada/placement/README.md
similarity index 100%
rename from spatialstencil/placement/README.md
rename to spada/placement/README.md
diff --git a/spatialstencil/placement/__init__.py b/spada/placement/__init__.py
similarity index 100%
rename from spatialstencil/placement/__init__.py
rename to spada/placement/__init__.py
diff --git a/spatialstencil/placement/domain.py b/spada/placement/domain.py
similarity index 100%
rename from spatialstencil/placement/domain.py
rename to spada/placement/domain.py
diff --git a/spatialstencil/placement/graph.py b/spada/placement/graph.py
similarity index 98%
rename from spatialstencil/placement/graph.py
rename to spada/placement/graph.py
index 43fb59b6..3356db46 100644
--- a/spatialstencil/placement/graph.py
+++ b/spada/placement/graph.py
@@ -1,8 +1,8 @@
 from typing import Sequence, List, Dict, Tuple
 import igraph as ig
 
-from spatialstencil.placement.domain import FieldDomain
-from spatialstencil.placement.stencil import Stencil, StencilDirection
+from spada.placement.domain import FieldDomain
+from spada.placement.stencil import Stencil, StencilDirection
 
 
 class StencilGraph:
diff --git a/spatialstencil/placement/mla.py b/spada/placement/mla.py
similarity index 100%
rename from spatialstencil/placement/mla.py
rename to spada/placement/mla.py
diff --git a/spatialstencil/placement/model.py b/spada/placement/model.py
similarity index 98%
rename from spatialstencil/placement/model.py
rename to spada/placement/model.py
index d1d72b95..62b73036 100644
--- a/spatialstencil/placement/model.py
+++ b/spada/placement/model.py
@@ -8,8 +8,8 @@
 import numpy as np
 from numpy.typing import NDArray
 
-from spatialstencil.placement.graph import StencilGraph
-from spatialstencil.placement.placement import Placement
+from spada.placement.graph import StencilGraph
+from spada.placement.placement import Placement
 
 
 @dataclass
diff --git a/spatialstencil/placement/optimizer.py b/spada/placement/optimizer.py
similarity index 88%
rename from spatialstencil/placement/optimizer.py
rename to spada/placement/optimizer.py
index 7edb7922..83724ff0 100644
--- a/spatialstencil/placement/optimizer.py
+++ b/spada/placement/optimizer.py
@@ -2,10 +2,10 @@
 import igraph
 import numpy as np
 
-from spatialstencil.placement.graph import StencilGraph
-from spatialstencil.placement.placed_graph import PlacedStencilGraph
-from spatialstencil.placement.model import PlacementCost, CostModel
-from spatialstencil.placement.partition import FieldPartition
+from spada.placement.graph import StencilGraph
+from spada.placement.placed_graph import PlacedStencilGraph
+from spada.placement.model import PlacementCost, CostModel
+from spada.placement.partition import FieldPartition
 
 
 def color_graph(g: StencilGraph):
diff --git a/spatialstencil/placement/partition.py b/spada/placement/partition.py
similarity index 97%
rename from spatialstencil/placement/partition.py
rename to spada/placement/partition.py
index 5884dc6f..73c699ae 100644
--- a/spatialstencil/placement/partition.py
+++ b/spada/placement/partition.py
@@ -11,9 +11,9 @@
 import igraph
 from typing import Tuple
 
-from spatialstencil.placement.graph import FieldDomain
-from spatialstencil.placement.mla import linearize_with_random_forest
-from spatialstencil.placement.placement import Placement
+from spada.placement.graph import FieldDomain
+from spada.placement.mla import linearize_with_random_forest
+from spada.placement.placement import Placement
 
 
 
diff --git a/spatialstencil/placement/placed_graph.py b/spada/placement/placed_graph.py
similarity index 94%
rename from spatialstencil/placement/placed_graph.py
rename to spada/placement/placed_graph.py
index 2f14c4f6..e6edd334 100644
--- a/spatialstencil/placement/placed_graph.py
+++ b/spada/placement/placed_graph.py
@@ -2,9 +2,9 @@
 import igraph as ig
 import matplotlib.pyplot as plt
 
-from spatialstencil.placement.graph import StencilGraph
-from spatialstencil.placement.partition import Placement
-from spatialstencil.placement.stencil import StencilDirection
+from spada.placement.graph import StencilGraph
+from spada.placement.partition import Placement
+from spada.placement.stencil import StencilDirection
 
 
 class PlacedStencilGraph(StencilGraph):
diff --git a/spatialstencil/placement/placement.py b/spada/placement/placement.py
similarity index 99%
rename from spatialstencil/placement/placement.py
rename to spada/placement/placement.py
index 8495b5df..32d900fb 100644
--- a/spatialstencil/placement/placement.py
+++ b/spada/placement/placement.py
@@ -7,7 +7,7 @@
 import numpy as np
 import igraph
 
-from spatialstencil.placement.graph import StencilGraph
+from spada.placement.graph import StencilGraph
 
 
 @dataclass
diff --git a/spatialstencil/placement/stencil.py b/spada/placement/stencil.py
similarity index 100%
rename from spatialstencil/placement/stencil.py
rename to spada/placement/stencil.py
diff --git a/spatialstencil/runtime/__init__.py b/spada/runtime/__init__.py
similarity index 100%
rename from spatialstencil/runtime/__init__.py
rename to spada/runtime/__init__.py
diff --git a/spatialstencil/runtime/cerebras_runtime_stub.py b/spada/runtime/cerebras_runtime_stub.py
similarity index 100%
rename from spatialstencil/runtime/cerebras_runtime_stub.py
rename to spada/runtime/cerebras_runtime_stub.py
diff --git a/spatialstencil/runtime/runtime.py b/spada/runtime/runtime.py
similarity index 83%
rename from spatialstencil/runtime/runtime.py
rename to spada/runtime/runtime.py
index 8f3c01f7..9ef3adc6 100644
--- a/spatialstencil/runtime/runtime.py
+++ b/spada/runtime/runtime.py
@@ -8,10 +8,8 @@
 import numpy.typing as npt
 import time
 
-SYNC_REQUIRED_SYMBOLS = ("f_sync", "f_tic", "f_toc", "__benchmark_refclock")
-
 if TYPE_CHECKING:
-    from spatialstencil.runtime import cerebras_runtime_stub as crt
+    from spada.runtime import cerebras_runtime_stub as crt
 else:
     try:
         from cerebras.sdk.runtime import sdkruntimepybind as crt
@@ -226,52 +224,6 @@ def copy_back_benchmark_cycles(runtime: crt.SdkRuntime, metadata: ProgramMetadat
     return cycle_stop - cycle_start
 
 
-def copy_back_sync_buffer(runtime: crt.SdkRuntime, metadata: ProgramMetadata) -> np.ndarray:
-    """
-    Copy back the reference clock sync-benchmarking buffer.
-
-    :param runtime: The Cerebras SDK runtime object to perform the copy operation
-    :param metadata: Program metadata containing input/output information
-    :return: Numpy array containing the reference clock for each PE
-    """
-    cycle_ref = np.zeros(metadata.kernel_dims + [3], dtype=np.uint32)
-    runtime.memcpy_d2h(
-        cycle_ref.ravel(),
-        runtime.get_id("__benchmark_refclock"),
-        0,
-        0,
-        *cycle_ref.shape,
-        streaming=False,
-        data_type=crt.MemcpyDataType.MEMCPY_16BIT,
-        order=crt.MemcpyOrder.ROW_MAJOR,
-        nonblock=False,
-    )
-    return convert_timestamp(cycle_ref)
-
-
-def copy_back_sync_benchmark_data(runtime: crt.SdkRuntime, metadata: ProgramMetadata) -> np.ndarray:
-    """
-    Copy back sync-benchmarking data and reconstruct the corrected global cycle count.
-
-    :param runtime: The Cerebras SDK runtime object to perform the copy operation
-    :param metadata: Program metadata containing input/output information
-    :return: Numpy scalar containing the total number of cycles spent on the chip
-             for the benchmarked period.
-    """
-    # Compute propagation delay (one cycle per link) to synchronize reference clocks
-    width, height = metadata.kernel_dims
-    propagation_delay = np.arange(width, dtype=np.uint64)[:, None] + np.arange(height, dtype=np.uint64)[None, :]
-
-    time_start, time_end = copy_back_benchmark_data(runtime, metadata)
-    reference = copy_back_sync_buffer(runtime, metadata)
-    reference = reference - propagation_delay
-    time_start = time_start - reference
-    time_end = time_end - reference
-
-    # Return the total time spent on the chip
-    return time_end.max() - time_start.min()
-
-
 def print_cycle_counts(label: str, cycle_counts: np.ndarray) -> None:
     """
     Print benchmark data in a compact form for either scalar or per-PE cycle counts.
@@ -342,14 +294,9 @@ def __init__(
         self.inputs = self.metadata.inputs
         self.outputs = self.metadata.outputs
 
-        print("SYNC BENCHMARK?", self.has_sync_benchmarking())
-
     def has_symbol(self, symbol: str) -> bool:
         return self.runtime.get_id(symbol) is not None
 
-    def has_sync_benchmarking(self) -> bool:
-        return all(self.has_symbol(symbol) for symbol in SYNC_REQUIRED_SYMBOLS)
-
     def has_basic_benchmarking(self) -> bool:
         return self.has_symbol("__benchmark_start") and self.has_symbol("__benchmark_stop")
 
@@ -387,15 +334,8 @@ def __call__(self, *args, **kwargs) -> Dict[str, np.ndarray]:
             self.runtime.run()
             print("done.", flush=True)
 
-            sync_benchmarking = False
-            if self.benchmark:
-                sync_benchmarking = self.has_sync_benchmarking()
-                if not sync_benchmarking and not self.has_basic_benchmarking():
-                    raise ValueError("Benchmarking requested but not enabled in the program.")
-
-            if self.benchmark and sync_benchmarking and not self.metadata.memcpy_mode:
-                self.runtime.launch("f_sync", nonblock=False)
-                self.runtime.launch("f_tic", nonblock=False)
+            if self.benchmark and not self.has_basic_benchmarking():
+                raise ValueError("Benchmarking requested but not enabled in the program.")
 
             # Copy data to device
             for name, data in kwargs.items():
@@ -414,28 +354,17 @@ def __call__(self, *args, **kwargs) -> Dict[str, np.ndarray]:
                 # Use flatten_copy to copy data to device
                 flatten_copy(name, data, expected_shape, self.runtime, self.metadata, self.benchmark)
 
-            if self.benchmark and sync_benchmarking and self.metadata.memcpy_mode:
-                self.runtime.launch("f_sync", nonblock=False)
-
             # Run the program
             for i in range(self.repetitions):
                 if self.metadata.memcpy_mode:
                     if self.benchmark and not self.simulator and i == 0:
                         time.sleep(5.0)
                     print("Launching kernel...", flush=True, end="")
-                    if self.benchmark and sync_benchmarking:
-                        self.runtime.launch("f_tic", nonblock=False)
                     self.runtime.launch(self.metadata.kernel_name, *scalar_args, nonblock=False)
-                    if self.benchmark and sync_benchmarking:
-                        self.runtime.launch("f_toc", nonblock=False)
                     print("kernel launched.", flush=True)
 
                     if self.benchmark:
-                        cycle_counts = (
-                            copy_back_sync_benchmark_data(self.runtime, self.metadata)
-                            if sync_benchmarking
-                            else copy_back_benchmark_cycles(self.runtime, self.metadata)
-                        )
+                        cycle_counts = copy_back_benchmark_cycles(self.runtime, self.metadata)
                         num_digits = len(str(self.repetitions))
                         np.save(self.output_dir / f"perf_cycles_{i:0{num_digits}d}.npy", cycle_counts)
                         print_cycle_counts(f"Iteration {i} cycle count", cycle_counts)
@@ -460,11 +389,7 @@ def __call__(self, *args, **kwargs) -> Dict[str, np.ndarray]:
             print("Copy-back complete.", flush=True)
 
             if self.benchmark and not self.metadata.memcpy_mode:
-                cycle_counts = (
-                    copy_back_sync_benchmark_data(self.runtime, self.metadata)
-                    if sync_benchmarking
-                    else copy_back_benchmark_data(self.runtime, self.metadata)
-                )
+                cycle_counts = copy_back_benchmark_data(self.runtime, self.metadata)
                 np.save(self.output_dir / "perf_cycles.npy", cycle_counts)
                 print_cycle_counts("Cycle count", cycle_counts)
 
diff --git a/spatialstencil/syntax/__init__.py b/spada/syntax/__init__.py
similarity index 100%
rename from spatialstencil/syntax/__init__.py
rename to spada/syntax/__init__.py
diff --git a/spatialstencil/syntax/common/__init__.py b/spada/syntax/common/__init__.py
similarity index 100%
rename from spatialstencil/syntax/common/__init__.py
rename to spada/syntax/common/__init__.py
diff --git a/spatialstencil/syntax/common/basenode.py b/spada/syntax/common/basenode.py
similarity index 100%
rename from spatialstencil/syntax/common/basenode.py
rename to spada/syntax/common/basenode.py
diff --git a/spatialstencil/syntax/common/find_and_replace.py b/spada/syntax/common/find_and_replace.py
similarity index 100%
rename from spatialstencil/syntax/common/find_and_replace.py
rename to spada/syntax/common/find_and_replace.py
diff --git a/spatialstencil/syntax/common/match_tree.py b/spada/syntax/common/match_tree.py
similarity index 98%
rename from spatialstencil/syntax/common/match_tree.py
rename to spada/syntax/common/match_tree.py
index 47cb63fa..1c576c08 100644
--- a/spatialstencil/syntax/common/match_tree.py
+++ b/spada/syntax/common/match_tree.py
@@ -2,9 +2,9 @@
 from dataclasses import dataclass
 from typing import List, Union, Deque, TypeVar, Any, Generic
 
-from spatialstencil.syntax.common.basenode import BaseNode
+from spada.syntax.common.basenode import BaseNode
 
-import spatialstencil.syntax.common.basenode as syntax
+import spada.syntax.common.basenode as syntax
 
 V = TypeVar('V')
 
diff --git a/spatialstencil/syntax/common/serialization.py b/spada/syntax/common/serialization.py
similarity index 100%
rename from spatialstencil/syntax/common/serialization.py
rename to spada/syntax/common/serialization.py
diff --git a/spatialstencil/syntax/common/tree_matching.py b/spada/syntax/common/tree_matching.py
similarity index 97%
rename from spatialstencil/syntax/common/tree_matching.py
rename to spada/syntax/common/tree_matching.py
index 43ce3d62..2e811afc 100644
--- a/spatialstencil/syntax/common/tree_matching.py
+++ b/spada/syntax/common/tree_matching.py
@@ -3,9 +3,9 @@
 from dataclasses import dataclass
 from typing import TypeVar, Generic
 
-from spatialstencil.syntax.common.basenode import BaseNode, Wildcard
-from spatialstencil.syntax.common.match_tree import root_to_leaf_paths, TreeNode, Symbol, Index, Label, MatchingBaseNode
-from spatialstencil.syntax.common.trie import TrieBuilder, TrieNode, Trie
+from spada.syntax.common.basenode import BaseNode, Wildcard
+from spada.syntax.common.match_tree import root_to_leaf_paths, TreeNode, Symbol, Index, Label, MatchingBaseNode
+from spada.syntax.common.trie import TrieBuilder, TrieNode, Trie
 from collections import deque, defaultdict
 
 
diff --git a/spatialstencil/syntax/common/trie.py b/spada/syntax/common/trie.py
similarity index 100%
rename from spatialstencil/syntax/common/trie.py
rename to spada/syntax/common/trie.py
diff --git a/spatialstencil/syntax/common/types.py b/spada/syntax/common/types.py
similarity index 100%
rename from spatialstencil/syntax/common/types.py
rename to spada/syntax/common/types.py
diff --git a/spatialstencil/syntax/common/visitor.py b/spada/syntax/common/visitor.py
similarity index 99%
rename from spatialstencil/syntax/common/visitor.py
rename to spada/syntax/common/visitor.py
index 5f86c712..2514f981 100644
--- a/spatialstencil/syntax/common/visitor.py
+++ b/spada/syntax/common/visitor.py
@@ -5,7 +5,7 @@
 functionality such as IR language testing and dataclass support.
 """
 from typing import Generic, TypeVar, Sequence
-from spatialstencil.syntax.common.basenode import BaseNode
+from spada.syntax.common.basenode import BaseNode
 
 # Create a generic type T that extends the base node type
 BaseNodeT = TypeVar('BaseNodeT', bound=BaseNode)
diff --git a/spatialstencil/syntax/csl/__init__.py b/spada/syntax/csl/__init__.py
similarity index 100%
rename from spatialstencil/syntax/csl/__init__.py
rename to spada/syntax/csl/__init__.py
diff --git a/spatialstencil/syntax/csl/benchmarking.py b/spada/syntax/csl/benchmarking.py
similarity index 98%
rename from spatialstencil/syntax/csl/benchmarking.py
rename to spada/syntax/csl/benchmarking.py
index b28dae7e..3b19ea72 100644
--- a/spatialstencil/syntax/csl/benchmarking.py
+++ b/spada/syntax/csl/benchmarking.py
@@ -9,7 +9,7 @@
 from pathlib import Path
 from typing import Iterator, Sequence
 
-from spatialstencil.syntax.csl.codefile import CodeFile
+from spada.syntax.csl.codefile import CodeFile
 
 _SYNC_ASSET_DIR = Path(__file__).resolve().parents[2] / "assets" / "csl" / "sync"
 
diff --git a/spatialstencil/syntax/csl/codefile.py b/spada/syntax/csl/codefile.py
similarity index 100%
rename from spatialstencil/syntax/csl/codefile.py
rename to spada/syntax/csl/codefile.py
diff --git a/spatialstencil/syntax/csl/constants.py b/spada/syntax/csl/constants.py
similarity index 100%
rename from spatialstencil/syntax/csl/constants.py
rename to spada/syntax/csl/constants.py
diff --git a/spatialstencil/syntax/csl/dsd_ops.py b/spada/syntax/csl/dsd_ops.py
similarity index 98%
rename from spatialstencil/syntax/csl/dsd_ops.py
rename to spada/syntax/csl/dsd_ops.py
index fd168fb0..0123754c 100644
--- a/spatialstencil/syntax/csl/dsd_ops.py
+++ b/spada/syntax/csl/dsd_ops.py
@@ -4,8 +4,8 @@
 import copy
 from dataclasses import dataclass
 from typing import Literal, Optional
-from spatialstencil.syntax.spatial_ir import irnodes as spir
-from spatialstencil.syntax.csl import structures as cslstruct
+from spada.syntax.spatial_ir import irnodes as spir
+from spada.syntax.csl import structures as cslstruct
 
 UniqueDSDDict = dict[str, list[tuple[str, cslstruct.DataStructureDescriptor]]]
 
@@ -96,7 +96,7 @@ def _ident_or_const(expr: spir.SpatialNode) -> spir.Identifier | spir.ConstantLi
 
 
 def _dsd(dsds: UniqueDSDDict, expr: spir.SpatialNode, output: bool = False) -> str:
-    from spatialstencil.syntax.csl.statements import name_to_csl
+    from spada.syntax.csl.statements import name_to_csl
     if isinstance(expr, spir.Identifier):
         if expr.as_ir() not in dsds:
             return name_to_csl(expr)
@@ -124,7 +124,7 @@ def _dsd(dsds: UniqueDSDDict, expr: spir.SpatialNode, output: bool = False) -> s
 
 
 def _dsd_object(dsds: UniqueDSDDict, expr: spir.SpatialNode, output: bool = False) -> str:
-    from spatialstencil.syntax.csl.statements import name_to_csl
+    from spada.syntax.csl.statements import name_to_csl
     if isinstance(expr, spir.Identifier):
         if expr.as_ir() not in dsds:
             return name_to_csl(expr)
@@ -312,7 +312,7 @@ def _as_csl(self, statement: spir.AssignmentStatement | spir.SendStatement,
                 raise TypeError(f"Unsupported types for cast operation: {src_dtype}, {dtype}")
 
         if self.scalar_input:
-            from spatialstencil.syntax.csl.statements import emit_expression
+            from spada.syntax.csl.statements import emit_expression
             if isinstance(statement, spir.SendStatement):
                 # local_array may be an ArraySlice (e.g. a[k]) or a plain Identifier (e.g. x)
                 src_expr = emit_expression(spir.Expression(statement.local_array), dsds, dtypes)
diff --git a/spatialstencil/syntax/csl/preprocessing.py b/spada/syntax/csl/preprocessing.py
similarity index 80%
rename from spatialstencil/syntax/csl/preprocessing.py
rename to spada/syntax/csl/preprocessing.py
index 9c9a06a5..d2d97943 100644
--- a/spatialstencil/syntax/csl/preprocessing.py
+++ b/spada/syntax/csl/preprocessing.py
@@ -1,4 +1,4 @@
-from spatialstencil.syntax.spatial_ir.canonicalization import PEBlock
+from spada.syntax.spatial_ir.canonicalization import PEBlock
 
 
 def preprocess_rectangle(rect: PEBlock):
diff --git a/spatialstencil/syntax/csl/prune_unused_fields.py b/spada/syntax/csl/prune_unused_fields.py
similarity index 82%
rename from spatialstencil/syntax/csl/prune_unused_fields.py
rename to spada/syntax/csl/prune_unused_fields.py
index 9057ee6e..6e0ffbcc 100644
--- a/spatialstencil/syntax/csl/prune_unused_fields.py
+++ b/spada/syntax/csl/prune_unused_fields.py
@@ -2,10 +2,10 @@
 Support module for copy elimination in the CSL codegen backend.
 Shares logic with the more general copy elimination pass, but specializes for DSD operations.
 """
-from spatialstencil.syntax.csl import dsd_ops
-from spatialstencil.syntax.spatial_ir import irnodes as spir
-from spatialstencil.syntax.spatial_ir.canonicalization import PEBlock
-from spatialstencil.syntax.spatial_ir.copy_elimination import _FieldUseCollector
+from spada.syntax.csl import dsd_ops
+from spada.syntax.spatial_ir import irnodes as spir
+from spada.syntax.spatial_ir.canonicalization import PEBlock
+from spada.syntax.spatial_ir.copy_elimination import _FieldUseCollector
 
 
 def _effective_statement_for_csl_codegen(
diff --git a/spatialstencil/syntax/csl/statements.py b/spada/syntax/csl/statements.py
similarity index 99%
rename from spatialstencil/syntax/csl/statements.py
rename to spada/syntax/csl/statements.py
index e49d4902..f2eccc0e 100644
--- a/spatialstencil/syntax/csl/statements.py
+++ b/spada/syntax/csl/statements.py
@@ -1,8 +1,8 @@
 from io import StringIO
 from typing import Optional
-from spatialstencil.syntax.csl.structures import DataStructureDescriptor
-from spatialstencil.syntax.csl import dsd_ops
-from spatialstencil.syntax.spatial_ir import irnodes as spir
+from spada.syntax.csl.structures import DataStructureDescriptor
+from spada.syntax.csl import dsd_ops
+from spada.syntax.spatial_ir import irnodes as spir
 
 UniqueDSDDict = dict[str, list[tuple[str, DataStructureDescriptor]]]
 
diff --git a/spatialstencil/syntax/csl/structures.py b/spada/syntax/csl/structures.py
similarity index 100%
rename from spatialstencil/syntax/csl/structures.py
rename to spada/syntax/csl/structures.py
diff --git a/spatialstencil/syntax/csl/task_recycling.py b/spada/syntax/csl/task_recycling.py
similarity index 99%
rename from spatialstencil/syntax/csl/task_recycling.py
rename to spada/syntax/csl/task_recycling.py
index 66b74013..96dc6d78 100644
--- a/spatialstencil/syntax/csl/task_recycling.py
+++ b/spada/syntax/csl/task_recycling.py
@@ -1,7 +1,7 @@
 """
 This module plans how logical CSL local tasks can share a smaller set of
 hardware local-task IDs when the program contains more local tasks than the
-target architecture exposes in :mod:`spatialstencil.syntax.csl.constants`.
+target architecture exposes in :mod:`spada.syntax.csl.constants`.
 
 Terminology
 -----------
@@ -144,8 +144,8 @@
 import heapq
 from typing import Iterable
 
-from spatialstencil.syntax.csl import constants
-from spatialstencil.syntax.csl import tasks as tdag
+from spada.syntax.csl import constants
+from spada.syntax.csl import tasks as tdag
 
 
 @dataclass(frozen=True)
diff --git a/spatialstencil/syntax/csl/tasks.py b/spada/syntax/csl/tasks.py
similarity index 97%
rename from spatialstencil/syntax/csl/tasks.py
rename to spada/syntax/csl/tasks.py
index 6d9fecc1..838ae46b 100644
--- a/spatialstencil/syntax/csl/tasks.py
+++ b/spada/syntax/csl/tasks.py
@@ -7,8 +7,8 @@
 from enum import Enum, auto
 import networkx as nx  # TODO: Switch to igraph
 from typing import Any, Literal, Optional
-from spatialstencil.syntax.spatial_ir import irnodes as spir, analysis
-from spatialstencil.syntax.csl import constants, dsd_ops, structures as cslstruct
+from spada.syntax.spatial_ir import irnodes as spir, analysis
+from spada.syntax.csl import constants, dsd_ops, structures as cslstruct
 
 UniqueDSDDict = dict[str, list[tuple[str, cslstruct.DataStructureDescriptor]]]
 
diff --git a/spatialstencil/syntax/gt4py/__init__.py b/spada/syntax/gt4py/__init__.py
similarity index 100%
rename from spatialstencil/syntax/gt4py/__init__.py
rename to spada/syntax/gt4py/__init__.py
diff --git a/spatialstencil/syntax/gt4py/astnodes.py b/spada/syntax/gt4py/astnodes.py
similarity index 97%
rename from spatialstencil/syntax/gt4py/astnodes.py
rename to spada/syntax/gt4py/astnodes.py
index 809312c2..bf9cc679 100644
--- a/spatialstencil/syntax/gt4py/astnodes.py
+++ b/spada/syntax/gt4py/astnodes.py
@@ -6,8 +6,8 @@
 import enum
 from dataclasses import dataclass
 
-from spatialstencil.syntax.common.basenode import BaseNode
-from spatialstencil.syntax.common import visitor
+from spada.syntax.common.basenode import BaseNode
+from spada.syntax.common import visitor
 
 
 class ComputationType(enum.Enum):
diff --git a/spatialstencil/syntax/gt4py/parser.py b/spada/syntax/gt4py/parser.py
similarity index 93%
rename from spatialstencil/syntax/gt4py/parser.py
rename to spada/syntax/gt4py/parser.py
index 53c41584..f78c6abb 100644
--- a/spatialstencil/syntax/gt4py/parser.py
+++ b/spada/syntax/gt4py/parser.py
@@ -1,7 +1,7 @@
 import ast
 import sys
 from typing import TextIO
-from spatialstencil.syntax.gt4py.astnodes import *
+from spada.syntax.gt4py.astnodes import *
 
 
 class GTVisitor(ast.NodeVisitor):
@@ -105,10 +105,10 @@ def parse_function(func: ast.FunctionDef) -> GTProgram:
 
 def parse_string(code: str) -> dict[str, GTree]:
     """
-    Parses a string representing a spatial stencil program, returning the
+    Parses a string representing a SpaDA program, returning the
     top-level program AST node.
     
-    :param code: A code string in spatial stencil format.
+    :param code: A code string in SpaDA format.
     :return: A Program node representing the root of the AST.
     """
     module = ast.parse(code)
@@ -123,7 +123,7 @@ def parse_string(code: str) -> dict[str, GTree]:
 
 def parse_file(file_or_filename: TextIO | str) -> dict[str, ast.FunctionDef]:
     """
-    Parses a file representing a spatial stencil program, returning the
+    Parses a file representing a SpaDA program, returning the
     top-level program AST node.
     
     :param file_or_filename: A file path or handle to an open file to read.
@@ -137,7 +137,7 @@ def parse_file(file_or_filename: TextIO | str) -> dict[str, ast.FunctionDef]:
 
 if __name__ == '__main__':
     if len(sys.argv) not in (2, 3):
-        print('USAGE: python -m spatialstencil.syntax.gt4py.parser <PYTHON FILE> [FUNCTION NAME]')
+        print('USAGE: python -m spada.syntax.gt4py.parser <PYTHON FILE> [FUNCTION NAME]')
         exit(1)
 
     out = parse_file(sys.argv[1])
diff --git a/spatialstencil/syntax/spatial_ir/__init__.py b/spada/syntax/spatial_ir/__init__.py
similarity index 100%
rename from spatialstencil/syntax/spatial_ir/__init__.py
rename to spada/syntax/spatial_ir/__init__.py
diff --git a/spatialstencil/syntax/spatial_ir/analysis.py b/spada/syntax/spatial_ir/analysis.py
similarity index 99%
rename from spatialstencil/syntax/spatial_ir/analysis.py
rename to spada/syntax/spatial_ir/analysis.py
index 7ca6d695..b1b093cd 100644
--- a/spatialstencil/syntax/spatial_ir/analysis.py
+++ b/spada/syntax/spatial_ir/analysis.py
@@ -2,11 +2,11 @@
 Contains analysis functions for Spatial IR, such as statement dependency analysis.
 """
 from collections import defaultdict
-from spatialstencil.syntax.spatial_ir import irnodes as spir
+from spada.syntax.spatial_ir import irnodes as spir
 from dataclasses import dataclass
 from typing import Literal
 import networkx as nx  # TODO: Switch to igraph
-from spatialstencil.syntax.spatial_ir.grid_geometry import Rectangle
+from spada.syntax.spatial_ir.grid_geometry import Rectangle
 
 
 @dataclass(frozen=True)
diff --git a/spatialstencil/syntax/spatial_ir/canonical_subgrids.py b/spada/syntax/spatial_ir/canonical_subgrids.py
similarity index 93%
rename from spatialstencil/syntax/spatial_ir/canonical_subgrids.py
rename to spada/syntax/spatial_ir/canonical_subgrids.py
index af05eb53..9f1027ad 100644
--- a/spatialstencil/syntax/spatial_ir/canonical_subgrids.py
+++ b/spada/syntax/spatial_ir/canonical_subgrids.py
@@ -1,9 +1,9 @@
 import copy
 
-from spatialstencil.syntax.spatial_ir.grid_geometry import split_rectangles
-from spatialstencil.syntax.spatial_ir.irnodes import Kernel, SubgridExpression, DataflowBlock, PlaceBlock, ComputeBlock, \
+from spada.syntax.spatial_ir.grid_geometry import split_rectangles
+from spada.syntax.spatial_ir.irnodes import Kernel, SubgridExpression, DataflowBlock, PlaceBlock, ComputeBlock, \
     Phase
-import spatialstencil.syntax.spatial_ir.irnodes as spa
+import spada.syntax.spatial_ir.irnodes as spa
 
 
 def fill_compute_rectangle(kernel: spa.Kernel, block_variable_type: spa.ScalarType = spa.ScalarType.u16) -> spa.Kernel:
diff --git a/spatialstencil/syntax/spatial_ir/canonicalization.py b/spada/syntax/spatial_ir/canonicalization.py
similarity index 99%
rename from spatialstencil/syntax/spatial_ir/canonicalization.py
rename to spada/syntax/spatial_ir/canonicalization.py
index ab3133ff..998a88b7 100644
--- a/spatialstencil/syntax/spatial_ir/canonicalization.py
+++ b/spada/syntax/spatial_ir/canonicalization.py
@@ -5,8 +5,8 @@
 import copy
 from dataclasses import dataclass
 from itertools import product
-from spatialstencil.syntax.spatial_ir import irnodes as spir, analysis, passes
-from spatialstencil.syntax.spatial_ir.grid_geometry import Rectangle
+from spada.syntax.spatial_ir import irnodes as spir, analysis, passes
+from spada.syntax.spatial_ir.grid_geometry import Rectangle
 
 
 def inline_metaprogramming(kernel: spir.Kernel) -> spir.Kernel:
@@ -641,7 +641,7 @@ def __init__(self, dtypes: dict[spir.Identifier, spir.IRType]):
         self.dtypes = dtypes
 
     def visit_ForeachStatement(self, node: spir.ForeachStatement):
-        from spatialstencil.syntax.csl import dsd_ops
+        from spada.syntax.csl import dsd_ops
         if dsd_ops.get_dsd_op(self.dtypes, node) is not None:
             return self.generic_visit(node)
 
diff --git a/spatialstencil/syntax/spatial_ir/copy_elimination.py b/spada/syntax/spatial_ir/copy_elimination.py
similarity index 99%
rename from spatialstencil/syntax/spatial_ir/copy_elimination.py
rename to spada/syntax/spatial_ir/copy_elimination.py
index ce1fc97b..07da896d 100644
--- a/spatialstencil/syntax/spatial_ir/copy_elimination.py
+++ b/spada/syntax/spatial_ir/copy_elimination.py
@@ -21,8 +21,8 @@
 from collections import defaultdict
 from dataclasses import dataclass
 
-from spatialstencil.syntax.spatial_ir import irnodes as spir, passes
-from spatialstencil.syntax.spatial_ir.canonicalization import PEBlock, Rectangle
+from spada.syntax.spatial_ir import irnodes as spir, passes
+from spada.syntax.spatial_ir.canonicalization import PEBlock, Rectangle
 
 
 @dataclass(frozen=True)
diff --git a/spatialstencil/syntax/spatial_ir/grid_geometry.py b/spada/syntax/spatial_ir/grid_geometry.py
similarity index 100%
rename from spatialstencil/syntax/spatial_ir/grid_geometry.py
rename to spada/syntax/spatial_ir/grid_geometry.py
diff --git a/spatialstencil/syntax/spatial_ir/irnodes.py b/spada/syntax/spatial_ir/irnodes.py
similarity index 99%
rename from spatialstencil/syntax/spatial_ir/irnodes.py
rename to spada/syntax/spatial_ir/irnodes.py
index b6075fbe..28d029ef 100644
--- a/spatialstencil/syntax/spatial_ir/irnodes.py
+++ b/spada/syntax/spatial_ir/irnodes.py
@@ -3,10 +3,10 @@
 import copy
 from dataclasses import dataclass, field
 from typing import Union, Tuple, Optional, Literal
-from spatialstencil.syntax.common import visitor
-from spatialstencil.syntax.common.basenode import BaseNode
-from spatialstencil.syntax.common.types import ScalarType, IRType
-from spatialstencil.syntax.spatial_ir.grid_geometry import Rectangle
+from spada.syntax.common import visitor
+from spada.syntax.common.basenode import BaseNode
+from spada.syntax.common.types import ScalarType, IRType
+from spada.syntax.spatial_ir.grid_geometry import Rectangle
 
 
 @dataclass
diff --git a/spatialstencil/syntax/spatial_ir/language.lark b/spada/syntax/spatial_ir/language.lark
similarity index 100%
rename from spatialstencil/syntax/spatial_ir/language.lark
rename to spada/syntax/spatial_ir/language.lark
diff --git a/spatialstencil/syntax/spatial_ir/lark_to_ir.py b/spada/syntax/spatial_ir/lark_to_ir.py
similarity index 98%
rename from spatialstencil/syntax/spatial_ir/lark_to_ir.py
rename to spada/syntax/spatial_ir/lark_to_ir.py
index 40a482d7..da877497 100644
--- a/spatialstencil/syntax/spatial_ir/lark_to_ir.py
+++ b/spada/syntax/spatial_ir/lark_to_ir.py
@@ -1,8 +1,8 @@
 import lark
 
-from spatialstencil.syntax.common.types import ScalarType
-from spatialstencil.syntax.spatial_ir import irnodes
-from spatialstencil.syntax.spatial_ir.irnodes import StreamType, Identifier
+from spada.syntax.common.types import ScalarType
+from spada.syntax.spatial_ir import irnodes
+from spada.syntax.spatial_ir.irnodes import StreamType, Identifier
 
 
 class TreeToSpatialIR(lark.Transformer):
diff --git a/spatialstencil/syntax/spatial_ir/parser.py b/spada/syntax/spatial_ir/parser.py
similarity index 87%
rename from spatialstencil/syntax/spatial_ir/parser.py
rename to spada/syntax/spatial_ir/parser.py
index cd201cff..c566c286 100644
--- a/spatialstencil/syntax/spatial_ir/parser.py
+++ b/spada/syntax/spatial_ir/parser.py
@@ -3,8 +3,8 @@
 import sys
 from typing import TextIO
 
-from spatialstencil.syntax.spatial_ir import irnodes
-from spatialstencil.syntax.spatial_ir import lark_to_ir
+from spada.syntax.spatial_ir import irnodes
+from spada.syntax.spatial_ir import lark_to_ir
 
 
 class Parser:
@@ -29,7 +29,7 @@ def parse(self, code: str, name: str = None) -> irnodes.Kernel:
         Parses a string representing a spatial IR kernel, returning the
         top-level kernel IR node.
         
-        :param code: A code string in spatial stencil format.
+        :param code: A code string in SpaDA format.
         :param name: An optional name for the file, used for error messages.
         :return: A Kernel node representing the root of the spatial IR.
         """
@@ -44,7 +44,7 @@ def parse_string(code: str, name: str = None) -> irnodes.Kernel:
     Parses a string representing a spatial IR kernel, returning the
     top-level kernel IR node.
     
-    :param code: A code string in spatial stencil format.
+    :param code: A code string in SpaDA format.
     :param name: An optional name for the file, used for error messages.
     :return: A Kernel node representing the root of the spatial IR.
     """
@@ -68,7 +68,7 @@ def parse_file(file_or_filename: TextIO | str) -> irnodes.Kernel:
 
 if __name__ == '__main__':
     if len(sys.argv) != 2:
-        print('USAGE: python -m spatialstencil.syntax.spatial_ir.parser <STENCIL FILE>')
+        print('USAGE: python -m spada.syntax.spatial_ir.parser <STENCIL FILE>')
         exit(1)
 
     out = parse_file(sys.argv[1])
diff --git a/spatialstencil/syntax/spatial_ir/passes.py b/spada/syntax/spatial_ir/passes.py
similarity index 98%
rename from spatialstencil/syntax/spatial_ir/passes.py
rename to spada/syntax/spatial_ir/passes.py
index 98f2c3cb..243fe46f 100644
--- a/spatialstencil/syntax/spatial_ir/passes.py
+++ b/spada/syntax/spatial_ir/passes.py
@@ -4,8 +4,8 @@
 from collections.abc import Callable
 from dataclasses import dataclass, field, replace
 
-from spatialstencil.syntax.spatial_ir import irnodes as spa
-from spatialstencil.syntax.stencil_ir.type_inference import _result_type_of
+from spada.syntax.spatial_ir import irnodes as spa
+from spada.syntax.stencil_ir.type_inference import _result_type_of
 
 
 class Concretizer(spa.NodeTransformer):
diff --git a/spatialstencil/syntax/stencil_ir/analysis.py b/spada/syntax/stencil_ir/analysis.py
similarity index 98%
rename from spatialstencil/syntax/stencil_ir/analysis.py
rename to spada/syntax/stencil_ir/analysis.py
index c9a24905..75ffba72 100644
--- a/spatialstencil/syntax/stencil_ir/analysis.py
+++ b/spada/syntax/stencil_ir/analysis.py
@@ -2,7 +2,7 @@
 Analysis passes on the Stencil IR.
 """
 from collections import defaultdict
-from spatialstencil.syntax.stencil_ir import irnodes as sast
+from spada.syntax.stencil_ir import irnodes as sast
 from typing import Literal
 
 
diff --git a/spatialstencil/syntax/stencil_ir/canonicalization.py b/spada/syntax/stencil_ir/canonicalization.py
similarity index 93%
rename from spatialstencil/syntax/stencil_ir/canonicalization.py
rename to spada/syntax/stencil_ir/canonicalization.py
index f076fe30..798c0225 100644
--- a/spatialstencil/syntax/stencil_ir/canonicalization.py
+++ b/spada/syntax/stencil_ir/canonicalization.py
@@ -5,8 +5,8 @@
 from collections import defaultdict
 from typing import Literal
 
-from spatialstencil.syntax.stencil_ir import irnodes as sast
-from spatialstencil.syntax.stencil_ir import type_inference
+from spada.syntax.stencil_ir import irnodes as sast
+from spada.syntax.stencil_ir import type_inference
 
 
 def canonicalize(program: sast.Program) -> sast.Program:
diff --git a/spatialstencil/syntax/stencil_ir/canonicalize_expression.py b/spada/syntax/stencil_ir/canonicalize_expression.py
similarity index 95%
rename from spatialstencil/syntax/stencil_ir/canonicalize_expression.py
rename to spada/syntax/stencil_ir/canonicalize_expression.py
index b3d1fad2..0b8f6938 100644
--- a/spatialstencil/syntax/stencil_ir/canonicalize_expression.py
+++ b/spada/syntax/stencil_ir/canonicalize_expression.py
@@ -10,11 +10,11 @@
 The modifications are done in-place on the IR nodes.
 """
 
-from spatialstencil.lowering.versioning import Versioning
-from spatialstencil.syntax.common.basenode import Wildcard
-from spatialstencil.syntax.common.tree_matching import PatternTransformer
-from spatialstencil.syntax.stencil_ir.domain_collector import DomainCollector
-from spatialstencil.syntax.stencil_ir.irnodes import *
+from spada.lowering.versioning import Versioning
+from spada.syntax.common.basenode import Wildcard
+from spada.syntax.common.tree_matching import PatternTransformer
+from spada.syntax.stencil_ir.domain_collector import DomainCollector
+from spada.syntax.stencil_ir.irnodes import *
 
 
 class CanonicalizeExpression(NodeVisitor):
diff --git a/spatialstencil/syntax/stencil_ir/def_use_analysis.py b/spada/syntax/stencil_ir/def_use_analysis.py
similarity index 97%
rename from spatialstencil/syntax/stencil_ir/def_use_analysis.py
rename to spada/syntax/stencil_ir/def_use_analysis.py
index 0d10720f..195452ba 100644
--- a/spatialstencil/syntax/stencil_ir/def_use_analysis.py
+++ b/spada/syntax/stencil_ir/def_use_analysis.py
@@ -1,8 +1,8 @@
 from dataclasses import dataclass
 from typing import Sequence
 
-import spatialstencil.syntax.stencil_ir.irnodes as sast
-from spatialstencil.syntax.stencil_ir.irnodes import ComputationBlock
+import spada.syntax.stencil_ir.irnodes as sast
+from spada.syntax.stencil_ir.irnodes import ComputationBlock
 
 
 @dataclass
diff --git a/spatialstencil/syntax/stencil_ir/domain_collector.py b/spada/syntax/stencil_ir/domain_collector.py
similarity index 97%
rename from spatialstencil/syntax/stencil_ir/domain_collector.py
rename to spada/syntax/stencil_ir/domain_collector.py
index f20639a2..9da56905 100644
--- a/spatialstencil/syntax/stencil_ir/domain_collector.py
+++ b/spada/syntax/stencil_ir/domain_collector.py
@@ -1,7 +1,7 @@
 from dataclasses import dataclass
 
-import spatialstencil.syntax.stencil_ir.irnodes as sast
-from spatialstencil.syntax.stencil_ir.irnodes import ComputationBlock, Program
+import spada.syntax.stencil_ir.irnodes as sast
+from spada.syntax.stencil_ir.irnodes import ComputationBlock, Program
 
 
 @dataclass
diff --git a/spatialstencil/syntax/stencil_ir/domain_inference.py b/spada/syntax/stencil_ir/domain_inference.py
similarity index 98%
rename from spatialstencil/syntax/stencil_ir/domain_inference.py
rename to spada/syntax/stencil_ir/domain_inference.py
index b492a9ae..5ae04b6c 100644
--- a/spatialstencil/syntax/stencil_ir/domain_inference.py
+++ b/spada/syntax/stencil_ir/domain_inference.py
@@ -2,11 +2,11 @@
 import warnings
 from typing import Sequence, Collection
 
-import spatialstencil.syntax.stencil_ir.irnodes as sast
+import spada.syntax.stencil_ir.irnodes as sast
 import copy
 
-from spatialstencil.syntax.stencil_ir import def_use_analysis
-from spatialstencil.syntax.stencil_ir.def_use_analysis import ScopedUse, ScopedDefinition
+from spada.syntax.stencil_ir import def_use_analysis
+from spada.syntax.stencil_ir.def_use_analysis import ScopedUse, ScopedDefinition
 
 
 def infer_field_domains(program: sast.Program,
diff --git a/spatialstencil/syntax/stencil_ir/extent_inference.py b/spada/syntax/stencil_ir/extent_inference.py
similarity index 98%
rename from spatialstencil/syntax/stencil_ir/extent_inference.py
rename to spada/syntax/stencil_ir/extent_inference.py
index cbe5c0a6..31a8be8b 100644
--- a/spatialstencil/syntax/stencil_ir/extent_inference.py
+++ b/spada/syntax/stencil_ir/extent_inference.py
@@ -2,8 +2,8 @@
 from collections import defaultdict
 from typing import Sequence, Collection
 
-import spatialstencil.syntax.stencil_ir.irnodes as sast
-import spatialstencil.syntax.stencil_ir.def_use_analysis as def_use_analysis
+import spada.syntax.stencil_ir.irnodes as sast
+import spada.syntax.stencil_ir.def_use_analysis as def_use_analysis
 
 def infer_field_extents(program: sast.Program):
     """
diff --git a/spatialstencil/syntax/stencil_ir/flop_counter.py b/spada/syntax/stencil_ir/flop_counter.py
similarity index 96%
rename from spatialstencil/syntax/stencil_ir/flop_counter.py
rename to spada/syntax/stencil_ir/flop_counter.py
index 5bd4ad8f..27a2995c 100644
--- a/spatialstencil/syntax/stencil_ir/flop_counter.py
+++ b/spada/syntax/stencil_ir/flop_counter.py
@@ -1,10 +1,10 @@
 """
-FLOP Counter for spatial stencil IR computations.
+FLOP Counter for SpaDA IR computations.
 
 This visitor counts the total number of floating-point operations (FLOPs)
 in a stencil computation by analyzing statements and their execution domains.
 """
-from spatialstencil.syntax.stencil_ir.irnodes import (FieldType, NodeVisitor, Expression, Identifier, Subscript, 
+from spada.syntax.stencil_ir.irnodes import (FieldType, NodeVisitor, Expression, Identifier, Subscript, 
                      UnaryOperator, BinaryOperator, TernaryOperator, 
                      MathCall, StatementBlock, AssignOp, ReturnOp, 
                      ViewType, Cartesian, Program)
@@ -12,7 +12,7 @@
 
 class FLOPCounter(NodeVisitor):
     """
-    A visitor that counts FLOPs in a spatial stencil computation.
+    A visitor that counts FLOPs in a SpaDA computation.
     
     The count is calculated as:
     FLOPs = operations_per_statement × output_domain_size × num_output_extents
diff --git a/spatialstencil/syntax/stencil_ir/irnodes.py b/spada/syntax/stencil_ir/irnodes.py
similarity index 98%
rename from spatialstencil/syntax/stencil_ir/irnodes.py
rename to spada/syntax/stencil_ir/irnodes.py
index 444a3578..6d2d1b70 100644
--- a/spatialstencil/syntax/stencil_ir/irnodes.py
+++ b/spada/syntax/stencil_ir/irnodes.py
@@ -1,13 +1,13 @@
 """
-Native class definitions for the spatial stencil Intermediate Representation (IR).
+Native class definitions for the SpaDA Intermediate Representation (IR).
 """
 from dataclasses import dataclass, field
 import enum
 from typing import Literal, Sequence
 
-from spatialstencil.syntax.common.basenode import BaseNode
-from spatialstencil.syntax.common import visitor
-from spatialstencil.syntax.common.types import IRType, ScalarType
+from spada.syntax.common.basenode import BaseNode
+from spada.syntax.common import visitor
+from spada.syntax.common.types import IRType, ScalarType
 
 
 class ComputationType(enum.Enum):
@@ -19,7 +19,7 @@ class ComputationType(enum.Enum):
 
 class Node(BaseNode):
     """
-    Abstract class representing an IR node for spatial stencils.
+    Abstract class representing an IR node for SpaDA.
     """
 
     @classmethod
diff --git a/spatialstencil/syntax/stencil_ir/language.lark b/spada/syntax/stencil_ir/language.lark
similarity index 100%
rename from spatialstencil/syntax/stencil_ir/language.lark
rename to spada/syntax/stencil_ir/language.lark
diff --git a/spatialstencil/syntax/stencil_ir/lark_to_ast.py b/spada/syntax/stencil_ir/lark_to_ast.py
similarity index 99%
rename from spatialstencil/syntax/stencil_ir/lark_to_ast.py
rename to spada/syntax/stencil_ir/lark_to_ast.py
index 53955cfe..5cb8a6d2 100644
--- a/spatialstencil/syntax/stencil_ir/lark_to_ast.py
+++ b/spada/syntax/stencil_ir/lark_to_ast.py
@@ -1,7 +1,7 @@
 from dataclasses import dataclass
 import lark
 
-from spatialstencil.syntax.stencil_ir import irnodes
+from spada.syntax.stencil_ir import irnodes
 
 
 class TreeToAST(lark.Transformer):
diff --git a/spatialstencil/syntax/stencil_ir/parser.py b/spada/syntax/stencil_ir/parser.py
similarity index 72%
rename from spatialstencil/syntax/stencil_ir/parser.py
rename to spada/syntax/stencil_ir/parser.py
index 82fa8523..bef1ab42 100644
--- a/spatialstencil/syntax/stencil_ir/parser.py
+++ b/spada/syntax/stencil_ir/parser.py
@@ -3,13 +3,13 @@
 import sys
 from typing import TextIO
 
-from spatialstencil.syntax.stencil_ir import irnodes
-from spatialstencil.syntax.stencil_ir import lark_to_ast
+from spada.syntax.stencil_ir import irnodes
+from spada.syntax.stencil_ir import lark_to_ast
 
 
 class Parser:
     """
-    A spatial stencil language parser. Parses multiple strings faster than
+    A SpaDA language parser. Parses multiple strings faster than
     calling ``parser.parse_string`` multiple times.
     """
 
@@ -26,10 +26,10 @@ def __init__(self) -> None:
 
     def parse(self, code: str) -> irnodes.Program:
         """
-        Parses a string representing a spatial stencil program, returning the
+        Parses a string representing a SpaDA program, returning the
         top-level program AST node.
         
-        :param code: A code string in spatial stencil format.
+        :param code: A code string in SpaDA format.
         :return: A Program node representing the root of the AST.
         """
         tree = self.parser.parse(code)
@@ -39,10 +39,10 @@ def parse(self, code: str) -> irnodes.Program:
 
 def parse_string(code: str) -> irnodes.Program:
     """
-    Parses a string representing a spatial stencil program, returning the
+    Parses a string representing a SpaDA program, returning the
     top-level program AST node.
     
-    :param code: A code string in spatial stencil format.
+    :param code: A code string in SpaDA format.
     :return: A Program node representing the root of the AST.
     """
     parser = Parser()
@@ -51,7 +51,7 @@ def parse_string(code: str) -> irnodes.Program:
 
 def parse_file(file_or_filename: TextIO | str) -> irnodes.Program:
     """
-    Parses a file representing a spatial stencil program, returning the
+    Parses a file representing a SpaDA program, returning the
     top-level program AST node.
     
     :param file_or_filename: A file path or handle to an open file to read.
@@ -65,7 +65,7 @@ def parse_file(file_or_filename: TextIO | str) -> irnodes.Program:
 
 if __name__ == '__main__':
     if len(sys.argv) != 2:
-        print('USAGE: python -m spatialstencil.syntax.stencil_ir.parser <STENCIL FILE>')
+        print('USAGE: python -m spada.syntax.stencil_ir.parser <STENCIL FILE>')
         exit(1)
 
     out = parse_file(sys.argv[1])
diff --git a/spatialstencil/syntax/stencil_ir/refactor_forward_backward_stencils.py b/spada/syntax/stencil_ir/refactor_forward_backward_stencils.py
similarity index 98%
rename from spatialstencil/syntax/stencil_ir/refactor_forward_backward_stencils.py
rename to spada/syntax/stencil_ir/refactor_forward_backward_stencils.py
index c1e18138..adf81d1d 100644
--- a/spatialstencil/syntax/stencil_ir/refactor_forward_backward_stencils.py
+++ b/spada/syntax/stencil_ir/refactor_forward_backward_stencils.py
@@ -1,6 +1,6 @@
 import copy
 from collections import defaultdict
-from spatialstencil.syntax.stencil_ir.irnodes import *
+from spada.syntax.stencil_ir.irnodes import *
 
 
 class RefactorForwardBackwardStencils(ScopedNodeVisitor):
diff --git a/spatialstencil/syntax/stencil_ir/ssa.py b/spada/syntax/stencil_ir/ssa.py
similarity index 96%
rename from spatialstencil/syntax/stencil_ir/ssa.py
rename to spada/syntax/stencil_ir/ssa.py
index fb005701..1f36cb36 100644
--- a/spatialstencil/syntax/stencil_ir/ssa.py
+++ b/spada/syntax/stencil_ir/ssa.py
@@ -2,8 +2,8 @@
 from dataclasses import dataclass
 from typing import Mapping
 
-import spatialstencil.syntax.stencil_ir.irnodes as sast
-from spatialstencil.syntax.stencil_ir.irnodes import ComputationBlock, Program
+import spada.syntax.stencil_ir.irnodes as sast
+from spada.syntax.stencil_ir.irnodes import ComputationBlock, Program
 
 
 class SSAVisitor(sast.ScopedNodeVisitor):
diff --git a/spatialstencil/syntax/stencil_ir/type_inference.py b/spada/syntax/stencil_ir/type_inference.py
similarity index 98%
rename from spatialstencil/syntax/stencil_ir/type_inference.py
rename to spada/syntax/stencil_ir/type_inference.py
index 9d534692..7cb30f12 100644
--- a/spatialstencil/syntax/stencil_ir/type_inference.py
+++ b/spada/syntax/stencil_ir/type_inference.py
@@ -3,12 +3,12 @@
 """
 import copy
 
-from spatialstencil.syntax.common import types
-from spatialstencil.syntax.stencil_ir import irnodes as sast
-from spatialstencil.syntax.stencil_ir import analysis
+from spada.syntax.common import types
+from spada.syntax.stencil_ir import irnodes as sast
+from spada.syntax.stencil_ir import analysis
 
-from spatialstencil.syntax.stencil_ir.domain_inference import infer_field_domains
-from spatialstencil.syntax.stencil_ir.extent_inference import infer_field_extents
+from spada.syntax.stencil_ir.domain_inference import infer_field_domains
+from spada.syntax.stencil_ir.extent_inference import infer_field_extents
 
 
 def infer_types(program: sast.Program,
diff --git a/spatialstencil/assets/csl/sync/README.md b/spatialstencil/assets/csl/sync/README.md
deleted file mode 100644
index 03675e86..00000000
--- a/spatialstencil/assets/csl/sync/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# CSL PE Clock Synchronization Utility
-The files in this folder were copied from the Cerebras SDK 1.4.0 [bandwidth-test benchmark](https://github.com/Cerebras/sdk-examples/tree/rel-sdk-1.4.0/benchmarks/bandwidth-test/src/sync).
-They provide functionality for synchronizing the clocks of all PEs to measure
-communication operations (e.g., collectives) correctly. Use with `--sync-benchmarking`.
diff --git a/spatialstencil/assets/csl/sync/layout.csl b/spatialstencil/assets/csl/sync/layout.csl
deleted file mode 100644
index 2ce43c76..00000000
--- a/spatialstencil/assets/csl/sync/layout.csl
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2025 Cerebras Systems.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-param colors:[5]color;
-param entrypoints:[4]local_task_id;
-param width : i16 ;   // width of the core
-param height: i16 ;   // height of the core
-
-const C0 : color = colors[0];
-const C1 : color = colors[1];
-const C2 : color = colors[2];
-const C3 : color = colors[3];
-const C4 : color = colors[4];
-
-const STARTUP: local_task_id = entrypoints[0];
-const SYNC_Y: local_task_id = entrypoints[1];
-const SYNC_BCAST: local_task_id = entrypoints[2];
-const EXIT: local_task_id = entrypoints[3];
-
-fn get_params(px:i16, py:i16) comptime_struct {
-
-    var first_py: bool = (0 == py);
-    var last_py: bool = ((height-1) == py);
-    var is_py_even: bool = (0 == (py % 2));
-
-    var first_px: bool = (0 == px);
-    var last_px: bool = ((width-1) == px);
-    var is_px_even: bool = (0 == (px % 2));
-
-    var c_recv_px: color = C0;
-    var c_send_px: color = C1;
-    if (is_px_even){
-        c_recv_px = C0;
-        c_send_px = C1;
-    }else{
-        c_recv_px = C1;
-        c_send_px = C0;
-    }
-
-    var c_recv_py: color = C2;
-    var c_send_py: color = C3;
-    if (is_py_even){
-        c_recv_py = C2;
-        c_send_py = C3;
-    }else{
-        c_recv_py = C3;
-        c_send_py = C2;
-    }
-
-    return .{
-        .c_recv_px = c_recv_px,
-        .c_send_px = c_send_px,
-        .c_recv_py = c_recv_py,
-        .c_send_py = c_send_py,
-        .c_bcast = C4,
-
-        .STARTUP = STARTUP,
-        .SYNC_Y = SYNC_Y,
-        .SYNC_BCAST = SYNC_BCAST,
-        .EXIT = EXIT,
-
-        .first_px = first_px,
-        .last_px = last_px,
-        .first_py = first_py,
-        .last_py = last_py,
-    };
-}
diff --git a/spatialstencil/assets/csl/sync/pe.csl b/spatialstencil/assets/csl/sync/pe.csl
deleted file mode 100644
index 50fb29ef..00000000
--- a/spatialstencil/assets/csl/sync/pe.csl
+++ /dev/null
@@ -1,291 +0,0 @@
-// Copyright 2025 Cerebras Systems.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-param c_recv_px: color;
-param c_send_px: color;
-param c_recv_py: color;
-param c_send_py: color;
-param c_bcast: color;
-
-param STARTUP: local_task_id;
-param SYNC_Y: local_task_id;
-param SYNC_BCAST: local_task_id;
-param EXIT: local_task_id;
-
-param first_px: bool;
-param last_px: bool;
-param first_py: bool;
-param last_py: bool;
-
-// f_callback = sys_mod.unblock_cmd_stream, to continue next command
-param f_callback : fn ()void;
-
-// input_queues={2,3,4}
-// output_queues={2,3,4}
-param input_queues:[3]u16;
-param output_queues:[3]u16;
-
-const c_recv_px_iq = @get_input_queue(input_queues[0]);
-const c_send_px_oq = @get_output_queue(output_queues[0]);
-
-const c_recv_py_iq = @get_input_queue(input_queues[1]);
-const c_send_py_oq = @get_output_queue(output_queues[1]);
-
-const c_bcast_iq = @get_input_queue(input_queues[2]);
-const c_bcast_oq = @get_output_queue(input_queues[2]);
-
-const timestamp = @import_module("<time>");
-
-// tsc_size_words = 3
-var tscRefBuffer: *[timestamp.tsc_size_words]u16;
-
-////////////////////////////////////////////////////////////////////////////////
-// Main memory (48KB)
-////////////////////////////////////////////////////////////////////////////////
-
-var buf = @zeros([1]f32);
-
-////////////////////////////////////////////////////////////////////////////////
-// Tasks
-// syntax
-//     task_begin(name, entrypoint, color)
-////////////////////////////////////////////////////////////////////////////////
-
-const mem_buf_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{1} -> buf[i] });
-
-var fab_recv_data_px_wdsd =  @get_dsd(fabin_dsd, .{
-   .extent = 1,
-   .fabric_color = c_recv_px,
-   .input_queue = c_recv_px_iq
-});
-
-var fab_trans_data_px_wdsd = @get_dsd(fabout_dsd, .{
-    .extent = 1,
-    .fabric_color = c_send_px,
-    .output_queue = c_send_px_oq
-});
-
-var fab_recv_data_py_wdsd =  @get_dsd(fabin_dsd, .{
-   .extent = 1,
-   .fabric_color = c_recv_py,
-   .input_queue = c_recv_py_iq
-});
-
-var fab_trans_data_py_wdsd = @get_dsd(fabout_dsd, .{
-    .extent = 1,
-    .fabric_color = c_send_py,
-    .output_queue = c_send_py_oq
-});
-
-var fab_recv_data_bcast_wdsd =  @get_dsd(fabin_dsd, .{
-   .extent = 1,
-   .fabric_color = c_bcast,
-   .input_queue = c_bcast_iq
-});
-
-var fab_trans_data_bcast_wdsd = @get_dsd(fabout_dsd, .{
-    .extent = 1,
-    .fabric_color = c_bcast,
-    .output_queue = c_bcast_oq
-});
-
-
-
-// Each row performs a sync from the last PE to first PE
-fn f_sync(refClock: *[3]u16) void {
-    tscRefBuffer = refClock;
-
-    // sync a row
-    if (last_px){
-        // px = width-1: send sync signal
-        @mov32(fab_trans_data_px_wdsd, mem_buf_dsd, .{.async=true, .activate = f_sync_y });
-    }else{
-        if (first_px){
-            // px = 0: receive signal
-            @mov32(mem_buf_dsd, fab_recv_data_px_wdsd, .{.async=true, .activate = f_sync_y });
-        }else{
-            // 0 < px < width-1: receive signal and forward it
-            @mov32(fab_trans_data_px_wdsd, fab_recv_data_px_wdsd, .{.async=true, .activate = f_sync_y });
-        }
-    }
-}
-
-
-// prerequisite: row synchronization is done
-//   the first PE is the last one to receive the signal
-// The first column performs a sync from last PE to first PE
-// other PEs wait for bcast signal
-task f_sync_y() void {
-    if (first_px){
-        // 1st column performs a sync
-        if (last_py){
-            // py = height-1: send sync signal
-            @mov32(fab_trans_data_py_wdsd, mem_buf_dsd, .{.async=true, .activate = f_sync_bcast });
-        }else{
-            if (first_py){
-                // py = 0: receive signal
-                @mov32(mem_buf_dsd, fab_recv_data_py_wdsd, .{.async=true, .activate = f_sync_bcast });
-            }else{
-                // 0 < py < height-1: receive signal and forward it
-                @mov32(fab_trans_data_py_wdsd, fab_recv_data_py_wdsd, .{.async=true, .activate = f_sync_bcast });
-            }
-        }
-    }else{
-        // other PEs wait for bcast signal
-        @activate(SYNC_BCAST); // trigger f_sync_bcast
-    }
-}
-
-// prerequisite: sync is done, P0.0 is the last one to receive the sync
-// P0.0 broadcasts the signal, others wait for the bcast signal from P0.0
-task f_sync_bcast() void {
-
-    if ( first_px and first_py ){
-        // P0.0 sends the signal
-        @mov32(fab_trans_data_bcast_wdsd, mem_buf_dsd, .{.async=true, .activate = f_exit });
-    }else{
-        // others wait for bcast from P0.0
-        @mov32(mem_buf_dsd, fab_recv_data_bcast_wdsd, .{.async=true, .activate = f_exit });
-    }
-}
-
-// record reference clock T
-// T is regarded as clock 0 because all PEs sync with P0.0
-task f_exit() void {
-
-    timestamp.get_timestamp(tscRefBuffer);
-
-    //sys_mod.unblock_cmd_stream();
-    f_callback();
-}
-
-
-task f_startup() void {
-    timestamp.enable_tsc();
-}
-
-comptime {
-    @activate(STARTUP);
-
-    @bind_local_task(f_startup, STARTUP);
-    @bind_local_task(f_sync_y, SYNC_Y);
-    @bind_local_task(f_sync_bcast, SYNC_BCAST);
-    @bind_local_task(f_exit, EXIT);
-
-    // On WSE-3, we must explicitly initialize input and output queues
-    if (@is_arch("wse3")) {
-        @initialize_queue(c_recv_px_iq, .{ .color = c_recv_px });
-        @initialize_queue(c_send_px_oq, .{ .color = c_send_px });
-
-        @initialize_queue(c_recv_py_iq, .{ .color = c_recv_py });
-        @initialize_queue(c_send_py_oq, .{ .color = c_send_py });
-
-        @initialize_queue(c_bcast_iq, .{ .color = c_bcast });
-        @initialize_queue(c_bcast_oq, .{ .color = c_bcast });
-    }
-}
-
-
-// sync a row with C0 and C1
-//
-//     C0     C1     C0     C1
-// P0 <-- P1 <-- P2 <-- P3 <-- P4
-//
-//     C0     C1     C0     C1     C0
-// P0 <-- P1 <-- P2 <-- P3 <-- P4 <-- P5
-//
-// P0: recv C0
-// P_even: recv C0, send C1
-// P_odd: recv C1, send C0
-// P_last: send C0 if odd; send C1 if even
-comptime {
-    if (first_px){
-        // px = 0: receive from east
-        @set_local_color_config(c_recv_px, .{ .routes = .{ .rx = .{EAST}, .tx = .{RAMP} } } );
-    }else{
-        if (last_px){
-           // px = width-1: send to west
-           @set_local_color_config(c_send_px, .{ .routes = .{ .rx = .{RAMP}, .tx = .{WEST} } } );
-        }else{
-           // 0 < px < width-1: receive from east, send to west
-           @set_local_color_config(c_recv_px, .{ .routes = .{ .rx = .{EAST}, .tx = .{RAMP} } } );
-           @set_local_color_config(c_send_px, .{ .routes = .{ .rx = .{RAMP}, .tx = .{WEST} } } );
-        }
-    }
-}
-
-// sync a col with C2 and C3
-//     C2     C3     C2     C3
-// P0 <-- P1 <-- P2 <-- P3 <-- P4
-//
-//     C2     C3     C2     C3     C2
-// P0 <-- P1 <-- P2 <-- P3 <-- P4 <-- P5
-//
-// P0: recv C2
-// P_even: recv C2, send C3
-// P_odd: recv C3, send C2
-// P_last: send C2 if odd; send C3 if even
-comptime {
-    if (first_py){
-        // py = 0 (even): receive from south
-        @set_local_color_config(c_recv_py, .{ .routes = .{ .rx = .{SOUTH}, .tx = .{RAMP} } } );
-    }else{
-        if (last_py){
-           // py = height-1: send to north
-           @set_local_color_config(c_send_py, .{ .routes = .{ .rx = .{RAMP}, .tx = .{NORTH} } } );
-        }else{
-           // 0 < py < height-1: receive from south, send to north
-           @set_local_color_config(c_recv_py, .{ .routes = .{ .rx = .{SOUTH}, .tx = .{RAMP} } } );
-           @set_local_color_config(c_send_py, .{ .routes = .{ .rx = .{RAMP}, .tx = .{NORTH} } } );
-        }
-    }
-}
-
-
-// w > 1 and h > 1
-//  x --> x --> x
-//  |
-//  V
-//  x --> x --> x
-//  |
-//  V
-//  x --> x --> x
-//
-// WARNING: corner case for w=1 or h=1
-comptime {
-    if (first_px){
-        // px = 0
-        if (first_py){
-            // P0,0: send to east and south
-            @set_local_color_config(c_bcast, .{ .routes = .{ .rx = .{RAMP}, .tx = .{EAST, SOUTH} } } );
-        }else{
-            if (last_py){
-                // P0,h-1
-                @set_local_color_config(c_bcast, .{ .routes = .{ .rx = .{NORTH}, .tx = .{EAST, RAMP} } } );
-            }else{
-                // P0,py: 0 < py < height-1
-                @set_local_color_config(c_bcast, .{ .routes = .{ .rx = .{NORTH}, .tx = .{EAST, RAMP, SOUTH} } } );
-            }
-        }
-    }else{
-        if (last_px){
-            // px = width-1
-           @set_local_color_config(c_bcast, .{ .routes = .{ .rx = .{WEST}, .tx = .{RAMP} } } );
-        }else{
-            // 0 < px < width-1
-           @set_local_color_config(c_bcast, .{ .routes = .{ .rx = .{WEST}, .tx = .{EAST, RAMP} } } );
-        }
-    }
-}
diff --git a/tests/csl_runtime/_lib.sh b/tests/csl_runtime/_lib.sh
index a5e24a96..781e886b 100755
--- a/tests/csl_runtime/_lib.sh
+++ b/tests/csl_runtime/_lib.sh
@@ -4,7 +4,7 @@
 [ "$(basename "$0")" = "_lib.sh" ] && exit 0
 
 COLLECTIVES_DIR="$(cd "$(dirname "$0")/../../samples/spatial/collectives" && pwd)"
-RUNTIME_PY="$(cd "$(dirname "$0")/../.." && pwd)/spatialstencil/runtime/runtime.py"
+RUNTIME_PY="$(cd "$(dirname "$0")/../.." && pwd)/spada/runtime/runtime.py"
 
 # verify_reduce_sum
 #   Loads a_in.npy, computes np.sum(axis=0, keepdims=True), compares with OUT_out.npy.
diff --git a/tests/csl_runtime/test_add.sh b/tests/csl_runtime/test_add.sh
index cff01bc2..093eb736 100755
--- a/tests/csl_runtime/test_add.sh
+++ b/tests/csl_runtime/test_add.sh
@@ -4,7 +4,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 
-# Compile the spatial stencil program
+# Compile the SpaDA program
 sptlc "$SCRIPT_DIR/../../samples/spatial/simple/add.sptl" add_sptl -p N=8
 
 python <<EOF
@@ -18,7 +18,7 @@ np.save('b.npy', b)
 EOF
 
 # Run the compiled program with the Python runtime and the simulator
-cs_python "$SCRIPT_DIR/../../spatialstencil/runtime/runtime.py" add_sptl a.npy b.npy --benchmark
+cs_python "$SCRIPT_DIR/../../spada/runtime/runtime.py" add_sptl a.npy b.npy --benchmark
 
 # Check if the output file matches the expected output
 python <<EOF
diff --git a/tests/csl_runtime/test_benchmarks.sh b/tests/csl_runtime/test_benchmarks.sh
index bf78196b..a52a07a0 100755
--- a/tests/csl_runtime/test_benchmarks.sh
+++ b/tests/csl_runtime/test_benchmarks.sh
@@ -10,7 +10,7 @@ NC='\033[0m'
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 BENCHMARK_DIR="$SCRIPT_DIR/../../samples/benchmarks"
-RUNTIME="$SCRIPT_DIR/../../spatialstencil/runtime/runtime.py"
+RUNTIME="$SCRIPT_DIR/../../spada/runtime/runtime.py"
 OUTPUT_DIR="$SCRIPT_DIR/benchmark"
 
 TOTAL=0
diff --git a/tests/csl_runtime/test_broadcast_1d_multicast.sh b/tests/csl_runtime/test_broadcast_1d_multicast.sh
index 20767a73..9f0eb563 100755
--- a/tests/csl_runtime/test_broadcast_1d_multicast.sh
+++ b/tests/csl_runtime/test_broadcast_1d_multicast.sh
@@ -34,4 +34,3 @@ run_broadcast_1d_mc 2 2
 run_broadcast_1d_mc 4 2
 run_broadcast_1d_mc 8 4
 run_broadcast_1d_mc 4 8
-run_broadcast_1d_mc 4 2 --sync-benchmarking
diff --git a/tests/csl_runtime/test_copy.sh b/tests/csl_runtime/test_copy.sh
index 68dd3782..c85b58fa 100755
--- a/tests/csl_runtime/test_copy.sh
+++ b/tests/csl_runtime/test_copy.sh
@@ -4,7 +4,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 
-# Compile the spatial stencil program
+# Compile the SpaDA program
 FOLDER_NAME="copy_sptl"
 sptlc "$SCRIPT_DIR/../spatial_ir/samples/neighbor_copy.sptl" "$FOLDER_NAME" -p K=2
 
@@ -15,7 +15,7 @@ np.save('a.npy', a)
 EOF
 
 # Run the compiled program with the Python runtime and the simulator
-timeout -s 9 120 cs_python "$SCRIPT_DIR/../../spatialstencil/runtime/runtime.py" "$FOLDER_NAME" a.npy --benchmark
+timeout -s 9 120 cs_python "$SCRIPT_DIR/../../spada/runtime/runtime.py" "$FOLDER_NAME" a.npy --benchmark
 
 # Check if the output file matches the expected output
 python <<EOF
diff --git a/tests/csl_runtime/test_exchange.sh b/tests/csl_runtime/test_exchange.sh
index af73c6f2..74a47bcf 100755
--- a/tests/csl_runtime/test_exchange.sh
+++ b/tests/csl_runtime/test_exchange.sh
@@ -4,7 +4,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 
-# Compile the spatial stencil program
+# Compile the SpaDA program
 FOLDER_NAME="xchg_sptl"
 sptlc "$SCRIPT_DIR/../spatial_ir/samples/neighbor_exchange.sptl" "$FOLDER_NAME" -p K=2
 
@@ -15,7 +15,7 @@ np.save('a.npy', a)
 EOF
 
 # Run the compiled program with the Python runtime and the simulator
-timeout -s 9 120 cs_python "$SCRIPT_DIR/../../spatialstencil/runtime/runtime.py" "$FOLDER_NAME" a.npy --benchmark
+timeout -s 9 120 cs_python "$SCRIPT_DIR/../../spada/runtime/runtime.py" "$FOLDER_NAME" a.npy --benchmark
 
 # Check if the output file matches the expected output
 python <<EOF
diff --git a/tests/csl_runtime/test_laplacian.sh b/tests/csl_runtime/test_laplacian.sh
index 625ae304..97edda15 100755
--- a/tests/csl_runtime/test_laplacian.sh
+++ b/tests/csl_runtime/test_laplacian.sh
@@ -4,7 +4,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 
-# Compile the spatial stencil program
+# Compile the SpaDA program
 sptlc "$SCRIPT_DIR/../../samples/benchmarks/laplacian_4_4_4.sptl" lap_sptl
 
 python <<EOF
@@ -14,7 +14,7 @@ np.save('a.npy', a)
 EOF
 
 # Run the compiled program with the Python runtime and the simulator
-timeout -s 9 120 cs_python "$SCRIPT_DIR/../../spatialstencil/runtime/runtime.py" lap_sptl a.npy --benchmark
+timeout -s 9 120 cs_python "$SCRIPT_DIR/../../spada/runtime/runtime.py" lap_sptl a.npy --benchmark
 
 # Check if the output file matches the expected output
 python $SCRIPT_DIR/laplacian.py a.npy -o expected_out.npy
diff --git a/tests/csl_runtime/test_mult_scalar.sh b/tests/csl_runtime/test_mult_scalar.sh
index 1dc7b6c1..72eace2d 100755
--- a/tests/csl_runtime/test_mult_scalar.sh
+++ b/tests/csl_runtime/test_mult_scalar.sh
@@ -4,7 +4,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 
-# Compile the spatial stencil program
+# Compile the SpaDA program
 sptlc "$SCRIPT_DIR/../../samples/spatial/simple/mult_scalar.sptl" mult_sptl -p N=8
 
 python <<EOF
@@ -18,7 +18,7 @@ np.save('coeff.npy', coeff)
 EOF
 
 # Run the compiled program with the Python runtime and the simulator
-cs_python "$SCRIPT_DIR/../../spatialstencil/runtime/runtime.py" mult_sptl a.npy coeff.npy --benchmark
+cs_python "$SCRIPT_DIR/../../spada/runtime/runtime.py" mult_sptl a.npy coeff.npy --benchmark
 
 # Check if the output file matches the expected output
 python <<EOF
diff --git a/tests/csl_runtime/test_reduce.sh b/tests/csl_runtime/test_reduce.sh
index 1511758f..0205fc18 100755
--- a/tests/csl_runtime/test_reduce.sh
+++ b/tests/csl_runtime/test_reduce.sh
@@ -4,7 +4,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 
-# Compile the spatial stencil program
+# Compile the SpaDA program
 FOLDER_NAME="reduce_sptl"
 sptlc "$SCRIPT_DIR/../../samples/benchmarks/reduce.sptl" "$FOLDER_NAME" -p N=15 -p K=128
 
@@ -15,7 +15,7 @@ np.save('a.npy', a)
 EOF
 
 # Run the compiled program with the Python runtime and the simulator
-timeout -s 9 120 cs_python "$SCRIPT_DIR/../../spatialstencil/runtime/runtime.py" "$FOLDER_NAME" a.npy --benchmark
+timeout -s 9 120 cs_python "$SCRIPT_DIR/../../spada/runtime/runtime.py" "$FOLDER_NAME" a.npy --benchmark
 
 # Check if the output file matches the expected output
 python <<EOF
diff --git a/tests/csl_runtime/test_reduce_pipelined.sh b/tests/csl_runtime/test_reduce_pipelined.sh
index 7e25ea2e..032edfe6 100755
--- a/tests/csl_runtime/test_reduce_pipelined.sh
+++ b/tests/csl_runtime/test_reduce_pipelined.sh
@@ -4,7 +4,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 
-# Compile the spatial stencil program
+# Compile the SpaDA program
 FOLDER_NAME="reduce_sptl"
 sptlc "$SCRIPT_DIR/../../samples/benchmarks/reduce_pipelined.sptl" "$FOLDER_NAME" -p N=15 -p K=128
 
@@ -15,7 +15,7 @@ np.save('a.npy', a)
 EOF
 
 # Run the compiled program with the Python runtime and the simulator
-timeout -s 9 120 cs_python "$SCRIPT_DIR/../../spatialstencil/runtime/runtime.py" "$FOLDER_NAME" a.npy --benchmark
+timeout -s 9 120 cs_python "$SCRIPT_DIR/../../spada/runtime/runtime.py" "$FOLDER_NAME" a.npy --benchmark
 
 # Check if the output file matches the expected output
 python <<EOF
diff --git a/tests/csl_runtime/test_runtime_with_mock_simulator.py b/tests/csl_runtime/test_runtime_with_mock_simulator.py
index 2609c640..5ba85091 100644
--- a/tests/csl_runtime/test_runtime_with_mock_simulator.py
+++ b/tests/csl_runtime/test_runtime_with_mock_simulator.py
@@ -6,9 +6,9 @@
 from pathlib import Path
 from unittest.mock import patch, MagicMock
 from enum import Enum
-from spatialstencil.runtime.cerebras_runtime_stub import MemcpyDataType, MemcpyOrder
+from spada.runtime.cerebras_runtime_stub import MemcpyDataType, MemcpyOrder
 
-# Mock the Cerebras SDK module BEFORE any imports from spatialstencil
+# Mock the Cerebras SDK module BEFORE any imports from spada
 # This needs to be done at the very beginning to prevent ImportError
 
 
@@ -139,7 +139,7 @@ def launch(self, symbol: str, nonblock: bool = False):
 # End of mocking the Cerebras SDK
 
 # Now we can safely import the runtime classes
-from spatialstencil.runtime.runtime import Program, ProgramMetadata, copy_back_sync_benchmark_data
+from spada.runtime.runtime import Program, ProgramMetadata
 
 
 def mock_kernel(a, b, out):
@@ -147,39 +147,6 @@ def mock_kernel(a, b, out):
     out[:] = a + b
 
 
-class MockSyncBenchmarkRuntime:
-    def __init__(self, time_start_hwe: np.ndarray, time_stop_hwe: np.ndarray, time_ref_hwe: np.ndarray):
-        self.buffer_names = {"__benchmark_start": 1, "__benchmark_stop": 2, "__benchmark_refclock": 3}
-        self.data_buffers = {
-            1: time_start_hwe.transpose(1, 0, 2).ravel(),
-            1: time_stop_hwe.transpose(1, 0, 2).ravel(),
-            2: time_ref_hwe.transpose(1, 0, 2).ravel(),
-        }
-
-    def launch(self, symbol: str, nonblock: bool = False):
-        return None
-
-    def get_id(self, symbol: str) -> int:
-        return self.buffer_names[symbol]
-
-    def memcpy_d2h(
-        self,
-        dest: np.ndarray,
-        src: int,
-        px: int,
-        py: int,
-        w: int,
-        h: int,
-        elem_per_pe: int,
-        *,
-        streaming: bool,
-        data_type,
-        order,
-        nonblock: bool
-    ):
-        dest[:] = self.data_buffers[src]
-
-
 class TestProgramWithMockRuntime(unittest.TestCase):
     """Test the Program class with a mock runtime."""
 
@@ -226,7 +193,7 @@ def tearDown(self):
 
         shutil.rmtree(self.temp_dir)
 
-    @patch("spatialstencil.runtime.runtime.crt.SdkRuntime")
+    @patch("spada.runtime.runtime.crt.SdkRuntime")
     def test_program_initialization(self, mock_sdk_runtime_class):
         """Test that Program initializes correctly with metadata."""
         mock_sdk_runtime_class.return_value = self.mock_runtime
@@ -244,7 +211,7 @@ def test_program_initialization(self, mock_sdk_runtime_class):
     def test_program_execution_with_positional_args(self):
         """Test program execution with positional arguments."""
         # Patch the SdkRuntime class directly in the runtime module
-        with patch("spatialstencil.runtime.runtime.crt.SdkRuntime", return_value=self.mock_runtime):
+        with patch("spada.runtime.runtime.crt.SdkRuntime", return_value=self.mock_runtime):
             program = Program(str(self.program_dir))
 
             # Create test input data
@@ -261,7 +228,7 @@ def test_program_execution_with_positional_args(self):
 
     def test_program_execution_with_keyword_args(self):
         """Test program execution with keyword arguments."""
-        with patch("spatialstencil.runtime.runtime.crt.SdkRuntime", return_value=self.mock_runtime):
+        with patch("spada.runtime.runtime.crt.SdkRuntime", return_value=self.mock_runtime):
             program = Program(str(self.program_dir))
 
             # Create test input data
@@ -278,7 +245,7 @@ def test_program_execution_with_keyword_args(self):
 
     def test_program_shape_validation(self):
         """Test that program validates input shapes correctly."""
-        with patch("spatialstencil.runtime.runtime.crt.SdkRuntime", return_value=self.mock_runtime):
+        with patch("spada.runtime.runtime.crt.SdkRuntime", return_value=self.mock_runtime):
             program = Program(str(self.program_dir))
 
             # Create test input data with wrong shape
@@ -291,7 +258,7 @@ def test_program_shape_validation(self):
 
     def test_program_missing_input(self):
         """Test that program raises error for missing inputs."""
-        with patch("spatialstencil.runtime.runtime.crt.SdkRuntime", return_value=self.mock_runtime):
+        with patch("spada.runtime.runtime.crt.SdkRuntime", return_value=self.mock_runtime):
             program = Program(str(self.program_dir))
 
             # Create test input data - only provide one input
@@ -303,7 +270,7 @@ def test_program_missing_input(self):
 
     def test_mock_kernel_execution_verification(self):
         """Test that our mock kernel is actually being executed with correct data."""
-        with patch("spatialstencil.runtime.runtime.crt.SdkRuntime", return_value=self.mock_runtime):
+        with patch("spada.runtime.runtime.crt.SdkRuntime", return_value=self.mock_runtime):
             program = Program(str(self.program_dir))
 
             # Create specific test input data to verify kernel execution
@@ -325,7 +292,7 @@ def test_mock_kernel_execution_verification(self):
 
     def test_program_unexpected_input(self):
         """Test that program raises error for unexpected inputs."""
-        with patch("spatialstencil.runtime.runtime.crt.SdkRuntime", return_value=self.mock_runtime):
+        with patch("spada.runtime.runtime.crt.SdkRuntime", return_value=self.mock_runtime):
             program = Program(str(self.program_dir))
 
             # Create test input data
@@ -338,7 +305,7 @@ def test_program_unexpected_input(self):
                 program(a=a, b=b, c=c)
 
     def test_benchmark_requires_symbols(self):
-        with patch("spatialstencil.runtime.runtime.crt.SdkRuntime", return_value=self.mock_runtime):
+        with patch("spada.runtime.runtime.crt.SdkRuntime", return_value=self.mock_runtime):
             program = Program(str(self.program_dir), benchmark=True)
 
             a = np.ones((4, 4, 1), dtype=np.float32)
diff --git a/tests/csl_runtime/test_task_recycling.sh b/tests/csl_runtime/test_task_recycling.sh
index 50983c99..4c625c7c 100644
--- a/tests/csl_runtime/test_task_recycling.sh
+++ b/tests/csl_runtime/test_task_recycling.sh
@@ -13,7 +13,7 @@ input_data = np.arange(1.0, 15.0, dtype=np.float32).reshape(1, 1, 14)
 np.save('input.npy', input_data)
 EOF
 
-timeout -s 9 120 cs_python "$SCRIPT_DIR/../../spatialstencil/runtime/runtime.py" "$FOLDER_NAME" input.npy --benchmark
+timeout -s 9 120 cs_python "$SCRIPT_DIR/../../spada/runtime/runtime.py" "$FOLDER_NAME" input.npy --benchmark
 
 python <<EOF
 import numpy as np
diff --git a/tests/csl_runtime/test_task_recycling_chain.sh b/tests/csl_runtime/test_task_recycling_chain.sh
index f2b1d5d9..df4090ed 100755
--- a/tests/csl_runtime/test_task_recycling_chain.sh
+++ b/tests/csl_runtime/test_task_recycling_chain.sh
@@ -14,7 +14,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 FOLDER="task_recycling_two_stage_sptl"
-RUNTIME_PY="$(cd "$SCRIPT_DIR/../.." && pwd)/spatialstencil/runtime/runtime.py"
+RUNTIME_PY="$(cd "$SCRIPT_DIR/../.." && pwd)/spada/runtime/runtime.py"
 
 sptlc "$SCRIPT_DIR/samples/task_recycling_two_stage.sptl" "$FOLDER" --disable-task-fusion
 
diff --git a/tests/csl_runtime/test_task_recycling_fork.sh b/tests/csl_runtime/test_task_recycling_fork.sh
index a350aa52..adbc6662 100755
--- a/tests/csl_runtime/test_task_recycling_fork.sh
+++ b/tests/csl_runtime/test_task_recycling_fork.sh
@@ -15,7 +15,7 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 FOLDER="task_recycling_three_stage_sptl"
-RUNTIME_PY="$(cd "$SCRIPT_DIR/../.." && pwd)/spatialstencil/runtime/runtime.py"
+RUNTIME_PY="$(cd "$SCRIPT_DIR/../.." && pwd)/spada/runtime/runtime.py"
 
 sptlc "$SCRIPT_DIR/samples/task_recycling_three_stage.sptl" "$FOLDER" --disable-task-fusion
 
diff --git a/tests/gt4py/test_lowering_gt4py_to_stencil_ir.py b/tests/gt4py/test_lowering_gt4py_to_stencil_ir.py
index 6ccb8330..6f4a2e5c 100644
--- a/tests/gt4py/test_lowering_gt4py_to_stencil_ir.py
+++ b/tests/gt4py/test_lowering_gt4py_to_stencil_ir.py
@@ -1,7 +1,7 @@
 import unittest
-from spatialstencil.syntax.gt4py import parser
-from spatialstencil.syntax.stencil_ir import irnodes as sast, analysis
-from spatialstencil.lowering import gt4py_to_stencil_ir
+from spada.syntax.gt4py import parser
+from spada.syntax.stencil_ir import irnodes as sast, analysis
+from spada.lowering import gt4py_to_stencil_ir
 import numpy as np
 import os
 
diff --git a/tests/placement/test_model.py b/tests/placement/test_model.py
index 58b5c5fd..eb0b7554 100644
--- a/tests/placement/test_model.py
+++ b/tests/placement/test_model.py
@@ -1,10 +1,10 @@
 import unittest
 import numpy as np
-from spatialstencil.placement.domain import FieldDomain
-from spatialstencil.placement.model import CostModel
-from spatialstencil.placement.partition import FieldPartition
-from spatialstencil.placement.stencil import Stencil, StencilDirection
-from spatialstencil.placement.graph import StencilGraph
+from spada.placement.domain import FieldDomain
+from spada.placement.model import CostModel
+from spada.placement.partition import FieldPartition
+from spada.placement.stencil import Stencil, StencilDirection
+from spada.placement.graph import StencilGraph
 import igraph as ig
 
 
diff --git a/tests/placement/test_placement.py b/tests/placement/test_placement.py
index 8ba1e474..a6df6870 100644
--- a/tests/placement/test_placement.py
+++ b/tests/placement/test_placement.py
@@ -1,8 +1,8 @@
 import unittest
 import numpy as np
 
-from spatialstencil.placement.graph import FieldDomain
-from spatialstencil.placement.partition import FieldPartition
+from spada.placement.graph import FieldDomain
+from spada.placement.partition import FieldPartition
 
 
 class TestPlacement(unittest.TestCase):
diff --git a/tests/spatial_ir/test_argument_lowering.py b/tests/spatial_ir/test_argument_lowering.py
index 4fbe5fa9..88ae5da6 100644
--- a/tests/spatial_ir/test_argument_lowering.py
+++ b/tests/spatial_ir/test_argument_lowering.py
@@ -1,5 +1,5 @@
 import pytest
-from spatialstencil.syntax.spatial_ir import canonicalization, irnodes as spir, parser, passes
+from spada.syntax.spatial_ir import canonicalization, irnodes as spir, parser, passes
 
 
 @pytest.mark.parametrize("streaming", (False, True))
diff --git a/tests/spatial_ir/test_copy_elimination.py b/tests/spatial_ir/test_copy_elimination.py
index 76863e83..998c2a63 100644
--- a/tests/spatial_ir/test_copy_elimination.py
+++ b/tests/spatial_ir/test_copy_elimination.py
@@ -1,4 +1,4 @@
-from spatialstencil.syntax.spatial_ir import irnodes as spir, canonicalization, copy_elimination, parser, passes
+from spada.syntax.spatial_ir import irnodes as spir, canonicalization, copy_elimination, parser, passes
 
 
 def _optimize_kernel(kernel: spir.Kernel):
diff --git a/tests/spatial_ir/test_csl_tasks.py b/tests/spatial_ir/test_csl_tasks.py
index b8f3903b..f8a3ead8 100644
--- a/tests/spatial_ir/test_csl_tasks.py
+++ b/tests/spatial_ir/test_csl_tasks.py
@@ -1,8 +1,8 @@
 import pytest
-from spatialstencil.lowering import spatial_ir_to_csl as s2c
-from spatialstencil.syntax.spatial_ir import analysis, parser
-from spatialstencil.syntax.spatial_ir.canonicalization import PEBlock
-from spatialstencil.syntax.csl import tasks as tdag
+from spada.lowering import spatial_ir_to_csl as s2c
+from spada.syntax.spatial_ir import analysis, parser
+from spada.syntax.spatial_ir.canonicalization import PEBlock
+from spada.syntax.csl import tasks as tdag
 
 
 def _create_tasks(peblock: PEBlock):
diff --git a/tests/spatial_ir/test_dsd_ops.py b/tests/spatial_ir/test_dsd_ops.py
index fec02345..1e7c2a97 100644
--- a/tests/spatial_ir/test_dsd_ops.py
+++ b/tests/spatial_ir/test_dsd_ops.py
@@ -1,8 +1,8 @@
 import pytest
-from spatialstencil.lowering import spatial_ir_to_csl as s2c
-from spatialstencil.syntax.spatial_ir import parser, passes
-from spatialstencil.syntax.spatial_ir.canonicalization import PEBlock
-from spatialstencil.syntax.csl import dsd_ops
+from spada.lowering import spatial_ir_to_csl as s2c
+from spada.syntax.spatial_ir import parser, passes
+from spada.syntax.spatial_ir.canonicalization import PEBlock
+from spada.syntax.csl import dsd_ops
 
 
 def test_dsd_op_detection():
diff --git a/tests/spatial_ir/test_lowering_spatial_ir_to_csl.py b/tests/spatial_ir/test_lowering_spatial_ir_to_csl.py
index 750dbfce..75a769a8 100644
--- a/tests/spatial_ir/test_lowering_spatial_ir_to_csl.py
+++ b/tests/spatial_ir/test_lowering_spatial_ir_to_csl.py
@@ -1,6 +1,6 @@
 import os
-from spatialstencil.lowering.spatial_ir_to_csl import lower_spatial_ir_to_csl
-from spatialstencil.syntax.spatial_ir import parser, passes
+from spada.lowering.spatial_ir_to_csl import lower_spatial_ir_to_csl
+from spada.syntax.spatial_ir import parser, passes
 import pytest
 
 
diff --git a/tests/spatial_ir/test_lowering_statements_to_csl.py b/tests/spatial_ir/test_lowering_statements_to_csl.py
index 4e328b32..0cd810e2 100644
--- a/tests/spatial_ir/test_lowering_statements_to_csl.py
+++ b/tests/spatial_ir/test_lowering_statements_to_csl.py
@@ -1,6 +1,6 @@
 import pytest
-from spatialstencil.lowering.spatial_ir_to_csl import lower_spatial_ir_to_csl
-from spatialstencil.syntax.spatial_ir import parser, passes
+from spada.lowering.spatial_ir_to_csl import lower_spatial_ir_to_csl
+from spada.syntax.spatial_ir import parser, passes
 
 
 def create_inline_spatial_ir(code: str):
diff --git a/tests/spatial_ir/test_metaprogramming.py b/tests/spatial_ir/test_metaprogramming.py
index 60ee54e3..cea2930e 100644
--- a/tests/spatial_ir/test_metaprogramming.py
+++ b/tests/spatial_ir/test_metaprogramming.py
@@ -1,6 +1,6 @@
 import pytest
 
-from spatialstencil.syntax.spatial_ir import canonicalization, irnodes as spir, parser, passes
+from spada.syntax.spatial_ir import canonicalization, irnodes as spir, parser, passes
 
 
 class MetaForCounter(spir.NodeVisitor):
diff --git a/tests/spatial_ir/test_multicast.py b/tests/spatial_ir/test_multicast.py
index c11b70a5..adf802e0 100644
--- a/tests/spatial_ir/test_multicast.py
+++ b/tests/spatial_ir/test_multicast.py
@@ -9,8 +9,8 @@
 """
 import os
 import pytest
-from spatialstencil.syntax.spatial_ir import irnodes as spir, parser, passes, canonicalization
-from spatialstencil.lowering.spatial_ir_to_csl import lower_spatial_ir_to_csl
+from spada.syntax.spatial_ir import irnodes as spir, parser, passes, canonicalization
+from spada.lowering.spatial_ir_to_csl import lower_spatial_ir_to_csl
 
 _TESTING_DIR = os.path.join(os.path.dirname(__file__), 'samples')
 
diff --git a/tests/spatial_ir/test_optimization.py b/tests/spatial_ir/test_optimization.py
index a1d57add..b234d69c 100644
--- a/tests/spatial_ir/test_optimization.py
+++ b/tests/spatial_ir/test_optimization.py
@@ -1,5 +1,5 @@
-from spatialstencil.syntax.spatial_ir import irnodes as spa
-from spatialstencil.syntax.spatial_ir import parser, passes, canonicalization, copy_elimination
+from spada.syntax.spatial_ir import irnodes as spa
+from spada.syntax.spatial_ir import parser, passes, canonicalization, copy_elimination
 from typing import TypeVar
 import pytest
 
diff --git a/tests/spatial_ir/test_spatial_geometry.py b/tests/spatial_ir/test_spatial_geometry.py
index 8be1c8c2..227c5bfa 100644
--- a/tests/spatial_ir/test_spatial_geometry.py
+++ b/tests/spatial_ir/test_spatial_geometry.py
@@ -1,7 +1,7 @@
 import copy
 import unittest
 
-from spatialstencil.syntax.spatial_ir.grid_geometry import Rectangle, intersect_ranges, split_rectangle, split_rectangles, group_rectangles_by_domain
+from spada.syntax.spatial_ir.grid_geometry import Rectangle, intersect_ranges, split_rectangle, split_rectangles, group_rectangles_by_domain
 
 RectWithId = Rectangle[int]
 
diff --git a/tests/spatial_ir/test_spatial_ir_analysis.py b/tests/spatial_ir/test_spatial_ir_analysis.py
index c6701f7c..b86fc8e5 100644
--- a/tests/spatial_ir/test_spatial_ir_analysis.py
+++ b/tests/spatial_ir/test_spatial_ir_analysis.py
@@ -1,8 +1,8 @@
 import os
 import pytest
 import networkx as nx
-from spatialstencil.syntax.spatial_ir import irnodes as spa, analysis, parser, canonicalization, passes
-from spatialstencil.syntax.csl import tasks
+from spada.syntax.spatial_ir import irnodes as spa, analysis, parser, canonicalization, passes
+from spada.syntax.csl import tasks
 
 
 def test_completion_dag_simple():
diff --git a/tests/spatial_ir/test_spatial_ir_parser.py b/tests/spatial_ir/test_spatial_ir_parser.py
index acc6c054..9376bb30 100644
--- a/tests/spatial_ir/test_spatial_ir_parser.py
+++ b/tests/spatial_ir/test_spatial_ir_parser.py
@@ -1,4 +1,4 @@
-from spatialstencil.syntax.spatial_ir import irnodes as spast, parser
+from spada.syntax.spatial_ir import irnodes as spast, parser
 import os
 
 
diff --git a/tests/spatial_ir/test_spatial_ir_passes.py b/tests/spatial_ir/test_spatial_ir_passes.py
index 04c1c6b6..7895087c 100644
--- a/tests/spatial_ir/test_spatial_ir_passes.py
+++ b/tests/spatial_ir/test_spatial_ir_passes.py
@@ -1,5 +1,5 @@
 import pytest
-from spatialstencil.syntax.spatial_ir import irnodes as spir, canonicalization, parser, passes, copy_elimination
+from spada.syntax.spatial_ir import irnodes as spir, canonicalization, parser, passes, copy_elimination
 
 
 def test_canonicalize_nochange():
diff --git a/tests/spatial_ir/test_spatial_ir_schema.py b/tests/spatial_ir/test_spatial_ir_schema.py
index 0e7cd41e..0329040a 100644
--- a/tests/spatial_ir/test_spatial_ir_schema.py
+++ b/tests/spatial_ir/test_spatial_ir_schema.py
@@ -1,6 +1,6 @@
 import unittest
 
-from spatialstencil.syntax.spatial_ir.irnodes import Kernel
+from spada.syntax.spatial_ir.irnodes import Kernel
 
 
 class TestSpatialIR(unittest.TestCase):
diff --git a/tests/spatial_ir/test_task_recycling.py b/tests/spatial_ir/test_task_recycling.py
index 2fe2f9fe..9b026d8b 100644
--- a/tests/spatial_ir/test_task_recycling.py
+++ b/tests/spatial_ir/test_task_recycling.py
@@ -1,10 +1,10 @@
 import os
 import pytest
 
-from spatialstencil.lowering import spatial_ir_to_csl as s2c
-from spatialstencil.syntax.csl import constants, task_recycling, tasks as tdag
-from spatialstencil.syntax.spatial_ir import analysis, parser, passes
-from spatialstencil.syntax.spatial_ir.canonicalization import PEBlock
+from spada.lowering import spatial_ir_to_csl as s2c
+from spada.syntax.csl import constants, task_recycling, tasks as tdag
+from spada.syntax.spatial_ir import analysis, parser, passes
+from spada.syntax.spatial_ir.canonicalization import PEBlock
 
 
 def _load_sample_kernel():
diff --git a/tests/spatial_ir/test_task_recycling_codegen.py b/tests/spatial_ir/test_task_recycling_codegen.py
index 210e0f1e..dc023cf0 100644
--- a/tests/spatial_ir/test_task_recycling_codegen.py
+++ b/tests/spatial_ir/test_task_recycling_codegen.py
@@ -3,8 +3,8 @@
 
 import pytest
 
-from spatialstencil.lowering.spatial_ir_to_csl import lower_spatial_ir_to_csl
-from spatialstencil.syntax.spatial_ir import parser, passes
+from spada.lowering.spatial_ir_to_csl import lower_spatial_ir_to_csl
+from spada.syntax.spatial_ir import parser, passes
 
 _CSL_RUNTIME_TASK_RECYCLING_SAMPLES = os.path.join(
     os.path.dirname(__file__), '..', 'csl_runtime', 'samples')
diff --git a/tests/stencil_ir/test_lowering_stencil_to_spatial.py b/tests/stencil_ir/test_lowering_stencil_to_spatial.py
index d4b806ea..32de82d0 100644
--- a/tests/stencil_ir/test_lowering_stencil_to_spatial.py
+++ b/tests/stencil_ir/test_lowering_stencil_to_spatial.py
@@ -4,21 +4,21 @@
 
 import pytest
 
-from spatialstencil.cli.gt4py_to_spatial import lower_function, lower_gt4py_to_sptl
-from spatialstencil.lowering.stencil_to_spatial_routing import ChannelStrategy
-from spatialstencil.lowering.stencil_to_spatial_compute import HorizontalStencilTransformer
-from spatialstencil.lowering.stencil_to_spatial_dataflow import ProgramDataflow
-from spatialstencil.lowering.stencil_to_spatial_place import ProgramPlacement
-from spatialstencil.lowering.versioning import Versioning
-from spatialstencil.syntax.spatial_ir.grid_geometry import Rectangle
-from spatialstencil.syntax.stencil_ir import type_inference, parser
-from spatialstencil.syntax.stencil_ir.domain_collector import DomainCollector
+from spada.cli.gt4py_to_spatial import lower_function, lower_gt4py_to_sptl
+from spada.lowering.stencil_to_spatial_routing import ChannelStrategy
+from spada.lowering.stencil_to_spatial_compute import HorizontalStencilTransformer
+from spada.lowering.stencil_to_spatial_dataflow import ProgramDataflow
+from spada.lowering.stencil_to_spatial_place import ProgramPlacement
+from spada.lowering.versioning import Versioning
+from spada.syntax.spatial_ir.grid_geometry import Rectangle
+from spada.syntax.stencil_ir import type_inference, parser
+from spada.syntax.stencil_ir.domain_collector import DomainCollector
 
-from spatialstencil.syntax.stencil_ir.irnodes import *
-import spatialstencil.syntax.spatial_ir.irnodes as spa
+from spada.syntax.stencil_ir.irnodes import *
+import spada.syntax.spatial_ir.irnodes as spa
 
-from spatialstencil.lowering.stencil_to_spatial import lower_stencil_to_spatial
-from spatialstencil.syntax.stencil_ir.refactor_forward_backward_stencils import RefactorForwardBackwardStencils
+from spada.lowering.stencil_to_spatial import lower_stencil_to_spatial
+from spada.syntax.stencil_ir.refactor_forward_backward_stencils import RefactorForwardBackwardStencils
 
 
 class DummyProgramPlacement(ProgramPlacement):
@@ -229,7 +229,7 @@ def test_vadv():
 
 
 def test_gt4py_integration():
-    from spatialstencil.syntax.gt4py import parser as gt4py_parser
+    from spada.syntax.gt4py import parser as gt4py_parser
     
     gtfuncs = gt4py_parser.parse_file(str(Path(__file__).parent / Path('../../samples/gt4py_test_instances.py')))
 
diff --git a/tests/stencil_ir/test_stencil_ir_parser.py b/tests/stencil_ir/test_stencil_ir_parser.py
index 48a0d03a..d99c73cc 100644
--- a/tests/stencil_ir/test_stencil_ir_parser.py
+++ b/tests/stencil_ir/test_stencil_ir_parser.py
@@ -1,5 +1,5 @@
 import unittest
-from spatialstencil.syntax.stencil_ir import irnodes as sast, parser
+from spada.syntax.stencil_ir import irnodes as sast, parser
 import os
 
 
diff --git a/tests/stencil_ir/test_stencil_ir_passes.py b/tests/stencil_ir/test_stencil_ir_passes.py
index ee8683e7..3e072812 100644
--- a/tests/stencil_ir/test_stencil_ir_passes.py
+++ b/tests/stencil_ir/test_stencil_ir_passes.py
@@ -1,12 +1,12 @@
 import unittest
 from pathlib import Path
 
-from spatialstencil.syntax.stencil_ir import type_inference, parser, canonicalization, extent_inference, \
+from spada.syntax.stencil_ir import type_inference, parser, canonicalization, extent_inference, \
     domain_inference
-from spatialstencil.syntax.stencil_ir.irnodes import ScalarType, Program, Cartesian, Interval, Offset, Extent, \
+from spada.syntax.stencil_ir.irnodes import ScalarType, Program, Cartesian, Interval, Offset, Extent, \
     StatementBlock, MaterializeOp, ComputationBlock, ReturnOp
 
-from spatialstencil.syntax.stencil_ir.ssa import SSAVisitor
+from spada.syntax.stencil_ir.ssa import SSAVisitor
 
 class TestTypeInference(unittest.TestCase):
 
diff --git a/tests/stencil_ir/test_stencil_ir_schema.py b/tests/stencil_ir/test_stencil_ir_schema.py
index ebb2e586..ee88c6ee 100644
--- a/tests/stencil_ir/test_stencil_ir_schema.py
+++ b/tests/stencil_ir/test_stencil_ir_schema.py
@@ -1,5 +1,5 @@
 import unittest
-from spatialstencil.syntax.stencil_ir.irnodes import Program
+from spada.syntax.stencil_ir.irnodes import Program
 
 class TestStencilIR(unittest.TestCase):
     def test_validate_stencil_schema(self):
diff --git a/tests/utils/test_csl_benchmarking.py b/tests/utils/test_csl_benchmarking.py
index d75be712..79aa9fe2 100644
--- a/tests/utils/test_csl_benchmarking.py
+++ b/tests/utils/test_csl_benchmarking.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from spatialstencil.syntax.csl import benchmarking
+from spada.syntax.csl import benchmarking
 
 
 REPO_ROOT = Path(__file__).resolve().parents[2]
diff --git a/tests/utils/test_irnodes.py b/tests/utils/test_irnodes.py
index e02d5e9e..8547bf63 100644
--- a/tests/utils/test_irnodes.py
+++ b/tests/utils/test_irnodes.py
@@ -1,7 +1,7 @@
 import unittest
 from dataclasses import dataclass
-from spatialstencil.syntax.common.basenode import BaseNode
-from spatialstencil.syntax.common.visitor import IRNodeTransformer, IRNodeVisitor
+from spada.syntax.common.basenode import BaseNode
+from spada.syntax.common.visitor import IRNodeTransformer, IRNodeVisitor
 
 
 @dataclass
diff --git a/tests/utils/test_serialization.py b/tests/utils/test_serialization.py
index 29dded4d..92f565cd 100644
--- a/tests/utils/test_serialization.py
+++ b/tests/utils/test_serialization.py
@@ -8,7 +8,7 @@
 import enum
 from dataclasses import dataclass
 from typing import Union, List, Optional, Dict
-from spatialstencil.syntax.common.serialization import (DataclassEncoder, dataclass_decoder, save_to_json,
+from spada.syntax.common.serialization import (DataclassEncoder, dataclass_decoder, save_to_json,
                                                         load_from_json)
 
 
diff --git a/tests/utils/test_tree_matching.py b/tests/utils/test_tree_matching.py
index c82fca52..35ede2c0 100644
--- a/tests/utils/test_tree_matching.py
+++ b/tests/utils/test_tree_matching.py
@@ -3,19 +3,19 @@
 
 import unittest
 
-from spatialstencil.lowering.stencil_to_spatial_compute import HorizontalStencilTransformer
-from spatialstencil.lowering.stencil_to_spatial_dataflow import ProgramDataflow
-from spatialstencil.lowering.stencil_to_spatial_place import ProgramPlacement
-from spatialstencil.lowering.versioning import Versioning
-from spatialstencil.syntax.common.match_tree import TreeNode, TreeWildcard, MatchTree, MatchingBaseNode
-from spatialstencil.syntax.common.tree_matching import _match_pattern, PatternMatcher, PatternTransformer
+from spada.lowering.stencil_to_spatial_compute import HorizontalStencilTransformer
+from spada.lowering.stencil_to_spatial_dataflow import ProgramDataflow
+from spada.lowering.stencil_to_spatial_place import ProgramPlacement
+from spada.lowering.versioning import Versioning
+from spada.syntax.common.match_tree import TreeNode, TreeWildcard, MatchTree, MatchingBaseNode
+from spada.syntax.common.tree_matching import _match_pattern, PatternMatcher, PatternTransformer
 from typing import Tuple, List, TypeVar, Generic
 
-import spatialstencil.syntax.stencil_ir.irnodes as sast
-from spatialstencil.syntax.common.basenode import Wildcard
-from spatialstencil.syntax.common.types import ScalarType
-import spatialstencil.syntax.spatial_ir.irnodes as spa
-from spatialstencil.syntax.stencil_ir.domain_collector import DomainCollector
+import spada.syntax.stencil_ir.irnodes as sast
+from spada.syntax.common.basenode import Wildcard
+from spada.syntax.common.types import ScalarType
+import spada.syntax.spatial_ir.irnodes as spa
+from spada.syntax.stencil_ir.domain_collector import DomainCollector
 
 
 # Assume Tree, Node, Wildcard classes are already defined from previous translations.