spcl · tbennun · Oct 10, 2025 · Mar 29, 2026 · Mar 29, 2026
diff --git a/samples/benchmarks/vertical_advection_128_128_80.sptl b/samples/benchmarks/vertical_advection_128_128_80.sptl
diff --git a/samples/benchmarks/vertical_advection_16_16_4.sptl b/samples/benchmarks/vertical_advection_16_16_4.sptl
diff --git a/samples/benchmarks/vertical_advection_4_4_4.sptl b/samples/benchmarks/vertical_advection_4_4_4.sptl
diff --git a/spatialstencil/lowering/stencil_to_spatial_compute_fwbw.py b/spatialstencil/lowering/stencil_to_spatial_compute_fwbw.py
@@ -153,7 +153,18 @@ def visit_Subscript(self, node: sast.Subscript):
         assert node.subscript[0] == 0
         assert node.subscript[1] == 0
         z_offset = node.subscript[2]
-        array = self.placement.get_storage(node.value)
+        if z_offset != 0:
+            # For non-zero k-offsets in a FORWARD/BACKWARD stencil, the access
+            # targets the *accumulated* (final) field value at the neighbouring
+            # k-level — i.e. the value after every assignment at that level has
+            # completed.  Local SSA intermediates (e.g. fresh_d before Thomas
+            # elimination) only hold the correct value at the *current* k; using
+            # them at k±1 yields wrong (often uninitialized) results.
+            array = self.placement.get_accumulated_storage(node.value.name)
+            if array is None:
+                array = self.placement.get_storage(node.value)
+        else:
+            array = self.placement.get_storage(node.value)
         if isinstance(array[1], spa.ArrayType):
             if z_offset == 0:
                 access = self.iteration_variable.identifier

diff --git a/spatialstencil/lowering/stencil_to_spatial_place.py b/spatialstencil/lowering/stencil_to_spatial_place.py
@@ -161,6 +161,24 @@ def get_shift(self) -> tuple[int, int, int]:
         """
         return self.domain_shift
 
+    def get_accumulated_storage(self,
+                               name: str,
+                               offset: sast.Offset = sast.Offset.zero()) -> tuple[spa.Identifier, spa.ArrayType | spa.ScalarType] | None:
+        """Return program-scope (accumulated) storage for a variable by name.
+
+        Unlike get_storage(), this bypasses SSA-specific storage and resolves
+        through the program scope only, returning the storage associated with
+        the field's accumulated value — i.e. the final value after all
+        assignments at a given k-level have completed.  This is the correct
+        target for stencil accesses with a non-zero k-offset inside a
+        FORWARD or BACKWARD computation body.
+        """
+        if name in self._program_scope_fields:
+            identifier = self._program_scope_fields[name]
+            if offset in self._storage_map[identifier]:
+                return self._storage_map[identifier][offset]
+        return None
+
     def get_storage(self,
                     identifier: sast.Identifier,
                     offset: sast.Offset = sast.Offset.zero()) -> tuple[spa.Identifier, spa.ArrayType | spa.ScalarType] | None:

diff --git a/tests/csl_runtime/test_vadv.sh b/tests/csl_runtime/test_vadv.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+
+BNAME=vadv_sptl
+# Compile the spatial stencil program
+sptlc "$SCRIPT_DIR/../../samples/benchmarks/vertical_advection_4_4_4.sptl" $BNAME $*
+
+python <<EOF
+import numpy as np
+dtr_stage = np.random.rand(1).astype(np.float32)
+u_pos = np.random.rand(4, 4, 4).astype(np.float32)
+u_stage = np.random.rand(4, 4, 5).astype(np.float32)
+utens = np.random.rand(4, 4, 4).astype(np.float32)
+utens_stage = np.random.rand(4, 4, 4).astype(np.float32)
+wcon = np.random.rand(5, 4, 4).astype(np.float32)
+# Save the arrays to .npy files
+np.save('dtr_stage.npy', dtr_stage)
+np.save('u_pos.npy', u_pos)
+np.save('u_stage.npy', u_stage)
+np.save('utens.npy', utens)
+np.save('utens_stage.npy', utens_stage)
+np.save('wcon.npy', wcon)
+EOF
+
+# Run the compiled program with the Python runtime and the simulator
+timeout -s 9 120 cs_python "$SCRIPT_DIR/../../spatialstencil/runtime/runtime.py" --benchmark $BNAME dtr_stage.npy u_pos.npy u_stage.npy utens.npy utens_stage.npy wcon.npy
+
+# Check if the output file matches the expected output
+python $SCRIPT_DIR/vertical_advection.py dtr_stage.npy u_pos.npy u_stage.npy utens.npy utens_stage.npy wcon.npy -o expected_out.npy
+python <<EOF
+import numpy as np
+# Load the arrays
+ref = np.load('expected_out.npy')
+output = np.load('OUT___kernel_out_0.npy')
+# Check if the output is correct
+if not np.allclose(output, ref, atol=1e-6, rtol=1e-5):
+    print("Test failed: Output does not match expected result.")
+    exit(1)
+else:
+    print("Test passed: Output matches expected result.")
+EOF
+
+# Clean up generated files
+rm -rf $BNAME
+rm -f OUT___kernel_out_0.npy expected_out.npy
+rm -f dtr_stage.npy u_pos.npy u_stage.npy utens.npy utens_stage.npy wcon.npy
diff --git a/tests/csl_runtime/vertical_advection.py b/tests/csl_runtime/vertical_advection.py
@@ -0,0 +1,94 @@
+import argparse
+import numpy as np
+import numpy.typing as npt
+
+# Adapted from NPBench: https://github.com/spcl/npbench/blob/main/npbench/benchmarks/weather_stencils/vadv/vadv_numpy.py
+
+# Sample constants
+BET_M = 0.5
+BET_P = 0.5
+
+
+# Adapted from https://github.com/GridTools/gt4py/blob/1caca893034a18d5df1522ed251486659f846589/tests/test_integration/stencil_definitions.py#L111
+def vadv(utens_stage, u_stage, wcon, u_pos, utens, dtr_stage):
+    I, J, K = utens_stage.shape[0], utens_stage.shape[1], utens_stage.shape[2]
+    ccol = np.ndarray((I, J, K), dtype=utens_stage.dtype)
+    dcol = np.ndarray((I, J, K), dtype=utens_stage.dtype)
+    data_col = np.ndarray((I, J), dtype=utens_stage.dtype)
+
+    for k in range(1):
+        gcv = 0.25 * (wcon[1:, :, k + 1] + wcon[:-1, :, k + 1])
+        cs = gcv * BET_M
+
+        ccol[:, :, k] = gcv * BET_P
+        bcol = dtr_stage - ccol[:, :, k]
+
+        # update the d column
+        correction_term = -cs * (u_stage[:, :, k + 1] - u_stage[:, :, k])
+        dcol[:, :, k] = (dtr_stage * u_pos[:, :, k] + utens[:, :, k] + utens_stage[:, :, k] + correction_term)
+
+        # Thomas forward
+        divided = 1.0 / bcol
+        ccol[:, :, k] = ccol[:, :, k] * divided
+        dcol[:, :, k] = dcol[:, :, k] * divided
+
+    for k in range(1, K - 1):
+        gav = -0.25 * (wcon[1:, :, k] + wcon[:-1, :, k])
+        gcv = 0.25 * (wcon[1:, :, k + 1] + wcon[:-1, :, k + 1])
+
+        as_ = gav * BET_M
+        cs = gcv * BET_M
+
+        acol = gav * BET_P
+        ccol[:, :, k] = gcv * BET_P
+        bcol = dtr_stage - acol - ccol[:, :, k]
+
+        # update the d column
+        correction_term = -as_ * (u_stage[:, :, k - 1] - u_stage[:, :, k]) - cs * (
+            u_stage[:, :, k + 1] - u_stage[:, :, k])
+        dcol[:, :, k] = (dtr_stage * u_pos[:, :, k] + utens[:, :, k] + utens_stage[:, :, k] + correction_term)
+
+        # Thomas forward
+        divided = 1.0 / (bcol - ccol[:, :, k - 1] * acol)
+        ccol[:, :, k] = ccol[:, :, k] * divided
+        dcol[:, :, k] = (dcol[:, :, k] - (dcol[:, :, k - 1]) * acol) * divided
+
+    for k in range(K - 1, K):
+        gav = -0.25 * (wcon[1:, :, k] + wcon[:-1, :, k])
+        as_ = gav * BET_M
+        acol = gav * BET_P
+        bcol = dtr_stage - acol
+
+        # update the d column
+        correction_term = -as_ * (u_stage[:, :, k - 1] - u_stage[:, :, k])
+        dcol[:, :, k] = (dtr_stage * u_pos[:, :, k] + utens[:, :, k] + utens_stage[:, :, k] + correction_term)
+
+        # Thomas forward
+        divided = 1.0 / (bcol - ccol[:, :, k - 1] * acol)
+        dcol[:, :, k] = (dcol[:, :, k] - (dcol[:, :, k - 1]) * acol) * divided
+
+    for k in range(K - 1, K - 2, -1):
+        datacol = dcol[:, :, k]
+        data_col[:] = datacol
+        utens_stage[:, :, k] = dtr_stage * (datacol - u_pos[:, :, k])
+
+    for k in range(K - 2, -1, -1):
+        datacol = dcol[:, :, k] - ccol[:, :, k] * data_col[:, :]
+        data_col[:] = datacol
+        utens_stage[:, :, k] = dtr_stage * (datacol - u_pos[:, :, k])
+
+    return utens_stage
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Vertical Advection Stencil")
+    parser.add_argument("inputs", nargs='+', type=str, help="Input file paths")
+    parser.add_argument("--output", "-o", type=str, default='vadv_out.npy', help="Output file path")
+    args = parser.parse_args()
+
+    inputs = [np.load(input_file) for input_file in args.inputs]
+    inputs[0] = inputs[0].item() # dtr_stage
+    dtr_stage, u_pos, u_stage, utens, utens_stage, wcon = inputs
+
+    result = vadv(utens_stage, u_stage, wcon, u_pos, utens, dtr_stage)
+    np.save(args.output, result)
diff --git a/tests/stencil_ir/test_lowering_stencil_to_spatial.py b/tests/stencil_ir/test_lowering_stencil_to_spatial.py
@@ -1,10 +1,12 @@
+import re
 import unittest
 from pathlib import Path
 from typing import Tuple
 
 import pytest
 
 from spatialstencil.cli.gt4py_to_spatial import lower_function, lower_gt4py_to_sptl
+from spatialstencil.lowering import gt4py_to_stencil_ir
 from spatialstencil.lowering.stencil_to_spatial_routing import ChannelStrategy
 from spatialstencil.lowering.stencil_to_spatial_compute import HorizontalStencilTransformer
 from spatialstencil.lowering.stencil_to_spatial_dataflow import ProgramDataflow
@@ -228,6 +230,39 @@ def test_vadv():
         assert subgrids_dont_overlap(spatial_program)
 
 
+def test_fwbw_koffset_uses_accumulated_storage():
+    """Regression: k±1 accesses in FORWARD/BACKWARD stencil bodies must use
+    accumulated (program-scope) storage, not SSA-versioned intermediates.
+
+    The Thomas forward sweep in vertical_advection needs dcol[k-1] equal to
+    the Thomas-eliminated value from the previous k-iteration.  The bug was
+    that visit_Subscript used the SSA-specific local array (e.g.
+    dcol_0_0_0#3) instead of the accumulated array (dcol_0_0_0), yielding
+    uninitialised reads for the first k in each interval.
+    """
+    from spatialstencil.syntax.gt4py import parser as gt4py_parser
+
+    gtfuncs = gt4py_parser.parse_file(str(
+        Path(__file__).parent / Path('../../samples/stencils.py')))
+    program = gtfuncs['vertical_advection']
+    irprogram = gt4py_to_stencil_ir.lower_gt4py_to_stencil_ir(program, domain=(4, 4, 4))
+    type_inference.infer_field_extents(irprogram)
+    type_inference.infer_field_domains(irprogram)
+    kernel = lower_stencil_to_spatial(irprogram)
+    ir = kernel.as_ir()
+
+    # Detect the bug: an SSA-versioned array name (name#N) directly followed
+    # by an array index containing a k-offset (e.g. [(k#3 - 1)]).
+    # Accumulated storage is printed without a version suffix, so any match
+    # here means a local intermediate is being read at a neighbouring k-level.
+    bad = re.findall(r'\w+#\d+\[\([^)]*-\s*\d+\)\]', ir)
+    assert not bad, (
+        "k-offset accesses to SSA-versioned (non-accumulated) storage found; "
+        "these should use the accumulated field instead:\n  "
+        + "\n  ".join(bad)
+    )
+
+
 def test_gt4py_integration():
     from spatialstencil.syntax.gt4py import parser as gt4py_parser
 
@@ -247,4 +282,5 @@ def test_gt4py_integration():
     test_vertical_stencil_finishes()
     test_scalar_arguments()
     test_vadv()
+    test_fwbw_koffset_uses_accumulated_storage()
     test_gt4py_integration()