diff --git a/samples/benchmarks/vertical_advection_128_128_80.sptl b/samples/benchmarks/vertical_advection_128_128_80.sptl
index 29a7ef02..ce174c05 100644
--- a/samples/benchmarks/vertical_advection_128_128_80.sptl
+++ b/samples/benchmarks/vertical_advection_128_128_80.sptl
@@ -10,86 +10,86 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 80>[128, 128] readonly
     f32[81] utens_0_0_0
     f32[81] utens_stage_0_0_0
     f32[81] wcon_0_0_0
-    f32[81] _refactored_wcon_1_0_0_0_0#1
-    f32[2] gcv_0_0_0
-    f32[2] _temp_0_0_0
-    f32[2] _temp_0_0_0#1
-    f32[2] cs_0_0_0
-    f32[2] ccol_0_0_0#1
-    f32[2] bcol_0_0_0
-    f32[2] correction_term_0_0_0
-    f32[2] _temp_0_0_0#2
-    f32[2] dcol_0_0_0#1
-    f32[2] _temp_0_0_0#3
-    f32[2] _temp_0_0_0#4
-    f32[2] _temp_0_0_0#5
-    f32[2] divided_0_0_0
-    f32[2] ccol_0_0_0#2
-    f32[2] dcol_0_0_0#2
-    f32[80] gav_0_0_m1
-    f32[80] gav_0_0_0
-    f32[80] _temp_0_0_0#6
-    f32[80] gcv_0_0_m1
-    f32[80] gcv_0_0_0#1
-    f32[80] _temp_0_0_0#7
-    f32[80] _temp_0_0_0#8
-    f32[80] as__0_0_m1
-    f32[80] as__0_0_0
-    f32[80] cs_0_0_m1
-    f32[80] cs_0_0_0#1
-    f32[80] acol_0_0_0
-    f32[80] ccol_0_0_m1
-    f32[80] ccol_0_0_0#3
-    f32[80] bcol_0_0_0#1
-    f32[80] _temp_0_0_0#9
-    f32[80] correction_term_0_0_m1
-    f32[80] correction_term_0_0_0#1
-    f32[80] _temp_0_0_0#10
-    f32[80] _temp_0_0_0#11
-    f32[80] _temp_0_0_0#12
-    f32[80] _temp_0_0_0#13
-    f32[80] dcol_0_0_m1
-    f32[80] dcol_0_0_0#3
-    f32[80] _temp_0_0_0#14
-    f32[80] _temp_0_0_0#15
-    f32[80] _temp_0_0_0#16
-    f32[80] divided_0_0_0#1
-    f32[80] _temp_0_0_0#17
-    f32[80] _temp_0_0_0#18
-    f32[80] ccol_0_0_0#4
-    f32[80] dcol_0_0_0#4
-    f32[80] _temp_0_0_0#19
-    f32[80] _temp_0_0_0#20
-    f32[81] gav_0_0_m1#1
-    f32[81] gav_0_0_0#1
-    f32[81] _temp_0_0_0#21
-    f32[81] as__0_0_m1#1
-    f32[81] as__0_0_0#1
-    f32[81] acol_0_0_0#1
-    f32[81] bcol_0_0_0#2
-    f32[81] correction_term_0_0_m1#1
-    f32[81] correction_term_0_0_0#2
-    f32[81] _temp_0_0_0#22
-    f32[81] dcol_0_0_m1#1
-    f32[81] dcol_0_0_0#5
-    f32[81] _temp_0_0_0#23
-    f32[81] _temp_0_0_0#24
-    f32[81] _temp_0_0_0#25
-    f32[81] divided_0_0_0#2
-    f32[81] _temp_0_0_0#26
-    f32[81] _temp_0_0_0#27
-    f32[81] dcol_0_0_0#6
-    f32[81] _temp_0_0_0#28
-    f32[81] _temp_0_0_0#29
-    f32[81] datacol_0_0_0
-    f32[81] data_col_0_0_0#1
-    f32[81] utens_stage_0_0_0#2
-    f32[81] _temp_0_0_0#30
-    f32[80] datacol_0_0_0#1
+    f32[81] _refactored_wcon_1_0_0_0_0#2
+    f32[2] gcv_0_0_0#1
+    f32[2] _temp_0_0_0#6
+    f32[2] _temp_0_0_0#7
+    f32[2] cs_0_0_0#1
+    f32[2] ccol_0_0_0#3
+    f32[2] bcol_0_0_0#1
+    f32[2] correction_term_0_0_0#1
+    f32[2] _temp_0_0_0#8
+    f32[2] dcol_0_0_0#3
+    f32[2] _temp_0_0_0#9
+    f32[2] _temp_0_0_0#10
+    f32[2] _temp_0_0_0#11
+    f32[2] divided_0_0_0#1
+    f32[2] ccol_0_0_0#4
+    f32[2] dcol_0_0_0#4
+    f32[80] gav_0_0_m1#1
+    f32[80] gav_0_0_0#1
+    f32[80] _temp_0_0_0#27
+    f32[80] gcv_0_0_m1#1
+    f32[80] gcv_0_0_0#3
+    f32[80] _temp_0_0_0#28
+    f32[80] _temp_0_0_0#29
+    f32[80] as__0_0_m1#1
+    f32[80] as__0_0_0#1
+    f32[80] cs_0_0_m1#1
+    f32[80] cs_0_0_0#3
+    f32[80] acol_0_0_0#1
+    f32[80] ccol_0_0_m1#1
+    f32[80] ccol_0_0_0#7
+    f32[80] bcol_0_0_0#3
+    f32[80] _temp_0_0_0#30
+    f32[80] correction_term_0_0_m1#1
+    f32[80] correction_term_0_0_0#3
     f32[80] _temp_0_0_0#31
-    f32[80] data_col_0_0_0#2
-    f32[80] utens_stage_0_0_0#3
     f32[80] _temp_0_0_0#32
+    f32[80] _temp_0_0_0#33
+    f32[80] _temp_0_0_0#34
+    f32[80] dcol_0_0_m1#1
+    f32[80] dcol_0_0_0#7
+    f32[80] _temp_0_0_0#35
+    f32[80] _temp_0_0_0#36
+    f32[80] _temp_0_0_0#37
+    f32[80] divided_0_0_0#3
+    f32[80] _temp_0_0_0#38
+    f32[80] _temp_0_0_0#39
+    f32[80] ccol_0_0_0#8
+    f32[80] dcol_0_0_0#8
+    f32[80] _temp_0_0_0#40
+    f32[80] _temp_0_0_0#41
+    f32[81] gav_0_0_m1#3
+    f32[81] gav_0_0_0#3
+    f32[81] _temp_0_0_0#51
+    f32[81] as__0_0_m1#3
+    f32[81] as__0_0_0#3
+    f32[81] acol_0_0_0#3
+    f32[81] bcol_0_0_0#5
+    f32[81] correction_term_0_0_m1#3
+    f32[81] correction_term_0_0_0#5
+    f32[81] _temp_0_0_0#52
+    f32[81] dcol_0_0_m1#3
+    f32[81] dcol_0_0_0#11
+    f32[81] _temp_0_0_0#53
+    f32[81] _temp_0_0_0#54
+    f32[81] _temp_0_0_0#55
+    f32[81] divided_0_0_0#5
+    f32[81] _temp_0_0_0#56
+    f32[81] _temp_0_0_0#57
+    f32[81] dcol_0_0_0#12
+    f32[81] _temp_0_0_0#58
+    f32[81] _temp_0_0_0#59
+    f32[81] datacol_0_0_0#1
+    f32[81] data_col_0_0_0#2
+    f32[81] utens_stage_0_0_0#3
+    f32[81] _temp_0_0_0#61
+    f32[80] datacol_0_0_0#3
+    f32[80] _temp_0_0_0#64
+    f32[80] data_col_0_0_0#4
+    f32[80] utens_stage_0_0_0#5
+    f32[80] _temp_0_0_0#65
   }
   place u16 i, u16 j in [2:129:2 , 0:128:1] {
     f32[81] utens_stage_0_0_0#1
@@ -102,7 +102,7 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 80>[128, 128] readonly
     f32[81] utens_0_0_0
     f32[81] utens_stage_0_0_0
     f32[81] wcon_0_0_0
-    f32[81] _refactored_wcon_1_0_0_0_0#1
+    f32[81] _refactored_wcon_1_0_0_0_0#3
     f32[2] gcv_0_0_0
     f32[2] _temp_0_0_0
     f32[2] _temp_0_0_0#1
@@ -120,68 +120,68 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 80>[128, 128] readonly
     f32[2] dcol_0_0_0#2
     f32[80] gav_0_0_m1
     f32[80] gav_0_0_0
-    f32[80] _temp_0_0_0#6
+    f32[80] _temp_0_0_0#12
     f32[80] gcv_0_0_m1
-    f32[80] gcv_0_0_0#1
-    f32[80] _temp_0_0_0#7
-    f32[80] _temp_0_0_0#8
+    f32[80] gcv_0_0_0#2
+    f32[80] _temp_0_0_0#13
+    f32[80] _temp_0_0_0#14
     f32[80] as__0_0_m1
     f32[80] as__0_0_0
     f32[80] cs_0_0_m1
-    f32[80] cs_0_0_0#1
+    f32[80] cs_0_0_0#2
     f32[80] acol_0_0_0
     f32[80] ccol_0_0_m1
-    f32[80] ccol_0_0_0#3
-    f32[80] bcol_0_0_0#1
-    f32[80] _temp_0_0_0#9
-    f32[80] correction_term_0_0_m1
-    f32[80] correction_term_0_0_0#1
-    f32[80] _temp_0_0_0#10
-    f32[80] _temp_0_0_0#11
-    f32[80] _temp_0_0_0#12
-    f32[80] _temp_0_0_0#13
-    f32[80] dcol_0_0_m1
-    f32[80] dcol_0_0_0#3
-    f32[80] _temp_0_0_0#14
+    f32[80] ccol_0_0_0#5
+    f32[80] bcol_0_0_0#2
     f32[80] _temp_0_0_0#15
+    f32[80] correction_term_0_0_m1
+    f32[80] correction_term_0_0_0#2
     f32[80] _temp_0_0_0#16
-    f32[80] divided_0_0_0#1
     f32[80] _temp_0_0_0#17
     f32[80] _temp_0_0_0#18
-    f32[80] ccol_0_0_0#4
-    f32[80] dcol_0_0_0#4
     f32[80] _temp_0_0_0#19
+    f32[80] dcol_0_0_m1
+    f32[80] dcol_0_0_0#5
     f32[80] _temp_0_0_0#20
-    f32[81] gav_0_0_m1#1
-    f32[81] gav_0_0_0#1
-    f32[81] _temp_0_0_0#21
-    f32[81] as__0_0_m1#1
-    f32[81] as__0_0_0#1
-    f32[81] acol_0_0_0#1
-    f32[81] bcol_0_0_0#2
-    f32[81] correction_term_0_0_m1#1
-    f32[81] correction_term_0_0_0#2
-    f32[81] _temp_0_0_0#22
-    f32[81] dcol_0_0_m1#1
-    f32[81] dcol_0_0_0#5
-    f32[81] _temp_0_0_0#23
-    f32[81] _temp_0_0_0#24
-    f32[81] _temp_0_0_0#25
-    f32[81] divided_0_0_0#2
-    f32[81] _temp_0_0_0#26
-    f32[81] _temp_0_0_0#27
-    f32[81] dcol_0_0_0#6
-    f32[81] _temp_0_0_0#28
-    f32[81] _temp_0_0_0#29
+    f32[80] _temp_0_0_0#21
+    f32[80] _temp_0_0_0#22
+    f32[80] divided_0_0_0#2
+    f32[80] _temp_0_0_0#23
+    f32[80] _temp_0_0_0#24
+    f32[80] ccol_0_0_0#6
+    f32[80] dcol_0_0_0#6
+    f32[80] _temp_0_0_0#25
+    f32[80] _temp_0_0_0#26
+    f32[81] gav_0_0_m1#2
+    f32[81] gav_0_0_0#2
+    f32[81] _temp_0_0_0#42
+    f32[81] as__0_0_m1#2
+    f32[81] as__0_0_0#2
+    f32[81] acol_0_0_0#2
+    f32[81] bcol_0_0_0#4
+    f32[81] correction_term_0_0_m1#2
+    f32[81] correction_term_0_0_0#4
+    f32[81] _temp_0_0_0#43
+    f32[81] dcol_0_0_m1#2
+    f32[81] dcol_0_0_0#9
+    f32[81] _temp_0_0_0#44
+    f32[81] _temp_0_0_0#45
+    f32[81] _temp_0_0_0#46
+    f32[81] divided_0_0_0#4
+    f32[81] _temp_0_0_0#47
+    f32[81] _temp_0_0_0#48
+    f32[81] dcol_0_0_0#10
+    f32[81] _temp_0_0_0#49
+    f32[81] _temp_0_0_0#50
     f32[81] datacol_0_0_0
     f32[81] data_col_0_0_0#1
     f32[81] utens_stage_0_0_0#2
-    f32[81] _temp_0_0_0#30
-    f32[80] datacol_0_0_0#1
-    f32[80] _temp_0_0_0#31
-    f32[80] data_col_0_0_0#2
-    f32[80] utens_stage_0_0_0#3
-    f32[80] _temp_0_0_0#32
+    f32[81] _temp_0_0_0#60
+    f32[80] datacol_0_0_0#2
+    f32[80] _temp_0_0_0#62
+    f32[80] data_col_0_0_0#3
+    f32[80] utens_stage_0_0_0#4
+    f32[80] _temp_0_0_0#63
   }
   place u16 i, u16 j in [3:129:2 , 0:128:1] {
     f32[81] utens_stage_0_0_0#1
@@ -194,7 +194,7 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 80>[128, 128] readonly
     f32[81] utens_0_0_0
     f32[81] utens_stage_0_0_0
     f32[81] wcon_0_0_0
-    f32[81] _refactored_wcon_1_0_0_0_0#1
+    f32[81] _refactored_wcon_1_0_0_0_0#3
     f32[2] gcv_0_0_0
     f32[2] _temp_0_0_0
     f32[2] _temp_0_0_0#1
@@ -212,68 +212,68 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 80>[128, 128] readonly
     f32[2] dcol_0_0_0#2
     f32[80] gav_0_0_m1
     f32[80] gav_0_0_0
-    f32[80] _temp_0_0_0#6
+    f32[80] _temp_0_0_0#12
     f32[80] gcv_0_0_m1
-    f32[80] gcv_0_0_0#1
-    f32[80] _temp_0_0_0#7
-    f32[80] _temp_0_0_0#8
+    f32[80] gcv_0_0_0#2
+    f32[80] _temp_0_0_0#13
+    f32[80] _temp_0_0_0#14
     f32[80] as__0_0_m1
     f32[80] as__0_0_0
     f32[80] cs_0_0_m1
-    f32[80] cs_0_0_0#1
+    f32[80] cs_0_0_0#2
     f32[80] acol_0_0_0
     f32[80] ccol_0_0_m1
-    f32[80] ccol_0_0_0#3
-    f32[80] bcol_0_0_0#1
-    f32[80] _temp_0_0_0#9
-    f32[80] correction_term_0_0_m1
-    f32[80] correction_term_0_0_0#1
-    f32[80] _temp_0_0_0#10
-    f32[80] _temp_0_0_0#11
-    f32[80] _temp_0_0_0#12
-    f32[80] _temp_0_0_0#13
-    f32[80] dcol_0_0_m1
-    f32[80] dcol_0_0_0#3
-    f32[80] _temp_0_0_0#14
+    f32[80] ccol_0_0_0#5
+    f32[80] bcol_0_0_0#2
     f32[80] _temp_0_0_0#15
+    f32[80] correction_term_0_0_m1
+    f32[80] correction_term_0_0_0#2
     f32[80] _temp_0_0_0#16
-    f32[80] divided_0_0_0#1
     f32[80] _temp_0_0_0#17
     f32[80] _temp_0_0_0#18
-    f32[80] ccol_0_0_0#4
-    f32[80] dcol_0_0_0#4
     f32[80] _temp_0_0_0#19
+    f32[80] dcol_0_0_m1
+    f32[80] dcol_0_0_0#5
     f32[80] _temp_0_0_0#20
-    f32[81] gav_0_0_m1#1
-    f32[81] gav_0_0_0#1
-    f32[81] _temp_0_0_0#21
-    f32[81] as__0_0_m1#1
-    f32[81] as__0_0_0#1
-    f32[81] acol_0_0_0#1
-    f32[81] bcol_0_0_0#2
-    f32[81] correction_term_0_0_m1#1
-    f32[81] correction_term_0_0_0#2
-    f32[81] _temp_0_0_0#22
-    f32[81] dcol_0_0_m1#1
-    f32[81] dcol_0_0_0#5
-    f32[81] _temp_0_0_0#23
-    f32[81] _temp_0_0_0#24
-    f32[81] _temp_0_0_0#25
-    f32[81] divided_0_0_0#2
-    f32[81] _temp_0_0_0#26
-    f32[81] _temp_0_0_0#27
-    f32[81] dcol_0_0_0#6
-    f32[81] _temp_0_0_0#28
-    f32[81] _temp_0_0_0#29
+    f32[80] _temp_0_0_0#21
+    f32[80] _temp_0_0_0#22
+    f32[80] divided_0_0_0#2
+    f32[80] _temp_0_0_0#23
+    f32[80] _temp_0_0_0#24
+    f32[80] ccol_0_0_0#6
+    f32[80] dcol_0_0_0#6
+    f32[80] _temp_0_0_0#25
+    f32[80] _temp_0_0_0#26
+    f32[81] gav_0_0_m1#2
+    f32[81] gav_0_0_0#2
+    f32[81] _temp_0_0_0#42
+    f32[81] as__0_0_m1#2
+    f32[81] as__0_0_0#2
+    f32[81] acol_0_0_0#2
+    f32[81] bcol_0_0_0#4
+    f32[81] correction_term_0_0_m1#2
+    f32[81] correction_term_0_0_0#4
+    f32[81] _temp_0_0_0#43
+    f32[81] dcol_0_0_m1#2
+    f32[81] dcol_0_0_0#9
+    f32[81] _temp_0_0_0#44
+    f32[81] _temp_0_0_0#45
+    f32[81] _temp_0_0_0#46
+    f32[81] divided_0_0_0#4
+    f32[81] _temp_0_0_0#47
+    f32[81] _temp_0_0_0#48
+    f32[81] dcol_0_0_0#10
+    f32[81] _temp_0_0_0#49
+    f32[81] _temp_0_0_0#50
     f32[81] datacol_0_0_0
     f32[81] data_col_0_0_0#1
     f32[81] utens_stage_0_0_0#2
-    f32[81] _temp_0_0_0#30
-    f32[80] datacol_0_0_0#1
-    f32[80] _temp_0_0_0#31
-    f32[80] data_col_0_0_0#2
-    f32[80] utens_stage_0_0_0#3
-    f32[80] _temp_0_0_0#32
+    f32[81] _temp_0_0_0#60
+    f32[80] datacol_0_0_0#2
+    f32[80] _temp_0_0_0#62
+    f32[80] data_col_0_0_0#3
+    f32[80] utens_stage_0_0_0#4
+    f32[80] _temp_0_0_0#63
   }
   place u16 i#1, u16 j#1 in [129:130:2 , 0:128:1] {
     f32[81] wcon_0_0_0
@@ -283,43 +283,43 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 80>[128, 128] readonly
   }
   dataflow u16 i#5, u16 j#5 in [129:130:2 , 0:128:1] {
     stream<f32> _stream_wcon#1 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 0
-    }
+  hops = [(-1, 0)], 
+  channel = 0
+}
     stream<f32> _stream_wcon#2 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 1
-    }
+  hops = [(-1, 0)], 
+  channel = 1
+}
   }
   dataflow u16 i#5, u16 j#5 in [2:129:2 , 0:128:1] {
-    stream<f32> _stream_wcon#1 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_wcon#2 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 1
-    }
+    stream<f32> _stream_wcon#3 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 2
+}
+    stream<f32> _stream_wcon#4 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 3
+}
   }
   dataflow u16 i#5, u16 j#5 in [3:129:2 , 0:128:1] {
-    stream<f32> _stream_wcon#1 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_wcon#2 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 1
-    }
+    stream<f32> _stream_wcon#3 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 2
+}
+    stream<f32> _stream_wcon#4 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 3
+}
   }
   dataflow u16 i#5, u16 j#5 in [1:2:2 , 0:128:1] {
-    stream<f32> _stream_wcon#1 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_wcon#2 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 1
-    }
+    stream<f32> _stream_wcon#5 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 4
+}
+    stream<f32> _stream_wcon#6 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 5
+}
   }
   compute u16 i#2, u16 j#2 in [1:2:2 , 0:128:1] {
     await receive(u_pos_0_0_0, _u_pos[i#2, j#2])
@@ -328,104 +328,104 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 80>[128, 128] readonly
     await receive(utens_stage_0_0_0, _utens_stage[i#2, j#2])
     await receive(wcon_0_0_0, _wcon[i#2, j#2])
     awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:81:1], receive(_stream_wcon#2) {
-      _refactored_wcon_1_0_0_0_0#1[k] = x
+    completion _recv_comp = foreach i32 k, f32 x in [0:81:1], receive(_stream_wcon#6) {
+      _refactored_wcon_1_0_0_0_0#2[k] = x
     }
     await _recv_comp
     await map i32 k#1 in [0:80:1] {
-      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#1[k#1]
+      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#2[k#1]
     }
     awaitall
     for i32 k#2 in [0:1:1] {
-      _temp_0_0_0[k#2] = _refactored_wcon_1_0_0_0_0[(k#2 + 1)]
-      _temp_0_0_0#1[k#2] = (_temp_0_0_0[k#2] + wcon_0_0_0[(k#2 + 1)])
-      gcv_0_0_0[k#2] = (0.25 * _temp_0_0_0#1[k#2])
-      cs_0_0_0[k#2] = (gcv_0_0_0[k#2] * 0.5)
-      ccol_0_0_0#1[k#2] = (gcv_0_0_0[k#2] * 0.5)
-      bcol_0_0_0[k#2] = (_dtr_stage - ccol_0_0_0#1[k#2])
-      _temp_0_0_0#2[k#2] = (u_stage_0_0_0[(k#2 + 1)] - u_stage_0_0_0[k#2])
-      correction_term_0_0_0[k#2] = (-cs_0_0_0[k#2] * _temp_0_0_0#2[k#2])
-      _temp_0_0_0#3[k#2] = (_dtr_stage * u_pos_0_0_0[k#2])
-      _temp_0_0_0#4[k#2] = (_temp_0_0_0#3[k#2] + utens_0_0_0[k#2])
-      _temp_0_0_0#5[k#2] = (_temp_0_0_0#4[k#2] + utens_stage_0_0_0[k#2])
-      dcol_0_0_0#1[k#2] = (_temp_0_0_0#5[k#2] + correction_term_0_0_0[k#2])
-      divided_0_0_0[k#2] = (1.0 / bcol_0_0_0[k#2])
-      ccol_0_0_0#2[k#2] = (ccol_0_0_0#1[k#2] * divided_0_0_0[k#2])
-      dcol_0_0_0#2[k#2] = (dcol_0_0_0#1[k#2] * divided_0_0_0[k#2])
-      ccol_0_0_0[k#2] = ccol_0_0_0#2[k#2]
-      dcol_0_0_0[k#2] = dcol_0_0_0#2[k#2]
+      _temp_0_0_0#6[k#2] = _refactored_wcon_1_0_0_0_0[(k#2 + 1)]
+      _temp_0_0_0#7[k#2] = (_temp_0_0_0#6[k#2] + wcon_0_0_0[(k#2 + 1)])
+      gcv_0_0_0#1[k#2] = (0.25 * _temp_0_0_0#7[k#2])
+      cs_0_0_0#1[k#2] = (gcv_0_0_0#1[k#2] * 0.5)
+      ccol_0_0_0#3[k#2] = (gcv_0_0_0#1[k#2] * 0.5)
+      bcol_0_0_0#1[k#2] = (_dtr_stage - ccol_0_0_0#3[k#2])
+      _temp_0_0_0#8[k#2] = (u_stage_0_0_0[(k#2 + 1)] - u_stage_0_0_0[k#2])
+      correction_term_0_0_0#1[k#2] = (-cs_0_0_0#1[k#2] * _temp_0_0_0#8[k#2])
+      _temp_0_0_0#9[k#2] = (_dtr_stage * u_pos_0_0_0[k#2])
+      _temp_0_0_0#10[k#2] = (_temp_0_0_0#9[k#2] + utens_0_0_0[k#2])
+      _temp_0_0_0#11[k#2] = (_temp_0_0_0#10[k#2] + utens_stage_0_0_0[k#2])
+      dcol_0_0_0#3[k#2] = (_temp_0_0_0#11[k#2] + correction_term_0_0_0#1[k#2])
+      divided_0_0_0#1[k#2] = (1.0 / bcol_0_0_0#1[k#2])
+      ccol_0_0_0#4[k#2] = (ccol_0_0_0#3[k#2] * divided_0_0_0#1[k#2])
+      dcol_0_0_0#4[k#2] = (dcol_0_0_0#3[k#2] * divided_0_0_0#1[k#2])
+      ccol_0_0_0[k#2] = ccol_0_0_0#4[k#2]
+      dcol_0_0_0[k#2] = dcol_0_0_0#4[k#2]
     }
     awaitall
     for i32 k#3 in [1:79:1] {
-      _temp_0_0_0#6[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
-      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#6[k#3])
-      _temp_0_0_0#7[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
-      _temp_0_0_0#8[k#3] = (_temp_0_0_0#7[k#3] + wcon_0_0_0[(k#3 + 1)])
-      gcv_0_0_0#1[k#3] = (0.25 * _temp_0_0_0#8[k#3])
-      as__0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      cs_0_0_0#1[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
-      acol_0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      ccol_0_0_0#3[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
-      _temp_0_0_0#9[k#3] = (_dtr_stage - acol_0_0_0[k#3])
-      bcol_0_0_0#1[k#3] = (_temp_0_0_0#9[k#3] - ccol_0_0_0#3[k#3])
-      _temp_0_0_0#10[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#11[k#3] = (cs_0_0_0#1[k#3] * _temp_0_0_0#10[k#3])
-      _temp_0_0_0#12[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#13[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#12[k#3])
-      correction_term_0_0_0#1[k#3] = (_temp_0_0_0#13[k#3] - _temp_0_0_0#11[k#3])
-      _temp_0_0_0#14[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
-      _temp_0_0_0#15[k#3] = (_temp_0_0_0#14[k#3] + utens_0_0_0[k#3])
-      _temp_0_0_0#16[k#3] = (_temp_0_0_0#15[k#3] + utens_stage_0_0_0[k#3])
-      dcol_0_0_0#3[k#3] = (_temp_0_0_0#16[k#3] + correction_term_0_0_0#1[k#3])
-      _temp_0_0_0#17[k#3] = (ccol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#18[k#3] = (bcol_0_0_0#1[k#3] - _temp_0_0_0#17[k#3])
-      divided_0_0_0#1[k#3] = (1.0 / _temp_0_0_0#18[k#3])
-      ccol_0_0_0#4[k#3] = (ccol_0_0_0#3[k#3] * divided_0_0_0#1[k#3])
-      _temp_0_0_0#19[k#3] = (dcol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#20[k#3] = (dcol_0_0_0#3[k#3] - _temp_0_0_0#19[k#3])
-      dcol_0_0_0#4[k#3] = (_temp_0_0_0#20[k#3] * divided_0_0_0#1[k#3])
-      ccol_0_0_0[k#3] = ccol_0_0_0#4[k#3]
-      dcol_0_0_0[k#3] = dcol_0_0_0#4[k#3]
+      _temp_0_0_0#27[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
+      gav_0_0_0#1[k#3] = (-0.25 * _temp_0_0_0#27[k#3])
+      _temp_0_0_0#28[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
+      _temp_0_0_0#29[k#3] = (_temp_0_0_0#28[k#3] + wcon_0_0_0[(k#3 + 1)])
+      gcv_0_0_0#3[k#3] = (0.25 * _temp_0_0_0#29[k#3])
+      as__0_0_0#1[k#3] = (gav_0_0_0#1[k#3] * 0.5)
+      cs_0_0_0#3[k#3] = (gcv_0_0_0#3[k#3] * 0.5)
+      acol_0_0_0#1[k#3] = (gav_0_0_0#1[k#3] * 0.5)
+      ccol_0_0_0#7[k#3] = (gcv_0_0_0#3[k#3] * 0.5)
+      _temp_0_0_0#30[k#3] = (_dtr_stage - acol_0_0_0#1[k#3])
+      bcol_0_0_0#3[k#3] = (_temp_0_0_0#30[k#3] - ccol_0_0_0#7[k#3])
+      _temp_0_0_0#31[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#32[k#3] = (cs_0_0_0#3[k#3] * _temp_0_0_0#31[k#3])
+      _temp_0_0_0#33[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#34[k#3] = (-as__0_0_0#1[k#3] * _temp_0_0_0#33[k#3])
+      correction_term_0_0_0#3[k#3] = (_temp_0_0_0#34[k#3] - _temp_0_0_0#32[k#3])
+      _temp_0_0_0#35[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
+      _temp_0_0_0#36[k#3] = (_temp_0_0_0#35[k#3] + utens_0_0_0[k#3])
+      _temp_0_0_0#37[k#3] = (_temp_0_0_0#36[k#3] + utens_stage_0_0_0[k#3])
+      dcol_0_0_0#7[k#3] = (_temp_0_0_0#37[k#3] + correction_term_0_0_0#3[k#3])
+      _temp_0_0_0#38[k#3] = (ccol_0_0_0[(k#3 - 1)] * acol_0_0_0#1[k#3])
+      _temp_0_0_0#39[k#3] = (bcol_0_0_0#3[k#3] - _temp_0_0_0#38[k#3])
+      divided_0_0_0#3[k#3] = (1.0 / _temp_0_0_0#39[k#3])
+      ccol_0_0_0#8[k#3] = (ccol_0_0_0#7[k#3] * divided_0_0_0#3[k#3])
+      _temp_0_0_0#40[k#3] = (dcol_0_0_0[(k#3 - 1)] * acol_0_0_0#1[k#3])
+      _temp_0_0_0#41[k#3] = (dcol_0_0_0#7[k#3] - _temp_0_0_0#40[k#3])
+      dcol_0_0_0#8[k#3] = (_temp_0_0_0#41[k#3] * divided_0_0_0#3[k#3])
+      ccol_0_0_0[k#3] = ccol_0_0_0#8[k#3]
+      dcol_0_0_0[k#3] = dcol_0_0_0#8[k#3]
     }
     awaitall
     for i32 k#4 in [79:80:1] {
-      _temp_0_0_0#21[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
-      gav_0_0_0#1[k#4] = (-0.25 * _temp_0_0_0#21[k#4])
-      as__0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      acol_0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      bcol_0_0_0#2[k#4] = (_dtr_stage - acol_0_0_0#1[k#4])
-      _temp_0_0_0#22[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
-      correction_term_0_0_0#2[k#4] = (-as__0_0_0#1[k#4] * _temp_0_0_0#22[k#4])
-      _temp_0_0_0#23[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
-      _temp_0_0_0#24[k#4] = (_temp_0_0_0#23[k#4] + utens_0_0_0[k#4])
-      _temp_0_0_0#25[k#4] = (_temp_0_0_0#24[k#4] + utens_stage_0_0_0[k#4])
-      dcol_0_0_0#5[k#4] = (_temp_0_0_0#25[k#4] + correction_term_0_0_0#2[k#4])
-      _temp_0_0_0#26[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#27[k#4] = (bcol_0_0_0#2[k#4] - _temp_0_0_0#26[k#4])
-      divided_0_0_0#2[k#4] = (1.0 / _temp_0_0_0#27[k#4])
-      _temp_0_0_0#28[k#4] = (dcol_0_0_0#5[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#29[k#4] = (dcol_0_0_0#5[k#4] - _temp_0_0_0#28[k#4])
-      dcol_0_0_0#6[k#4] = (_temp_0_0_0#29[k#4] * divided_0_0_0#2[k#4])
-      dcol_0_0_0[k#4] = dcol_0_0_0#6[k#4]
+      _temp_0_0_0#51[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
+      gav_0_0_0#3[k#4] = (-0.25 * _temp_0_0_0#51[k#4])
+      as__0_0_0#3[k#4] = (gav_0_0_0#3[k#4] * 0.5)
+      acol_0_0_0#3[k#4] = (gav_0_0_0#3[k#4] * 0.5)
+      bcol_0_0_0#5[k#4] = (_dtr_stage - acol_0_0_0#3[k#4])
+      _temp_0_0_0#52[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
+      correction_term_0_0_0#5[k#4] = (-as__0_0_0#3[k#4] * _temp_0_0_0#52[k#4])
+      _temp_0_0_0#53[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
+      _temp_0_0_0#54[k#4] = (_temp_0_0_0#53[k#4] + utens_0_0_0[k#4])
+      _temp_0_0_0#55[k#4] = (_temp_0_0_0#54[k#4] + utens_stage_0_0_0[k#4])
+      dcol_0_0_0#11[k#4] = (_temp_0_0_0#55[k#4] + correction_term_0_0_0#5[k#4])
+      _temp_0_0_0#56[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#3[k#4])
+      _temp_0_0_0#57[k#4] = (bcol_0_0_0#5[k#4] - _temp_0_0_0#56[k#4])
+      divided_0_0_0#5[k#4] = (1.0 / _temp_0_0_0#57[k#4])
+      _temp_0_0_0#58[k#4] = (dcol_0_0_0[(k#4 - 1)] * acol_0_0_0#3[k#4])
+      _temp_0_0_0#59[k#4] = (dcol_0_0_0#11[k#4] - _temp_0_0_0#58[k#4])
+      dcol_0_0_0#12[k#4] = (_temp_0_0_0#59[k#4] * divided_0_0_0#5[k#4])
+      dcol_0_0_0[k#4] = dcol_0_0_0#12[k#4]
     }
     awaitall
     for i32 k#5 in [79:78:-1] {
-      datacol_0_0_0[k#5] = dcol_0_0_0[k#5]
-      data_col_0_0_0#1[k#5] = datacol_0_0_0[k#5]
-      _temp_0_0_0#30[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
-      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#30[k#5])
-      data_col_0_0_0[k#5] = data_col_0_0_0#1[k#5]
-      utens_stage_0_0_0#1[k#5] = utens_stage_0_0_0#2[k#5]
+      datacol_0_0_0#1[k#5] = dcol_0_0_0[k#5]
+      data_col_0_0_0#2[k#5] = datacol_0_0_0#1[k#5]
+      _temp_0_0_0#61[k#5] = (datacol_0_0_0#1[k#5] - u_pos_0_0_0[k#5])
+      utens_stage_0_0_0#3[k#5] = (_dtr_stage * _temp_0_0_0#61[k#5])
+      data_col_0_0_0[k#5] = data_col_0_0_0#2[k#5]
+      utens_stage_0_0_0#1[k#5] = utens_stage_0_0_0#3[k#5]
     }
     awaitall
     for i32 k#6 in [78:-1:-1] {
-      _temp_0_0_0#31[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
-      datacol_0_0_0#1[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#31[k#6])
-      data_col_0_0_0#2[k#6] = datacol_0_0_0#1[k#6]
-      _temp_0_0_0#32[k#6] = (datacol_0_0_0#1[k#6] - u_pos_0_0_0[k#6])
-      utens_stage_0_0_0#3[k#6] = (_dtr_stage * _temp_0_0_0#32[k#6])
-      data_col_0_0_0[k#6] = data_col_0_0_0#2[k#6]
-      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#3[k#6]
+      _temp_0_0_0#64[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
+      datacol_0_0_0#3[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#64[k#6])
+      data_col_0_0_0#4[k#6] = datacol_0_0_0#3[k#6]
+      _temp_0_0_0#65[k#6] = (datacol_0_0_0#3[k#6] - u_pos_0_0_0[k#6])
+      utens_stage_0_0_0#5[k#6] = (_dtr_stage * _temp_0_0_0#65[k#6])
+      data_col_0_0_0[k#6] = data_col_0_0_0#4[k#6]
+      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#5[k#6]
     }
     awaitall
     await send(utens_stage_0_0_0#1, __kernel_out_0[(i#2 - 1), (j#2 - 0)])
@@ -438,14 +438,14 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 80>[128, 128] readonly
     await receive(utens_stage_0_0_0, _utens_stage[i#2, j#2])
     await receive(wcon_0_0_0, _wcon[i#2, j#2])
     awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:81:1], receive(_stream_wcon#1) {
-      _refactored_wcon_1_0_0_0_0#1[k] = x
+    completion _recv_comp = foreach i32 k, f32 x in [0:81:1], receive(_stream_wcon#3) {
+      _refactored_wcon_1_0_0_0_0#3[k] = x
     }
-    completion _send_comp = send(wcon_0_0_0, _stream_wcon#2)
+    completion _send_comp = send(wcon_0_0_0, _stream_wcon#4)
     await _send_comp
     await _recv_comp
     await map i32 k#1 in [0:80:1] {
-      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#1[k#1]
+      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#3[k#1]
     }
     awaitall
     for i32 k#2 in [0:1:1] {
@@ -469,75 +469,75 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 80>[128, 128] readonly
     }
     awaitall
     for i32 k#3 in [1:79:1] {
-      _temp_0_0_0#6[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
-      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#6[k#3])
-      _temp_0_0_0#7[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
-      _temp_0_0_0#8[k#3] = (_temp_0_0_0#7[k#3] + wcon_0_0_0[(k#3 + 1)])
-      gcv_0_0_0#1[k#3] = (0.25 * _temp_0_0_0#8[k#3])
+      _temp_0_0_0#12[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
+      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#12[k#3])
+      _temp_0_0_0#13[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
+      _temp_0_0_0#14[k#3] = (_temp_0_0_0#13[k#3] + wcon_0_0_0[(k#3 + 1)])
+      gcv_0_0_0#2[k#3] = (0.25 * _temp_0_0_0#14[k#3])
       as__0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      cs_0_0_0#1[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
+      cs_0_0_0#2[k#3] = (gcv_0_0_0#2[k#3] * 0.5)
       acol_0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      ccol_0_0_0#3[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
-      _temp_0_0_0#9[k#3] = (_dtr_stage - acol_0_0_0[k#3])
-      bcol_0_0_0#1[k#3] = (_temp_0_0_0#9[k#3] - ccol_0_0_0#3[k#3])
-      _temp_0_0_0#10[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#11[k#3] = (cs_0_0_0#1[k#3] * _temp_0_0_0#10[k#3])
-      _temp_0_0_0#12[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#13[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#12[k#3])
-      correction_term_0_0_0#1[k#3] = (_temp_0_0_0#13[k#3] - _temp_0_0_0#11[k#3])
-      _temp_0_0_0#14[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
-      _temp_0_0_0#15[k#3] = (_temp_0_0_0#14[k#3] + utens_0_0_0[k#3])
-      _temp_0_0_0#16[k#3] = (_temp_0_0_0#15[k#3] + utens_stage_0_0_0[k#3])
-      dcol_0_0_0#3[k#3] = (_temp_0_0_0#16[k#3] + correction_term_0_0_0#1[k#3])
-      _temp_0_0_0#17[k#3] = (ccol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#18[k#3] = (bcol_0_0_0#1[k#3] - _temp_0_0_0#17[k#3])
-      divided_0_0_0#1[k#3] = (1.0 / _temp_0_0_0#18[k#3])
-      ccol_0_0_0#4[k#3] = (ccol_0_0_0#3[k#3] * divided_0_0_0#1[k#3])
-      _temp_0_0_0#19[k#3] = (dcol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#20[k#3] = (dcol_0_0_0#3[k#3] - _temp_0_0_0#19[k#3])
-      dcol_0_0_0#4[k#3] = (_temp_0_0_0#20[k#3] * divided_0_0_0#1[k#3])
-      ccol_0_0_0[k#3] = ccol_0_0_0#4[k#3]
-      dcol_0_0_0[k#3] = dcol_0_0_0#4[k#3]
+      ccol_0_0_0#5[k#3] = (gcv_0_0_0#2[k#3] * 0.5)
+      _temp_0_0_0#15[k#3] = (_dtr_stage - acol_0_0_0[k#3])
+      bcol_0_0_0#2[k#3] = (_temp_0_0_0#15[k#3] - ccol_0_0_0#5[k#3])
+      _temp_0_0_0#16[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#17[k#3] = (cs_0_0_0#2[k#3] * _temp_0_0_0#16[k#3])
+      _temp_0_0_0#18[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#19[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#18[k#3])
+      correction_term_0_0_0#2[k#3] = (_temp_0_0_0#19[k#3] - _temp_0_0_0#17[k#3])
+      _temp_0_0_0#20[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
+      _temp_0_0_0#21[k#3] = (_temp_0_0_0#20[k#3] + utens_0_0_0[k#3])
+      _temp_0_0_0#22[k#3] = (_temp_0_0_0#21[k#3] + utens_stage_0_0_0[k#3])
+      dcol_0_0_0#5[k#3] = (_temp_0_0_0#22[k#3] + correction_term_0_0_0#2[k#3])
+      _temp_0_0_0#23[k#3] = (ccol_0_0_0[(k#3 - 1)] * acol_0_0_0[k#3])
+      _temp_0_0_0#24[k#3] = (bcol_0_0_0#2[k#3] - _temp_0_0_0#23[k#3])
+      divided_0_0_0#2[k#3] = (1.0 / _temp_0_0_0#24[k#3])
+      ccol_0_0_0#6[k#3] = (ccol_0_0_0#5[k#3] * divided_0_0_0#2[k#3])
+      _temp_0_0_0#25[k#3] = (dcol_0_0_0[(k#3 - 1)] * acol_0_0_0[k#3])
+      _temp_0_0_0#26[k#3] = (dcol_0_0_0#5[k#3] - _temp_0_0_0#25[k#3])
+      dcol_0_0_0#6[k#3] = (_temp_0_0_0#26[k#3] * divided_0_0_0#2[k#3])
+      ccol_0_0_0[k#3] = ccol_0_0_0#6[k#3]
+      dcol_0_0_0[k#3] = dcol_0_0_0#6[k#3]
     }
     awaitall
     for i32 k#4 in [79:80:1] {
-      _temp_0_0_0#21[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
-      gav_0_0_0#1[k#4] = (-0.25 * _temp_0_0_0#21[k#4])
-      as__0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      acol_0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      bcol_0_0_0#2[k#4] = (_dtr_stage - acol_0_0_0#1[k#4])
-      _temp_0_0_0#22[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
-      correction_term_0_0_0#2[k#4] = (-as__0_0_0#1[k#4] * _temp_0_0_0#22[k#4])
-      _temp_0_0_0#23[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
-      _temp_0_0_0#24[k#4] = (_temp_0_0_0#23[k#4] + utens_0_0_0[k#4])
-      _temp_0_0_0#25[k#4] = (_temp_0_0_0#24[k#4] + utens_stage_0_0_0[k#4])
-      dcol_0_0_0#5[k#4] = (_temp_0_0_0#25[k#4] + correction_term_0_0_0#2[k#4])
-      _temp_0_0_0#26[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#27[k#4] = (bcol_0_0_0#2[k#4] - _temp_0_0_0#26[k#4])
-      divided_0_0_0#2[k#4] = (1.0 / _temp_0_0_0#27[k#4])
-      _temp_0_0_0#28[k#4] = (dcol_0_0_0#5[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#29[k#4] = (dcol_0_0_0#5[k#4] - _temp_0_0_0#28[k#4])
-      dcol_0_0_0#6[k#4] = (_temp_0_0_0#29[k#4] * divided_0_0_0#2[k#4])
-      dcol_0_0_0[k#4] = dcol_0_0_0#6[k#4]
+      _temp_0_0_0#42[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
+      gav_0_0_0#2[k#4] = (-0.25 * _temp_0_0_0#42[k#4])
+      as__0_0_0#2[k#4] = (gav_0_0_0#2[k#4] * 0.5)
+      acol_0_0_0#2[k#4] = (gav_0_0_0#2[k#4] * 0.5)
+      bcol_0_0_0#4[k#4] = (_dtr_stage - acol_0_0_0#2[k#4])
+      _temp_0_0_0#43[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
+      correction_term_0_0_0#4[k#4] = (-as__0_0_0#2[k#4] * _temp_0_0_0#43[k#4])
+      _temp_0_0_0#44[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
+      _temp_0_0_0#45[k#4] = (_temp_0_0_0#44[k#4] + utens_0_0_0[k#4])
+      _temp_0_0_0#46[k#4] = (_temp_0_0_0#45[k#4] + utens_stage_0_0_0[k#4])
+      dcol_0_0_0#9[k#4] = (_temp_0_0_0#46[k#4] + correction_term_0_0_0#4[k#4])
+      _temp_0_0_0#47[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#2[k#4])
+      _temp_0_0_0#48[k#4] = (bcol_0_0_0#4[k#4] - _temp_0_0_0#47[k#4])
+      divided_0_0_0#4[k#4] = (1.0 / _temp_0_0_0#48[k#4])
+      _temp_0_0_0#49[k#4] = (dcol_0_0_0[(k#4 - 1)] * acol_0_0_0#2[k#4])
+      _temp_0_0_0#50[k#4] = (dcol_0_0_0#9[k#4] - _temp_0_0_0#49[k#4])
+      dcol_0_0_0#10[k#4] = (_temp_0_0_0#50[k#4] * divided_0_0_0#4[k#4])
+      dcol_0_0_0[k#4] = dcol_0_0_0#10[k#4]
     }
     awaitall
     for i32 k#5 in [79:78:-1] {
       datacol_0_0_0[k#5] = dcol_0_0_0[k#5]
       data_col_0_0_0#1[k#5] = datacol_0_0_0[k#5]
-      _temp_0_0_0#30[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
-      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#30[k#5])
+      _temp_0_0_0#60[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
+      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#60[k#5])
       data_col_0_0_0[k#5] = data_col_0_0_0#1[k#5]
       utens_stage_0_0_0#1[k#5] = utens_stage_0_0_0#2[k#5]
     }
     awaitall
     for i32 k#6 in [78:-1:-1] {
-      _temp_0_0_0#31[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
-      datacol_0_0_0#1[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#31[k#6])
-      data_col_0_0_0#2[k#6] = datacol_0_0_0#1[k#6]
-      _temp_0_0_0#32[k#6] = (datacol_0_0_0#1[k#6] - u_pos_0_0_0[k#6])
-      utens_stage_0_0_0#3[k#6] = (_dtr_stage * _temp_0_0_0#32[k#6])
-      data_col_0_0_0[k#6] = data_col_0_0_0#2[k#6]
-      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#3[k#6]
+      _temp_0_0_0#62[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
+      datacol_0_0_0#2[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#62[k#6])
+      data_col_0_0_0#3[k#6] = datacol_0_0_0#2[k#6]
+      _temp_0_0_0#63[k#6] = (datacol_0_0_0#2[k#6] - u_pos_0_0_0[k#6])
+      utens_stage_0_0_0#4[k#6] = (_dtr_stage * _temp_0_0_0#63[k#6])
+      data_col_0_0_0[k#6] = data_col_0_0_0#3[k#6]
+      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#4[k#6]
     }
     awaitall
     await send(utens_stage_0_0_0#1, __kernel_out_0[(i#2 - 1), (j#2 - 0)])
@@ -550,14 +550,14 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 80>[128, 128] readonly
     await receive(utens_stage_0_0_0, _utens_stage[i#2, j#2])
     await receive(wcon_0_0_0, _wcon[i#2, j#2])
     awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:81:1], receive(_stream_wcon#2) {
-      _refactored_wcon_1_0_0_0_0#1[k] = x
+    completion _recv_comp = foreach i32 k, f32 x in [0:81:1], receive(_stream_wcon#4) {
+      _refactored_wcon_1_0_0_0_0#3[k] = x
     }
-    completion _send_comp = send(wcon_0_0_0, _stream_wcon#1)
+    completion _send_comp = send(wcon_0_0_0, _stream_wcon#3)
     await _send_comp
     await _recv_comp
     await map i32 k#1 in [0:80:1] {
-      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#1[k#1]
+      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#3[k#1]
     }
     awaitall
     for i32 k#2 in [0:1:1] {
@@ -581,75 +581,75 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 80>[128, 128] readonly
     }
     awaitall
     for i32 k#3 in [1:79:1] {
-      _temp_0_0_0#6[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
-      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#6[k#3])
-      _temp_0_0_0#7[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
-      _temp_0_0_0#8[k#3] = (_temp_0_0_0#7[k#3] + wcon_0_0_0[(k#3 + 1)])
-      gcv_0_0_0#1[k#3] = (0.25 * _temp_0_0_0#8[k#3])
+      _temp_0_0_0#12[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
+      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#12[k#3])
+      _temp_0_0_0#13[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
+      _temp_0_0_0#14[k#3] = (_temp_0_0_0#13[k#3] + wcon_0_0_0[(k#3 + 1)])
+      gcv_0_0_0#2[k#3] = (0.25 * _temp_0_0_0#14[k#3])
       as__0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      cs_0_0_0#1[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
+      cs_0_0_0#2[k#3] = (gcv_0_0_0#2[k#3] * 0.5)
       acol_0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      ccol_0_0_0#3[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
-      _temp_0_0_0#9[k#3] = (_dtr_stage - acol_0_0_0[k#3])
-      bcol_0_0_0#1[k#3] = (_temp_0_0_0#9[k#3] - ccol_0_0_0#3[k#3])
-      _temp_0_0_0#10[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#11[k#3] = (cs_0_0_0#1[k#3] * _temp_0_0_0#10[k#3])
-      _temp_0_0_0#12[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#13[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#12[k#3])
-      correction_term_0_0_0#1[k#3] = (_temp_0_0_0#13[k#3] - _temp_0_0_0#11[k#3])
-      _temp_0_0_0#14[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
-      _temp_0_0_0#15[k#3] = (_temp_0_0_0#14[k#3] + utens_0_0_0[k#3])
-      _temp_0_0_0#16[k#3] = (_temp_0_0_0#15[k#3] + utens_stage_0_0_0[k#3])
-      dcol_0_0_0#3[k#3] = (_temp_0_0_0#16[k#3] + correction_term_0_0_0#1[k#3])
-      _temp_0_0_0#17[k#3] = (ccol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#18[k#3] = (bcol_0_0_0#1[k#3] - _temp_0_0_0#17[k#3])
-      divided_0_0_0#1[k#3] = (1.0 / _temp_0_0_0#18[k#3])
-      ccol_0_0_0#4[k#3] = (ccol_0_0_0#3[k#3] * divided_0_0_0#1[k#3])
-      _temp_0_0_0#19[k#3] = (dcol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#20[k#3] = (dcol_0_0_0#3[k#3] - _temp_0_0_0#19[k#3])
-      dcol_0_0_0#4[k#3] = (_temp_0_0_0#20[k#3] * divided_0_0_0#1[k#3])
-      ccol_0_0_0[k#3] = ccol_0_0_0#4[k#3]
-      dcol_0_0_0[k#3] = dcol_0_0_0#4[k#3]
+      ccol_0_0_0#5[k#3] = (gcv_0_0_0#2[k#3] * 0.5)
+      _temp_0_0_0#15[k#3] = (_dtr_stage - acol_0_0_0[k#3])
+      bcol_0_0_0#2[k#3] = (_temp_0_0_0#15[k#3] - ccol_0_0_0#5[k#3])
+      _temp_0_0_0#16[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#17[k#3] = (cs_0_0_0#2[k#3] * _temp_0_0_0#16[k#3])
+      _temp_0_0_0#18[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#19[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#18[k#3])
+      correction_term_0_0_0#2[k#3] = (_temp_0_0_0#19[k#3] - _temp_0_0_0#17[k#3])
+      _temp_0_0_0#20[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
+      _temp_0_0_0#21[k#3] = (_temp_0_0_0#20[k#3] + utens_0_0_0[k#3])
+      _temp_0_0_0#22[k#3] = (_temp_0_0_0#21[k#3] + utens_stage_0_0_0[k#3])
+      dcol_0_0_0#5[k#3] = (_temp_0_0_0#22[k#3] + correction_term_0_0_0#2[k#3])
+      _temp_0_0_0#23[k#3] = (ccol_0_0_0[(k#3 - 1)] * acol_0_0_0[k#3])
+      _temp_0_0_0#24[k#3] = (bcol_0_0_0#2[k#3] - _temp_0_0_0#23[k#3])
+      divided_0_0_0#2[k#3] = (1.0 / _temp_0_0_0#24[k#3])
+      ccol_0_0_0#6[k#3] = (ccol_0_0_0#5[k#3] * divided_0_0_0#2[k#3])
+      _temp_0_0_0#25[k#3] = (dcol_0_0_0[(k#3 - 1)] * acol_0_0_0[k#3])
+      _temp_0_0_0#26[k#3] = (dcol_0_0_0#5[k#3] - _temp_0_0_0#25[k#3])
+      dcol_0_0_0#6[k#3] = (_temp_0_0_0#26[k#3] * divided_0_0_0#2[k#3])
+      ccol_0_0_0[k#3] = ccol_0_0_0#6[k#3]
+      dcol_0_0_0[k#3] = dcol_0_0_0#6[k#3]
     }
     awaitall
     for i32 k#4 in [79:80:1] {
-      _temp_0_0_0#21[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
-      gav_0_0_0#1[k#4] = (-0.25 * _temp_0_0_0#21[k#4])
-      as__0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      acol_0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      bcol_0_0_0#2[k#4] = (_dtr_stage - acol_0_0_0#1[k#4])
-      _temp_0_0_0#22[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
-      correction_term_0_0_0#2[k#4] = (-as__0_0_0#1[k#4] * _temp_0_0_0#22[k#4])
-      _temp_0_0_0#23[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
-      _temp_0_0_0#24[k#4] = (_temp_0_0_0#23[k#4] + utens_0_0_0[k#4])
-      _temp_0_0_0#25[k#4] = (_temp_0_0_0#24[k#4] + utens_stage_0_0_0[k#4])
-      dcol_0_0_0#5[k#4] = (_temp_0_0_0#25[k#4] + correction_term_0_0_0#2[k#4])
-      _temp_0_0_0#26[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#27[k#4] = (bcol_0_0_0#2[k#4] - _temp_0_0_0#26[k#4])
-      divided_0_0_0#2[k#4] = (1.0 / _temp_0_0_0#27[k#4])
-      _temp_0_0_0#28[k#4] = (dcol_0_0_0#5[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#29[k#4] = (dcol_0_0_0#5[k#4] - _temp_0_0_0#28[k#4])
-      dcol_0_0_0#6[k#4] = (_temp_0_0_0#29[k#4] * divided_0_0_0#2[k#4])
-      dcol_0_0_0[k#4] = dcol_0_0_0#6[k#4]
+      _temp_0_0_0#42[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
+      gav_0_0_0#2[k#4] = (-0.25 * _temp_0_0_0#42[k#4])
+      as__0_0_0#2[k#4] = (gav_0_0_0#2[k#4] * 0.5)
+      acol_0_0_0#2[k#4] = (gav_0_0_0#2[k#4] * 0.5)
+      bcol_0_0_0#4[k#4] = (_dtr_stage - acol_0_0_0#2[k#4])
+      _temp_0_0_0#43[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
+      correction_term_0_0_0#4[k#4] = (-as__0_0_0#2[k#4] * _temp_0_0_0#43[k#4])
+      _temp_0_0_0#44[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
+      _temp_0_0_0#45[k#4] = (_temp_0_0_0#44[k#4] + utens_0_0_0[k#4])
+      _temp_0_0_0#46[k#4] = (_temp_0_0_0#45[k#4] + utens_stage_0_0_0[k#4])
+      dcol_0_0_0#9[k#4] = (_temp_0_0_0#46[k#4] + correction_term_0_0_0#4[k#4])
+      _temp_0_0_0#47[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#2[k#4])
+      _temp_0_0_0#48[k#4] = (bcol_0_0_0#4[k#4] - _temp_0_0_0#47[k#4])
+      divided_0_0_0#4[k#4] = (1.0 / _temp_0_0_0#48[k#4])
+      _temp_0_0_0#49[k#4] = (dcol_0_0_0[(k#4 - 1)] * acol_0_0_0#2[k#4])
+      _temp_0_0_0#50[k#4] = (dcol_0_0_0#9[k#4] - _temp_0_0_0#49[k#4])
+      dcol_0_0_0#10[k#4] = (_temp_0_0_0#50[k#4] * divided_0_0_0#4[k#4])
+      dcol_0_0_0[k#4] = dcol_0_0_0#10[k#4]
     }
     awaitall
     for i32 k#5 in [79:78:-1] {
       datacol_0_0_0[k#5] = dcol_0_0_0[k#5]
       data_col_0_0_0#1[k#5] = datacol_0_0_0[k#5]
-      _temp_0_0_0#30[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
-      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#30[k#5])
+      _temp_0_0_0#60[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
+      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#60[k#5])
       data_col_0_0_0[k#5] = data_col_0_0_0#1[k#5]
       utens_stage_0_0_0#1[k#5] = utens_stage_0_0_0#2[k#5]
     }
     awaitall
     for i32 k#6 in [78:-1:-1] {
-      _temp_0_0_0#31[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
-      datacol_0_0_0#1[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#31[k#6])
-      data_col_0_0_0#2[k#6] = datacol_0_0_0#1[k#6]
-      _temp_0_0_0#32[k#6] = (datacol_0_0_0#1[k#6] - u_pos_0_0_0[k#6])
-      utens_stage_0_0_0#3[k#6] = (_dtr_stage * _temp_0_0_0#32[k#6])
-      data_col_0_0_0[k#6] = data_col_0_0_0#2[k#6]
-      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#3[k#6]
+      _temp_0_0_0#62[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
+      datacol_0_0_0#2[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#62[k#6])
+      data_col_0_0_0#3[k#6] = datacol_0_0_0#2[k#6]
+      _temp_0_0_0#63[k#6] = (datacol_0_0_0#2[k#6] - u_pos_0_0_0[k#6])
+      utens_stage_0_0_0#4[k#6] = (_dtr_stage * _temp_0_0_0#63[k#6])
+      data_col_0_0_0[k#6] = data_col_0_0_0#3[k#6]
+      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#4[k#6]
     }
     awaitall
     await send(utens_stage_0_0_0#1, __kernel_out_0[(i#2 - 1), (j#2 - 0)])
diff --git a/samples/benchmarks/vertical_advection_16_16_4.sptl b/samples/benchmarks/vertical_advection_16_16_4.sptl
index ad017d65..61457414 100644
--- a/samples/benchmarks/vertical_advection_16_16_4.sptl
+++ b/samples/benchmarks/vertical_advection_16_16_4.sptl
@@ -10,86 +10,86 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[16, 16] readonly _u_
     f32[5] utens_0_0_0
     f32[5] utens_stage_0_0_0
     f32[5] wcon_0_0_0
-    f32[5] _refactored_wcon_1_0_0_0_0#1
-    f32[2] gcv_0_0_0
-    f32[2] _temp_0_0_0
-    f32[2] _temp_0_0_0#1
-    f32[2] cs_0_0_0
-    f32[2] ccol_0_0_0#1
-    f32[2] bcol_0_0_0
-    f32[2] correction_term_0_0_0
-    f32[2] _temp_0_0_0#2
-    f32[2] dcol_0_0_0#1
-    f32[2] _temp_0_0_0#3
-    f32[2] _temp_0_0_0#4
-    f32[2] _temp_0_0_0#5
-    f32[2] divided_0_0_0
-    f32[2] ccol_0_0_0#2
-    f32[2] dcol_0_0_0#2
-    f32[4] gav_0_0_m1
-    f32[4] gav_0_0_0
-    f32[4] _temp_0_0_0#6
-    f32[4] gcv_0_0_m1
-    f32[4] gcv_0_0_0#1
-    f32[4] _temp_0_0_0#7
-    f32[4] _temp_0_0_0#8
-    f32[4] as__0_0_m1
-    f32[4] as__0_0_0
-    f32[4] cs_0_0_m1
-    f32[4] cs_0_0_0#1
-    f32[4] acol_0_0_0
-    f32[4] ccol_0_0_m1
-    f32[4] ccol_0_0_0#3
-    f32[4] bcol_0_0_0#1
-    f32[4] _temp_0_0_0#9
-    f32[4] correction_term_0_0_m1
-    f32[4] correction_term_0_0_0#1
-    f32[4] _temp_0_0_0#10
-    f32[4] _temp_0_0_0#11
-    f32[4] _temp_0_0_0#12
-    f32[4] _temp_0_0_0#13
-    f32[4] dcol_0_0_m1
-    f32[4] dcol_0_0_0#3
-    f32[4] _temp_0_0_0#14
-    f32[4] _temp_0_0_0#15
-    f32[4] _temp_0_0_0#16
-    f32[4] divided_0_0_0#1
-    f32[4] _temp_0_0_0#17
-    f32[4] _temp_0_0_0#18
-    f32[4] ccol_0_0_0#4
-    f32[4] dcol_0_0_0#4
-    f32[4] _temp_0_0_0#19
-    f32[4] _temp_0_0_0#20
-    f32[5] gav_0_0_m1#1
-    f32[5] gav_0_0_0#1
-    f32[5] _temp_0_0_0#21
-    f32[5] as__0_0_m1#1
-    f32[5] as__0_0_0#1
-    f32[5] acol_0_0_0#1
-    f32[5] bcol_0_0_0#2
-    f32[5] correction_term_0_0_m1#1
-    f32[5] correction_term_0_0_0#2
-    f32[5] _temp_0_0_0#22
-    f32[5] dcol_0_0_m1#1
-    f32[5] dcol_0_0_0#5
-    f32[5] _temp_0_0_0#23
-    f32[5] _temp_0_0_0#24
-    f32[5] _temp_0_0_0#25
-    f32[5] divided_0_0_0#2
-    f32[5] _temp_0_0_0#26
-    f32[5] _temp_0_0_0#27
-    f32[5] dcol_0_0_0#6
-    f32[5] _temp_0_0_0#28
-    f32[5] _temp_0_0_0#29
-    f32[5] datacol_0_0_0
-    f32[5] data_col_0_0_0#1
-    f32[5] utens_stage_0_0_0#2
-    f32[5] _temp_0_0_0#30
-    f32[4] datacol_0_0_0#1
+    f32[5] _refactored_wcon_1_0_0_0_0#2
+    f32[2] gcv_0_0_0#1
+    f32[2] _temp_0_0_0#6
+    f32[2] _temp_0_0_0#7
+    f32[2] cs_0_0_0#1
+    f32[2] ccol_0_0_0#3
+    f32[2] bcol_0_0_0#1
+    f32[2] correction_term_0_0_0#1
+    f32[2] _temp_0_0_0#8
+    f32[2] dcol_0_0_0#3
+    f32[2] _temp_0_0_0#9
+    f32[2] _temp_0_0_0#10
+    f32[2] _temp_0_0_0#11
+    f32[2] divided_0_0_0#1
+    f32[2] ccol_0_0_0#4
+    f32[2] dcol_0_0_0#4
+    f32[4] gav_0_0_m1#1
+    f32[4] gav_0_0_0#1
+    f32[4] _temp_0_0_0#27
+    f32[4] gcv_0_0_m1#1
+    f32[4] gcv_0_0_0#3
+    f32[4] _temp_0_0_0#28
+    f32[4] _temp_0_0_0#29
+    f32[4] as__0_0_m1#1
+    f32[4] as__0_0_0#1
+    f32[4] cs_0_0_m1#1
+    f32[4] cs_0_0_0#3
+    f32[4] acol_0_0_0#1
+    f32[4] ccol_0_0_m1#1
+    f32[4] ccol_0_0_0#7
+    f32[4] bcol_0_0_0#3
+    f32[4] _temp_0_0_0#30
+    f32[4] correction_term_0_0_m1#1
+    f32[4] correction_term_0_0_0#3
     f32[4] _temp_0_0_0#31
-    f32[4] data_col_0_0_0#2
-    f32[4] utens_stage_0_0_0#3
     f32[4] _temp_0_0_0#32
+    f32[4] _temp_0_0_0#33
+    f32[4] _temp_0_0_0#34
+    f32[4] dcol_0_0_m1#1
+    f32[4] dcol_0_0_0#7
+    f32[4] _temp_0_0_0#35
+    f32[4] _temp_0_0_0#36
+    f32[4] _temp_0_0_0#37
+    f32[4] divided_0_0_0#3
+    f32[4] _temp_0_0_0#38
+    f32[4] _temp_0_0_0#39
+    f32[4] ccol_0_0_0#8
+    f32[4] dcol_0_0_0#8
+    f32[4] _temp_0_0_0#40
+    f32[4] _temp_0_0_0#41
+    f32[5] gav_0_0_m1#3
+    f32[5] gav_0_0_0#3
+    f32[5] _temp_0_0_0#51
+    f32[5] as__0_0_m1#3
+    f32[5] as__0_0_0#3
+    f32[5] acol_0_0_0#3
+    f32[5] bcol_0_0_0#5
+    f32[5] correction_term_0_0_m1#3
+    f32[5] correction_term_0_0_0#5
+    f32[5] _temp_0_0_0#52
+    f32[5] dcol_0_0_m1#3
+    f32[5] dcol_0_0_0#11
+    f32[5] _temp_0_0_0#53
+    f32[5] _temp_0_0_0#54
+    f32[5] _temp_0_0_0#55
+    f32[5] divided_0_0_0#5
+    f32[5] _temp_0_0_0#56
+    f32[5] _temp_0_0_0#57
+    f32[5] dcol_0_0_0#12
+    f32[5] _temp_0_0_0#58
+    f32[5] _temp_0_0_0#59
+    f32[5] datacol_0_0_0#1
+    f32[5] data_col_0_0_0#2
+    f32[5] utens_stage_0_0_0#3
+    f32[5] _temp_0_0_0#61
+    f32[4] datacol_0_0_0#3
+    f32[4] _temp_0_0_0#64
+    f32[4] data_col_0_0_0#4
+    f32[4] utens_stage_0_0_0#5
+    f32[4] _temp_0_0_0#65
   }
   place u16 i, u16 j in [2:17:2 , 0:16:1] {
     f32[5] utens_stage_0_0_0#1
@@ -102,7 +102,7 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[16, 16] readonly _u_
     f32[5] utens_0_0_0
     f32[5] utens_stage_0_0_0
     f32[5] wcon_0_0_0
-    f32[5] _refactored_wcon_1_0_0_0_0#1
+    f32[5] _refactored_wcon_1_0_0_0_0#3
     f32[2] gcv_0_0_0
     f32[2] _temp_0_0_0
     f32[2] _temp_0_0_0#1
@@ -120,68 +120,68 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[16, 16] readonly _u_
     f32[2] dcol_0_0_0#2
     f32[4] gav_0_0_m1
     f32[4] gav_0_0_0
-    f32[4] _temp_0_0_0#6
+    f32[4] _temp_0_0_0#12
     f32[4] gcv_0_0_m1
-    f32[4] gcv_0_0_0#1
-    f32[4] _temp_0_0_0#7
-    f32[4] _temp_0_0_0#8
+    f32[4] gcv_0_0_0#2
+    f32[4] _temp_0_0_0#13
+    f32[4] _temp_0_0_0#14
     f32[4] as__0_0_m1
     f32[4] as__0_0_0
     f32[4] cs_0_0_m1
-    f32[4] cs_0_0_0#1
+    f32[4] cs_0_0_0#2
     f32[4] acol_0_0_0
     f32[4] ccol_0_0_m1
-    f32[4] ccol_0_0_0#3
-    f32[4] bcol_0_0_0#1
-    f32[4] _temp_0_0_0#9
-    f32[4] correction_term_0_0_m1
-    f32[4] correction_term_0_0_0#1
-    f32[4] _temp_0_0_0#10
-    f32[4] _temp_0_0_0#11
-    f32[4] _temp_0_0_0#12
-    f32[4] _temp_0_0_0#13
-    f32[4] dcol_0_0_m1
-    f32[4] dcol_0_0_0#3
-    f32[4] _temp_0_0_0#14
+    f32[4] ccol_0_0_0#5
+    f32[4] bcol_0_0_0#2
     f32[4] _temp_0_0_0#15
+    f32[4] correction_term_0_0_m1
+    f32[4] correction_term_0_0_0#2
     f32[4] _temp_0_0_0#16
-    f32[4] divided_0_0_0#1
     f32[4] _temp_0_0_0#17
     f32[4] _temp_0_0_0#18
-    f32[4] ccol_0_0_0#4
-    f32[4] dcol_0_0_0#4
     f32[4] _temp_0_0_0#19
+    f32[4] dcol_0_0_m1
+    f32[4] dcol_0_0_0#5
     f32[4] _temp_0_0_0#20
-    f32[5] gav_0_0_m1#1
-    f32[5] gav_0_0_0#1
-    f32[5] _temp_0_0_0#21
-    f32[5] as__0_0_m1#1
-    f32[5] as__0_0_0#1
-    f32[5] acol_0_0_0#1
-    f32[5] bcol_0_0_0#2
-    f32[5] correction_term_0_0_m1#1
-    f32[5] correction_term_0_0_0#2
-    f32[5] _temp_0_0_0#22
-    f32[5] dcol_0_0_m1#1
-    f32[5] dcol_0_0_0#5
-    f32[5] _temp_0_0_0#23
-    f32[5] _temp_0_0_0#24
-    f32[5] _temp_0_0_0#25
-    f32[5] divided_0_0_0#2
-    f32[5] _temp_0_0_0#26
-    f32[5] _temp_0_0_0#27
-    f32[5] dcol_0_0_0#6
-    f32[5] _temp_0_0_0#28
-    f32[5] _temp_0_0_0#29
+    f32[4] _temp_0_0_0#21
+    f32[4] _temp_0_0_0#22
+    f32[4] divided_0_0_0#2
+    f32[4] _temp_0_0_0#23
+    f32[4] _temp_0_0_0#24
+    f32[4] ccol_0_0_0#6
+    f32[4] dcol_0_0_0#6
+    f32[4] _temp_0_0_0#25
+    f32[4] _temp_0_0_0#26
+    f32[5] gav_0_0_m1#2
+    f32[5] gav_0_0_0#2
+    f32[5] _temp_0_0_0#42
+    f32[5] as__0_0_m1#2
+    f32[5] as__0_0_0#2
+    f32[5] acol_0_0_0#2
+    f32[5] bcol_0_0_0#4
+    f32[5] correction_term_0_0_m1#2
+    f32[5] correction_term_0_0_0#4
+    f32[5] _temp_0_0_0#43
+    f32[5] dcol_0_0_m1#2
+    f32[5] dcol_0_0_0#9
+    f32[5] _temp_0_0_0#44
+    f32[5] _temp_0_0_0#45
+    f32[5] _temp_0_0_0#46
+    f32[5] divided_0_0_0#4
+    f32[5] _temp_0_0_0#47
+    f32[5] _temp_0_0_0#48
+    f32[5] dcol_0_0_0#10
+    f32[5] _temp_0_0_0#49
+    f32[5] _temp_0_0_0#50
     f32[5] datacol_0_0_0
     f32[5] data_col_0_0_0#1
     f32[5] utens_stage_0_0_0#2
-    f32[5] _temp_0_0_0#30
-    f32[4] datacol_0_0_0#1
-    f32[4] _temp_0_0_0#31
-    f32[4] data_col_0_0_0#2
-    f32[4] utens_stage_0_0_0#3
-    f32[4] _temp_0_0_0#32
+    f32[5] _temp_0_0_0#60
+    f32[4] datacol_0_0_0#2
+    f32[4] _temp_0_0_0#62
+    f32[4] data_col_0_0_0#3
+    f32[4] utens_stage_0_0_0#4
+    f32[4] _temp_0_0_0#63
   }
   place u16 i, u16 j in [3:17:2 , 0:16:1] {
     f32[5] utens_stage_0_0_0#1
@@ -194,7 +194,7 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[16, 16] readonly _u_
     f32[5] utens_0_0_0
     f32[5] utens_stage_0_0_0
     f32[5] wcon_0_0_0
-    f32[5] _refactored_wcon_1_0_0_0_0#1
+    f32[5] _refactored_wcon_1_0_0_0_0#3
     f32[2] gcv_0_0_0
     f32[2] _temp_0_0_0
     f32[2] _temp_0_0_0#1
@@ -212,68 +212,68 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[16, 16] readonly _u_
     f32[2] dcol_0_0_0#2
     f32[4] gav_0_0_m1
     f32[4] gav_0_0_0
-    f32[4] _temp_0_0_0#6
+    f32[4] _temp_0_0_0#12
     f32[4] gcv_0_0_m1
-    f32[4] gcv_0_0_0#1
-    f32[4] _temp_0_0_0#7
-    f32[4] _temp_0_0_0#8
+    f32[4] gcv_0_0_0#2
+    f32[4] _temp_0_0_0#13
+    f32[4] _temp_0_0_0#14
     f32[4] as__0_0_m1
     f32[4] as__0_0_0
     f32[4] cs_0_0_m1
-    f32[4] cs_0_0_0#1
+    f32[4] cs_0_0_0#2
     f32[4] acol_0_0_0
     f32[4] ccol_0_0_m1
-    f32[4] ccol_0_0_0#3
-    f32[4] bcol_0_0_0#1
-    f32[4] _temp_0_0_0#9
-    f32[4] correction_term_0_0_m1
-    f32[4] correction_term_0_0_0#1
-    f32[4] _temp_0_0_0#10
-    f32[4] _temp_0_0_0#11
-    f32[4] _temp_0_0_0#12
-    f32[4] _temp_0_0_0#13
-    f32[4] dcol_0_0_m1
-    f32[4] dcol_0_0_0#3
-    f32[4] _temp_0_0_0#14
+    f32[4] ccol_0_0_0#5
+    f32[4] bcol_0_0_0#2
     f32[4] _temp_0_0_0#15
+    f32[4] correction_term_0_0_m1
+    f32[4] correction_term_0_0_0#2
     f32[4] _temp_0_0_0#16
-    f32[4] divided_0_0_0#1
     f32[4] _temp_0_0_0#17
     f32[4] _temp_0_0_0#18
-    f32[4] ccol_0_0_0#4
-    f32[4] dcol_0_0_0#4
     f32[4] _temp_0_0_0#19
+    f32[4] dcol_0_0_m1
+    f32[4] dcol_0_0_0#5
     f32[4] _temp_0_0_0#20
-    f32[5] gav_0_0_m1#1
-    f32[5] gav_0_0_0#1
-    f32[5] _temp_0_0_0#21
-    f32[5] as__0_0_m1#1
-    f32[5] as__0_0_0#1
-    f32[5] acol_0_0_0#1
-    f32[5] bcol_0_0_0#2
-    f32[5] correction_term_0_0_m1#1
-    f32[5] correction_term_0_0_0#2
-    f32[5] _temp_0_0_0#22
-    f32[5] dcol_0_0_m1#1
-    f32[5] dcol_0_0_0#5
-    f32[5] _temp_0_0_0#23
-    f32[5] _temp_0_0_0#24
-    f32[5] _temp_0_0_0#25
-    f32[5] divided_0_0_0#2
-    f32[5] _temp_0_0_0#26
-    f32[5] _temp_0_0_0#27
-    f32[5] dcol_0_0_0#6
-    f32[5] _temp_0_0_0#28
-    f32[5] _temp_0_0_0#29
+    f32[4] _temp_0_0_0#21
+    f32[4] _temp_0_0_0#22
+    f32[4] divided_0_0_0#2
+    f32[4] _temp_0_0_0#23
+    f32[4] _temp_0_0_0#24
+    f32[4] ccol_0_0_0#6
+    f32[4] dcol_0_0_0#6
+    f32[4] _temp_0_0_0#25
+    f32[4] _temp_0_0_0#26
+    f32[5] gav_0_0_m1#2
+    f32[5] gav_0_0_0#2
+    f32[5] _temp_0_0_0#42
+    f32[5] as__0_0_m1#2
+    f32[5] as__0_0_0#2
+    f32[5] acol_0_0_0#2
+    f32[5] bcol_0_0_0#4
+    f32[5] correction_term_0_0_m1#2
+    f32[5] correction_term_0_0_0#4
+    f32[5] _temp_0_0_0#43
+    f32[5] dcol_0_0_m1#2
+    f32[5] dcol_0_0_0#9
+    f32[5] _temp_0_0_0#44
+    f32[5] _temp_0_0_0#45
+    f32[5] _temp_0_0_0#46
+    f32[5] divided_0_0_0#4
+    f32[5] _temp_0_0_0#47
+    f32[5] _temp_0_0_0#48
+    f32[5] dcol_0_0_0#10
+    f32[5] _temp_0_0_0#49
+    f32[5] _temp_0_0_0#50
     f32[5] datacol_0_0_0
     f32[5] data_col_0_0_0#1
     f32[5] utens_stage_0_0_0#2
-    f32[5] _temp_0_0_0#30
-    f32[4] datacol_0_0_0#1
-    f32[4] _temp_0_0_0#31
-    f32[4] data_col_0_0_0#2
-    f32[4] utens_stage_0_0_0#3
-    f32[4] _temp_0_0_0#32
+    f32[5] _temp_0_0_0#60
+    f32[4] datacol_0_0_0#2
+    f32[4] _temp_0_0_0#62
+    f32[4] data_col_0_0_0#3
+    f32[4] utens_stage_0_0_0#4
+    f32[4] _temp_0_0_0#63
   }
   place u16 i#1, u16 j#1 in [17:18:2 , 0:16:1] {
     f32[5] wcon_0_0_0
@@ -283,43 +283,43 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[16, 16] readonly _u_
   }
   dataflow u16 i#5, u16 j#5 in [17:18:2 , 0:16:1] {
     stream<f32> _stream_wcon#1 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 0
-    }
+  hops = [(-1, 0)], 
+  channel = 0
+}
     stream<f32> _stream_wcon#2 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 1
-    }
+  hops = [(-1, 0)], 
+  channel = 1
+}
   }
   dataflow u16 i#5, u16 j#5 in [2:17:2 , 0:16:1] {
-    stream<f32> _stream_wcon#1 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_wcon#2 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 1
-    }
+    stream<f32> _stream_wcon#3 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 2
+}
+    stream<f32> _stream_wcon#4 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 3
+}
   }
   dataflow u16 i#5, u16 j#5 in [3:17:2 , 0:16:1] {
-    stream<f32> _stream_wcon#1 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_wcon#2 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 1
-    }
+    stream<f32> _stream_wcon#3 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 2
+}
+    stream<f32> _stream_wcon#4 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 3
+}
   }
   dataflow u16 i#5, u16 j#5 in [1:2:2 , 0:16:1] {
-    stream<f32> _stream_wcon#1 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_wcon#2 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 1
-    }
+    stream<f32> _stream_wcon#5 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 4
+}
+    stream<f32> _stream_wcon#6 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 5
+}
   }
   compute u16 i#2, u16 j#2 in [1:2:2 , 0:16:1] {
     await receive(u_pos_0_0_0, _u_pos[i#2, j#2])
@@ -328,104 +328,104 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[16, 16] readonly _u_
     await receive(utens_stage_0_0_0, _utens_stage[i#2, j#2])
     await receive(wcon_0_0_0, _wcon[i#2, j#2])
     awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:5:1], receive(_stream_wcon#2) {
-      _refactored_wcon_1_0_0_0_0#1[k] = x
+    completion _recv_comp = foreach i32 k, f32 x in [0:5:1], receive(_stream_wcon#6) {
+      _refactored_wcon_1_0_0_0_0#2[k] = x
     }
     await _recv_comp
     await map i32 k#1 in [0:4:1] {
-      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#1[k#1]
+      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#2[k#1]
     }
     awaitall
     for i32 k#2 in [0:1:1] {
-      _temp_0_0_0[k#2] = _refactored_wcon_1_0_0_0_0[(k#2 + 1)]
-      _temp_0_0_0#1[k#2] = (_temp_0_0_0[k#2] + wcon_0_0_0[(k#2 + 1)])
-      gcv_0_0_0[k#2] = (0.25 * _temp_0_0_0#1[k#2])
-      cs_0_0_0[k#2] = (gcv_0_0_0[k#2] * 0.5)
-      ccol_0_0_0#1[k#2] = (gcv_0_0_0[k#2] * 0.5)
-      bcol_0_0_0[k#2] = (_dtr_stage - ccol_0_0_0#1[k#2])
-      _temp_0_0_0#2[k#2] = (u_stage_0_0_0[(k#2 + 1)] - u_stage_0_0_0[k#2])
-      correction_term_0_0_0[k#2] = (-cs_0_0_0[k#2] * _temp_0_0_0#2[k#2])
-      _temp_0_0_0#3[k#2] = (_dtr_stage * u_pos_0_0_0[k#2])
-      _temp_0_0_0#4[k#2] = (_temp_0_0_0#3[k#2] + utens_0_0_0[k#2])
-      _temp_0_0_0#5[k#2] = (_temp_0_0_0#4[k#2] + utens_stage_0_0_0[k#2])
-      dcol_0_0_0#1[k#2] = (_temp_0_0_0#5[k#2] + correction_term_0_0_0[k#2])
-      divided_0_0_0[k#2] = (1.0 / bcol_0_0_0[k#2])
-      ccol_0_0_0#2[k#2] = (ccol_0_0_0#1[k#2] * divided_0_0_0[k#2])
-      dcol_0_0_0#2[k#2] = (dcol_0_0_0#1[k#2] * divided_0_0_0[k#2])
-      ccol_0_0_0[k#2] = ccol_0_0_0#2[k#2]
-      dcol_0_0_0[k#2] = dcol_0_0_0#2[k#2]
+      _temp_0_0_0#6[k#2] = _refactored_wcon_1_0_0_0_0[(k#2 + 1)]
+      _temp_0_0_0#7[k#2] = (_temp_0_0_0#6[k#2] + wcon_0_0_0[(k#2 + 1)])
+      gcv_0_0_0#1[k#2] = (0.25 * _temp_0_0_0#7[k#2])
+      cs_0_0_0#1[k#2] = (gcv_0_0_0#1[k#2] * 0.5)
+      ccol_0_0_0#3[k#2] = (gcv_0_0_0#1[k#2] * 0.5)
+      bcol_0_0_0#1[k#2] = (_dtr_stage - ccol_0_0_0#3[k#2])
+      _temp_0_0_0#8[k#2] = (u_stage_0_0_0[(k#2 + 1)] - u_stage_0_0_0[k#2])
+      correction_term_0_0_0#1[k#2] = (-cs_0_0_0#1[k#2] * _temp_0_0_0#8[k#2])
+      _temp_0_0_0#9[k#2] = (_dtr_stage * u_pos_0_0_0[k#2])
+      _temp_0_0_0#10[k#2] = (_temp_0_0_0#9[k#2] + utens_0_0_0[k#2])
+      _temp_0_0_0#11[k#2] = (_temp_0_0_0#10[k#2] + utens_stage_0_0_0[k#2])
+      dcol_0_0_0#3[k#2] = (_temp_0_0_0#11[k#2] + correction_term_0_0_0#1[k#2])
+      divided_0_0_0#1[k#2] = (1.0 / bcol_0_0_0#1[k#2])
+      ccol_0_0_0#4[k#2] = (ccol_0_0_0#3[k#2] * divided_0_0_0#1[k#2])
+      dcol_0_0_0#4[k#2] = (dcol_0_0_0#3[k#2] * divided_0_0_0#1[k#2])
+      ccol_0_0_0[k#2] = ccol_0_0_0#4[k#2]
+      dcol_0_0_0[k#2] = dcol_0_0_0#4[k#2]
     }
     awaitall
     for i32 k#3 in [1:3:1] {
-      _temp_0_0_0#6[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
-      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#6[k#3])
-      _temp_0_0_0#7[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
-      _temp_0_0_0#8[k#3] = (_temp_0_0_0#7[k#3] + wcon_0_0_0[(k#3 + 1)])
-      gcv_0_0_0#1[k#3] = (0.25 * _temp_0_0_0#8[k#3])
-      as__0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      cs_0_0_0#1[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
-      acol_0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      ccol_0_0_0#3[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
-      _temp_0_0_0#9[k#3] = (_dtr_stage - acol_0_0_0[k#3])
-      bcol_0_0_0#1[k#3] = (_temp_0_0_0#9[k#3] - ccol_0_0_0#3[k#3])
-      _temp_0_0_0#10[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#11[k#3] = (cs_0_0_0#1[k#3] * _temp_0_0_0#10[k#3])
-      _temp_0_0_0#12[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#13[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#12[k#3])
-      correction_term_0_0_0#1[k#3] = (_temp_0_0_0#13[k#3] - _temp_0_0_0#11[k#3])
-      _temp_0_0_0#14[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
-      _temp_0_0_0#15[k#3] = (_temp_0_0_0#14[k#3] + utens_0_0_0[k#3])
-      _temp_0_0_0#16[k#3] = (_temp_0_0_0#15[k#3] + utens_stage_0_0_0[k#3])
-      dcol_0_0_0#3[k#3] = (_temp_0_0_0#16[k#3] + correction_term_0_0_0#1[k#3])
-      _temp_0_0_0#17[k#3] = (ccol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#18[k#3] = (bcol_0_0_0#1[k#3] - _temp_0_0_0#17[k#3])
-      divided_0_0_0#1[k#3] = (1.0 / _temp_0_0_0#18[k#3])
-      ccol_0_0_0#4[k#3] = (ccol_0_0_0#3[k#3] * divided_0_0_0#1[k#3])
-      _temp_0_0_0#19[k#3] = (dcol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#20[k#3] = (dcol_0_0_0#3[k#3] - _temp_0_0_0#19[k#3])
-      dcol_0_0_0#4[k#3] = (_temp_0_0_0#20[k#3] * divided_0_0_0#1[k#3])
-      ccol_0_0_0[k#3] = ccol_0_0_0#4[k#3]
-      dcol_0_0_0[k#3] = dcol_0_0_0#4[k#3]
+      _temp_0_0_0#27[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
+      gav_0_0_0#1[k#3] = (-0.25 * _temp_0_0_0#27[k#3])
+      _temp_0_0_0#28[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
+      _temp_0_0_0#29[k#3] = (_temp_0_0_0#28[k#3] + wcon_0_0_0[(k#3 + 1)])
+      gcv_0_0_0#3[k#3] = (0.25 * _temp_0_0_0#29[k#3])
+      as__0_0_0#1[k#3] = (gav_0_0_0#1[k#3] * 0.5)
+      cs_0_0_0#3[k#3] = (gcv_0_0_0#3[k#3] * 0.5)
+      acol_0_0_0#1[k#3] = (gav_0_0_0#1[k#3] * 0.5)
+      ccol_0_0_0#7[k#3] = (gcv_0_0_0#3[k#3] * 0.5)
+      _temp_0_0_0#30[k#3] = (_dtr_stage - acol_0_0_0#1[k#3])
+      bcol_0_0_0#3[k#3] = (_temp_0_0_0#30[k#3] - ccol_0_0_0#7[k#3])
+      _temp_0_0_0#31[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#32[k#3] = (cs_0_0_0#3[k#3] * _temp_0_0_0#31[k#3])
+      _temp_0_0_0#33[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#34[k#3] = (-as__0_0_0#1[k#3] * _temp_0_0_0#33[k#3])
+      correction_term_0_0_0#3[k#3] = (_temp_0_0_0#34[k#3] - _temp_0_0_0#32[k#3])
+      _temp_0_0_0#35[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
+      _temp_0_0_0#36[k#3] = (_temp_0_0_0#35[k#3] + utens_0_0_0[k#3])
+      _temp_0_0_0#37[k#3] = (_temp_0_0_0#36[k#3] + utens_stage_0_0_0[k#3])
+      dcol_0_0_0#7[k#3] = (_temp_0_0_0#37[k#3] + correction_term_0_0_0#3[k#3])
+      _temp_0_0_0#38[k#3] = (ccol_0_0_0[(k#3 - 1)] * acol_0_0_0#1[k#3])
+      _temp_0_0_0#39[k#3] = (bcol_0_0_0#3[k#3] - _temp_0_0_0#38[k#3])
+      divided_0_0_0#3[k#3] = (1.0 / _temp_0_0_0#39[k#3])
+      ccol_0_0_0#8[k#3] = (ccol_0_0_0#7[k#3] * divided_0_0_0#3[k#3])
+      _temp_0_0_0#40[k#3] = (dcol_0_0_0[(k#3 - 1)] * acol_0_0_0#1[k#3])
+      _temp_0_0_0#41[k#3] = (dcol_0_0_0#7[k#3] - _temp_0_0_0#40[k#3])
+      dcol_0_0_0#8[k#3] = (_temp_0_0_0#41[k#3] * divided_0_0_0#3[k#3])
+      ccol_0_0_0[k#3] = ccol_0_0_0#8[k#3]
+      dcol_0_0_0[k#3] = dcol_0_0_0#8[k#3]
     }
     awaitall
     for i32 k#4 in [3:4:1] {
-      _temp_0_0_0#21[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
-      gav_0_0_0#1[k#4] = (-0.25 * _temp_0_0_0#21[k#4])
-      as__0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      acol_0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      bcol_0_0_0#2[k#4] = (_dtr_stage - acol_0_0_0#1[k#4])
-      _temp_0_0_0#22[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
-      correction_term_0_0_0#2[k#4] = (-as__0_0_0#1[k#4] * _temp_0_0_0#22[k#4])
-      _temp_0_0_0#23[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
-      _temp_0_0_0#24[k#4] = (_temp_0_0_0#23[k#4] + utens_0_0_0[k#4])
-      _temp_0_0_0#25[k#4] = (_temp_0_0_0#24[k#4] + utens_stage_0_0_0[k#4])
-      dcol_0_0_0#5[k#4] = (_temp_0_0_0#25[k#4] + correction_term_0_0_0#2[k#4])
-      _temp_0_0_0#26[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#27[k#4] = (bcol_0_0_0#2[k#4] - _temp_0_0_0#26[k#4])
-      divided_0_0_0#2[k#4] = (1.0 / _temp_0_0_0#27[k#4])
-      _temp_0_0_0#28[k#4] = (dcol_0_0_0#5[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#29[k#4] = (dcol_0_0_0#5[k#4] - _temp_0_0_0#28[k#4])
-      dcol_0_0_0#6[k#4] = (_temp_0_0_0#29[k#4] * divided_0_0_0#2[k#4])
-      dcol_0_0_0[k#4] = dcol_0_0_0#6[k#4]
+      _temp_0_0_0#51[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
+      gav_0_0_0#3[k#4] = (-0.25 * _temp_0_0_0#51[k#4])
+      as__0_0_0#3[k#4] = (gav_0_0_0#3[k#4] * 0.5)
+      acol_0_0_0#3[k#4] = (gav_0_0_0#3[k#4] * 0.5)
+      bcol_0_0_0#5[k#4] = (_dtr_stage - acol_0_0_0#3[k#4])
+      _temp_0_0_0#52[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
+      correction_term_0_0_0#5[k#4] = (-as__0_0_0#3[k#4] * _temp_0_0_0#52[k#4])
+      _temp_0_0_0#53[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
+      _temp_0_0_0#54[k#4] = (_temp_0_0_0#53[k#4] + utens_0_0_0[k#4])
+      _temp_0_0_0#55[k#4] = (_temp_0_0_0#54[k#4] + utens_stage_0_0_0[k#4])
+      dcol_0_0_0#11[k#4] = (_temp_0_0_0#55[k#4] + correction_term_0_0_0#5[k#4])
+      _temp_0_0_0#56[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#3[k#4])
+      _temp_0_0_0#57[k#4] = (bcol_0_0_0#5[k#4] - _temp_0_0_0#56[k#4])
+      divided_0_0_0#5[k#4] = (1.0 / _temp_0_0_0#57[k#4])
+      _temp_0_0_0#58[k#4] = (dcol_0_0_0[(k#4 - 1)] * acol_0_0_0#3[k#4])
+      _temp_0_0_0#59[k#4] = (dcol_0_0_0#11[k#4] - _temp_0_0_0#58[k#4])
+      dcol_0_0_0#12[k#4] = (_temp_0_0_0#59[k#4] * divided_0_0_0#5[k#4])
+      dcol_0_0_0[k#4] = dcol_0_0_0#12[k#4]
     }
     awaitall
     for i32 k#5 in [3:2:-1] {
-      datacol_0_0_0[k#5] = dcol_0_0_0[k#5]
-      data_col_0_0_0#1[k#5] = datacol_0_0_0[k#5]
-      _temp_0_0_0#30[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
-      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#30[k#5])
-      data_col_0_0_0[k#5] = data_col_0_0_0#1[k#5]
-      utens_stage_0_0_0#1[k#5] = utens_stage_0_0_0#2[k#5]
+      datacol_0_0_0#1[k#5] = dcol_0_0_0[k#5]
+      data_col_0_0_0#2[k#5] = datacol_0_0_0#1[k#5]
+      _temp_0_0_0#61[k#5] = (datacol_0_0_0#1[k#5] - u_pos_0_0_0[k#5])
+      utens_stage_0_0_0#3[k#5] = (_dtr_stage * _temp_0_0_0#61[k#5])
+      data_col_0_0_0[k#5] = data_col_0_0_0#2[k#5]
+      utens_stage_0_0_0#1[k#5] = utens_stage_0_0_0#3[k#5]
     }
     awaitall
     for i32 k#6 in [2:-1:-1] {
-      _temp_0_0_0#31[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
-      datacol_0_0_0#1[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#31[k#6])
-      data_col_0_0_0#2[k#6] = datacol_0_0_0#1[k#6]
-      _temp_0_0_0#32[k#6] = (datacol_0_0_0#1[k#6] - u_pos_0_0_0[k#6])
-      utens_stage_0_0_0#3[k#6] = (_dtr_stage * _temp_0_0_0#32[k#6])
-      data_col_0_0_0[k#6] = data_col_0_0_0#2[k#6]
-      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#3[k#6]
+      _temp_0_0_0#64[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
+      datacol_0_0_0#3[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#64[k#6])
+      data_col_0_0_0#4[k#6] = datacol_0_0_0#3[k#6]
+      _temp_0_0_0#65[k#6] = (datacol_0_0_0#3[k#6] - u_pos_0_0_0[k#6])
+      utens_stage_0_0_0#5[k#6] = (_dtr_stage * _temp_0_0_0#65[k#6])
+      data_col_0_0_0[k#6] = data_col_0_0_0#4[k#6]
+      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#5[k#6]
     }
     awaitall
     await send(utens_stage_0_0_0#1, __kernel_out_0[(i#2 - 1), (j#2 - 0)])
@@ -438,14 +438,14 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[16, 16] readonly _u_
     await receive(utens_stage_0_0_0, _utens_stage[i#2, j#2])
     await receive(wcon_0_0_0, _wcon[i#2, j#2])
     awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:5:1], receive(_stream_wcon#1) {
-      _refactored_wcon_1_0_0_0_0#1[k] = x
+    completion _recv_comp = foreach i32 k, f32 x in [0:5:1], receive(_stream_wcon#3) {
+      _refactored_wcon_1_0_0_0_0#3[k] = x
     }
-    completion _send_comp = send(wcon_0_0_0, _stream_wcon#2)
+    completion _send_comp = send(wcon_0_0_0, _stream_wcon#4)
     await _send_comp
     await _recv_comp
     await map i32 k#1 in [0:4:1] {
-      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#1[k#1]
+      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#3[k#1]
     }
     awaitall
     for i32 k#2 in [0:1:1] {
@@ -469,75 +469,75 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[16, 16] readonly _u_
     }
     awaitall
     for i32 k#3 in [1:3:1] {
-      _temp_0_0_0#6[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
-      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#6[k#3])
-      _temp_0_0_0#7[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
-      _temp_0_0_0#8[k#3] = (_temp_0_0_0#7[k#3] + wcon_0_0_0[(k#3 + 1)])
-      gcv_0_0_0#1[k#3] = (0.25 * _temp_0_0_0#8[k#3])
+      _temp_0_0_0#12[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
+      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#12[k#3])
+      _temp_0_0_0#13[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
+      _temp_0_0_0#14[k#3] = (_temp_0_0_0#13[k#3] + wcon_0_0_0[(k#3 + 1)])
+      gcv_0_0_0#2[k#3] = (0.25 * _temp_0_0_0#14[k#3])
       as__0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      cs_0_0_0#1[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
+      cs_0_0_0#2[k#3] = (gcv_0_0_0#2[k#3] * 0.5)
       acol_0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      ccol_0_0_0#3[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
-      _temp_0_0_0#9[k#3] = (_dtr_stage - acol_0_0_0[k#3])
-      bcol_0_0_0#1[k#3] = (_temp_0_0_0#9[k#3] - ccol_0_0_0#3[k#3])
-      _temp_0_0_0#10[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#11[k#3] = (cs_0_0_0#1[k#3] * _temp_0_0_0#10[k#3])
-      _temp_0_0_0#12[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#13[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#12[k#3])
-      correction_term_0_0_0#1[k#3] = (_temp_0_0_0#13[k#3] - _temp_0_0_0#11[k#3])
-      _temp_0_0_0#14[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
-      _temp_0_0_0#15[k#3] = (_temp_0_0_0#14[k#3] + utens_0_0_0[k#3])
-      _temp_0_0_0#16[k#3] = (_temp_0_0_0#15[k#3] + utens_stage_0_0_0[k#3])
-      dcol_0_0_0#3[k#3] = (_temp_0_0_0#16[k#3] + correction_term_0_0_0#1[k#3])
-      _temp_0_0_0#17[k#3] = (ccol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#18[k#3] = (bcol_0_0_0#1[k#3] - _temp_0_0_0#17[k#3])
-      divided_0_0_0#1[k#3] = (1.0 / _temp_0_0_0#18[k#3])
-      ccol_0_0_0#4[k#3] = (ccol_0_0_0#3[k#3] * divided_0_0_0#1[k#3])
-      _temp_0_0_0#19[k#3] = (dcol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#20[k#3] = (dcol_0_0_0#3[k#3] - _temp_0_0_0#19[k#3])
-      dcol_0_0_0#4[k#3] = (_temp_0_0_0#20[k#3] * divided_0_0_0#1[k#3])
-      ccol_0_0_0[k#3] = ccol_0_0_0#4[k#3]
-      dcol_0_0_0[k#3] = dcol_0_0_0#4[k#3]
+      ccol_0_0_0#5[k#3] = (gcv_0_0_0#2[k#3] * 0.5)
+      _temp_0_0_0#15[k#3] = (_dtr_stage - acol_0_0_0[k#3])
+      bcol_0_0_0#2[k#3] = (_temp_0_0_0#15[k#3] - ccol_0_0_0#5[k#3])
+      _temp_0_0_0#16[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#17[k#3] = (cs_0_0_0#2[k#3] * _temp_0_0_0#16[k#3])
+      _temp_0_0_0#18[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#19[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#18[k#3])
+      correction_term_0_0_0#2[k#3] = (_temp_0_0_0#19[k#3] - _temp_0_0_0#17[k#3])
+      _temp_0_0_0#20[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
+      _temp_0_0_0#21[k#3] = (_temp_0_0_0#20[k#3] + utens_0_0_0[k#3])
+      _temp_0_0_0#22[k#3] = (_temp_0_0_0#21[k#3] + utens_stage_0_0_0[k#3])
+      dcol_0_0_0#5[k#3] = (_temp_0_0_0#22[k#3] + correction_term_0_0_0#2[k#3])
+      _temp_0_0_0#23[k#3] = (ccol_0_0_0[(k#3 - 1)] * acol_0_0_0[k#3])
+      _temp_0_0_0#24[k#3] = (bcol_0_0_0#2[k#3] - _temp_0_0_0#23[k#3])
+      divided_0_0_0#2[k#3] = (1.0 / _temp_0_0_0#24[k#3])
+      ccol_0_0_0#6[k#3] = (ccol_0_0_0#5[k#3] * divided_0_0_0#2[k#3])
+      _temp_0_0_0#25[k#3] = (dcol_0_0_0[(k#3 - 1)] * acol_0_0_0[k#3])
+      _temp_0_0_0#26[k#3] = (dcol_0_0_0#5[k#3] - _temp_0_0_0#25[k#3])
+      dcol_0_0_0#6[k#3] = (_temp_0_0_0#26[k#3] * divided_0_0_0#2[k#3])
+      ccol_0_0_0[k#3] = ccol_0_0_0#6[k#3]
+      dcol_0_0_0[k#3] = dcol_0_0_0#6[k#3]
     }
     awaitall
     for i32 k#4 in [3:4:1] {
-      _temp_0_0_0#21[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
-      gav_0_0_0#1[k#4] = (-0.25 * _temp_0_0_0#21[k#4])
-      as__0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      acol_0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      bcol_0_0_0#2[k#4] = (_dtr_stage - acol_0_0_0#1[k#4])
-      _temp_0_0_0#22[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
-      correction_term_0_0_0#2[k#4] = (-as__0_0_0#1[k#4] * _temp_0_0_0#22[k#4])
-      _temp_0_0_0#23[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
-      _temp_0_0_0#24[k#4] = (_temp_0_0_0#23[k#4] + utens_0_0_0[k#4])
-      _temp_0_0_0#25[k#4] = (_temp_0_0_0#24[k#4] + utens_stage_0_0_0[k#4])
-      dcol_0_0_0#5[k#4] = (_temp_0_0_0#25[k#4] + correction_term_0_0_0#2[k#4])
-      _temp_0_0_0#26[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#27[k#4] = (bcol_0_0_0#2[k#4] - _temp_0_0_0#26[k#4])
-      divided_0_0_0#2[k#4] = (1.0 / _temp_0_0_0#27[k#4])
-      _temp_0_0_0#28[k#4] = (dcol_0_0_0#5[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#29[k#4] = (dcol_0_0_0#5[k#4] - _temp_0_0_0#28[k#4])
-      dcol_0_0_0#6[k#4] = (_temp_0_0_0#29[k#4] * divided_0_0_0#2[k#4])
-      dcol_0_0_0[k#4] = dcol_0_0_0#6[k#4]
+      _temp_0_0_0#42[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
+      gav_0_0_0#2[k#4] = (-0.25 * _temp_0_0_0#42[k#4])
+      as__0_0_0#2[k#4] = (gav_0_0_0#2[k#4] * 0.5)
+      acol_0_0_0#2[k#4] = (gav_0_0_0#2[k#4] * 0.5)
+      bcol_0_0_0#4[k#4] = (_dtr_stage - acol_0_0_0#2[k#4])
+      _temp_0_0_0#43[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
+      correction_term_0_0_0#4[k#4] = (-as__0_0_0#2[k#4] * _temp_0_0_0#43[k#4])
+      _temp_0_0_0#44[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
+      _temp_0_0_0#45[k#4] = (_temp_0_0_0#44[k#4] + utens_0_0_0[k#4])
+      _temp_0_0_0#46[k#4] = (_temp_0_0_0#45[k#4] + utens_stage_0_0_0[k#4])
+      dcol_0_0_0#9[k#4] = (_temp_0_0_0#46[k#4] + correction_term_0_0_0#4[k#4])
+      _temp_0_0_0#47[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#2[k#4])
+      _temp_0_0_0#48[k#4] = (bcol_0_0_0#4[k#4] - _temp_0_0_0#47[k#4])
+      divided_0_0_0#4[k#4] = (1.0 / _temp_0_0_0#48[k#4])
+      _temp_0_0_0#49[k#4] = (dcol_0_0_0[(k#4 - 1)] * acol_0_0_0#2[k#4])
+      _temp_0_0_0#50[k#4] = (dcol_0_0_0#9[k#4] - _temp_0_0_0#49[k#4])
+      dcol_0_0_0#10[k#4] = (_temp_0_0_0#50[k#4] * divided_0_0_0#4[k#4])
+      dcol_0_0_0[k#4] = dcol_0_0_0#10[k#4]
     }
     awaitall
     for i32 k#5 in [3:2:-1] {
       datacol_0_0_0[k#5] = dcol_0_0_0[k#5]
       data_col_0_0_0#1[k#5] = datacol_0_0_0[k#5]
-      _temp_0_0_0#30[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
-      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#30[k#5])
+      _temp_0_0_0#60[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
+      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#60[k#5])
       data_col_0_0_0[k#5] = data_col_0_0_0#1[k#5]
       utens_stage_0_0_0#1[k#5] = utens_stage_0_0_0#2[k#5]
     }
     awaitall
     for i32 k#6 in [2:-1:-1] {
-      _temp_0_0_0#31[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
-      datacol_0_0_0#1[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#31[k#6])
-      data_col_0_0_0#2[k#6] = datacol_0_0_0#1[k#6]
-      _temp_0_0_0#32[k#6] = (datacol_0_0_0#1[k#6] - u_pos_0_0_0[k#6])
-      utens_stage_0_0_0#3[k#6] = (_dtr_stage * _temp_0_0_0#32[k#6])
-      data_col_0_0_0[k#6] = data_col_0_0_0#2[k#6]
-      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#3[k#6]
+      _temp_0_0_0#62[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
+      datacol_0_0_0#2[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#62[k#6])
+      data_col_0_0_0#3[k#6] = datacol_0_0_0#2[k#6]
+      _temp_0_0_0#63[k#6] = (datacol_0_0_0#2[k#6] - u_pos_0_0_0[k#6])
+      utens_stage_0_0_0#4[k#6] = (_dtr_stage * _temp_0_0_0#63[k#6])
+      data_col_0_0_0[k#6] = data_col_0_0_0#3[k#6]
+      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#4[k#6]
     }
     awaitall
     await send(utens_stage_0_0_0#1, __kernel_out_0[(i#2 - 1), (j#2 - 0)])
@@ -550,14 +550,14 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[16, 16] readonly _u_
     await receive(utens_stage_0_0_0, _utens_stage[i#2, j#2])
     await receive(wcon_0_0_0, _wcon[i#2, j#2])
     awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:5:1], receive(_stream_wcon#2) {
-      _refactored_wcon_1_0_0_0_0#1[k] = x
+    completion _recv_comp = foreach i32 k, f32 x in [0:5:1], receive(_stream_wcon#4) {
+      _refactored_wcon_1_0_0_0_0#3[k] = x
     }
-    completion _send_comp = send(wcon_0_0_0, _stream_wcon#1)
+    completion _send_comp = send(wcon_0_0_0, _stream_wcon#3)
     await _send_comp
     await _recv_comp
     await map i32 k#1 in [0:4:1] {
-      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#1[k#1]
+      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#3[k#1]
     }
     awaitall
     for i32 k#2 in [0:1:1] {
@@ -581,75 +581,75 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[16, 16] readonly _u_
     }
     awaitall
     for i32 k#3 in [1:3:1] {
-      _temp_0_0_0#6[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
-      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#6[k#3])
-      _temp_0_0_0#7[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
-      _temp_0_0_0#8[k#3] = (_temp_0_0_0#7[k#3] + wcon_0_0_0[(k#3 + 1)])
-      gcv_0_0_0#1[k#3] = (0.25 * _temp_0_0_0#8[k#3])
+      _temp_0_0_0#12[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
+      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#12[k#3])
+      _temp_0_0_0#13[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
+      _temp_0_0_0#14[k#3] = (_temp_0_0_0#13[k#3] + wcon_0_0_0[(k#3 + 1)])
+      gcv_0_0_0#2[k#3] = (0.25 * _temp_0_0_0#14[k#3])
       as__0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      cs_0_0_0#1[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
+      cs_0_0_0#2[k#3] = (gcv_0_0_0#2[k#3] * 0.5)
       acol_0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      ccol_0_0_0#3[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
-      _temp_0_0_0#9[k#3] = (_dtr_stage - acol_0_0_0[k#3])
-      bcol_0_0_0#1[k#3] = (_temp_0_0_0#9[k#3] - ccol_0_0_0#3[k#3])
-      _temp_0_0_0#10[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#11[k#3] = (cs_0_0_0#1[k#3] * _temp_0_0_0#10[k#3])
-      _temp_0_0_0#12[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#13[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#12[k#3])
-      correction_term_0_0_0#1[k#3] = (_temp_0_0_0#13[k#3] - _temp_0_0_0#11[k#3])
-      _temp_0_0_0#14[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
-      _temp_0_0_0#15[k#3] = (_temp_0_0_0#14[k#3] + utens_0_0_0[k#3])
-      _temp_0_0_0#16[k#3] = (_temp_0_0_0#15[k#3] + utens_stage_0_0_0[k#3])
-      dcol_0_0_0#3[k#3] = (_temp_0_0_0#16[k#3] + correction_term_0_0_0#1[k#3])
-      _temp_0_0_0#17[k#3] = (ccol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#18[k#3] = (bcol_0_0_0#1[k#3] - _temp_0_0_0#17[k#3])
-      divided_0_0_0#1[k#3] = (1.0 / _temp_0_0_0#18[k#3])
-      ccol_0_0_0#4[k#3] = (ccol_0_0_0#3[k#3] * divided_0_0_0#1[k#3])
-      _temp_0_0_0#19[k#3] = (dcol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#20[k#3] = (dcol_0_0_0#3[k#3] - _temp_0_0_0#19[k#3])
-      dcol_0_0_0#4[k#3] = (_temp_0_0_0#20[k#3] * divided_0_0_0#1[k#3])
-      ccol_0_0_0[k#3] = ccol_0_0_0#4[k#3]
-      dcol_0_0_0[k#3] = dcol_0_0_0#4[k#3]
+      ccol_0_0_0#5[k#3] = (gcv_0_0_0#2[k#3] * 0.5)
+      _temp_0_0_0#15[k#3] = (_dtr_stage - acol_0_0_0[k#3])
+      bcol_0_0_0#2[k#3] = (_temp_0_0_0#15[k#3] - ccol_0_0_0#5[k#3])
+      _temp_0_0_0#16[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#17[k#3] = (cs_0_0_0#2[k#3] * _temp_0_0_0#16[k#3])
+      _temp_0_0_0#18[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#19[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#18[k#3])
+      correction_term_0_0_0#2[k#3] = (_temp_0_0_0#19[k#3] - _temp_0_0_0#17[k#3])
+      _temp_0_0_0#20[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
+      _temp_0_0_0#21[k#3] = (_temp_0_0_0#20[k#3] + utens_0_0_0[k#3])
+      _temp_0_0_0#22[k#3] = (_temp_0_0_0#21[k#3] + utens_stage_0_0_0[k#3])
+      dcol_0_0_0#5[k#3] = (_temp_0_0_0#22[k#3] + correction_term_0_0_0#2[k#3])
+      _temp_0_0_0#23[k#3] = (ccol_0_0_0[(k#3 - 1)] * acol_0_0_0[k#3])
+      _temp_0_0_0#24[k#3] = (bcol_0_0_0#2[k#3] - _temp_0_0_0#23[k#3])
+      divided_0_0_0#2[k#3] = (1.0 / _temp_0_0_0#24[k#3])
+      ccol_0_0_0#6[k#3] = (ccol_0_0_0#5[k#3] * divided_0_0_0#2[k#3])
+      _temp_0_0_0#25[k#3] = (dcol_0_0_0[(k#3 - 1)] * acol_0_0_0[k#3])
+      _temp_0_0_0#26[k#3] = (dcol_0_0_0#5[k#3] - _temp_0_0_0#25[k#3])
+      dcol_0_0_0#6[k#3] = (_temp_0_0_0#26[k#3] * divided_0_0_0#2[k#3])
+      ccol_0_0_0[k#3] = ccol_0_0_0#6[k#3]
+      dcol_0_0_0[k#3] = dcol_0_0_0#6[k#3]
     }
     awaitall
     for i32 k#4 in [3:4:1] {
-      _temp_0_0_0#21[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
-      gav_0_0_0#1[k#4] = (-0.25 * _temp_0_0_0#21[k#4])
-      as__0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      acol_0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      bcol_0_0_0#2[k#4] = (_dtr_stage - acol_0_0_0#1[k#4])
-      _temp_0_0_0#22[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
-      correction_term_0_0_0#2[k#4] = (-as__0_0_0#1[k#4] * _temp_0_0_0#22[k#4])
-      _temp_0_0_0#23[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
-      _temp_0_0_0#24[k#4] = (_temp_0_0_0#23[k#4] + utens_0_0_0[k#4])
-      _temp_0_0_0#25[k#4] = (_temp_0_0_0#24[k#4] + utens_stage_0_0_0[k#4])
-      dcol_0_0_0#5[k#4] = (_temp_0_0_0#25[k#4] + correction_term_0_0_0#2[k#4])
-      _temp_0_0_0#26[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#27[k#4] = (bcol_0_0_0#2[k#4] - _temp_0_0_0#26[k#4])
-      divided_0_0_0#2[k#4] = (1.0 / _temp_0_0_0#27[k#4])
-      _temp_0_0_0#28[k#4] = (dcol_0_0_0#5[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#29[k#4] = (dcol_0_0_0#5[k#4] - _temp_0_0_0#28[k#4])
-      dcol_0_0_0#6[k#4] = (_temp_0_0_0#29[k#4] * divided_0_0_0#2[k#4])
-      dcol_0_0_0[k#4] = dcol_0_0_0#6[k#4]
+      _temp_0_0_0#42[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
+      gav_0_0_0#2[k#4] = (-0.25 * _temp_0_0_0#42[k#4])
+      as__0_0_0#2[k#4] = (gav_0_0_0#2[k#4] * 0.5)
+      acol_0_0_0#2[k#4] = (gav_0_0_0#2[k#4] * 0.5)
+      bcol_0_0_0#4[k#4] = (_dtr_stage - acol_0_0_0#2[k#4])
+      _temp_0_0_0#43[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
+      correction_term_0_0_0#4[k#4] = (-as__0_0_0#2[k#4] * _temp_0_0_0#43[k#4])
+      _temp_0_0_0#44[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
+      _temp_0_0_0#45[k#4] = (_temp_0_0_0#44[k#4] + utens_0_0_0[k#4])
+      _temp_0_0_0#46[k#4] = (_temp_0_0_0#45[k#4] + utens_stage_0_0_0[k#4])
+      dcol_0_0_0#9[k#4] = (_temp_0_0_0#46[k#4] + correction_term_0_0_0#4[k#4])
+      _temp_0_0_0#47[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#2[k#4])
+      _temp_0_0_0#48[k#4] = (bcol_0_0_0#4[k#4] - _temp_0_0_0#47[k#4])
+      divided_0_0_0#4[k#4] = (1.0 / _temp_0_0_0#48[k#4])
+      _temp_0_0_0#49[k#4] = (dcol_0_0_0[(k#4 - 1)] * acol_0_0_0#2[k#4])
+      _temp_0_0_0#50[k#4] = (dcol_0_0_0#9[k#4] - _temp_0_0_0#49[k#4])
+      dcol_0_0_0#10[k#4] = (_temp_0_0_0#50[k#4] * divided_0_0_0#4[k#4])
+      dcol_0_0_0[k#4] = dcol_0_0_0#10[k#4]
     }
     awaitall
     for i32 k#5 in [3:2:-1] {
       datacol_0_0_0[k#5] = dcol_0_0_0[k#5]
       data_col_0_0_0#1[k#5] = datacol_0_0_0[k#5]
-      _temp_0_0_0#30[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
-      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#30[k#5])
+      _temp_0_0_0#60[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
+      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#60[k#5])
       data_col_0_0_0[k#5] = data_col_0_0_0#1[k#5]
       utens_stage_0_0_0#1[k#5] = utens_stage_0_0_0#2[k#5]
     }
     awaitall
     for i32 k#6 in [2:-1:-1] {
-      _temp_0_0_0#31[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
-      datacol_0_0_0#1[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#31[k#6])
-      data_col_0_0_0#2[k#6] = datacol_0_0_0#1[k#6]
-      _temp_0_0_0#32[k#6] = (datacol_0_0_0#1[k#6] - u_pos_0_0_0[k#6])
-      utens_stage_0_0_0#3[k#6] = (_dtr_stage * _temp_0_0_0#32[k#6])
-      data_col_0_0_0[k#6] = data_col_0_0_0#2[k#6]
-      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#3[k#6]
+      _temp_0_0_0#62[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
+      datacol_0_0_0#2[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#62[k#6])
+      data_col_0_0_0#3[k#6] = datacol_0_0_0#2[k#6]
+      _temp_0_0_0#63[k#6] = (datacol_0_0_0#2[k#6] - u_pos_0_0_0[k#6])
+      utens_stage_0_0_0#4[k#6] = (_dtr_stage * _temp_0_0_0#63[k#6])
+      data_col_0_0_0[k#6] = data_col_0_0_0#3[k#6]
+      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#4[k#6]
     }
     awaitall
     await send(utens_stage_0_0_0#1, __kernel_out_0[(i#2 - 1), (j#2 - 0)])
diff --git a/samples/benchmarks/vertical_advection_4_4_4.sptl b/samples/benchmarks/vertical_advection_4_4_4.sptl
index 0902a275..6ec57b91 100644
--- a/samples/benchmarks/vertical_advection_4_4_4.sptl
+++ b/samples/benchmarks/vertical_advection_4_4_4.sptl
@@ -10,86 +10,86 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[4, 4] readonly _u_po
     f32[5] utens_0_0_0
     f32[5] utens_stage_0_0_0
     f32[5] wcon_0_0_0
-    f32[5] _refactored_wcon_1_0_0_0_0#1
-    f32[2] gcv_0_0_0
-    f32[2] _temp_0_0_0
-    f32[2] _temp_0_0_0#1
-    f32[2] cs_0_0_0
-    f32[2] ccol_0_0_0#1
-    f32[2] bcol_0_0_0
-    f32[2] correction_term_0_0_0
-    f32[2] _temp_0_0_0#2
-    f32[2] dcol_0_0_0#1
-    f32[2] _temp_0_0_0#3
-    f32[2] _temp_0_0_0#4
-    f32[2] _temp_0_0_0#5
-    f32[2] divided_0_0_0
-    f32[2] ccol_0_0_0#2
-    f32[2] dcol_0_0_0#2
-    f32[4] gav_0_0_m1
-    f32[4] gav_0_0_0
-    f32[4] _temp_0_0_0#6
-    f32[4] gcv_0_0_m1
-    f32[4] gcv_0_0_0#1
-    f32[4] _temp_0_0_0#7
-    f32[4] _temp_0_0_0#8
-    f32[4] as__0_0_m1
-    f32[4] as__0_0_0
-    f32[4] cs_0_0_m1
-    f32[4] cs_0_0_0#1
-    f32[4] acol_0_0_0
-    f32[4] ccol_0_0_m1
-    f32[4] ccol_0_0_0#3
-    f32[4] bcol_0_0_0#1
-    f32[4] _temp_0_0_0#9
-    f32[4] correction_term_0_0_m1
-    f32[4] correction_term_0_0_0#1
-    f32[4] _temp_0_0_0#10
-    f32[4] _temp_0_0_0#11
-    f32[4] _temp_0_0_0#12
-    f32[4] _temp_0_0_0#13
-    f32[4] dcol_0_0_m1
-    f32[4] dcol_0_0_0#3
-    f32[4] _temp_0_0_0#14
-    f32[4] _temp_0_0_0#15
-    f32[4] _temp_0_0_0#16
-    f32[4] divided_0_0_0#1
-    f32[4] _temp_0_0_0#17
-    f32[4] _temp_0_0_0#18
-    f32[4] ccol_0_0_0#4
-    f32[4] dcol_0_0_0#4
-    f32[4] _temp_0_0_0#19
-    f32[4] _temp_0_0_0#20
-    f32[5] gav_0_0_m1#1
-    f32[5] gav_0_0_0#1
-    f32[5] _temp_0_0_0#21
-    f32[5] as__0_0_m1#1
-    f32[5] as__0_0_0#1
-    f32[5] acol_0_0_0#1
-    f32[5] bcol_0_0_0#2
-    f32[5] correction_term_0_0_m1#1
-    f32[5] correction_term_0_0_0#2
-    f32[5] _temp_0_0_0#22
-    f32[5] dcol_0_0_m1#1
-    f32[5] dcol_0_0_0#5
-    f32[5] _temp_0_0_0#23
-    f32[5] _temp_0_0_0#24
-    f32[5] _temp_0_0_0#25
-    f32[5] divided_0_0_0#2
-    f32[5] _temp_0_0_0#26
-    f32[5] _temp_0_0_0#27
-    f32[5] dcol_0_0_0#6
-    f32[5] _temp_0_0_0#28
-    f32[5] _temp_0_0_0#29
-    f32[5] datacol_0_0_0
-    f32[5] data_col_0_0_0#1
-    f32[5] utens_stage_0_0_0#2
-    f32[5] _temp_0_0_0#30
-    f32[4] datacol_0_0_0#1
+    f32[5] _refactored_wcon_1_0_0_0_0#2
+    f32[2] gcv_0_0_0#1
+    f32[2] _temp_0_0_0#6
+    f32[2] _temp_0_0_0#7
+    f32[2] cs_0_0_0#1
+    f32[2] ccol_0_0_0#3
+    f32[2] bcol_0_0_0#1
+    f32[2] correction_term_0_0_0#1
+    f32[2] _temp_0_0_0#8
+    f32[2] dcol_0_0_0#3
+    f32[2] _temp_0_0_0#9
+    f32[2] _temp_0_0_0#10
+    f32[2] _temp_0_0_0#11
+    f32[2] divided_0_0_0#1
+    f32[2] ccol_0_0_0#4
+    f32[2] dcol_0_0_0#4
+    f32[4] gav_0_0_m1#1
+    f32[4] gav_0_0_0#1
+    f32[4] _temp_0_0_0#27
+    f32[4] gcv_0_0_m1#1
+    f32[4] gcv_0_0_0#3
+    f32[4] _temp_0_0_0#28
+    f32[4] _temp_0_0_0#29
+    f32[4] as__0_0_m1#1
+    f32[4] as__0_0_0#1
+    f32[4] cs_0_0_m1#1
+    f32[4] cs_0_0_0#3
+    f32[4] acol_0_0_0#1
+    f32[4] ccol_0_0_m1#1
+    f32[4] ccol_0_0_0#7
+    f32[4] bcol_0_0_0#3
+    f32[4] _temp_0_0_0#30
+    f32[4] correction_term_0_0_m1#1
+    f32[4] correction_term_0_0_0#3
     f32[4] _temp_0_0_0#31
-    f32[4] data_col_0_0_0#2
-    f32[4] utens_stage_0_0_0#3
     f32[4] _temp_0_0_0#32
+    f32[4] _temp_0_0_0#33
+    f32[4] _temp_0_0_0#34
+    f32[4] dcol_0_0_m1#1
+    f32[4] dcol_0_0_0#7
+    f32[4] _temp_0_0_0#35
+    f32[4] _temp_0_0_0#36
+    f32[4] _temp_0_0_0#37
+    f32[4] divided_0_0_0#3
+    f32[4] _temp_0_0_0#38
+    f32[4] _temp_0_0_0#39
+    f32[4] ccol_0_0_0#8
+    f32[4] dcol_0_0_0#8
+    f32[4] _temp_0_0_0#40
+    f32[4] _temp_0_0_0#41
+    f32[5] gav_0_0_m1#3
+    f32[5] gav_0_0_0#3
+    f32[5] _temp_0_0_0#51
+    f32[5] as__0_0_m1#3
+    f32[5] as__0_0_0#3
+    f32[5] acol_0_0_0#3
+    f32[5] bcol_0_0_0#5
+    f32[5] correction_term_0_0_m1#3
+    f32[5] correction_term_0_0_0#5
+    f32[5] _temp_0_0_0#52
+    f32[5] dcol_0_0_m1#3
+    f32[5] dcol_0_0_0#11
+    f32[5] _temp_0_0_0#53
+    f32[5] _temp_0_0_0#54
+    f32[5] _temp_0_0_0#55
+    f32[5] divided_0_0_0#5
+    f32[5] _temp_0_0_0#56
+    f32[5] _temp_0_0_0#57
+    f32[5] dcol_0_0_0#12
+    f32[5] _temp_0_0_0#58
+    f32[5] _temp_0_0_0#59
+    f32[5] datacol_0_0_0#1
+    f32[5] data_col_0_0_0#2
+    f32[5] utens_stage_0_0_0#3
+    f32[5] _temp_0_0_0#61
+    f32[4] datacol_0_0_0#3
+    f32[4] _temp_0_0_0#64
+    f32[4] data_col_0_0_0#4
+    f32[4] utens_stage_0_0_0#5
+    f32[4] _temp_0_0_0#65
   }
   place u16 i, u16 j in [2:5:2 , 0:4:1] {
     f32[5] utens_stage_0_0_0#1
@@ -102,7 +102,7 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[4, 4] readonly _u_po
     f32[5] utens_0_0_0
     f32[5] utens_stage_0_0_0
     f32[5] wcon_0_0_0
-    f32[5] _refactored_wcon_1_0_0_0_0#1
+    f32[5] _refactored_wcon_1_0_0_0_0#3
     f32[2] gcv_0_0_0
     f32[2] _temp_0_0_0
     f32[2] _temp_0_0_0#1
@@ -120,68 +120,68 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[4, 4] readonly _u_po
     f32[2] dcol_0_0_0#2
     f32[4] gav_0_0_m1
     f32[4] gav_0_0_0
-    f32[4] _temp_0_0_0#6
+    f32[4] _temp_0_0_0#12
     f32[4] gcv_0_0_m1
-    f32[4] gcv_0_0_0#1
-    f32[4] _temp_0_0_0#7
-    f32[4] _temp_0_0_0#8
+    f32[4] gcv_0_0_0#2
+    f32[4] _temp_0_0_0#13
+    f32[4] _temp_0_0_0#14
     f32[4] as__0_0_m1
     f32[4] as__0_0_0
     f32[4] cs_0_0_m1
-    f32[4] cs_0_0_0#1
+    f32[4] cs_0_0_0#2
     f32[4] acol_0_0_0
     f32[4] ccol_0_0_m1
-    f32[4] ccol_0_0_0#3
-    f32[4] bcol_0_0_0#1
-    f32[4] _temp_0_0_0#9
-    f32[4] correction_term_0_0_m1
-    f32[4] correction_term_0_0_0#1
-    f32[4] _temp_0_0_0#10
-    f32[4] _temp_0_0_0#11
-    f32[4] _temp_0_0_0#12
-    f32[4] _temp_0_0_0#13
-    f32[4] dcol_0_0_m1
-    f32[4] dcol_0_0_0#3
-    f32[4] _temp_0_0_0#14
+    f32[4] ccol_0_0_0#5
+    f32[4] bcol_0_0_0#2
     f32[4] _temp_0_0_0#15
+    f32[4] correction_term_0_0_m1
+    f32[4] correction_term_0_0_0#2
     f32[4] _temp_0_0_0#16
-    f32[4] divided_0_0_0#1
     f32[4] _temp_0_0_0#17
     f32[4] _temp_0_0_0#18
-    f32[4] ccol_0_0_0#4
-    f32[4] dcol_0_0_0#4
     f32[4] _temp_0_0_0#19
+    f32[4] dcol_0_0_m1
+    f32[4] dcol_0_0_0#5
     f32[4] _temp_0_0_0#20
-    f32[5] gav_0_0_m1#1
-    f32[5] gav_0_0_0#1
-    f32[5] _temp_0_0_0#21
-    f32[5] as__0_0_m1#1
-    f32[5] as__0_0_0#1
-    f32[5] acol_0_0_0#1
-    f32[5] bcol_0_0_0#2
-    f32[5] correction_term_0_0_m1#1
-    f32[5] correction_term_0_0_0#2
-    f32[5] _temp_0_0_0#22
-    f32[5] dcol_0_0_m1#1
-    f32[5] dcol_0_0_0#5
-    f32[5] _temp_0_0_0#23
-    f32[5] _temp_0_0_0#24
-    f32[5] _temp_0_0_0#25
-    f32[5] divided_0_0_0#2
-    f32[5] _temp_0_0_0#26
-    f32[5] _temp_0_0_0#27
-    f32[5] dcol_0_0_0#6
-    f32[5] _temp_0_0_0#28
-    f32[5] _temp_0_0_0#29
+    f32[4] _temp_0_0_0#21
+    f32[4] _temp_0_0_0#22
+    f32[4] divided_0_0_0#2
+    f32[4] _temp_0_0_0#23
+    f32[4] _temp_0_0_0#24
+    f32[4] ccol_0_0_0#6
+    f32[4] dcol_0_0_0#6
+    f32[4] _temp_0_0_0#25
+    f32[4] _temp_0_0_0#26
+    f32[5] gav_0_0_m1#2
+    f32[5] gav_0_0_0#2
+    f32[5] _temp_0_0_0#42
+    f32[5] as__0_0_m1#2
+    f32[5] as__0_0_0#2
+    f32[5] acol_0_0_0#2
+    f32[5] bcol_0_0_0#4
+    f32[5] correction_term_0_0_m1#2
+    f32[5] correction_term_0_0_0#4
+    f32[5] _temp_0_0_0#43
+    f32[5] dcol_0_0_m1#2
+    f32[5] dcol_0_0_0#9
+    f32[5] _temp_0_0_0#44
+    f32[5] _temp_0_0_0#45
+    f32[5] _temp_0_0_0#46
+    f32[5] divided_0_0_0#4
+    f32[5] _temp_0_0_0#47
+    f32[5] _temp_0_0_0#48
+    f32[5] dcol_0_0_0#10
+    f32[5] _temp_0_0_0#49
+    f32[5] _temp_0_0_0#50
     f32[5] datacol_0_0_0
     f32[5] data_col_0_0_0#1
     f32[5] utens_stage_0_0_0#2
-    f32[5] _temp_0_0_0#30
-    f32[4] datacol_0_0_0#1
-    f32[4] _temp_0_0_0#31
-    f32[4] data_col_0_0_0#2
-    f32[4] utens_stage_0_0_0#3
-    f32[4] _temp_0_0_0#32
+    f32[5] _temp_0_0_0#60
+    f32[4] datacol_0_0_0#2
+    f32[4] _temp_0_0_0#62
+    f32[4] data_col_0_0_0#3
+    f32[4] utens_stage_0_0_0#4
+    f32[4] _temp_0_0_0#63
   }
   place u16 i, u16 j in [3:5:2 , 0:4:1] {
     f32[5] utens_stage_0_0_0#1
@@ -194,7 +194,7 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[4, 4] readonly _u_po
     f32[5] utens_0_0_0
     f32[5] utens_stage_0_0_0
     f32[5] wcon_0_0_0
-    f32[5] _refactored_wcon_1_0_0_0_0#1
+    f32[5] _refactored_wcon_1_0_0_0_0#3
     f32[2] gcv_0_0_0
     f32[2] _temp_0_0_0
     f32[2] _temp_0_0_0#1
@@ -212,68 +212,68 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[4, 4] readonly _u_po
     f32[2] dcol_0_0_0#2
     f32[4] gav_0_0_m1
     f32[4] gav_0_0_0
-    f32[4] _temp_0_0_0#6
+    f32[4] _temp_0_0_0#12
     f32[4] gcv_0_0_m1
-    f32[4] gcv_0_0_0#1
-    f32[4] _temp_0_0_0#7
-    f32[4] _temp_0_0_0#8
+    f32[4] gcv_0_0_0#2
+    f32[4] _temp_0_0_0#13
+    f32[4] _temp_0_0_0#14
     f32[4] as__0_0_m1
     f32[4] as__0_0_0
     f32[4] cs_0_0_m1
-    f32[4] cs_0_0_0#1
+    f32[4] cs_0_0_0#2
     f32[4] acol_0_0_0
     f32[4] ccol_0_0_m1
-    f32[4] ccol_0_0_0#3
-    f32[4] bcol_0_0_0#1
-    f32[4] _temp_0_0_0#9
-    f32[4] correction_term_0_0_m1
-    f32[4] correction_term_0_0_0#1
-    f32[4] _temp_0_0_0#10
-    f32[4] _temp_0_0_0#11
-    f32[4] _temp_0_0_0#12
-    f32[4] _temp_0_0_0#13
-    f32[4] dcol_0_0_m1
-    f32[4] dcol_0_0_0#3
-    f32[4] _temp_0_0_0#14
+    f32[4] ccol_0_0_0#5
+    f32[4] bcol_0_0_0#2
     f32[4] _temp_0_0_0#15
+    f32[4] correction_term_0_0_m1
+    f32[4] correction_term_0_0_0#2
     f32[4] _temp_0_0_0#16
-    f32[4] divided_0_0_0#1
     f32[4] _temp_0_0_0#17
     f32[4] _temp_0_0_0#18
-    f32[4] ccol_0_0_0#4
-    f32[4] dcol_0_0_0#4
     f32[4] _temp_0_0_0#19
+    f32[4] dcol_0_0_m1
+    f32[4] dcol_0_0_0#5
     f32[4] _temp_0_0_0#20
-    f32[5] gav_0_0_m1#1
-    f32[5] gav_0_0_0#1
-    f32[5] _temp_0_0_0#21
-    f32[5] as__0_0_m1#1
-    f32[5] as__0_0_0#1
-    f32[5] acol_0_0_0#1
-    f32[5] bcol_0_0_0#2
-    f32[5] correction_term_0_0_m1#1
-    f32[5] correction_term_0_0_0#2
-    f32[5] _temp_0_0_0#22
-    f32[5] dcol_0_0_m1#1
-    f32[5] dcol_0_0_0#5
-    f32[5] _temp_0_0_0#23
-    f32[5] _temp_0_0_0#24
-    f32[5] _temp_0_0_0#25
-    f32[5] divided_0_0_0#2
-    f32[5] _temp_0_0_0#26
-    f32[5] _temp_0_0_0#27
-    f32[5] dcol_0_0_0#6
-    f32[5] _temp_0_0_0#28
-    f32[5] _temp_0_0_0#29
+    f32[4] _temp_0_0_0#21
+    f32[4] _temp_0_0_0#22
+    f32[4] divided_0_0_0#2
+    f32[4] _temp_0_0_0#23
+    f32[4] _temp_0_0_0#24
+    f32[4] ccol_0_0_0#6
+    f32[4] dcol_0_0_0#6
+    f32[4] _temp_0_0_0#25
+    f32[4] _temp_0_0_0#26
+    f32[5] gav_0_0_m1#2
+    f32[5] gav_0_0_0#2
+    f32[5] _temp_0_0_0#42
+    f32[5] as__0_0_m1#2
+    f32[5] as__0_0_0#2
+    f32[5] acol_0_0_0#2
+    f32[5] bcol_0_0_0#4
+    f32[5] correction_term_0_0_m1#2
+    f32[5] correction_term_0_0_0#4
+    f32[5] _temp_0_0_0#43
+    f32[5] dcol_0_0_m1#2
+    f32[5] dcol_0_0_0#9
+    f32[5] _temp_0_0_0#44
+    f32[5] _temp_0_0_0#45
+    f32[5] _temp_0_0_0#46
+    f32[5] divided_0_0_0#4
+    f32[5] _temp_0_0_0#47
+    f32[5] _temp_0_0_0#48
+    f32[5] dcol_0_0_0#10
+    f32[5] _temp_0_0_0#49
+    f32[5] _temp_0_0_0#50
     f32[5] datacol_0_0_0
     f32[5] data_col_0_0_0#1
     f32[5] utens_stage_0_0_0#2
-    f32[5] _temp_0_0_0#30
-    f32[4] datacol_0_0_0#1
-    f32[4] _temp_0_0_0#31
-    f32[4] data_col_0_0_0#2
-    f32[4] utens_stage_0_0_0#3
-    f32[4] _temp_0_0_0#32
+    f32[5] _temp_0_0_0#60
+    f32[4] datacol_0_0_0#2
+    f32[4] _temp_0_0_0#62
+    f32[4] data_col_0_0_0#3
+    f32[4] utens_stage_0_0_0#4
+    f32[4] _temp_0_0_0#63
   }
   place u16 i#1, u16 j#1 in [5:6:2 , 0:4:1] {
     f32[5] wcon_0_0_0
@@ -283,43 +283,43 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[4, 4] readonly _u_po
   }
   dataflow u16 i#5, u16 j#5 in [5:6:2 , 0:4:1] {
     stream<f32> _stream_wcon#1 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 0
-    }
+  hops = [(-1, 0)], 
+  channel = 0
+}
     stream<f32> _stream_wcon#2 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 1
-    }
+  hops = [(-1, 0)], 
+  channel = 1
+}
   }
   dataflow u16 i#5, u16 j#5 in [2:5:2 , 0:4:1] {
-    stream<f32> _stream_wcon#1 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_wcon#2 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 1
-    }
+    stream<f32> _stream_wcon#3 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 2
+}
+    stream<f32> _stream_wcon#4 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 3
+}
   }
   dataflow u16 i#5, u16 j#5 in [3:5:2 , 0:4:1] {
-    stream<f32> _stream_wcon#1 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_wcon#2 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 1
-    }
+    stream<f32> _stream_wcon#3 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 2
+}
+    stream<f32> _stream_wcon#4 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 3
+}
   }
   dataflow u16 i#5, u16 j#5 in [1:2:2 , 0:4:1] {
-    stream<f32> _stream_wcon#1 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 0
-    }
-    stream<f32> _stream_wcon#2 = relative_stream(-1, 0) {
-      hops = [(-1, 0)], 
-      channel = 1
-    }
+    stream<f32> _stream_wcon#5 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 4
+}
+    stream<f32> _stream_wcon#6 = relative_stream(-1, 0) {
+  hops = [(-1, 0)], 
+  channel = 5
+}
   }
   compute u16 i#2, u16 j#2 in [1:2:2 , 0:4:1] {
     await receive(u_pos_0_0_0, _u_pos[i#2, j#2])
@@ -328,104 +328,104 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[4, 4] readonly _u_po
     await receive(utens_stage_0_0_0, _utens_stage[i#2, j#2])
     await receive(wcon_0_0_0, _wcon[i#2, j#2])
     awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:5:1], receive(_stream_wcon#2) {
-      _refactored_wcon_1_0_0_0_0#1[k] = x
+    completion _recv_comp = foreach i32 k, f32 x in [0:5:1], receive(_stream_wcon#6) {
+      _refactored_wcon_1_0_0_0_0#2[k] = x
     }
     await _recv_comp
     await map i32 k#1 in [0:4:1] {
-      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#1[k#1]
+      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#2[k#1]
     }
     awaitall
     for i32 k#2 in [0:1:1] {
-      _temp_0_0_0[k#2] = _refactored_wcon_1_0_0_0_0[(k#2 + 1)]
-      _temp_0_0_0#1[k#2] = (_temp_0_0_0[k#2] + wcon_0_0_0[(k#2 + 1)])
-      gcv_0_0_0[k#2] = (0.25 * _temp_0_0_0#1[k#2])
-      cs_0_0_0[k#2] = (gcv_0_0_0[k#2] * 0.5)
-      ccol_0_0_0#1[k#2] = (gcv_0_0_0[k#2] * 0.5)
-      bcol_0_0_0[k#2] = (_dtr_stage - ccol_0_0_0#1[k#2])
-      _temp_0_0_0#2[k#2] = (u_stage_0_0_0[(k#2 + 1)] - u_stage_0_0_0[k#2])
-      correction_term_0_0_0[k#2] = (-cs_0_0_0[k#2] * _temp_0_0_0#2[k#2])
-      _temp_0_0_0#3[k#2] = (_dtr_stage * u_pos_0_0_0[k#2])
-      _temp_0_0_0#4[k#2] = (_temp_0_0_0#3[k#2] + utens_0_0_0[k#2])
-      _temp_0_0_0#5[k#2] = (_temp_0_0_0#4[k#2] + utens_stage_0_0_0[k#2])
-      dcol_0_0_0#1[k#2] = (_temp_0_0_0#5[k#2] + correction_term_0_0_0[k#2])
-      divided_0_0_0[k#2] = (1.0 / bcol_0_0_0[k#2])
-      ccol_0_0_0#2[k#2] = (ccol_0_0_0#1[k#2] * divided_0_0_0[k#2])
-      dcol_0_0_0#2[k#2] = (dcol_0_0_0#1[k#2] * divided_0_0_0[k#2])
-      ccol_0_0_0[k#2] = ccol_0_0_0#2[k#2]
-      dcol_0_0_0[k#2] = dcol_0_0_0#2[k#2]
+      _temp_0_0_0#6[k#2] = _refactored_wcon_1_0_0_0_0[(k#2 + 1)]
+      _temp_0_0_0#7[k#2] = (_temp_0_0_0#6[k#2] + wcon_0_0_0[(k#2 + 1)])
+      gcv_0_0_0#1[k#2] = (0.25 * _temp_0_0_0#7[k#2])
+      cs_0_0_0#1[k#2] = (gcv_0_0_0#1[k#2] * 0.5)
+      ccol_0_0_0#3[k#2] = (gcv_0_0_0#1[k#2] * 0.5)
+      bcol_0_0_0#1[k#2] = (_dtr_stage - ccol_0_0_0#3[k#2])
+      _temp_0_0_0#8[k#2] = (u_stage_0_0_0[(k#2 + 1)] - u_stage_0_0_0[k#2])
+      correction_term_0_0_0#1[k#2] = (-cs_0_0_0#1[k#2] * _temp_0_0_0#8[k#2])
+      _temp_0_0_0#9[k#2] = (_dtr_stage * u_pos_0_0_0[k#2])
+      _temp_0_0_0#10[k#2] = (_temp_0_0_0#9[k#2] + utens_0_0_0[k#2])
+      _temp_0_0_0#11[k#2] = (_temp_0_0_0#10[k#2] + utens_stage_0_0_0[k#2])
+      dcol_0_0_0#3[k#2] = (_temp_0_0_0#11[k#2] + correction_term_0_0_0#1[k#2])
+      divided_0_0_0#1[k#2] = (1.0 / bcol_0_0_0#1[k#2])
+      ccol_0_0_0#4[k#2] = (ccol_0_0_0#3[k#2] * divided_0_0_0#1[k#2])
+      dcol_0_0_0#4[k#2] = (dcol_0_0_0#3[k#2] * divided_0_0_0#1[k#2])
+      ccol_0_0_0[k#2] = ccol_0_0_0#4[k#2]
+      dcol_0_0_0[k#2] = dcol_0_0_0#4[k#2]
     }
     awaitall
     for i32 k#3 in [1:3:1] {
-      _temp_0_0_0#6[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
-      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#6[k#3])
-      _temp_0_0_0#7[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
-      _temp_0_0_0#8[k#3] = (_temp_0_0_0#7[k#3] + wcon_0_0_0[(k#3 + 1)])
-      gcv_0_0_0#1[k#3] = (0.25 * _temp_0_0_0#8[k#3])
-      as__0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      cs_0_0_0#1[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
-      acol_0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      ccol_0_0_0#3[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
-      _temp_0_0_0#9[k#3] = (_dtr_stage - acol_0_0_0[k#3])
-      bcol_0_0_0#1[k#3] = (_temp_0_0_0#9[k#3] - ccol_0_0_0#3[k#3])
-      _temp_0_0_0#10[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#11[k#3] = (cs_0_0_0#1[k#3] * _temp_0_0_0#10[k#3])
-      _temp_0_0_0#12[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#13[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#12[k#3])
-      correction_term_0_0_0#1[k#3] = (_temp_0_0_0#13[k#3] - _temp_0_0_0#11[k#3])
-      _temp_0_0_0#14[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
-      _temp_0_0_0#15[k#3] = (_temp_0_0_0#14[k#3] + utens_0_0_0[k#3])
-      _temp_0_0_0#16[k#3] = (_temp_0_0_0#15[k#3] + utens_stage_0_0_0[k#3])
-      dcol_0_0_0#3[k#3] = (_temp_0_0_0#16[k#3] + correction_term_0_0_0#1[k#3])
-      _temp_0_0_0#17[k#3] = (ccol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#18[k#3] = (bcol_0_0_0#1[k#3] - _temp_0_0_0#17[k#3])
-      divided_0_0_0#1[k#3] = (1.0 / _temp_0_0_0#18[k#3])
-      ccol_0_0_0#4[k#3] = (ccol_0_0_0#3[k#3] * divided_0_0_0#1[k#3])
-      _temp_0_0_0#19[k#3] = (dcol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#20[k#3] = (dcol_0_0_0#3[k#3] - _temp_0_0_0#19[k#3])
-      dcol_0_0_0#4[k#3] = (_temp_0_0_0#20[k#3] * divided_0_0_0#1[k#3])
-      ccol_0_0_0[k#3] = ccol_0_0_0#4[k#3]
-      dcol_0_0_0[k#3] = dcol_0_0_0#4[k#3]
+      _temp_0_0_0#27[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
+      gav_0_0_0#1[k#3] = (-0.25 * _temp_0_0_0#27[k#3])
+      _temp_0_0_0#28[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
+      _temp_0_0_0#29[k#3] = (_temp_0_0_0#28[k#3] + wcon_0_0_0[(k#3 + 1)])
+      gcv_0_0_0#3[k#3] = (0.25 * _temp_0_0_0#29[k#3])
+      as__0_0_0#1[k#3] = (gav_0_0_0#1[k#3] * 0.5)
+      cs_0_0_0#3[k#3] = (gcv_0_0_0#3[k#3] * 0.5)
+      acol_0_0_0#1[k#3] = (gav_0_0_0#1[k#3] * 0.5)
+      ccol_0_0_0#7[k#3] = (gcv_0_0_0#3[k#3] * 0.5)
+      _temp_0_0_0#30[k#3] = (_dtr_stage - acol_0_0_0#1[k#3])
+      bcol_0_0_0#3[k#3] = (_temp_0_0_0#30[k#3] - ccol_0_0_0#7[k#3])
+      _temp_0_0_0#31[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#32[k#3] = (cs_0_0_0#3[k#3] * _temp_0_0_0#31[k#3])
+      _temp_0_0_0#33[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#34[k#3] = (-as__0_0_0#1[k#3] * _temp_0_0_0#33[k#3])
+      correction_term_0_0_0#3[k#3] = (_temp_0_0_0#34[k#3] - _temp_0_0_0#32[k#3])
+      _temp_0_0_0#35[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
+      _temp_0_0_0#36[k#3] = (_temp_0_0_0#35[k#3] + utens_0_0_0[k#3])
+      _temp_0_0_0#37[k#3] = (_temp_0_0_0#36[k#3] + utens_stage_0_0_0[k#3])
+      dcol_0_0_0#7[k#3] = (_temp_0_0_0#37[k#3] + correction_term_0_0_0#3[k#3])
+      _temp_0_0_0#38[k#3] = (ccol_0_0_0[(k#3 - 1)] * acol_0_0_0#1[k#3])
+      _temp_0_0_0#39[k#3] = (bcol_0_0_0#3[k#3] - _temp_0_0_0#38[k#3])
+      divided_0_0_0#3[k#3] = (1.0 / _temp_0_0_0#39[k#3])
+      ccol_0_0_0#8[k#3] = (ccol_0_0_0#7[k#3] * divided_0_0_0#3[k#3])
+      _temp_0_0_0#40[k#3] = (dcol_0_0_0[(k#3 - 1)] * acol_0_0_0#1[k#3])
+      _temp_0_0_0#41[k#3] = (dcol_0_0_0#7[k#3] - _temp_0_0_0#40[k#3])
+      dcol_0_0_0#8[k#3] = (_temp_0_0_0#41[k#3] * divided_0_0_0#3[k#3])
+      ccol_0_0_0[k#3] = ccol_0_0_0#8[k#3]
+      dcol_0_0_0[k#3] = dcol_0_0_0#8[k#3]
     }
     awaitall
     for i32 k#4 in [3:4:1] {
-      _temp_0_0_0#21[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
-      gav_0_0_0#1[k#4] = (-0.25 * _temp_0_0_0#21[k#4])
-      as__0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      acol_0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      bcol_0_0_0#2[k#4] = (_dtr_stage - acol_0_0_0#1[k#4])
-      _temp_0_0_0#22[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
-      correction_term_0_0_0#2[k#4] = (-as__0_0_0#1[k#4] * _temp_0_0_0#22[k#4])
-      _temp_0_0_0#23[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
-      _temp_0_0_0#24[k#4] = (_temp_0_0_0#23[k#4] + utens_0_0_0[k#4])
-      _temp_0_0_0#25[k#4] = (_temp_0_0_0#24[k#4] + utens_stage_0_0_0[k#4])
-      dcol_0_0_0#5[k#4] = (_temp_0_0_0#25[k#4] + correction_term_0_0_0#2[k#4])
-      _temp_0_0_0#26[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#27[k#4] = (bcol_0_0_0#2[k#4] - _temp_0_0_0#26[k#4])
-      divided_0_0_0#2[k#4] = (1.0 / _temp_0_0_0#27[k#4])
-      _temp_0_0_0#28[k#4] = (dcol_0_0_0#5[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#29[k#4] = (dcol_0_0_0#5[k#4] - _temp_0_0_0#28[k#4])
-      dcol_0_0_0#6[k#4] = (_temp_0_0_0#29[k#4] * divided_0_0_0#2[k#4])
-      dcol_0_0_0[k#4] = dcol_0_0_0#6[k#4]
+      _temp_0_0_0#51[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
+      gav_0_0_0#3[k#4] = (-0.25 * _temp_0_0_0#51[k#4])
+      as__0_0_0#3[k#4] = (gav_0_0_0#3[k#4] * 0.5)
+      acol_0_0_0#3[k#4] = (gav_0_0_0#3[k#4] * 0.5)
+      bcol_0_0_0#5[k#4] = (_dtr_stage - acol_0_0_0#3[k#4])
+      _temp_0_0_0#52[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
+      correction_term_0_0_0#5[k#4] = (-as__0_0_0#3[k#4] * _temp_0_0_0#52[k#4])
+      _temp_0_0_0#53[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
+      _temp_0_0_0#54[k#4] = (_temp_0_0_0#53[k#4] + utens_0_0_0[k#4])
+      _temp_0_0_0#55[k#4] = (_temp_0_0_0#54[k#4] + utens_stage_0_0_0[k#4])
+      dcol_0_0_0#11[k#4] = (_temp_0_0_0#55[k#4] + correction_term_0_0_0#5[k#4])
+      _temp_0_0_0#56[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#3[k#4])
+      _temp_0_0_0#57[k#4] = (bcol_0_0_0#5[k#4] - _temp_0_0_0#56[k#4])
+      divided_0_0_0#5[k#4] = (1.0 / _temp_0_0_0#57[k#4])
+      _temp_0_0_0#58[k#4] = (dcol_0_0_0[(k#4 - 1)] * acol_0_0_0#3[k#4])
+      _temp_0_0_0#59[k#4] = (dcol_0_0_0#11[k#4] - _temp_0_0_0#58[k#4])
+      dcol_0_0_0#12[k#4] = (_temp_0_0_0#59[k#4] * divided_0_0_0#5[k#4])
+      dcol_0_0_0[k#4] = dcol_0_0_0#12[k#4]
     }
     awaitall
     for i32 k#5 in [3:2:-1] {
-      datacol_0_0_0[k#5] = dcol_0_0_0[k#5]
-      data_col_0_0_0#1[k#5] = datacol_0_0_0[k#5]
-      _temp_0_0_0#30[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
-      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#30[k#5])
-      data_col_0_0_0[k#5] = data_col_0_0_0#1[k#5]
-      utens_stage_0_0_0#1[k#5] = utens_stage_0_0_0#2[k#5]
+      datacol_0_0_0#1[k#5] = dcol_0_0_0[k#5]
+      data_col_0_0_0#2[k#5] = datacol_0_0_0#1[k#5]
+      _temp_0_0_0#61[k#5] = (datacol_0_0_0#1[k#5] - u_pos_0_0_0[k#5])
+      utens_stage_0_0_0#3[k#5] = (_dtr_stage * _temp_0_0_0#61[k#5])
+      data_col_0_0_0[k#5] = data_col_0_0_0#2[k#5]
+      utens_stage_0_0_0#1[k#5] = utens_stage_0_0_0#3[k#5]
     }
     awaitall
     for i32 k#6 in [2:-1:-1] {
-      _temp_0_0_0#31[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
-      datacol_0_0_0#1[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#31[k#6])
-      data_col_0_0_0#2[k#6] = datacol_0_0_0#1[k#6]
-      _temp_0_0_0#32[k#6] = (datacol_0_0_0#1[k#6] - u_pos_0_0_0[k#6])
-      utens_stage_0_0_0#3[k#6] = (_dtr_stage * _temp_0_0_0#32[k#6])
-      data_col_0_0_0[k#6] = data_col_0_0_0#2[k#6]
-      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#3[k#6]
+      _temp_0_0_0#64[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
+      datacol_0_0_0#3[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#64[k#6])
+      data_col_0_0_0#4[k#6] = datacol_0_0_0#3[k#6]
+      _temp_0_0_0#65[k#6] = (datacol_0_0_0#3[k#6] - u_pos_0_0_0[k#6])
+      utens_stage_0_0_0#5[k#6] = (_dtr_stage * _temp_0_0_0#65[k#6])
+      data_col_0_0_0[k#6] = data_col_0_0_0#4[k#6]
+      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#5[k#6]
     }
     awaitall
     await send(utens_stage_0_0_0#1, __kernel_out_0[(i#2 - 1), (j#2 - 0)])
@@ -438,14 +438,14 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[4, 4] readonly _u_po
     await receive(utens_stage_0_0_0, _utens_stage[i#2, j#2])
     await receive(wcon_0_0_0, _wcon[i#2, j#2])
     awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:5:1], receive(_stream_wcon#1) {
-      _refactored_wcon_1_0_0_0_0#1[k] = x
+    completion _recv_comp = foreach i32 k, f32 x in [0:5:1], receive(_stream_wcon#3) {
+      _refactored_wcon_1_0_0_0_0#3[k] = x
     }
-    completion _send_comp = send(wcon_0_0_0, _stream_wcon#2)
+    completion _send_comp = send(wcon_0_0_0, _stream_wcon#4)
     await _send_comp
     await _recv_comp
     await map i32 k#1 in [0:4:1] {
-      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#1[k#1]
+      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#3[k#1]
     }
     awaitall
     for i32 k#2 in [0:1:1] {
@@ -469,75 +469,75 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[4, 4] readonly _u_po
     }
     awaitall
     for i32 k#3 in [1:3:1] {
-      _temp_0_0_0#6[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
-      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#6[k#3])
-      _temp_0_0_0#7[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
-      _temp_0_0_0#8[k#3] = (_temp_0_0_0#7[k#3] + wcon_0_0_0[(k#3 + 1)])
-      gcv_0_0_0#1[k#3] = (0.25 * _temp_0_0_0#8[k#3])
+      _temp_0_0_0#12[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
+      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#12[k#3])
+      _temp_0_0_0#13[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
+      _temp_0_0_0#14[k#3] = (_temp_0_0_0#13[k#3] + wcon_0_0_0[(k#3 + 1)])
+      gcv_0_0_0#2[k#3] = (0.25 * _temp_0_0_0#14[k#3])
       as__0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      cs_0_0_0#1[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
+      cs_0_0_0#2[k#3] = (gcv_0_0_0#2[k#3] * 0.5)
       acol_0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      ccol_0_0_0#3[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
-      _temp_0_0_0#9[k#3] = (_dtr_stage - acol_0_0_0[k#3])
-      bcol_0_0_0#1[k#3] = (_temp_0_0_0#9[k#3] - ccol_0_0_0#3[k#3])
-      _temp_0_0_0#10[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#11[k#3] = (cs_0_0_0#1[k#3] * _temp_0_0_0#10[k#3])
-      _temp_0_0_0#12[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#13[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#12[k#3])
-      correction_term_0_0_0#1[k#3] = (_temp_0_0_0#13[k#3] - _temp_0_0_0#11[k#3])
-      _temp_0_0_0#14[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
-      _temp_0_0_0#15[k#3] = (_temp_0_0_0#14[k#3] + utens_0_0_0[k#3])
-      _temp_0_0_0#16[k#3] = (_temp_0_0_0#15[k#3] + utens_stage_0_0_0[k#3])
-      dcol_0_0_0#3[k#3] = (_temp_0_0_0#16[k#3] + correction_term_0_0_0#1[k#3])
-      _temp_0_0_0#17[k#3] = (ccol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#18[k#3] = (bcol_0_0_0#1[k#3] - _temp_0_0_0#17[k#3])
-      divided_0_0_0#1[k#3] = (1.0 / _temp_0_0_0#18[k#3])
-      ccol_0_0_0#4[k#3] = (ccol_0_0_0#3[k#3] * divided_0_0_0#1[k#3])
-      _temp_0_0_0#19[k#3] = (dcol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#20[k#3] = (dcol_0_0_0#3[k#3] - _temp_0_0_0#19[k#3])
-      dcol_0_0_0#4[k#3] = (_temp_0_0_0#20[k#3] * divided_0_0_0#1[k#3])
-      ccol_0_0_0[k#3] = ccol_0_0_0#4[k#3]
-      dcol_0_0_0[k#3] = dcol_0_0_0#4[k#3]
+      ccol_0_0_0#5[k#3] = (gcv_0_0_0#2[k#3] * 0.5)
+      _temp_0_0_0#15[k#3] = (_dtr_stage - acol_0_0_0[k#3])
+      bcol_0_0_0#2[k#3] = (_temp_0_0_0#15[k#3] - ccol_0_0_0#5[k#3])
+      _temp_0_0_0#16[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#17[k#3] = (cs_0_0_0#2[k#3] * _temp_0_0_0#16[k#3])
+      _temp_0_0_0#18[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#19[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#18[k#3])
+      correction_term_0_0_0#2[k#3] = (_temp_0_0_0#19[k#3] - _temp_0_0_0#17[k#3])
+      _temp_0_0_0#20[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
+      _temp_0_0_0#21[k#3] = (_temp_0_0_0#20[k#3] + utens_0_0_0[k#3])
+      _temp_0_0_0#22[k#3] = (_temp_0_0_0#21[k#3] + utens_stage_0_0_0[k#3])
+      dcol_0_0_0#5[k#3] = (_temp_0_0_0#22[k#3] + correction_term_0_0_0#2[k#3])
+      _temp_0_0_0#23[k#3] = (ccol_0_0_0[(k#3 - 1)] * acol_0_0_0[k#3])
+      _temp_0_0_0#24[k#3] = (bcol_0_0_0#2[k#3] - _temp_0_0_0#23[k#3])
+      divided_0_0_0#2[k#3] = (1.0 / _temp_0_0_0#24[k#3])
+      ccol_0_0_0#6[k#3] = (ccol_0_0_0#5[k#3] * divided_0_0_0#2[k#3])
+      _temp_0_0_0#25[k#3] = (dcol_0_0_0[(k#3 - 1)] * acol_0_0_0[k#3])
+      _temp_0_0_0#26[k#3] = (dcol_0_0_0#5[k#3] - _temp_0_0_0#25[k#3])
+      dcol_0_0_0#6[k#3] = (_temp_0_0_0#26[k#3] * divided_0_0_0#2[k#3])
+      ccol_0_0_0[k#3] = ccol_0_0_0#6[k#3]
+      dcol_0_0_0[k#3] = dcol_0_0_0#6[k#3]
     }
     awaitall
     for i32 k#4 in [3:4:1] {
-      _temp_0_0_0#21[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
-      gav_0_0_0#1[k#4] = (-0.25 * _temp_0_0_0#21[k#4])
-      as__0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      acol_0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      bcol_0_0_0#2[k#4] = (_dtr_stage - acol_0_0_0#1[k#4])
-      _temp_0_0_0#22[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
-      correction_term_0_0_0#2[k#4] = (-as__0_0_0#1[k#4] * _temp_0_0_0#22[k#4])
-      _temp_0_0_0#23[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
-      _temp_0_0_0#24[k#4] = (_temp_0_0_0#23[k#4] + utens_0_0_0[k#4])
-      _temp_0_0_0#25[k#4] = (_temp_0_0_0#24[k#4] + utens_stage_0_0_0[k#4])
-      dcol_0_0_0#5[k#4] = (_temp_0_0_0#25[k#4] + correction_term_0_0_0#2[k#4])
-      _temp_0_0_0#26[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#27[k#4] = (bcol_0_0_0#2[k#4] - _temp_0_0_0#26[k#4])
-      divided_0_0_0#2[k#4] = (1.0 / _temp_0_0_0#27[k#4])
-      _temp_0_0_0#28[k#4] = (dcol_0_0_0#5[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#29[k#4] = (dcol_0_0_0#5[k#4] - _temp_0_0_0#28[k#4])
-      dcol_0_0_0#6[k#4] = (_temp_0_0_0#29[k#4] * divided_0_0_0#2[k#4])
-      dcol_0_0_0[k#4] = dcol_0_0_0#6[k#4]
+      _temp_0_0_0#42[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
+      gav_0_0_0#2[k#4] = (-0.25 * _temp_0_0_0#42[k#4])
+      as__0_0_0#2[k#4] = (gav_0_0_0#2[k#4] * 0.5)
+      acol_0_0_0#2[k#4] = (gav_0_0_0#2[k#4] * 0.5)
+      bcol_0_0_0#4[k#4] = (_dtr_stage - acol_0_0_0#2[k#4])
+      _temp_0_0_0#43[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
+      correction_term_0_0_0#4[k#4] = (-as__0_0_0#2[k#4] * _temp_0_0_0#43[k#4])
+      _temp_0_0_0#44[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
+      _temp_0_0_0#45[k#4] = (_temp_0_0_0#44[k#4] + utens_0_0_0[k#4])
+      _temp_0_0_0#46[k#4] = (_temp_0_0_0#45[k#4] + utens_stage_0_0_0[k#4])
+      dcol_0_0_0#9[k#4] = (_temp_0_0_0#46[k#4] + correction_term_0_0_0#4[k#4])
+      _temp_0_0_0#47[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#2[k#4])
+      _temp_0_0_0#48[k#4] = (bcol_0_0_0#4[k#4] - _temp_0_0_0#47[k#4])
+      divided_0_0_0#4[k#4] = (1.0 / _temp_0_0_0#48[k#4])
+      _temp_0_0_0#49[k#4] = (dcol_0_0_0[(k#4 - 1)] * acol_0_0_0#2[k#4])
+      _temp_0_0_0#50[k#4] = (dcol_0_0_0#9[k#4] - _temp_0_0_0#49[k#4])
+      dcol_0_0_0#10[k#4] = (_temp_0_0_0#50[k#4] * divided_0_0_0#4[k#4])
+      dcol_0_0_0[k#4] = dcol_0_0_0#10[k#4]
     }
     awaitall
     for i32 k#5 in [3:2:-1] {
       datacol_0_0_0[k#5] = dcol_0_0_0[k#5]
       data_col_0_0_0#1[k#5] = datacol_0_0_0[k#5]
-      _temp_0_0_0#30[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
-      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#30[k#5])
+      _temp_0_0_0#60[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
+      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#60[k#5])
       data_col_0_0_0[k#5] = data_col_0_0_0#1[k#5]
       utens_stage_0_0_0#1[k#5] = utens_stage_0_0_0#2[k#5]
     }
     awaitall
     for i32 k#6 in [2:-1:-1] {
-      _temp_0_0_0#31[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
-      datacol_0_0_0#1[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#31[k#6])
-      data_col_0_0_0#2[k#6] = datacol_0_0_0#1[k#6]
-      _temp_0_0_0#32[k#6] = (datacol_0_0_0#1[k#6] - u_pos_0_0_0[k#6])
-      utens_stage_0_0_0#3[k#6] = (_dtr_stage * _temp_0_0_0#32[k#6])
-      data_col_0_0_0[k#6] = data_col_0_0_0#2[k#6]
-      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#3[k#6]
+      _temp_0_0_0#62[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
+      datacol_0_0_0#2[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#62[k#6])
+      data_col_0_0_0#3[k#6] = datacol_0_0_0#2[k#6]
+      _temp_0_0_0#63[k#6] = (datacol_0_0_0#2[k#6] - u_pos_0_0_0[k#6])
+      utens_stage_0_0_0#4[k#6] = (_dtr_stage * _temp_0_0_0#63[k#6])
+      data_col_0_0_0[k#6] = data_col_0_0_0#3[k#6]
+      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#4[k#6]
     }
     awaitall
     await send(utens_stage_0_0_0#1, __kernel_out_0[(i#2 - 1), (j#2 - 0)])
@@ -550,14 +550,14 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[4, 4] readonly _u_po
     await receive(utens_stage_0_0_0, _utens_stage[i#2, j#2])
     await receive(wcon_0_0_0, _wcon[i#2, j#2])
     awaitall
-    completion _recv_comp = foreach i32 k, f32 x in [0:5:1], receive(_stream_wcon#2) {
-      _refactored_wcon_1_0_0_0_0#1[k] = x
+    completion _recv_comp = foreach i32 k, f32 x in [0:5:1], receive(_stream_wcon#4) {
+      _refactored_wcon_1_0_0_0_0#3[k] = x
     }
-    completion _send_comp = send(wcon_0_0_0, _stream_wcon#1)
+    completion _send_comp = send(wcon_0_0_0, _stream_wcon#3)
     await _send_comp
     await _recv_comp
     await map i32 k#1 in [0:4:1] {
-      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#1[k#1]
+      _refactored_wcon_1_0_0_0_0[k#1] = _refactored_wcon_1_0_0_0_0#3[k#1]
     }
     awaitall
     for i32 k#2 in [0:1:1] {
@@ -581,75 +581,75 @@ kernel @vertical_advection<>(f32 _dtr_stage, stream<f32, 4>[4, 4] readonly _u_po
     }
     awaitall
     for i32 k#3 in [1:3:1] {
-      _temp_0_0_0#6[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
-      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#6[k#3])
-      _temp_0_0_0#7[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
-      _temp_0_0_0#8[k#3] = (_temp_0_0_0#7[k#3] + wcon_0_0_0[(k#3 + 1)])
-      gcv_0_0_0#1[k#3] = (0.25 * _temp_0_0_0#8[k#3])
+      _temp_0_0_0#12[k#3] = (_refactored_wcon_1_0_0_0_0[k#3] + wcon_0_0_0[k#3])
+      gav_0_0_0[k#3] = (-0.25 * _temp_0_0_0#12[k#3])
+      _temp_0_0_0#13[k#3] = _refactored_wcon_1_0_0_0_0[(k#3 + 1)]
+      _temp_0_0_0#14[k#3] = (_temp_0_0_0#13[k#3] + wcon_0_0_0[(k#3 + 1)])
+      gcv_0_0_0#2[k#3] = (0.25 * _temp_0_0_0#14[k#3])
       as__0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      cs_0_0_0#1[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
+      cs_0_0_0#2[k#3] = (gcv_0_0_0#2[k#3] * 0.5)
       acol_0_0_0[k#3] = (gav_0_0_0[k#3] * 0.5)
-      ccol_0_0_0#3[k#3] = (gcv_0_0_0#1[k#3] * 0.5)
-      _temp_0_0_0#9[k#3] = (_dtr_stage - acol_0_0_0[k#3])
-      bcol_0_0_0#1[k#3] = (_temp_0_0_0#9[k#3] - ccol_0_0_0#3[k#3])
-      _temp_0_0_0#10[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#11[k#3] = (cs_0_0_0#1[k#3] * _temp_0_0_0#10[k#3])
-      _temp_0_0_0#12[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
-      _temp_0_0_0#13[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#12[k#3])
-      correction_term_0_0_0#1[k#3] = (_temp_0_0_0#13[k#3] - _temp_0_0_0#11[k#3])
-      _temp_0_0_0#14[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
-      _temp_0_0_0#15[k#3] = (_temp_0_0_0#14[k#3] + utens_0_0_0[k#3])
-      _temp_0_0_0#16[k#3] = (_temp_0_0_0#15[k#3] + utens_stage_0_0_0[k#3])
-      dcol_0_0_0#3[k#3] = (_temp_0_0_0#16[k#3] + correction_term_0_0_0#1[k#3])
-      _temp_0_0_0#17[k#3] = (ccol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#18[k#3] = (bcol_0_0_0#1[k#3] - _temp_0_0_0#17[k#3])
-      divided_0_0_0#1[k#3] = (1.0 / _temp_0_0_0#18[k#3])
-      ccol_0_0_0#4[k#3] = (ccol_0_0_0#3[k#3] * divided_0_0_0#1[k#3])
-      _temp_0_0_0#19[k#3] = (dcol_0_0_0#3[(k#3 - 1)] * acol_0_0_0[k#3])
-      _temp_0_0_0#20[k#3] = (dcol_0_0_0#3[k#3] - _temp_0_0_0#19[k#3])
-      dcol_0_0_0#4[k#3] = (_temp_0_0_0#20[k#3] * divided_0_0_0#1[k#3])
-      ccol_0_0_0[k#3] = ccol_0_0_0#4[k#3]
-      dcol_0_0_0[k#3] = dcol_0_0_0#4[k#3]
+      ccol_0_0_0#5[k#3] = (gcv_0_0_0#2[k#3] * 0.5)
+      _temp_0_0_0#15[k#3] = (_dtr_stage - acol_0_0_0[k#3])
+      bcol_0_0_0#2[k#3] = (_temp_0_0_0#15[k#3] - ccol_0_0_0#5[k#3])
+      _temp_0_0_0#16[k#3] = (u_stage_0_0_0[(k#3 + 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#17[k#3] = (cs_0_0_0#2[k#3] * _temp_0_0_0#16[k#3])
+      _temp_0_0_0#18[k#3] = (u_stage_0_0_0[(k#3 - 1)] - u_stage_0_0_0[k#3])
+      _temp_0_0_0#19[k#3] = (-as__0_0_0[k#3] * _temp_0_0_0#18[k#3])
+      correction_term_0_0_0#2[k#3] = (_temp_0_0_0#19[k#3] - _temp_0_0_0#17[k#3])
+      _temp_0_0_0#20[k#3] = (_dtr_stage * u_pos_0_0_0[k#3])
+      _temp_0_0_0#21[k#3] = (_temp_0_0_0#20[k#3] + utens_0_0_0[k#3])
+      _temp_0_0_0#22[k#3] = (_temp_0_0_0#21[k#3] + utens_stage_0_0_0[k#3])
+      dcol_0_0_0#5[k#3] = (_temp_0_0_0#22[k#3] + correction_term_0_0_0#2[k#3])
+      _temp_0_0_0#23[k#3] = (ccol_0_0_0[(k#3 - 1)] * acol_0_0_0[k#3])
+      _temp_0_0_0#24[k#3] = (bcol_0_0_0#2[k#3] - _temp_0_0_0#23[k#3])
+      divided_0_0_0#2[k#3] = (1.0 / _temp_0_0_0#24[k#3])
+      ccol_0_0_0#6[k#3] = (ccol_0_0_0#5[k#3] * divided_0_0_0#2[k#3])
+      _temp_0_0_0#25[k#3] = (dcol_0_0_0[(k#3 - 1)] * acol_0_0_0[k#3])
+      _temp_0_0_0#26[k#3] = (dcol_0_0_0#5[k#3] - _temp_0_0_0#25[k#3])
+      dcol_0_0_0#6[k#3] = (_temp_0_0_0#26[k#3] * divided_0_0_0#2[k#3])
+      ccol_0_0_0[k#3] = ccol_0_0_0#6[k#3]
+      dcol_0_0_0[k#3] = dcol_0_0_0#6[k#3]
     }
     awaitall
     for i32 k#4 in [3:4:1] {
-      _temp_0_0_0#21[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
-      gav_0_0_0#1[k#4] = (-0.25 * _temp_0_0_0#21[k#4])
-      as__0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      acol_0_0_0#1[k#4] = (gav_0_0_0#1[k#4] * 0.5)
-      bcol_0_0_0#2[k#4] = (_dtr_stage - acol_0_0_0#1[k#4])
-      _temp_0_0_0#22[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
-      correction_term_0_0_0#2[k#4] = (-as__0_0_0#1[k#4] * _temp_0_0_0#22[k#4])
-      _temp_0_0_0#23[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
-      _temp_0_0_0#24[k#4] = (_temp_0_0_0#23[k#4] + utens_0_0_0[k#4])
-      _temp_0_0_0#25[k#4] = (_temp_0_0_0#24[k#4] + utens_stage_0_0_0[k#4])
-      dcol_0_0_0#5[k#4] = (_temp_0_0_0#25[k#4] + correction_term_0_0_0#2[k#4])
-      _temp_0_0_0#26[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#27[k#4] = (bcol_0_0_0#2[k#4] - _temp_0_0_0#26[k#4])
-      divided_0_0_0#2[k#4] = (1.0 / _temp_0_0_0#27[k#4])
-      _temp_0_0_0#28[k#4] = (dcol_0_0_0#5[(k#4 - 1)] * acol_0_0_0#1[k#4])
-      _temp_0_0_0#29[k#4] = (dcol_0_0_0#5[k#4] - _temp_0_0_0#28[k#4])
-      dcol_0_0_0#6[k#4] = (_temp_0_0_0#29[k#4] * divided_0_0_0#2[k#4])
-      dcol_0_0_0[k#4] = dcol_0_0_0#6[k#4]
+      _temp_0_0_0#42[k#4] = (_refactored_wcon_1_0_0_0_0[k#4] + wcon_0_0_0[k#4])
+      gav_0_0_0#2[k#4] = (-0.25 * _temp_0_0_0#42[k#4])
+      as__0_0_0#2[k#4] = (gav_0_0_0#2[k#4] * 0.5)
+      acol_0_0_0#2[k#4] = (gav_0_0_0#2[k#4] * 0.5)
+      bcol_0_0_0#4[k#4] = (_dtr_stage - acol_0_0_0#2[k#4])
+      _temp_0_0_0#43[k#4] = (u_stage_0_0_0[(k#4 - 1)] - u_stage_0_0_0[k#4])
+      correction_term_0_0_0#4[k#4] = (-as__0_0_0#2[k#4] * _temp_0_0_0#43[k#4])
+      _temp_0_0_0#44[k#4] = (_dtr_stage * u_pos_0_0_0[k#4])
+      _temp_0_0_0#45[k#4] = (_temp_0_0_0#44[k#4] + utens_0_0_0[k#4])
+      _temp_0_0_0#46[k#4] = (_temp_0_0_0#45[k#4] + utens_stage_0_0_0[k#4])
+      dcol_0_0_0#9[k#4] = (_temp_0_0_0#46[k#4] + correction_term_0_0_0#4[k#4])
+      _temp_0_0_0#47[k#4] = (ccol_0_0_0[(k#4 - 1)] * acol_0_0_0#2[k#4])
+      _temp_0_0_0#48[k#4] = (bcol_0_0_0#4[k#4] - _temp_0_0_0#47[k#4])
+      divided_0_0_0#4[k#4] = (1.0 / _temp_0_0_0#48[k#4])
+      _temp_0_0_0#49[k#4] = (dcol_0_0_0[(k#4 - 1)] * acol_0_0_0#2[k#4])
+      _temp_0_0_0#50[k#4] = (dcol_0_0_0#9[k#4] - _temp_0_0_0#49[k#4])
+      dcol_0_0_0#10[k#4] = (_temp_0_0_0#50[k#4] * divided_0_0_0#4[k#4])
+      dcol_0_0_0[k#4] = dcol_0_0_0#10[k#4]
     }
     awaitall
     for i32 k#5 in [3:2:-1] {
       datacol_0_0_0[k#5] = dcol_0_0_0[k#5]
       data_col_0_0_0#1[k#5] = datacol_0_0_0[k#5]
-      _temp_0_0_0#30[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
-      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#30[k#5])
+      _temp_0_0_0#60[k#5] = (datacol_0_0_0[k#5] - u_pos_0_0_0[k#5])
+      utens_stage_0_0_0#2[k#5] = (_dtr_stage * _temp_0_0_0#60[k#5])
       data_col_0_0_0[k#5] = data_col_0_0_0#1[k#5]
       utens_stage_0_0_0#1[k#5] = utens_stage_0_0_0#2[k#5]
     }
     awaitall
     for i32 k#6 in [2:-1:-1] {
-      _temp_0_0_0#31[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
-      datacol_0_0_0#1[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#31[k#6])
-      data_col_0_0_0#2[k#6] = datacol_0_0_0#1[k#6]
-      _temp_0_0_0#32[k#6] = (datacol_0_0_0#1[k#6] - u_pos_0_0_0[k#6])
-      utens_stage_0_0_0#3[k#6] = (_dtr_stage * _temp_0_0_0#32[k#6])
-      data_col_0_0_0[k#6] = data_col_0_0_0#2[k#6]
-      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#3[k#6]
+      _temp_0_0_0#62[k#6] = (ccol_0_0_0[k#6] * data_col_0_0_0[(k#6 + 1)])
+      datacol_0_0_0#2[k#6] = (dcol_0_0_0[k#6] - _temp_0_0_0#62[k#6])
+      data_col_0_0_0#3[k#6] = datacol_0_0_0#2[k#6]
+      _temp_0_0_0#63[k#6] = (datacol_0_0_0#2[k#6] - u_pos_0_0_0[k#6])
+      utens_stage_0_0_0#4[k#6] = (_dtr_stage * _temp_0_0_0#63[k#6])
+      data_col_0_0_0[k#6] = data_col_0_0_0#3[k#6]
+      utens_stage_0_0_0#1[k#6] = utens_stage_0_0_0#4[k#6]
     }
     awaitall
     await send(utens_stage_0_0_0#1, __kernel_out_0[(i#2 - 1), (j#2 - 0)])
diff --git a/spatialstencil/lowering/stencil_to_spatial_compute_fwbw.py b/spatialstencil/lowering/stencil_to_spatial_compute_fwbw.py
index 86ef9ba8..b5a3bf2b 100644
--- a/spatialstencil/lowering/stencil_to_spatial_compute_fwbw.py
+++ b/spatialstencil/lowering/stencil_to_spatial_compute_fwbw.py
@@ -153,7 +153,18 @@ def visit_Subscript(self, node: sast.Subscript):
         assert node.subscript[0] == 0
         assert node.subscript[1] == 0
         z_offset = node.subscript[2]
-        array = self.placement.get_storage(node.value)
+        if z_offset != 0:
+            # For non-zero k-offsets in a FORWARD/BACKWARD stencil, the access
+            # targets the *accumulated* (final) field value at the neighbouring
+            # k-level — i.e. the value after every assignment at that level has
+            # completed.  Local SSA intermediates (e.g. fresh_d before Thomas
+            # elimination) only hold the correct value at the *current* k; using
+            # them at k±1 yields wrong (often uninitialized) results.
+            array = self.placement.get_accumulated_storage(node.value.name)
+            if array is None:
+                array = self.placement.get_storage(node.value)
+        else:
+            array = self.placement.get_storage(node.value)
         if isinstance(array[1], spa.ArrayType):
             if z_offset == 0:
                 access = self.iteration_variable.identifier
diff --git a/spatialstencil/lowering/stencil_to_spatial_place.py b/spatialstencil/lowering/stencil_to_spatial_place.py
index 8176a75f..032d7781 100644
--- a/spatialstencil/lowering/stencil_to_spatial_place.py
+++ b/spatialstencil/lowering/stencil_to_spatial_place.py
@@ -161,6 +161,24 @@ def get_shift(self) -> tuple[int, int, int]:
         """
         return self.domain_shift
 
+    def get_accumulated_storage(self,
+                               name: str,
+                               offset: sast.Offset = sast.Offset.zero()) -> tuple[spa.Identifier, spa.ArrayType | spa.ScalarType] | None:
+        """Return program-scope (accumulated) storage for a variable by name.
+
+        Unlike get_storage(), this bypasses SSA-specific storage and resolves
+        through the program scope only, returning the storage associated with
+        the field's accumulated value — i.e. the final value after all
+        assignments at a given k-level have completed.  This is the correct
+        target for stencil accesses with a non-zero k-offset inside a
+        FORWARD or BACKWARD computation body.
+        """
+        if name in self._program_scope_fields:
+            identifier = self._program_scope_fields[name]
+            if offset in self._storage_map[identifier]:
+                return self._storage_map[identifier][offset]
+        return None
+
     def get_storage(self,
                     identifier: sast.Identifier,
                     offset: sast.Offset = sast.Offset.zero()) -> tuple[spa.Identifier, spa.ArrayType | spa.ScalarType] | None:
diff --git a/tests/csl_runtime/test_vadv.sh b/tests/csl_runtime/test_vadv.sh
new file mode 100644
index 00000000..69749a43
--- /dev/null
+++ b/tests/csl_runtime/test_vadv.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+
+BNAME=vadv_sptl
+# Compile the spatial stencil program
+sptlc "$SCRIPT_DIR/../../samples/benchmarks/vertical_advection_4_4_4.sptl" $BNAME $*
+
+python <<EOF
+import numpy as np
+dtr_stage = np.random.rand(1).astype(np.float32)
+u_pos = np.random.rand(4, 4, 4).astype(np.float32)
+u_stage = np.random.rand(4, 4, 5).astype(np.float32)
+utens = np.random.rand(4, 4, 4).astype(np.float32)
+utens_stage = np.random.rand(4, 4, 4).astype(np.float32)
+wcon = np.random.rand(5, 4, 4).astype(np.float32)
+# Save the arrays to .npy files
+np.save('dtr_stage.npy', dtr_stage)
+np.save('u_pos.npy', u_pos)
+np.save('u_stage.npy', u_stage)
+np.save('utens.npy', utens)
+np.save('utens_stage.npy', utens_stage)
+np.save('wcon.npy', wcon)
+EOF
+
+# Run the compiled program with the Python runtime and the simulator
+timeout -s 9 120 cs_python "$SCRIPT_DIR/../../spatialstencil/runtime/runtime.py" --benchmark $BNAME dtr_stage.npy u_pos.npy u_stage.npy utens.npy utens_stage.npy wcon.npy
+
+# Check if the output file matches the expected output
+python $SCRIPT_DIR/vertical_advection.py dtr_stage.npy u_pos.npy u_stage.npy utens.npy utens_stage.npy wcon.npy -o expected_out.npy
+python <<EOF
+import numpy as np
+# Load the arrays
+ref = np.load('expected_out.npy')
+output = np.load('OUT___kernel_out_0.npy')
+# Check if the output is correct
+if not np.allclose(output, ref, atol=1e-6, rtol=1e-5):
+    print("Test failed: Output does not match expected result.")
+    exit(1)
+else:
+    print("Test passed: Output matches expected result.")
+EOF
+
+# Clean up generated files
+rm -rf $BNAME
+rm -f OUT___kernel_out_0.npy expected_out.npy
+rm -f dtr_stage.npy u_pos.npy u_stage.npy utens.npy utens_stage.npy wcon.npy
diff --git a/tests/csl_runtime/vertical_advection.py b/tests/csl_runtime/vertical_advection.py
new file mode 100644
index 00000000..4dea1ac3
--- /dev/null
+++ b/tests/csl_runtime/vertical_advection.py
@@ -0,0 +1,94 @@
+import argparse
+import numpy as np
+import numpy.typing as npt
+
+# Adapted from NPBench: https://github.com/spcl/npbench/blob/main/npbench/benchmarks/weather_stencils/vadv/vadv_numpy.py
+
+# Sample constants
+BET_M = 0.5
+BET_P = 0.5
+
+
+# Adapted from https://github.com/GridTools/gt4py/blob/1caca893034a18d5df1522ed251486659f846589/tests/test_integration/stencil_definitions.py#L111
+def vadv(utens_stage, u_stage, wcon, u_pos, utens, dtr_stage):
+    I, J, K = utens_stage.shape[0], utens_stage.shape[1], utens_stage.shape[2]
+    ccol = np.ndarray((I, J, K), dtype=utens_stage.dtype)
+    dcol = np.ndarray((I, J, K), dtype=utens_stage.dtype)
+    data_col = np.ndarray((I, J), dtype=utens_stage.dtype)
+
+    for k in range(1):
+        gcv = 0.25 * (wcon[1:, :, k + 1] + wcon[:-1, :, k + 1])
+        cs = gcv * BET_M
+
+        ccol[:, :, k] = gcv * BET_P
+        bcol = dtr_stage - ccol[:, :, k]
+
+        # update the d column
+        correction_term = -cs * (u_stage[:, :, k + 1] - u_stage[:, :, k])
+        dcol[:, :, k] = (dtr_stage * u_pos[:, :, k] + utens[:, :, k] + utens_stage[:, :, k] + correction_term)
+
+        # Thomas forward
+        divided = 1.0 / bcol
+        ccol[:, :, k] = ccol[:, :, k] * divided
+        dcol[:, :, k] = dcol[:, :, k] * divided
+
+    for k in range(1, K - 1):
+        gav = -0.25 * (wcon[1:, :, k] + wcon[:-1, :, k])
+        gcv = 0.25 * (wcon[1:, :, k + 1] + wcon[:-1, :, k + 1])
+
+        as_ = gav * BET_M
+        cs = gcv * BET_M
+
+        acol = gav * BET_P
+        ccol[:, :, k] = gcv * BET_P
+        bcol = dtr_stage - acol - ccol[:, :, k]
+
+        # update the d column
+        correction_term = -as_ * (u_stage[:, :, k - 1] - u_stage[:, :, k]) - cs * (
+            u_stage[:, :, k + 1] - u_stage[:, :, k])
+        dcol[:, :, k] = (dtr_stage * u_pos[:, :, k] + utens[:, :, k] + utens_stage[:, :, k] + correction_term)
+
+        # Thomas forward
+        divided = 1.0 / (bcol - ccol[:, :, k - 1] * acol)
+        ccol[:, :, k] = ccol[:, :, k] * divided
+        dcol[:, :, k] = (dcol[:, :, k] - (dcol[:, :, k - 1]) * acol) * divided
+
+    for k in range(K - 1, K):
+        gav = -0.25 * (wcon[1:, :, k] + wcon[:-1, :, k])
+        as_ = gav * BET_M
+        acol = gav * BET_P
+        bcol = dtr_stage - acol
+
+        # update the d column
+        correction_term = -as_ * (u_stage[:, :, k - 1] - u_stage[:, :, k])
+        dcol[:, :, k] = (dtr_stage * u_pos[:, :, k] + utens[:, :, k] + utens_stage[:, :, k] + correction_term)
+
+        # Thomas forward
+        divided = 1.0 / (bcol - ccol[:, :, k - 1] * acol)
+        dcol[:, :, k] = (dcol[:, :, k] - (dcol[:, :, k - 1]) * acol) * divided
+
+    for k in range(K - 1, K - 2, -1):
+        datacol = dcol[:, :, k]
+        data_col[:] = datacol
+        utens_stage[:, :, k] = dtr_stage * (datacol - u_pos[:, :, k])
+
+    for k in range(K - 2, -1, -1):
+        datacol = dcol[:, :, k] - ccol[:, :, k] * data_col[:, :]
+        data_col[:] = datacol
+        utens_stage[:, :, k] = dtr_stage * (datacol - u_pos[:, :, k])
+
+    return utens_stage
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Vertical Advection Stencil")
+    parser.add_argument("inputs", nargs='+', type=str, help="Input file paths")
+    parser.add_argument("--output", "-o", type=str, default='vadv_out.npy', help="Output file path")
+    args = parser.parse_args()
+
+    inputs = [np.load(input_file) for input_file in args.inputs]
+    inputs[0] = inputs[0].item() # dtr_stage
+    dtr_stage, u_pos, u_stage, utens, utens_stage, wcon = inputs
+
+    result = vadv(utens_stage, u_stage, wcon, u_pos, utens, dtr_stage)
+    np.save(args.output, result)
diff --git a/tests/stencil_ir/test_lowering_stencil_to_spatial.py b/tests/stencil_ir/test_lowering_stencil_to_spatial.py
index d4b806ea..2c93c950 100644
--- a/tests/stencil_ir/test_lowering_stencil_to_spatial.py
+++ b/tests/stencil_ir/test_lowering_stencil_to_spatial.py
@@ -1,3 +1,4 @@
+import re
 import unittest
 from pathlib import Path
 from typing import Tuple
@@ -5,6 +6,7 @@
 import pytest
 
 from spatialstencil.cli.gt4py_to_spatial import lower_function, lower_gt4py_to_sptl
+from spatialstencil.lowering import gt4py_to_stencil_ir
 from spatialstencil.lowering.stencil_to_spatial_routing import ChannelStrategy
 from spatialstencil.lowering.stencil_to_spatial_compute import HorizontalStencilTransformer
 from spatialstencil.lowering.stencil_to_spatial_dataflow import ProgramDataflow
@@ -228,6 +230,39 @@ def test_vadv():
         assert subgrids_dont_overlap(spatial_program)
 
 
+def test_fwbw_koffset_uses_accumulated_storage():
+    """Regression: k±1 accesses in FORWARD/BACKWARD stencil bodies must use
+    accumulated (program-scope) storage, not SSA-versioned intermediates.
+
+    The Thomas forward sweep in vertical_advection needs dcol[k-1] equal to
+    the Thomas-eliminated value from the previous k-iteration.  The bug was
+    that visit_Subscript used the SSA-specific local array (e.g.
+    dcol_0_0_0#3) instead of the accumulated array (dcol_0_0_0), yielding
+    uninitialised reads for the first k in each interval.
+    """
+    from spatialstencil.syntax.gt4py import parser as gt4py_parser
+
+    gtfuncs = gt4py_parser.parse_file(str(
+        Path(__file__).parent / Path('../../samples/stencils.py')))
+    program = gtfuncs['vertical_advection']
+    irprogram = gt4py_to_stencil_ir.lower_gt4py_to_stencil_ir(program, domain=(4, 4, 4))
+    type_inference.infer_field_extents(irprogram)
+    type_inference.infer_field_domains(irprogram)
+    kernel = lower_stencil_to_spatial(irprogram)
+    ir = kernel.as_ir()
+
+    # Detect the bug: an SSA-versioned array name (name#N) directly followed
+    # by an array index containing a k-offset (e.g. [(k#3 - 1)]).
+    # Accumulated storage is printed without a version suffix, so any match
+    # here means a local intermediate is being read at a neighbouring k-level.
+    bad = re.findall(r'\w+#\d+\[\([^)]*-\s*\d+\)\]', ir)
+    assert not bad, (
+        "k-offset accesses to SSA-versioned (non-accumulated) storage found; "
+        "these should use the accumulated field instead:\n  "
+        + "\n  ".join(bad)
+    )
+
+
 def test_gt4py_integration():
     from spatialstencil.syntax.gt4py import parser as gt4py_parser
     
@@ -247,4 +282,5 @@ def test_gt4py_integration():
     test_vertical_stencil_finishes()
     test_scalar_arguments()
     test_vadv()
+    test_fwbw_koffset_uses_accumulated_storage()
     test_gt4py_integration()