From 4fbdbb713efec68e9d426c6c16635b73a77a1e72 Mon Sep 17 00:00:00 2001 From: LonelyCat124 <3043914+LonelyCat124@users.noreply.github.com.> Date: Mon, 8 Jun 2026 14:51:59 +0100 Subject: [PATCH 01/10] Initial commits to imrpove lfric reproducuible reduction --- src/psyclone/psyGen.py | 79 ++++++++++++++++++++-- src/psyclone/psyir/nodes/omp_directives.py | 9 +++ 2 files changed, 81 insertions(+), 7 deletions(-) diff --git a/src/psyclone/psyGen.py b/src/psyclone/psyGen.py index 1f8c926592..a56f6b4ee5 100644 --- a/src/psyclone/psyGen.py +++ b/src/psyclone/psyGen.py @@ -967,6 +967,70 @@ def initialise_reduction_variable(self) -> None: insert_loc.addchild(assign, cursor) return new_node + def temp_to_array_assignment(self, + parent: Node, + table: SymbolTable) -> None: + ''' + Generate the appropriate code to transfer the private reduction + temporary into the shared array for reproducible reductions. + The assignment is always placed as the final child of parent. + + This method is designed to be used *after* a Kern has been lowered + (and thus detached) and therefore does not use `self.scope`. + + :param parent: the node to which to add the assignment as a child. + :param table: the SymbolTable to use. + ''' + tag = f"{self.name}:{self._reduction_arg.name}:local" + array_symbol = table.lookup_with_tag(tag) + local_var = table.lookup_with_tag( + f"{self.name}:{self._reduction_arg.name}:templocal") + thread_idx = table.lookup_with_tag("omp_thread_index") + arr_ref = ArrayReference.create( + array_symbol, [ + Literal("1", ScalarType.integer_type()), + Reference(thread_idx) + ]) + + assign = Assignment.create( + arr_ref, Reference(local_var) + ) + + assign.preceding_comment = ( + "Store the thread private value of the reduction into the " + "shared array as required for reproducible reductions." + ) + + # Where to put it? + parent.addchild(assign) + + def initialise_and_privatise_scalar_store( + self, parallel_region: Node, position: int, table: SymbolTable + ) -> None: + ''' + Generate the appropriate code to initialise the scalar reduction + variable inside the parallel region, and add it to the private clause. + + :param parent: the parallel region to which to add the initialisation + as a child. + :param position: where in the parent's list of children to add + the new Loop. + :param table: the SymbolTable to use. + ''' + local_var = table.lookup_with_tag( + f"{self.name}:{self._reduction_arg.name}:templocal") + assign = Assignment.create(Reference(local_var), + Literal("0", local_var.datatype)) + parallel_region.dir_body.addchild(assign, position) + assign.preceding_comment = ( + "Initialise thread-private reduction variable" + ) + # The local var also needs to be thread private. + parallel_region.private_clause.addchild( + Reference(local_var) + ) + + def reduction_sum_loop(self, parent: Node, position: int, @@ -1020,7 +1084,7 @@ def _reduction_reference(self): Return the reference to the reduction variable if OpenMP is set to be unreproducible, as we will be using the OpenMP reduction clause. Otherwise we will be computing the reduction ourselves and therefore - need to store values into a (padded) array separately for each + need to store values into a temporary variable for each thread. :returns: reference to the variable to be reduced. @@ -1031,12 +1095,8 @@ def _reduction_reference(self): symtab = self.ancestor(InvokeSchedule).symbol_table if self.reprod_reduction: local_var = symtab.lookup_with_tag( - f"{self.name}:{self._reduction_arg.name}:local") - # Return a multi-valued ArrayReference for a reproducible reduction - array_dim = [ - Literal("1", ScalarType.integer_type()), - Reference(symtab.lookup_with_tag("omp_thread_index"))] - return ArrayReference.create(local_var, array_dim) + f"{self.name}:{self._reduction_arg.name}:templocal") + return Reference(local_var) # Return a single-valued Reference for a non-reproducible reduction local_var = symtab.lookup_with_tag( f"AlgArgs_{self._reduction_arg.text}") @@ -1104,6 +1164,11 @@ def lower_to_language_level(self): array_type = ArrayType(arg_sym.datatype, 2*[ArrayType.Extent.DEFERRED]) + # Create a scalar temp to store to in the loop. + local_temp_var = table.find_or_create_tag( + root_name = f"local_temp_{self._reduction_arg.name}", + tag = f"{self.name}:{self._reduction_arg.name}:templocal", + symbol_type=DataSymbol, datatype=arg_sym.datatype) local_var = table.find_or_create_tag( root_name="local_"+self._reduction_arg.name, tag=f"{self.name}:{self._reduction_arg.name}:local", diff --git a/src/psyclone/psyir/nodes/omp_directives.py b/src/psyclone/psyir/nodes/omp_directives.py index 0884b20b86..bdce606a28 100644 --- a/src/psyclone/psyir/nodes/omp_directives.py +++ b/src/psyclone/psyir/nodes/omp_directives.py @@ -1401,6 +1401,15 @@ def lower_to_language_level(self) -> Node: self._add_reduction_clauses() for call in reversed(reprod_red_call_list): + # Get the reduction variable to initialise and privatise. + tag = f"{call.name}:{call._reduction_arg.name}:templocal" + sym = self.scope.symbol_table.lookup_with_tag(tag) + call.initialise_and_privatise_scalar_store( + self, 0, self.scope.symbol_table + ) + call.temp_to_array_assignment( + self.dir_body, self.scope.symbol_table + ) call.reduction_sum_loop(self.parent, self.position, self.scope.symbol_table) From 03b3fd8e7a7931d719e5abebb9359a44a5c3af85 Mon Sep 17 00:00:00 2001 From: LonelyCat124 <3043914+LonelyCat124@users.noreply.github.com.> Date: Mon, 8 Jun 2026 14:59:03 +0100 Subject: [PATCH 02/10] #3409 linting updates from initial commit --- src/psyclone/psyGen.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/psyclone/psyGen.py b/src/psyclone/psyGen.py index a56f6b4ee5..2ed319c5c7 100644 --- a/src/psyclone/psyGen.py +++ b/src/psyclone/psyGen.py @@ -990,7 +990,7 @@ def temp_to_array_assignment(self, array_symbol, [ Literal("1", ScalarType.integer_type()), Reference(thread_idx) - ]) + ]) assign = Assignment.create( arr_ref, Reference(local_var) @@ -1030,7 +1030,6 @@ def initialise_and_privatise_scalar_store( Reference(local_var) ) - def reduction_sum_loop(self, parent: Node, position: int, @@ -1165,9 +1164,9 @@ def lower_to_language_level(self): array_type = ArrayType(arg_sym.datatype, 2*[ArrayType.Extent.DEFERRED]) # Create a scalar temp to store to in the loop. - local_temp_var = table.find_or_create_tag( - root_name = f"local_temp_{self._reduction_arg.name}", - tag = f"{self.name}:{self._reduction_arg.name}:templocal", + _ = table.find_or_create_tag( + root_name=f"local_temp_{self._reduction_arg.name}", + tag=f"{self.name}:{self._reduction_arg.name}:templocal", symbol_type=DataSymbol, datatype=arg_sym.datatype) local_var = table.find_or_create_tag( root_name="local_"+self._reduction_arg.name, From 571a0ad7fba0cf6bae30e39590718fad4bbb8b7e Mon Sep 17 00:00:00 2001 From: LonelyCat124 <3043914+LonelyCat124@users.noreply.github.com.> Date: Tue, 9 Jun 2026 15:47:56 +0100 Subject: [PATCH 03/10] changes to remove the unneeded reduction statements and fix the tests --- src/psyclone/psyir/nodes/omp_directives.py | 5 +- .../lfric_transformations_test.py | 289 ++++++++++++------ 2 files changed, 199 insertions(+), 95 deletions(-) diff --git a/src/psyclone/psyir/nodes/omp_directives.py b/src/psyclone/psyir/nodes/omp_directives.py index bdce606a28..f33572c9df 100644 --- a/src/psyclone/psyir/nodes/omp_directives.py +++ b/src/psyclone/psyir/nodes/omp_directives.py @@ -1925,8 +1925,9 @@ def lower_to_language_level(self): # We only attempt to *automatically* add reduction clauses if we have a # high-level (DSL) reduction operation. reductions = self.reductions() - if not reductions: - # No high-level reduction operations. + if not reductions or self.reprod: + # No high-level reduction operations, or + # using reproducible reductions return super().lower_to_language_level() self.children[0].lower_to_language_level() diff --git a/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py b/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py index 1619d1eac7..9bf5e02b02 100644 --- a/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py +++ b/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py @@ -2182,7 +2182,10 @@ def test_reprod_red_after_normal_real_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels and communication routines\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" @@ -2193,9 +2196,12 @@ def test_reprod_red_after_normal_real_do(tmpdir, monkeypatch, annexed, " !$omp do schedule(static)\n" " do df = loop1_start, loop1_stop, 1\n" " ! Built-in: sum_X (sum a real-valued field)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + f1_data(df)\n" + " local_temp_asum = local_temp_asum + f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into the " + "shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -2222,7 +2228,10 @@ def test_reprod_red_after_normal_real_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" @@ -2233,9 +2242,12 @@ def test_reprod_red_after_normal_real_do(tmpdir, monkeypatch, annexed, " !$omp do schedule(static)\n" " do df = loop1_start, loop1_stop, 1\n" " ! Built-in: sum_X (sum a real-valued field)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + f1_data(df)\n" + " local_temp_asum = local_temp_asum + f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into the " + "shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -2385,21 +2397,32 @@ def test_two_reprod_reductions_real_do(tmpdir, dist_mem): " local_bsum = 0.0_r_def\n" "\n" " ! Call kernels and communication routines\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_bsum,local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_bsum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + " + " local_temp_asum = local_temp_asum + " "f1_data(df) * f2_data(df)\n" " enddo\n" " !$omp end do\n" " !$omp do schedule(static)\n" " do df = loop1_start, loop1_stop, 1\n" " ! Built-in: sum_X (sum a real-valued field)\n" - " local_bsum(1,th_idx) = local_bsum(1,th_idx) + f1_data(df)\n" + " local_temp_bsum = local_temp_bsum + f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_bsum(1,th_idx) = local_temp_bsum\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -2437,21 +2460,32 @@ def test_two_reprod_reductions_real_do(tmpdir, dist_mem): " local_bsum = 0.0_r_def\n" "\n" " ! Call kernels\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_bsum,local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_bsum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + " + " local_temp_asum = local_temp_asum + " "f1_data(df) * f2_data(df)\n" " enddo\n" " !$omp end do\n" " !$omp do schedule(static)\n" " do df = loop1_start, loop1_stop, 1\n" " ! Built-in: sum_X (sum a real-valued field)\n" - " local_bsum(1,th_idx) = local_bsum(1,th_idx) + f1_data(df)\n" + " local_temp_bsum = local_temp_bsum + f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_bsum(1,th_idx) = local_temp_bsum\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3259,15 +3293,21 @@ def test_reprod_reduction_real_do(tmpdir, dist_mem): " local_asum = 0.0_r_def\n" "\n" " ! Call kernels and communication routines\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + f1_data(df) " + " local_temp_asum = local_temp_asum + f1_data(df) " "* f2_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3289,15 +3329,21 @@ def test_reprod_reduction_real_do(tmpdir, dist_mem): " local_asum = 0.0_r_def\n" "\n" " ! Call kernels\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + f1_data(df) " + " local_temp_asum = local_temp_asum + f1_data(df) " "* f2_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3383,12 +3429,15 @@ def test_reprod_builtins_red_then_usual_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels and communication routines\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + f1_data(df) " + " local_temp_asum = local_temp_asum + f1_data(df) " "* f2_data(df)\n" " enddo\n" " !$omp end do\n" @@ -3397,7 +3446,10 @@ def test_reprod_builtins_red_then_usual_do(tmpdir, monkeypatch, annexed, " ! Built-in: inc_a_times_X (scale a real-valued field)\n" " f1_data(df) = bsum * f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3425,12 +3477,15 @@ def test_reprod_builtins_red_then_usual_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + f1_data(df) " + " local_temp_asum = local_temp_asum + f1_data(df) " "* f2_data(df)\n" " enddo\n" " !$omp end do\n" @@ -3439,7 +3494,10 @@ def test_reprod_builtins_red_then_usual_do(tmpdir, monkeypatch, annexed, " ! Built-in: inc_a_times_X (scale a real-valued field)\n" " f1_data(df) = bsum * f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3506,18 +3564,24 @@ def test_repr_bltins_red_then_usual_fuse_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels and communication routines\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + " + " local_temp_asum = local_temp_asum + " "f1_data(df) * f2_data(df)\n" "\n" " ! Built-in: inc_a_times_X (scale a real-valued field)\n" " f1_data(df) = bsum * f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3543,18 +3607,24 @@ def test_repr_bltins_red_then_usual_fuse_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + " + " local_temp_asum = local_temp_asum + " "f1_data(df) * f2_data(df)\n" "\n" " ! Built-in: inc_a_times_X (scale a real-valued field)\n" " f1_data(df) = bsum * f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3608,7 +3678,10 @@ def test_repr_bltins_usual_then_red_fuse_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels and communication routines\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" @@ -3616,10 +3689,13 @@ def test_repr_bltins_usual_then_red_fuse_do(tmpdir, monkeypatch, annexed, " f1_data(df) = bvalue * f1_data(df)\n" "\n" " ! Built-in: sum_X (sum a real-valued field)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + " + " local_temp_asum = local_temp_asum + " "f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3645,7 +3721,10 @@ def test_repr_bltins_usual_then_red_fuse_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" @@ -3653,10 +3732,13 @@ def test_repr_bltins_usual_then_red_fuse_do(tmpdir, monkeypatch, annexed, " f1_data(df) = bvalue * f1_data(df)\n" "\n" " ! Built-in: sum_X (sum a real-valued field)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + " + " local_temp_asum = local_temp_asum + " "f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3694,82 +3776,97 @@ def test_repr_3_builtins_2_reductions_do(tmpdir, dist_mem): assert "loop2_stop = f2_proxy%vspace%get_last_dof_owned()" in code for names in [ - {"var": "asum", "lvar": "local_asum", "loop_idx": "0", + {"var": "asum", "lvar": "local_temp_asum", + "avar": "local_asum", "loop_idx": "0", "rhs": "f1_data(df) * f2_data(df)", "builtin": "! Built-in: X_innerproduct_Y (real-valued fields)" }, - {"var": "bsum", "lvar": "local_bsum", "loop_idx": "2", + {"var": "bsum", "lvar": "local_temp_bsum", + "avar": "local_bsum", "loop_idx": "2", "rhs": "f2_data(df)", "builtin": "! Built-in: sum_X (sum a real-valued field)"}]: assert ( - " ALLOCATE(" + names["lvar"] + "(8,nthreads))\n" + " ALLOCATE(" + names["avar"] + "(8,nthreads))\n" "\n" " ! Initialise reduction variable\n" " " + names["var"] + " = 0.0_r_def\n" - " " + names["lvar"] + " = 0.0_r_def\n") in code - assert ( - " !$omp parallel default(shared) private(df,th_idx)\n" - " th_idx = omp_get_thread_num() + 1\n" - " !$omp do schedule(static)\n" - " do df = loop"+names["loop_idx"]+"_start, " - "loop"+names["loop_idx"]+"_stop, 1\n" - " " + names["builtin"] + "\n" - " " + names["lvar"] + "(1,th_idx) = " + - names["lvar"] + "(1,th_idx) + " + names["rhs"] + "\n" - " enddo\n" - " !$omp end do\n" - " !$omp end parallel\n" - "\n" - " ! sum the partial results sequentially\n" - " do th_idx = 1, nthreads, 1\n" - " " + names["var"] + " = " + names["var"] + " + " + - names["lvar"] + "(1,th_idx)\n" - " enddo\n" - " DEALLOCATE(" + names["lvar"] + ")\n" - "\n" - " ! Perform global sum\n" - " global_sum%value = " + names["var"] + "\n" - " " + names["var"] + " = " - "global_sum%get_sum()\n") in code + " " + names["avar"] + " = 0.0_r_def\n") in code + + correct = (f""" + !$omp parallel default(shared) private(df,th_idx,{names["lvar"]}) + ! Initialise thread-private reduction variable + {names["lvar"]} = 0.0_r_def + th_idx = omp_get_thread_num() + 1 + !$omp do schedule(static) + do df = loop{names["loop_idx"]}_start, loop{names["loop_idx"]}_stop, 1 + {names["builtin"]} + {names["lvar"]} = {names["lvar"]} + {names["rhs"]} + enddo + !$omp end do + + ! Store the thread private value of the reduction into the \ +shared array as required for reproducible reductions. + {names["avar"]}(1,th_idx) = {names["lvar"]} + !$omp end parallel + + ! sum the partial results sequentially + do th_idx = 1, nthreads, 1 + {names["var"]} = {names["var"]} + {names["avar"]}(1,th_idx) + enddo + DEALLOCATE({names["avar"]}) + + ! Perform global sum + global_sum%value = {names["var"]} + {names["var"]} = global_sum%get_sum() +""" + ) + assert correct in code else: assert "loop0_stop = undf_as1_f1" in code assert "loop1_stop = undf_as1_f1" in code assert "loop2_stop = undf_as1_f2" in code for names in [ - {"var": "asum", "lvar": "local_asum", "loop_idx": "0", + {"var": "asum", "lvar": "local_temp_asum", + "avar": "local_asum", "loop_idx": "0", "rhs": "f1_data(df) * f2_data(df)", "builtin": "! Built-in: X_innerproduct_Y (real-valued fields)" }, - {"var": "bsum", "lvar": "local_bsum", - "loop_idx": "2", "rhs": "f2_data(df)", + {"var": "bsum", "lvar": "local_temp_bsum", + "avar": "local_bsum", "loop_idx": "2", + "rhs": "f2_data(df)", "builtin": "! Built-in: sum_X (sum a real-valued field)"}]: assert ( - " ALLOCATE(" + names["lvar"] + "(8,nthreads))\n" + " ALLOCATE(" + names["avar"] + "(8,nthreads))\n" "\n" " ! Initialise reduction variable\n" " " + names["var"] + " = 0.0_r_def\n" - " " + names["lvar"] + " = 0.0_r_def\n") in code - expected = ( - " !$omp parallel default(shared) private(df,th_idx)\n" - " th_idx = omp_get_thread_num() + 1\n" - " !$omp do schedule(static)\n" - " do df = loop"+names["loop_idx"]+"_start, " - "loop" + names["loop_idx"]+"_stop, 1\n" - " " + names["builtin"] + "\n" - " " + names["lvar"] + "(1,th_idx) = " + - names["lvar"] + "(1,th_idx) + " + names["rhs"] + "\n" - " enddo\n" - " !$omp end do\n" - " !$omp end parallel\n" - "\n" - " ! sum the partial results sequentially\n" - " do th_idx = 1, nthreads, 1\n" - " " + names["var"] + " = " + names["var"] + " + " + - names["lvar"] + "(1,th_idx)\n" - " enddo\n" - " DEALLOCATE(" + names["lvar"] + ")\n") - assert expected in code + " " + names["avar"] + " = 0.0_r_def\n") in code + correct = (f""" + !$omp parallel default(shared) private(df,th_idx,{names["lvar"]}) + ! Initialise thread-private reduction variable + {names["lvar"]} = 0.0_r_def + th_idx = omp_get_thread_num() + 1 + !$omp do schedule(static) + do df = loop{names["loop_idx"]}_start, loop{names["loop_idx"]}_stop, 1 + {names["builtin"]} + {names["lvar"]} = {names["lvar"]} + {names["rhs"]} + enddo + !$omp end do + + ! Store the thread private value of the reduction into the \ +shared array as required for reproducible reductions. + {names["avar"]}(1,th_idx) = {names["lvar"]} + !$omp end parallel + + ! sum the partial results sequentially + do th_idx = 1, nthreads, 1 + {names["var"]} = {names["var"]} + {names["avar"]}(1,th_idx) + enddo + DEALLOCATE({names["avar"]}) +""" + ) + assert correct in code def test_repr_reductions_fused(tmpdir, dist_mem): @@ -3801,7 +3898,9 @@ def test_repr_reductions_fused(tmpdir, dist_mem): rtrans.apply(child) code = str(psy.gen) expected = '''\ - !$omp parallel default(shared) private(df,th_idx) + !$omp parallel default(shared) private(df,th_idx,local_temp_bsum) + ! Initialise thread-private reduction variable + local_temp_bsum = 0.0_r_def th_idx = omp_get_thread_num() + 1 !$omp do schedule(static) do df = loop1_start, loop1_stop, 1 @@ -3809,9 +3908,13 @@ def test_repr_reductions_fused(tmpdir, dist_mem): f1_data(df) = asum * f1_data(df) ! Built-in: sum_X (sum a real-valued field) - local_bsum(1,th_idx) = local_bsum(1,th_idx) + f2_data(df) + local_temp_bsum = local_temp_bsum + f2_data(df) enddo !$omp end do + + ! Store the thread private value of the reduction into the \ +shared array as required for reproducible reductions. + local_bsum(1,th_idx) = local_temp_bsum !$omp end parallel ! sum the partial results sequentially From 5ece0925a4efc58dfaf0d74fa8834b1622d3729c Mon Sep 17 00:00:00 2001 From: LonelyCat124 <3043914+LonelyCat124@users.noreply.github.com.> Date: Tue, 9 Jun 2026 16:11:32 +0100 Subject: [PATCH 04/10] Fixed linting error that didn't appear locally --- .../domain/lfric/transformations/lfric_transformations_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py b/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py index 9bf5e02b02..65abfc3f38 100644 --- a/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py +++ b/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py @@ -3820,7 +3820,7 @@ def test_repr_3_builtins_2_reductions_do(tmpdir, dist_mem): {names["var"]} = global_sum%get_sum() """ ) - assert correct in code + assert correct in code else: assert "loop0_stop = undf_as1_f1" in code assert "loop1_stop = undf_as1_f1" in code From 23a93e552369478e8e35a643e00e2970919c5a8b Mon Sep 17 00:00:00 2001 From: LonelyCat124 <3043914+LonelyCat124@users.noreply.github.com.> Date: Tue, 9 Jun 2026 16:17:03 +0100 Subject: [PATCH 05/10] fix coding standard for 3.9 --- .../lfric/transformations/lfric_transformations_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py b/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py index 65abfc3f38..6e37804f84 100644 --- a/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py +++ b/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py @@ -3819,7 +3819,7 @@ def test_repr_3_builtins_2_reductions_do(tmpdir, dist_mem): global_sum%value = {names["var"]} {names["var"]} = global_sum%get_sum() """ - ) + ) assert correct in code else: assert "loop0_stop = undf_as1_f1" in code @@ -3865,7 +3865,7 @@ def test_repr_3_builtins_2_reductions_do(tmpdir, dist_mem): enddo DEALLOCATE({names["avar"]}) """ - ) + ) assert correct in code From b4c2c5a14102d6d5a2137d4328a2898d43b3abe8 Mon Sep 17 00:00:00 2001 From: LonelyCat124 <3043914+LonelyCat124@users.noreply.github.com.> Date: Tue, 9 Jun 2026 16:22:41 +0100 Subject: [PATCH 06/10] linting for 3.9 --- .../lfric/transformations/lfric_transformations_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py b/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py index 6e37804f84..5dd74482ac 100644 --- a/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py +++ b/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py @@ -3819,7 +3819,7 @@ def test_repr_3_builtins_2_reductions_do(tmpdir, dist_mem): global_sum%value = {names["var"]} {names["var"]} = global_sum%get_sum() """ - ) + ) assert correct in code else: assert "loop0_stop = undf_as1_f1" in code @@ -3865,7 +3865,7 @@ def test_repr_3_builtins_2_reductions_do(tmpdir, dist_mem): enddo DEALLOCATE({names["avar"]}) """ - ) + ) assert correct in code From 5698069a6b442944123ff099a379145043dc67dd Mon Sep 17 00:00:00 2001 From: LonelyCat124 <3043914+LonelyCat124@users.noreply.github.com.> Date: Tue, 9 Jun 2026 16:39:35 +0100 Subject: [PATCH 07/10] Another try at linting --- .../lfric/transformations/lfric_transformations_test.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py b/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py index 5dd74482ac..c3940af7f7 100644 --- a/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py +++ b/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py @@ -3818,8 +3818,7 @@ def test_repr_3_builtins_2_reductions_do(tmpdir, dist_mem): ! Perform global sum global_sum%value = {names["var"]} {names["var"]} = global_sum%get_sum() -""" - ) +""") assert correct in code else: assert "loop0_stop = undf_as1_f1" in code @@ -3864,8 +3863,7 @@ def test_repr_3_builtins_2_reductions_do(tmpdir, dist_mem): {names["var"]} = {names["var"]} + {names["avar"]}(1,th_idx) enddo DEALLOCATE({names["avar"]}) -""" - ) +""") assert correct in code From 49729a66e84d5d7d55cea9ca44dc95ac2a2987b7 Mon Sep 17 00:00:00 2001 From: LonelyCat124 <3043914+LonelyCat124@users.noreply.github.com.> Date: Wed, 10 Jun 2026 15:39:33 +0100 Subject: [PATCH 08/10] Changes for ITs --- examples/lfric/scripts/KGOs/lfric_psyclone.cfg | 2 +- examples/lfric/scripts/KGOs/lfric_psyclone_no_annexed.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/lfric/scripts/KGOs/lfric_psyclone.cfg b/examples/lfric/scripts/KGOs/lfric_psyclone.cfg index b08795e7a1..ee7207fbbc 100644 --- a/examples/lfric/scripts/KGOs/lfric_psyclone.cfg +++ b/examples/lfric/scripts/KGOs/lfric_psyclone.cfg @@ -42,7 +42,7 @@ # Settings common to all APIs [DEFAULT] DISTRIBUTED_MEMORY = true -REPRODUCIBLE_REDUCTIONS = false +REPRODUCIBLE_REDUCTIONS = true # Amount to pad the local summation array when REPRODUCIBLE_REDUCTIONS is true REPROD_PAD_SIZE = 8 PSYIR_ROOT_NAME = psyir_tmp diff --git a/examples/lfric/scripts/KGOs/lfric_psyclone_no_annexed.cfg b/examples/lfric/scripts/KGOs/lfric_psyclone_no_annexed.cfg index 970d19575b..208fd84c53 100644 --- a/examples/lfric/scripts/KGOs/lfric_psyclone_no_annexed.cfg +++ b/examples/lfric/scripts/KGOs/lfric_psyclone_no_annexed.cfg @@ -42,7 +42,7 @@ # Settings common to all APIs [DEFAULT] DISTRIBUTED_MEMORY = true -REPRODUCIBLE_REDUCTIONS = false +REPRODUCIBLE_REDUCTIONS = true # Amount to pad the local summation array when REPRODUCIBLE_REDUCTIONS is true REPROD_PAD_SIZE = 8 PSYIR_ROOT_NAME = psyir_tmp From 7fd3f1fab2e16dbf571bbead9d8f272e67ee60a7 Mon Sep 17 00:00:00 2001 From: LonelyCat124 <3043914+LonelyCat124@users.noreply.github.com.> Date: Fri, 12 Jun 2026 14:03:20 +0100 Subject: [PATCH 09/10] Fixes for ITs with a change to LFRicOMPParallelLoopTrans --- .../lfric_transformations_test.py | 93 +++++++++++++++++++ src/psyclone/transformations.py | 30 +++++- 2 files changed, 121 insertions(+), 2 deletions(-) diff --git a/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py b/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py index c3940af7f7..74d43b57f3 100644 --- a/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py +++ b/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py @@ -6731,3 +6731,96 @@ def test_deprecated_names(capsys): assert (f"Deprecation warning: the script uses the legacy name " f"'{old_name}', please use new name " f"'{new_name}' instead.") in out + + +def test_reprod_red_with_parallel_do(tmpdir, monkeypatch, annexed, + dist_mem): + '''Test that when using reproducible reductions with + LFRicOMParallelLoopTrans it switches to using LFRicLoopTrans and + OMPParallelLoopTrans instead to ensure reproducibility. + ''' + file_name = "15.17.2_one_standard_builtin_one_reduction.f90" + api_config = Config.get().api_conf(TEST_API) + monkeypatch.setattr(api_config, "_compute_annexed_dofs", annexed) + # Enabled reproducible reductions + Config.get()._reproducible_reductions = True + psy, invoke = get_invoke(file_name, TEST_API, idx=0, dist_mem=dist_mem) + schedule = invoke.schedule + otrans = LFRicOMPParallelLoopTrans() + # Apply an OpenMP do to the loop + for child in schedule.children: + if isinstance(child, Loop): + otrans.apply(child, {"reprod": True}) + result = str(psy.gen) + assert LFRicBuild(tmpdir).code_compiles(psy) + + if dist_mem: # annexed can be True or False + if annexed: + assert ("loop0_stop = f1_proxy%vspace%get_last_dof_annexed()" + in result) + else: + assert ("loop0_stop = f1_proxy%vspace%get_last_dof_owned()" + in result) + assert "loop1_stop = f1_proxy%vspace%get_last_dof_owned()" in result + expected_output = ( + " ALLOCATE(local_asum(8,nthreads))\n" + "\n" + " ! Initialise reduction variable\n" + " asum = 0.0_r_def\n" + " local_asum = 0.0_r_def\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" + " th_idx = omp_get_thread_num() + 1\n" + " !$omp do schedule(static)\n" + " do df = loop1_start, loop1_stop, 1\n" + " ! Built-in: sum_X (sum a real-valued field)\n" + " local_temp_asum = local_temp_asum + f1_data(df)\n" + " enddo\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into the " + "shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" + " !$omp end parallel\n" + "\n" + " ! sum the partial results sequentially\n" + " do th_idx = 1, nthreads, 1\n" + " asum = asum + local_asum(1,th_idx)\n" + " enddo\n" + " DEALLOCATE(local_asum)\n" + "\n" + " ! Perform global sum\n" + " global_sum%value = asum\n" + " asum = global_sum%get_sum()") + assert expected_output in result + else: # not distmem. annexed can be True or False + assert "loop0_stop = undf_as1_f1" in result + assert "loop1_stop = undf_as1_f1" in result + expected_output = ( + " ALLOCATE(local_asum(8,nthreads))\n\n" + " ! Initialise reduction variable\n" + " asum = 0.0_r_def\n" + " local_asum = 0.0_r_def\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" + " th_idx = omp_get_thread_num() + 1\n" + " !$omp do schedule(static)\n" + " do df = loop1_start, loop1_stop, 1\n" + " ! Built-in: sum_X (sum a real-valued field)\n" + " local_temp_asum = local_temp_asum + f1_data(df)\n" + " enddo\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into the " + "shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" + " !$omp end parallel\n" + "\n" + " ! sum the partial results sequentially\n" + " do th_idx = 1, nthreads, 1\n" + " asum = asum + local_asum(1,th_idx)\n" + " enddo\n" + " DEALLOCATE(local_asum)\n") + assert expected_output in result diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py index 30dd962303..63f4b0b464 100644 --- a/src/psyclone/transformations.py +++ b/src/psyclone/transformations.py @@ -78,6 +78,7 @@ from psyclone.psyir.transformations.callee_transformation_mixin import ( CalleeTransformationMixin) from psyclone.psyir.transformations.loop_trans import LoopTrans +from psyclone.psyir.transformations.omp_parallel_trans import OMPParallelTrans from psyclone.psyir.transformations.omp_loop_trans import OMPLoopTrans from psyclone.psyir.transformations.region_trans import RegionTrans from psyclone.psyir.transformations.transformation_error import ( @@ -137,13 +138,12 @@ def __init__(self, omp_directive="do", omp_schedule="static"): def __str__(self): return "Add an OpenMP Parallel Do directive to an LFRic loop" - def validate(self, node, options=None): + def validate(self, node: LFRicLoop, options=None): ''' Perform LFRic-specific loop validity checks then call the `validate` method of the base class. :param node: the Node in the Schedule to check - :type node: :py:class:`psyclone.psyir.nodes.Node` :param options: a dictionary with options for transformations. :type options: Optional[Dict[str, Any]] @@ -177,6 +177,32 @@ def validate(self, node, options=None): local_options["force"] = True super().validate(node, options=local_options) + def apply(self, node: LFRicLoop, options=None): + ''' + Applies the LFRicOMPParallelLoopTrans to the supplied Loop. + + If reproducible reductions are enabled, and the specified + omp_directive is 'do' then this will instead apply + LFRicOMPLoopTrans and LFricOMPParallelTrans to the input + node instead. + + :param node: the Node in the Schedule to transform. + :param options: a dictionary with options for transformations. + :type options: Optional[Dict[str, Any]] + ''' + if (Config.get().reproducible_reductions + and self.omp_directive == "do"): + ltrans = LFRicOMPLoopTrans(omp_schedule=self.omp_schedule) + # TODO 2668: When we move to new kwarg options we may need to + # split the options using the sub_transformations methods, + # but disable the sub transformation option inheritance. + ltrans.apply(node, options=options) + ptrans = OMPParallelTrans() + ptrans.apply(node.parent.parent, options=options) + return + + super().apply(node, options) + class GOceanOMPParallelLoopTrans(OMPParallelLoopTrans): From b7c73eaf39669fcf77095a18448ce7ecd537edd5 Mon Sep 17 00:00:00 2001 From: LonelyCat124 <3043914+LonelyCat124@users.noreply.github.com.> Date: Mon, 15 Jun 2026 13:39:30 +0100 Subject: [PATCH 10/10] Fix only applying different transformations if reductions are present --- src/psyclone/transformations.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py index 63f4b0b464..34f650e077 100644 --- a/src/psyclone/transformations.py +++ b/src/psyclone/transformations.py @@ -192,14 +192,16 @@ def apply(self, node: LFRicLoop, options=None): ''' if (Config.get().reproducible_reductions and self.omp_directive == "do"): - ltrans = LFRicOMPLoopTrans(omp_schedule=self.omp_schedule) - # TODO 2668: When we move to new kwarg options we may need to - # split the options using the sub_transformations methods, - # but disable the sub transformation option inheritance. - ltrans.apply(node, options=options) - ptrans = OMPParallelTrans() - ptrans.apply(node.parent.parent, options=options) - return + kerns = node.walk(Kern) + if any([kern.is_reduction for kern in kerns]): + ltrans = LFRicOMPLoopTrans(omp_schedule=self.omp_schedule) + # TODO 2668: When we move to new kwarg options we may need to + # split the options using the sub_transformations methods, + # but disable the sub transformation option inheritance. + ltrans.apply(node, options=options) + ptrans = OMPParallelTrans() + ptrans.apply(node.parent.parent, options=options) + return super().apply(node, options)