diff --git a/examples/lfric/scripts/KGOs/lfric_psyclone.cfg b/examples/lfric/scripts/KGOs/lfric_psyclone.cfg index b08795e7a1..ee7207fbbc 100644 --- a/examples/lfric/scripts/KGOs/lfric_psyclone.cfg +++ b/examples/lfric/scripts/KGOs/lfric_psyclone.cfg @@ -42,7 +42,7 @@ # Settings common to all APIs [DEFAULT] DISTRIBUTED_MEMORY = true -REPRODUCIBLE_REDUCTIONS = false +REPRODUCIBLE_REDUCTIONS = true # Amount to pad the local summation array when REPRODUCIBLE_REDUCTIONS is true REPROD_PAD_SIZE = 8 PSYIR_ROOT_NAME = psyir_tmp diff --git a/examples/lfric/scripts/KGOs/lfric_psyclone_no_annexed.cfg b/examples/lfric/scripts/KGOs/lfric_psyclone_no_annexed.cfg index 970d19575b..208fd84c53 100644 --- a/examples/lfric/scripts/KGOs/lfric_psyclone_no_annexed.cfg +++ b/examples/lfric/scripts/KGOs/lfric_psyclone_no_annexed.cfg @@ -42,7 +42,7 @@ # Settings common to all APIs [DEFAULT] DISTRIBUTED_MEMORY = true -REPRODUCIBLE_REDUCTIONS = false +REPRODUCIBLE_REDUCTIONS = true # Amount to pad the local summation array when REPRODUCIBLE_REDUCTIONS is true REPROD_PAD_SIZE = 8 PSYIR_ROOT_NAME = psyir_tmp diff --git a/src/psyclone/psyGen.py b/src/psyclone/psyGen.py index 1f8c926592..2ed319c5c7 100644 --- a/src/psyclone/psyGen.py +++ b/src/psyclone/psyGen.py @@ -967,6 +967,69 @@ def initialise_reduction_variable(self) -> None: insert_loc.addchild(assign, cursor) return new_node + def temp_to_array_assignment(self, + parent: Node, + table: SymbolTable) -> None: + ''' + Generate the appropriate code to transfer the private reduction + temporary into the shared array for reproducible reductions. + The assignment is always placed as the final child of parent. + + This method is designed to be used *after* a Kern has been lowered + (and thus detached) and therefore does not use `self.scope`. + + :param parent: the node to which to add the assignment as a child. + :param table: the SymbolTable to use. + ''' + tag = f"{self.name}:{self._reduction_arg.name}:local" + array_symbol = table.lookup_with_tag(tag) + local_var = table.lookup_with_tag( + f"{self.name}:{self._reduction_arg.name}:templocal") + thread_idx = table.lookup_with_tag("omp_thread_index") + arr_ref = ArrayReference.create( + array_symbol, [ + Literal("1", ScalarType.integer_type()), + Reference(thread_idx) + ]) + + assign = Assignment.create( + arr_ref, Reference(local_var) + ) + + assign.preceding_comment = ( + "Store the thread private value of the reduction into the " + "shared array as required for reproducible reductions." + ) + + # Where to put it? + parent.addchild(assign) + + def initialise_and_privatise_scalar_store( + self, parallel_region: Node, position: int, table: SymbolTable + ) -> None: + ''' + Generate the appropriate code to initialise the scalar reduction + variable inside the parallel region, and add it to the private clause. + + :param parent: the parallel region to which to add the initialisation + as a child. + :param position: where in the parent's list of children to add + the new Loop. + :param table: the SymbolTable to use. + ''' + local_var = table.lookup_with_tag( + f"{self.name}:{self._reduction_arg.name}:templocal") + assign = Assignment.create(Reference(local_var), + Literal("0", local_var.datatype)) + parallel_region.dir_body.addchild(assign, position) + assign.preceding_comment = ( + "Initialise thread-private reduction variable" + ) + # The local var also needs to be thread private. + parallel_region.private_clause.addchild( + Reference(local_var) + ) + def reduction_sum_loop(self, parent: Node, position: int, @@ -1020,7 +1083,7 @@ def _reduction_reference(self): Return the reference to the reduction variable if OpenMP is set to be unreproducible, as we will be using the OpenMP reduction clause. Otherwise we will be computing the reduction ourselves and therefore - need to store values into a (padded) array separately for each + need to store values into a temporary variable for each thread. :returns: reference to the variable to be reduced. @@ -1031,12 +1094,8 @@ def _reduction_reference(self): symtab = self.ancestor(InvokeSchedule).symbol_table if self.reprod_reduction: local_var = symtab.lookup_with_tag( - f"{self.name}:{self._reduction_arg.name}:local") - # Return a multi-valued ArrayReference for a reproducible reduction - array_dim = [ - Literal("1", ScalarType.integer_type()), - Reference(symtab.lookup_with_tag("omp_thread_index"))] - return ArrayReference.create(local_var, array_dim) + f"{self.name}:{self._reduction_arg.name}:templocal") + return Reference(local_var) # Return a single-valued Reference for a non-reproducible reduction local_var = symtab.lookup_with_tag( f"AlgArgs_{self._reduction_arg.text}") @@ -1104,6 +1163,11 @@ def lower_to_language_level(self): array_type = ArrayType(arg_sym.datatype, 2*[ArrayType.Extent.DEFERRED]) + # Create a scalar temp to store to in the loop. + _ = table.find_or_create_tag( + root_name=f"local_temp_{self._reduction_arg.name}", + tag=f"{self.name}:{self._reduction_arg.name}:templocal", + symbol_type=DataSymbol, datatype=arg_sym.datatype) local_var = table.find_or_create_tag( root_name="local_"+self._reduction_arg.name, tag=f"{self.name}:{self._reduction_arg.name}:local", diff --git a/src/psyclone/psyir/nodes/omp_directives.py b/src/psyclone/psyir/nodes/omp_directives.py index 0884b20b86..f33572c9df 100644 --- a/src/psyclone/psyir/nodes/omp_directives.py +++ b/src/psyclone/psyir/nodes/omp_directives.py @@ -1401,6 +1401,15 @@ def lower_to_language_level(self) -> Node: self._add_reduction_clauses() for call in reversed(reprod_red_call_list): + # Get the reduction variable to initialise and privatise. + tag = f"{call.name}:{call._reduction_arg.name}:templocal" + sym = self.scope.symbol_table.lookup_with_tag(tag) + call.initialise_and_privatise_scalar_store( + self, 0, self.scope.symbol_table + ) + call.temp_to_array_assignment( + self.dir_body, self.scope.symbol_table + ) call.reduction_sum_loop(self.parent, self.position, self.scope.symbol_table) @@ -1916,8 +1925,9 @@ def lower_to_language_level(self): # We only attempt to *automatically* add reduction clauses if we have a # high-level (DSL) reduction operation. reductions = self.reductions() - if not reductions: - # No high-level reduction operations. + if not reductions or self.reprod: + # No high-level reduction operations, or + # using reproducible reductions return super().lower_to_language_level() self.children[0].lower_to_language_level() diff --git a/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py b/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py index 1619d1eac7..74d43b57f3 100644 --- a/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py +++ b/src/psyclone/tests/domain/lfric/transformations/lfric_transformations_test.py @@ -2182,7 +2182,10 @@ def test_reprod_red_after_normal_real_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels and communication routines\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" @@ -2193,9 +2196,12 @@ def test_reprod_red_after_normal_real_do(tmpdir, monkeypatch, annexed, " !$omp do schedule(static)\n" " do df = loop1_start, loop1_stop, 1\n" " ! Built-in: sum_X (sum a real-valued field)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + f1_data(df)\n" + " local_temp_asum = local_temp_asum + f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into the " + "shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -2222,7 +2228,10 @@ def test_reprod_red_after_normal_real_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" @@ -2233,9 +2242,12 @@ def test_reprod_red_after_normal_real_do(tmpdir, monkeypatch, annexed, " !$omp do schedule(static)\n" " do df = loop1_start, loop1_stop, 1\n" " ! Built-in: sum_X (sum a real-valued field)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + f1_data(df)\n" + " local_temp_asum = local_temp_asum + f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into the " + "shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -2385,21 +2397,32 @@ def test_two_reprod_reductions_real_do(tmpdir, dist_mem): " local_bsum = 0.0_r_def\n" "\n" " ! Call kernels and communication routines\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_bsum,local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_bsum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + " + " local_temp_asum = local_temp_asum + " "f1_data(df) * f2_data(df)\n" " enddo\n" " !$omp end do\n" " !$omp do schedule(static)\n" " do df = loop1_start, loop1_stop, 1\n" " ! Built-in: sum_X (sum a real-valued field)\n" - " local_bsum(1,th_idx) = local_bsum(1,th_idx) + f1_data(df)\n" + " local_temp_bsum = local_temp_bsum + f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_bsum(1,th_idx) = local_temp_bsum\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -2437,21 +2460,32 @@ def test_two_reprod_reductions_real_do(tmpdir, dist_mem): " local_bsum = 0.0_r_def\n" "\n" " ! Call kernels\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_bsum,local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_bsum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + " + " local_temp_asum = local_temp_asum + " "f1_data(df) * f2_data(df)\n" " enddo\n" " !$omp end do\n" " !$omp do schedule(static)\n" " do df = loop1_start, loop1_stop, 1\n" " ! Built-in: sum_X (sum a real-valued field)\n" - " local_bsum(1,th_idx) = local_bsum(1,th_idx) + f1_data(df)\n" + " local_temp_bsum = local_temp_bsum + f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_bsum(1,th_idx) = local_temp_bsum\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3259,15 +3293,21 @@ def test_reprod_reduction_real_do(tmpdir, dist_mem): " local_asum = 0.0_r_def\n" "\n" " ! Call kernels and communication routines\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + f1_data(df) " + " local_temp_asum = local_temp_asum + f1_data(df) " "* f2_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3289,15 +3329,21 @@ def test_reprod_reduction_real_do(tmpdir, dist_mem): " local_asum = 0.0_r_def\n" "\n" " ! Call kernels\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + f1_data(df) " + " local_temp_asum = local_temp_asum + f1_data(df) " "* f2_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3383,12 +3429,15 @@ def test_reprod_builtins_red_then_usual_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels and communication routines\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + f1_data(df) " + " local_temp_asum = local_temp_asum + f1_data(df) " "* f2_data(df)\n" " enddo\n" " !$omp end do\n" @@ -3397,7 +3446,10 @@ def test_reprod_builtins_red_then_usual_do(tmpdir, monkeypatch, annexed, " ! Built-in: inc_a_times_X (scale a real-valued field)\n" " f1_data(df) = bsum * f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3425,12 +3477,15 @@ def test_reprod_builtins_red_then_usual_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + f1_data(df) " + " local_temp_asum = local_temp_asum + f1_data(df) " "* f2_data(df)\n" " enddo\n" " !$omp end do\n" @@ -3439,7 +3494,10 @@ def test_reprod_builtins_red_then_usual_do(tmpdir, monkeypatch, annexed, " ! Built-in: inc_a_times_X (scale a real-valued field)\n" " f1_data(df) = bsum * f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3506,18 +3564,24 @@ def test_repr_bltins_red_then_usual_fuse_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels and communication routines\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + " + " local_temp_asum = local_temp_asum + " "f1_data(df) * f2_data(df)\n" "\n" " ! Built-in: inc_a_times_X (scale a real-valued field)\n" " f1_data(df) = bsum * f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3543,18 +3607,24 @@ def test_repr_bltins_red_then_usual_fuse_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" " ! Built-in: X_innerproduct_Y (real-valued fields)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + " + " local_temp_asum = local_temp_asum + " "f1_data(df) * f2_data(df)\n" "\n" " ! Built-in: inc_a_times_X (scale a real-valued field)\n" " f1_data(df) = bsum * f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3608,7 +3678,10 @@ def test_repr_bltins_usual_then_red_fuse_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels and communication routines\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" @@ -3616,10 +3689,13 @@ def test_repr_bltins_usual_then_red_fuse_do(tmpdir, monkeypatch, annexed, " f1_data(df) = bvalue * f1_data(df)\n" "\n" " ! Built-in: sum_X (sum a real-valued field)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + " + " local_temp_asum = local_temp_asum + " "f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3645,7 +3721,10 @@ def test_repr_bltins_usual_then_red_fuse_do(tmpdir, monkeypatch, annexed, " local_asum = 0.0_r_def\n" "\n" " ! Call kernels\n" - " !$omp parallel default(shared) private(df,th_idx)\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" " th_idx = omp_get_thread_num() + 1\n" " !$omp do schedule(static)\n" " do df = loop0_start, loop0_stop, 1\n" @@ -3653,10 +3732,13 @@ def test_repr_bltins_usual_then_red_fuse_do(tmpdir, monkeypatch, annexed, " f1_data(df) = bvalue * f1_data(df)\n" "\n" " ! Built-in: sum_X (sum a real-valued field)\n" - " local_asum(1,th_idx) = local_asum(1,th_idx) + " + " local_temp_asum = local_temp_asum + " "f1_data(df)\n" " enddo\n" - " !$omp end do\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into " + "the shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" " !$omp end parallel\n" "\n" " ! sum the partial results sequentially\n" @@ -3694,82 +3776,95 @@ def test_repr_3_builtins_2_reductions_do(tmpdir, dist_mem): assert "loop2_stop = f2_proxy%vspace%get_last_dof_owned()" in code for names in [ - {"var": "asum", "lvar": "local_asum", "loop_idx": "0", + {"var": "asum", "lvar": "local_temp_asum", + "avar": "local_asum", "loop_idx": "0", "rhs": "f1_data(df) * f2_data(df)", "builtin": "! Built-in: X_innerproduct_Y (real-valued fields)" }, - {"var": "bsum", "lvar": "local_bsum", "loop_idx": "2", + {"var": "bsum", "lvar": "local_temp_bsum", + "avar": "local_bsum", "loop_idx": "2", "rhs": "f2_data(df)", "builtin": "! Built-in: sum_X (sum a real-valued field)"}]: assert ( - " ALLOCATE(" + names["lvar"] + "(8,nthreads))\n" + " ALLOCATE(" + names["avar"] + "(8,nthreads))\n" "\n" " ! Initialise reduction variable\n" " " + names["var"] + " = 0.0_r_def\n" - " " + names["lvar"] + " = 0.0_r_def\n") in code - assert ( - " !$omp parallel default(shared) private(df,th_idx)\n" - " th_idx = omp_get_thread_num() + 1\n" - " !$omp do schedule(static)\n" - " do df = loop"+names["loop_idx"]+"_start, " - "loop"+names["loop_idx"]+"_stop, 1\n" - " " + names["builtin"] + "\n" - " " + names["lvar"] + "(1,th_idx) = " + - names["lvar"] + "(1,th_idx) + " + names["rhs"] + "\n" - " enddo\n" - " !$omp end do\n" - " !$omp end parallel\n" - "\n" - " ! sum the partial results sequentially\n" - " do th_idx = 1, nthreads, 1\n" - " " + names["var"] + " = " + names["var"] + " + " + - names["lvar"] + "(1,th_idx)\n" - " enddo\n" - " DEALLOCATE(" + names["lvar"] + ")\n" - "\n" - " ! Perform global sum\n" - " global_sum%value = " + names["var"] + "\n" - " " + names["var"] + " = " - "global_sum%get_sum()\n") in code + " " + names["avar"] + " = 0.0_r_def\n") in code + + correct = (f""" + !$omp parallel default(shared) private(df,th_idx,{names["lvar"]}) + ! Initialise thread-private reduction variable + {names["lvar"]} = 0.0_r_def + th_idx = omp_get_thread_num() + 1 + !$omp do schedule(static) + do df = loop{names["loop_idx"]}_start, loop{names["loop_idx"]}_stop, 1 + {names["builtin"]} + {names["lvar"]} = {names["lvar"]} + {names["rhs"]} + enddo + !$omp end do + + ! Store the thread private value of the reduction into the \ +shared array as required for reproducible reductions. + {names["avar"]}(1,th_idx) = {names["lvar"]} + !$omp end parallel + + ! sum the partial results sequentially + do th_idx = 1, nthreads, 1 + {names["var"]} = {names["var"]} + {names["avar"]}(1,th_idx) + enddo + DEALLOCATE({names["avar"]}) + + ! Perform global sum + global_sum%value = {names["var"]} + {names["var"]} = global_sum%get_sum() +""") + assert correct in code else: assert "loop0_stop = undf_as1_f1" in code assert "loop1_stop = undf_as1_f1" in code assert "loop2_stop = undf_as1_f2" in code for names in [ - {"var": "asum", "lvar": "local_asum", "loop_idx": "0", + {"var": "asum", "lvar": "local_temp_asum", + "avar": "local_asum", "loop_idx": "0", "rhs": "f1_data(df) * f2_data(df)", "builtin": "! Built-in: X_innerproduct_Y (real-valued fields)" }, - {"var": "bsum", "lvar": "local_bsum", - "loop_idx": "2", "rhs": "f2_data(df)", + {"var": "bsum", "lvar": "local_temp_bsum", + "avar": "local_bsum", "loop_idx": "2", + "rhs": "f2_data(df)", "builtin": "! Built-in: sum_X (sum a real-valued field)"}]: assert ( - " ALLOCATE(" + names["lvar"] + "(8,nthreads))\n" + " ALLOCATE(" + names["avar"] + "(8,nthreads))\n" "\n" " ! Initialise reduction variable\n" " " + names["var"] + " = 0.0_r_def\n" - " " + names["lvar"] + " = 0.0_r_def\n") in code - expected = ( - " !$omp parallel default(shared) private(df,th_idx)\n" - " th_idx = omp_get_thread_num() + 1\n" - " !$omp do schedule(static)\n" - " do df = loop"+names["loop_idx"]+"_start, " - "loop" + names["loop_idx"]+"_stop, 1\n" - " " + names["builtin"] + "\n" - " " + names["lvar"] + "(1,th_idx) = " + - names["lvar"] + "(1,th_idx) + " + names["rhs"] + "\n" - " enddo\n" - " !$omp end do\n" - " !$omp end parallel\n" - "\n" - " ! sum the partial results sequentially\n" - " do th_idx = 1, nthreads, 1\n" - " " + names["var"] + " = " + names["var"] + " + " + - names["lvar"] + "(1,th_idx)\n" - " enddo\n" - " DEALLOCATE(" + names["lvar"] + ")\n") - assert expected in code + " " + names["avar"] + " = 0.0_r_def\n") in code + correct = (f""" + !$omp parallel default(shared) private(df,th_idx,{names["lvar"]}) + ! Initialise thread-private reduction variable + {names["lvar"]} = 0.0_r_def + th_idx = omp_get_thread_num() + 1 + !$omp do schedule(static) + do df = loop{names["loop_idx"]}_start, loop{names["loop_idx"]}_stop, 1 + {names["builtin"]} + {names["lvar"]} = {names["lvar"]} + {names["rhs"]} + enddo + !$omp end do + + ! Store the thread private value of the reduction into the \ +shared array as required for reproducible reductions. + {names["avar"]}(1,th_idx) = {names["lvar"]} + !$omp end parallel + + ! sum the partial results sequentially + do th_idx = 1, nthreads, 1 + {names["var"]} = {names["var"]} + {names["avar"]}(1,th_idx) + enddo + DEALLOCATE({names["avar"]}) +""") + assert correct in code def test_repr_reductions_fused(tmpdir, dist_mem): @@ -3801,7 +3896,9 @@ def test_repr_reductions_fused(tmpdir, dist_mem): rtrans.apply(child) code = str(psy.gen) expected = '''\ - !$omp parallel default(shared) private(df,th_idx) + !$omp parallel default(shared) private(df,th_idx,local_temp_bsum) + ! Initialise thread-private reduction variable + local_temp_bsum = 0.0_r_def th_idx = omp_get_thread_num() + 1 !$omp do schedule(static) do df = loop1_start, loop1_stop, 1 @@ -3809,9 +3906,13 @@ def test_repr_reductions_fused(tmpdir, dist_mem): f1_data(df) = asum * f1_data(df) ! Built-in: sum_X (sum a real-valued field) - local_bsum(1,th_idx) = local_bsum(1,th_idx) + f2_data(df) + local_temp_bsum = local_temp_bsum + f2_data(df) enddo !$omp end do + + ! Store the thread private value of the reduction into the \ +shared array as required for reproducible reductions. + local_bsum(1,th_idx) = local_temp_bsum !$omp end parallel ! sum the partial results sequentially @@ -6630,3 +6731,96 @@ def test_deprecated_names(capsys): assert (f"Deprecation warning: the script uses the legacy name " f"'{old_name}', please use new name " f"'{new_name}' instead.") in out + + +def test_reprod_red_with_parallel_do(tmpdir, monkeypatch, annexed, + dist_mem): + '''Test that when using reproducible reductions with + LFRicOMParallelLoopTrans it switches to using LFRicLoopTrans and + OMPParallelLoopTrans instead to ensure reproducibility. + ''' + file_name = "15.17.2_one_standard_builtin_one_reduction.f90" + api_config = Config.get().api_conf(TEST_API) + monkeypatch.setattr(api_config, "_compute_annexed_dofs", annexed) + # Enabled reproducible reductions + Config.get()._reproducible_reductions = True + psy, invoke = get_invoke(file_name, TEST_API, idx=0, dist_mem=dist_mem) + schedule = invoke.schedule + otrans = LFRicOMPParallelLoopTrans() + # Apply an OpenMP do to the loop + for child in schedule.children: + if isinstance(child, Loop): + otrans.apply(child, {"reprod": True}) + result = str(psy.gen) + assert LFRicBuild(tmpdir).code_compiles(psy) + + if dist_mem: # annexed can be True or False + if annexed: + assert ("loop0_stop = f1_proxy%vspace%get_last_dof_annexed()" + in result) + else: + assert ("loop0_stop = f1_proxy%vspace%get_last_dof_owned()" + in result) + assert "loop1_stop = f1_proxy%vspace%get_last_dof_owned()" in result + expected_output = ( + " ALLOCATE(local_asum(8,nthreads))\n" + "\n" + " ! Initialise reduction variable\n" + " asum = 0.0_r_def\n" + " local_asum = 0.0_r_def\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" + " th_idx = omp_get_thread_num() + 1\n" + " !$omp do schedule(static)\n" + " do df = loop1_start, loop1_stop, 1\n" + " ! Built-in: sum_X (sum a real-valued field)\n" + " local_temp_asum = local_temp_asum + f1_data(df)\n" + " enddo\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into the " + "shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" + " !$omp end parallel\n" + "\n" + " ! sum the partial results sequentially\n" + " do th_idx = 1, nthreads, 1\n" + " asum = asum + local_asum(1,th_idx)\n" + " enddo\n" + " DEALLOCATE(local_asum)\n" + "\n" + " ! Perform global sum\n" + " global_sum%value = asum\n" + " asum = global_sum%get_sum()") + assert expected_output in result + else: # not distmem. annexed can be True or False + assert "loop0_stop = undf_as1_f1" in result + assert "loop1_stop = undf_as1_f1" in result + expected_output = ( + " ALLOCATE(local_asum(8,nthreads))\n\n" + " ! Initialise reduction variable\n" + " asum = 0.0_r_def\n" + " local_asum = 0.0_r_def\n" + " !$omp parallel default(shared) private(df,th_idx," + "local_temp_asum)\n" + " ! Initialise thread-private reduction variable\n" + " local_temp_asum = 0.0_r_def\n" + " th_idx = omp_get_thread_num() + 1\n" + " !$omp do schedule(static)\n" + " do df = loop1_start, loop1_stop, 1\n" + " ! Built-in: sum_X (sum a real-valued field)\n" + " local_temp_asum = local_temp_asum + f1_data(df)\n" + " enddo\n" + " !$omp end do\n\n" + " ! Store the thread private value of the reduction into the " + "shared array as required for reproducible reductions.\n" + " local_asum(1,th_idx) = local_temp_asum\n" + " !$omp end parallel\n" + "\n" + " ! sum the partial results sequentially\n" + " do th_idx = 1, nthreads, 1\n" + " asum = asum + local_asum(1,th_idx)\n" + " enddo\n" + " DEALLOCATE(local_asum)\n") + assert expected_output in result diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py index 30dd962303..34f650e077 100644 --- a/src/psyclone/transformations.py +++ b/src/psyclone/transformations.py @@ -78,6 +78,7 @@ from psyclone.psyir.transformations.callee_transformation_mixin import ( CalleeTransformationMixin) from psyclone.psyir.transformations.loop_trans import LoopTrans +from psyclone.psyir.transformations.omp_parallel_trans import OMPParallelTrans from psyclone.psyir.transformations.omp_loop_trans import OMPLoopTrans from psyclone.psyir.transformations.region_trans import RegionTrans from psyclone.psyir.transformations.transformation_error import ( @@ -137,13 +138,12 @@ def __init__(self, omp_directive="do", omp_schedule="static"): def __str__(self): return "Add an OpenMP Parallel Do directive to an LFRic loop" - def validate(self, node, options=None): + def validate(self, node: LFRicLoop, options=None): ''' Perform LFRic-specific loop validity checks then call the `validate` method of the base class. :param node: the Node in the Schedule to check - :type node: :py:class:`psyclone.psyir.nodes.Node` :param options: a dictionary with options for transformations. :type options: Optional[Dict[str, Any]] @@ -177,6 +177,34 @@ def validate(self, node, options=None): local_options["force"] = True super().validate(node, options=local_options) + def apply(self, node: LFRicLoop, options=None): + ''' + Applies the LFRicOMPParallelLoopTrans to the supplied Loop. + + If reproducible reductions are enabled, and the specified + omp_directive is 'do' then this will instead apply + LFRicOMPLoopTrans and LFricOMPParallelTrans to the input + node instead. + + :param node: the Node in the Schedule to transform. + :param options: a dictionary with options for transformations. + :type options: Optional[Dict[str, Any]] + ''' + if (Config.get().reproducible_reductions + and self.omp_directive == "do"): + kerns = node.walk(Kern) + if any([kern.is_reduction for kern in kerns]): + ltrans = LFRicOMPLoopTrans(omp_schedule=self.omp_schedule) + # TODO 2668: When we move to new kwarg options we may need to + # split the options using the sub_transformations methods, + # but disable the sub transformation option inheritance. + ltrans.apply(node, options=options) + ptrans = OMPParallelTrans() + ptrans.apply(node.parent.parent, options=options) + return + + super().apply(node, options) + class GOceanOMPParallelLoopTrans(OMPParallelLoopTrans):