diff --git a/tests/csl_runtime/samples/task_recycling_merge.sptl b/tests/csl_runtime/samples/task_recycling_merge_wse2.sptl similarity index 100% rename from tests/csl_runtime/samples/task_recycling_merge.sptl rename to tests/csl_runtime/samples/task_recycling_merge_wse2.sptl diff --git a/tests/csl_runtime/samples/task_recycling_merge_wse3.sptl b/tests/csl_runtime/samples/task_recycling_merge_wse3.sptl new file mode 100644 index 00000000..0431b945 --- /dev/null +++ b/tests/csl_runtime/samples/task_recycling_merge_wse3.sptl @@ -0,0 +1,111 @@ +kernel @task_recycling_merge<>(stream[1, 1] readonly input, stream[1, 1] writeonly output) { + place u16 i, u16 j in [0:1, 0:1] { + f32[22] buf + f32 v0 + f32 v1 + f32 v2 + f32 v3 + f32 v4 + f32 v5 + f32 v6 + f32 v7 + f32 v8 + f32 v9 + f32 v10 + f32 v11 + f32 v12 + f32 v13 + f32 v14 + f32 v15 + f32 v16 + f32 v17 + f32 v18 + f32 left_val + f32 right_val + f32 output_val + f32[1] out_buf + } + + dataflow u16 i, u16 j in [0:1, 0:1] { + } + + compute u16 i, u16 j in [0:1, 0:1] { + await receive(buf, input[i, j]) + + completion s0 = async { + v0 = buf[0] + } + completion s1 = async { + v1 = v0 * 0.0 + buf[1] + } + completion s2 = async { + v2 = v1 * 0.0 + buf[2] + } + completion s3 = async { + v3 = v2 * 0.0 + buf[3] + } + completion s4 = async { + v4 = v3 * 0.0 + buf[4] + } + completion s5 = async { + v5 = v4 * 0.0 + buf[5] + } + completion s6 = async { + v6 = v5 * 0.0 + buf[6] + } + completion s7 = async { + v7 = v6 * 0.0 + buf[7] + } + completion s8 = async { + v8 = v7 * 0.0 + buf[8] + } + completion s9 = async { + v9 = v8 * 0.0 + buf[9] + } + completion s10 = async { + v10 = v9 * 0.0 + buf[10] + } + completion s11 = async { + v11 = v10 * 0.0 + buf[11] + } + + completion s12 = async { + v12 = v11 * 0.0 + buf[12] + } + completion s13 = async { + v13 = v12 * 0.0 + buf[13] + } + completion s14 = async { + v14 = v13 * 0.0 + buf[14] + } + completion s15 = async { + v15 = v14 * 0.0 + buf[15] + } + completion s16 = async { + v16 = v15 * 0.0 + buf[16] + } + completion s17 = async { + v17 = v16 * 0.0 + buf[17] + } + completion s18 = async { + v18 = v17 * 0.0 + buf[18] + } + completion left = async { + left_val = v19 * 0.0 + buf[20] + } + completion right = async { + right_val = v19 * 0.0 + buf[21] + } + + awaitall + + output_val = v0 + v1 + v2 + v3 + v4 + v5 + v6 + output_val = output_val + v7 + v8 + v9 + v10 + v11 + output_val = output_val + v12 + v13 + v14 + v15 + v16 + output_val = output_val + v17 + v18 + v19 + output_val = output_val + left_val + right_val + out_buf[0] = output_val + + await send(out_buf, output[i, j]) + } +} diff --git a/tests/csl_runtime/samples/task_recycling_three_stage.sptl b/tests/csl_runtime/samples/task_recycling_three_stage_wse2.sptl similarity index 100% rename from tests/csl_runtime/samples/task_recycling_three_stage.sptl rename to tests/csl_runtime/samples/task_recycling_three_stage_wse2.sptl diff --git a/tests/csl_runtime/samples/task_recycling_three_stage_wse3.sptl b/tests/csl_runtime/samples/task_recycling_three_stage_wse3.sptl new file mode 100644 index 00000000..1bd825ac --- /dev/null +++ b/tests/csl_runtime/samples/task_recycling_three_stage_wse3.sptl @@ -0,0 +1,93 @@ +// Three sequential awaitall groups of 6 completions each. +// +// Key coloring property: +// Each awaitall group with 6 completions produces a 5-clique of blocked join +// tasks in the conflict graph. Tasks from different stages do not conflict +// (each stage is fully downstream of the previous one), so all three 5-cliques +// share the same 5 hardware slots. +// +// Total local tasks: ~18 (5 + 5 + 5 blocked tasks, T0, and join/final tasks). +// Chromatic number: 5. Average slot depth: ~3.5 tasks/slot. This exercises +// deeper slot reuse than the two-stage variant and demonstrates that the +// greedy coloring correctly reuses colors across non-conflicting stages. +// +// Mathematical result: sum(buf[0:18]). +kernel @task_recycling_three_stage<>(stream[1, 1] readonly input, + stream[1, 1] writeonly output) { + place u16 i, u16 j in [0:1, 0:1] { + f32[24] buf + f32 a0 + f32 a1 + f32 a2 + f32 a3 + f32 a4 + f32 a5 + f32 a6 + f32 a7 + f32 b0 + f32 b1 + f32 b2 + f32 b3 + f32 b4 + f32 b5 + f32 b6 + f32 b7 + f32 c0 + f32 c1 + f32 c2 + f32 c3 + f32 c4 + f32 c5 + f32 c6 + f32 c7 + f32 output_val + f32[1] out_buf + } + + dataflow u16 i, u16 j in [0:1, 0:1] { + } + + compute u16 i, u16 j in [0:1, 0:1] { + await receive(buf, input[i, j]) + + // Stage 1: 6 completions, produces a 5-clique + completion s1_0 = async { a0 = buf[0] } + completion s1_1 = async { a1 = a0 * 0.0 + buf[1] } + completion s1_2 = async { a2 = a1 * 0.0 + buf[2] } + completion s1_3 = async { a3 = a2 * 0.0 + buf[3] } + completion s1_4 = async { a4 = a3 * 0.0 + buf[4] } + completion s1_5 = async { a5 = a4 * 0.0 + buf[5] } + completion s1_6 = async { a6 = a5 * 0.0 + buf[6] } + completion s1_7 = async { a7 = a6 * 0.0 + buf[7] } + awaitall + + // Stage 2: 6 completions, another 5-clique sharing stage-1 slots + completion s2_0 = async { b0 = buf[8] } + completion s2_1 = async { b1 = b0 * 0.0 + buf[9] } + completion s2_2 = async { b2 = b1 * 0.0 + buf[10] } + completion s2_3 = async { b3 = b2 * 0.0 + buf[11] } + completion s2_4 = async { b4 = b3 * 0.0 + buf[12] } + completion s2_5 = async { b5 = b4 * 0.0 + buf[13] } + completion s2_6 = async { b6 = b5 * 0.0 + buf[14] } + completion s2_7 = async { b7 = b6 * 0.0 + buf[15] } + awaitall + + // Stage 3: 6 completions, third 5-clique sharing the same 5 slots + completion s3_0 = async { c0 = buf[16] } + completion s3_1 = async { c1 = c0 * 0.0 + buf[17] } + completion s3_2 = async { c2 = c1 * 0.0 + buf[18] } + completion s3_3 = async { c3 = c2 * 0.0 + buf[19] } + completion s3_4 = async { c4 = c3 * 0.0 + buf[20] } + completion s3_5 = async { c5 = c4 * 0.0 + buf[21] } + completion s3_6 = async { c6 = c5 * 0.0 + buf[22] } + completion s3_7 = async { c7 = c6 * 0.0 + buf[23] } + awaitall + + output_val = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + output_val = output_val + b0 + b1 + b2 + b3 + b4 + b5 + b6 + b7 + output_val = output_val + c0 + c1 + c2 + c3 + c4 + c5 + c6 + c7 + out_buf[0] = output_val + + await send(out_buf, output[i, j]) + } +} diff --git a/tests/csl_runtime/samples/task_recycling_two_stage.sptl b/tests/csl_runtime/samples/task_recycling_two_stage_wse2.sptl similarity index 100% rename from tests/csl_runtime/samples/task_recycling_two_stage.sptl rename to tests/csl_runtime/samples/task_recycling_two_stage_wse2.sptl diff --git a/tests/csl_runtime/samples/task_recycling_two_stage_wse3.sptl b/tests/csl_runtime/samples/task_recycling_two_stage_wse3.sptl new file mode 100644 index 00000000..95c6b45c --- /dev/null +++ b/tests/csl_runtime/samples/task_recycling_two_stage_wse3.sptl @@ -0,0 +1,93 @@ +// Two sequential awaitall groups of 8 completions each. +// +// Key coloring property: +// Each awaitall with N completions creates N-1 blocked join tasks that form +// a clique in the conflict graph (they all have the main task as a common +// predecessor). With 8 completions per group that is a 7-clique per group. +// However, the two groups are fully sequential — every task in group 2 has +// every task in group 1 as an ancestor — so the two 7-cliques share no edges +// and can reuse the same 7 hardware slots. +// +// Total local tasks: ~15 (7 blocked in group 1, 7 blocked in group 2, T0, +// T_final, ...). Chromatic number: 7. Average slot depth: ~2 tasks/slot. +// +// The multiplication-by-zero trick (v * 0.0 + buf[k]) introduces a code-order +// dependency on v while keeping the mathematical value equal to buf[k], so the +// final result is simply sum(buf[0:16]). +kernel @task_recycling_two_stage<>(stream[1, 1] readonly input, + stream[1, 1] writeonly output) { + place u16 i, u16 j in [0:1, 0:1] { + f32[24] buf + f32 a0 + f32 a1 + f32 a2 + f32 a3 + f32 a4 + f32 a5 + f32 a6 + f32 a7 + f32 a8 + f32 a9 + f32 a10 + f32 a11 + f32 b0 + f32 b1 + f32 b2 + f32 b3 + f32 b4 + f32 b5 + f32 b6 + f32 b7 + f32 b8 + f32 b9 + f32 b10 + f32 b11 + f32 output_val + f32[1] out_buf + } + + dataflow u16 i, u16 j in [0:1, 0:1] { + } + + compute u16 i, u16 j in [0:1, 0:1] { + await receive(buf, input[i, j]) + + // Stage 1: 8 completions + completion s1_0 = async { a0 = buf[0] } + completion s1_1 = async { a1 = a0 * 0.0 + buf[1] } + completion s1_2 = async { a2 = a1 * 0.0 + buf[2] } + completion s1_3 = async { a3 = a2 * 0.0 + buf[3] } + completion s1_4 = async { a4 = a3 * 0.0 + buf[4] } + completion s1_5 = async { a5 = a4 * 0.0 + buf[5] } + completion s1_6 = async { a6 = a5 * 0.0 + buf[6] } + completion s1_7 = async { a7 = a6 * 0.0 + buf[7] } + completion s1_8 = async { a8 = a7 * 0.0 + buf[8] } + completion s1_9 = async { a9 = a8 * 0.0 + buf[9] } + completion s1_10 = async { a10 = a9 * 0.0 + buf[10] } + completion s1_11 = async { a11 = a10 * 0.0 + buf[11] } + awaitall + + // Stage 2: 8 completions (independent of stage 1's values) + completion s2_0 = async { b0 = buf[12] } + completion s2_1 = async { b1 = b0 * 0.0 + buf[13] } + completion s2_2 = async { b2 = b1 * 0.0 + buf[14] } + completion s2_3 = async { b3 = b2 * 0.0 + buf[15] } + completion s2_4 = async { b4 = b3 * 0.0 + buf[16] } + completion s2_5 = async { b5 = b4 * 0.0 + buf[17] } + completion s2_6 = async { b6 = b5 * 0.0 + buf[18] } + completion s2_7 = async { b7 = b6 * 0.0 + buf[19] } + completion s2_8 = async { b8 = b7 * 0.0 + buf[20] } + completion s2_9 = async { b9 = b8 * 0.0 + buf[21] } + completion s2_10 = async { b10 = b9 * 0.0 + buf[22] } + completion s2_11 = async { b11 = b10 * 0.0 + buf[23] } + awaitall + + output_val = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + output_val = output_val + a8 + a9 + a10 + a11 + output_val = output_val + b0 + b1 + b2 + b3 + b4 + b5 + b6 + b7 + output_val = output_val + b8 + b9 + b10 + b11 + out_buf[0] = output_val + + await send(out_buf, output[i, j]) + } +} diff --git a/tests/spatial_ir/test_task_recycling.py b/tests/spatial_ir/test_task_recycling.py index 9b026d8b..723102ad 100644 --- a/tests/spatial_ir/test_task_recycling.py +++ b/tests/spatial_ir/test_task_recycling.py @@ -8,7 +8,8 @@ def _load_sample_kernel(): - sample = os.path.join(os.path.dirname(__file__), '..', 'csl_runtime', 'samples', 'task_recycling_merge.sptl') + sample = os.path.join( + os.path.dirname(__file__), '..', 'csl_runtime', 'samples', f'task_recycling_merge_{constants.ARCH}.sptl') kernel = parser.parse_file(sample) return passes.constexpr_propagation(kernel) @@ -95,12 +96,12 @@ def test_task_recycling_plan_reuses_local_slots(): # Conflict graph helpers # --------------------------------------------------------------------------- + def _create_fork_tasks(): """Task 0 activates tasks 1 and 2 independently; 1 and 2 are concurrent.""" return [ - tdag.CSLTask(0, 'local', [0], - [(1, tdag.InterTaskEdge.ACTIVATE), (2, tdag.InterTaskEdge.ACTIVATE)], - blocked=False), + tdag.CSLTask( + 0, 'local', [0], [(1, tdag.InterTaskEdge.ACTIVATE), (2, tdag.InterTaskEdge.ACTIVATE)], blocked=False), tdag.CSLTask(1, 'local', [1], [(-1, tdag.InterTaskEdge.SEQUENCE)], blocked=False), tdag.CSLTask(2, 'local', [2], [(-1, tdag.InterTaskEdge.SEQUENCE)], blocked=False), ] @@ -109,9 +110,8 @@ def _create_fork_tasks(): def _create_diamond_tasks(): """0 forks into 1 and 2, which join at the blocked task 3.""" return [ - tdag.CSLTask(0, 'local', [0], - [(1, tdag.InterTaskEdge.ACTIVATE), (2, tdag.InterTaskEdge.ACTIVATE)], - blocked=False), + tdag.CSLTask( + 0, 'local', [0], [(1, tdag.InterTaskEdge.ACTIVATE), (2, tdag.InterTaskEdge.ACTIVATE)], blocked=False), tdag.CSLTask(1, 'local', [1], [(3, tdag.InterTaskEdge.ACTIVATE)], blocked=False), tdag.CSLTask(2, 'local', [2], [(3, tdag.InterTaskEdge.UNBLOCK)], blocked=False), tdag.CSLTask(3, 'local', [3], [(-1, tdag.InterTaskEdge.SEQUENCE)], blocked=True), @@ -153,15 +153,14 @@ def test_no_conflicting_tasks_share_slot(): for slot in plan.local_slots: for i, a in enumerate(slot.task_indices): for b in slot.task_indices[i + 1:]: - assert b not in conflict_graph[a], ( - f'Tasks {a} and {b} conflict but were placed in the same slot' - ) + assert b not in conflict_graph[a], (f'Tasks {a} and {b} conflict but were placed in the same slot') # --------------------------------------------------------------------------- # greedy_coloring helper # --------------------------------------------------------------------------- + def test_greedy_coloring_max_colors_infeasible(): """A triangle (K3) needs 3 colors; requesting 2 must return None.""" k3 = {0: {1, 2}, 1: {0, 2}, 2: {0, 1}} @@ -205,6 +204,7 @@ def test_greedy_coloring_fixed_conflict_honored(): # plan_task_bindings mode behavior # --------------------------------------------------------------------------- + def test_fail_on_overrun_raises(): tasks = _create_linear_local_tasks(len(constants.LOCAL_TASK_IDS) + 1) with pytest.raises(ValueError, match='Too many local tasks'): @@ -236,6 +236,7 @@ def test_empty_task_list_returns_empty_plan(): # Transition preamble # --------------------------------------------------------------------------- + def test_transition_preamble_empty_for_non_recycled(): tasks = _create_linear_local_tasks(len(constants.LOCAL_TASK_IDS)) plan = task_recycling.plan_task_bindings(tasks, tdag.TaskCreationBehavior.STATE_MACHINE_ON_OVERRUN) @@ -244,11 +245,11 @@ def test_transition_preamble_empty_for_non_recycled(): assert plan.emit_local_transition_preamble(i, blocked=True) == '' - # --------------------------------------------------------------------------- # Determinism # --------------------------------------------------------------------------- + def test_plan_is_deterministic(): tasks = _create_linear_local_tasks(len(constants.LOCAL_TASK_IDS) + 5) plan1 = task_recycling.plan_task_bindings(tasks, tdag.TaskCreationBehavior.STATE_MACHINE_ON_OVERRUN) diff --git a/tests/spatial_ir/test_task_recycling_codegen.py b/tests/spatial_ir/test_task_recycling_codegen.py index dc023cf0..13e91635 100644 --- a/tests/spatial_ir/test_task_recycling_codegen.py +++ b/tests/spatial_ir/test_task_recycling_codegen.py @@ -5,9 +5,9 @@ from spada.lowering.spatial_ir_to_csl import lower_spatial_ir_to_csl from spada.syntax.spatial_ir import parser, passes +from spada.syntax.csl import constants -_CSL_RUNTIME_TASK_RECYCLING_SAMPLES = os.path.join( - os.path.dirname(__file__), '..', 'csl_runtime', 'samples') +_CSL_RUNTIME_TASK_RECYCLING_SAMPLES = os.path.join(os.path.dirname(__file__), '..', 'csl_runtime', 'samples') def test_task_recycling_codegen_uses_else_if_dispatch_for_recycled_slots(): @@ -37,9 +37,9 @@ def test_task_recycling_codegen_uses_else_if_dispatch_for_recycled_slots(): @pytest.mark.parametrize( 'filename', ( - 'task_recycling_merge.sptl', - 'task_recycling_two_stage.sptl', - 'task_recycling_three_stage.sptl', + f'task_recycling_merge_{constants.ARCH}.sptl', + f'task_recycling_two_stage_{constants.ARCH}.sptl', + f'task_recycling_three_stage_{constants.ARCH}.sptl', ), ) def test_csl_runtime_task_recycling_sample_lowers(filename: str): @@ -47,8 +47,7 @@ def test_csl_runtime_task_recycling_sample_lowers(filename: str): path = os.path.join(_CSL_RUNTIME_TASK_RECYCLING_SAMPLES, filename) kernel = parser.parse_file(path) kernel = passes.constexpr_propagation(kernel) - csl_files = lower_spatial_ir_to_csl( - kernel, task_fusion=False, copy_elision=True, prune_memory=True) + csl_files = lower_spatial_ir_to_csl(kernel, task_fusion=False, copy_elision=True, prune_memory=True) assert csl_files, 'expected at least one generated CSL file' combined = '\n'.join(f.code for f in csl_files) assert combined.strip(), 'expected non-empty CSL' @@ -60,8 +59,7 @@ def test_codegen_avoids_local_task_id_color_overlap(): kernel = parser.parse_file(path) kernel = passes.constexpr_propagation(kernel) - csl_files = lower_spatial_ir_to_csl( - kernel, task_fusion=False, copy_elision=True, prune_memory=True) + csl_files = lower_spatial_ir_to_csl(kernel, task_fusion=False, copy_elision=True, prune_memory=True) combined = '\n'.join(f.code for f in csl_files) local_task_ids = {int(v) for v in re.findall(r'@get_local_task_id\((\d+)\)', combined)} @@ -70,5 +68,4 @@ def test_codegen_avoids_local_task_id_color_overlap(): assert 8 in colors, 'sample should force color 8 to be allocated' assert local_task_ids assert local_task_ids.isdisjoint(colors), ( - f'local task IDs overlap communication colors: ids={sorted(local_task_ids)}, colors={sorted(colors)}' - ) + f'local task IDs overlap communication colors: ids={sorted(local_task_ids)}, colors={sorted(colors)}')