Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions tests/csl_runtime/samples/task_recycling_merge_wse3.sptl
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
kernel @task_recycling_merge<>(stream<f32, 22>[1, 1] readonly input, stream<f32, 1>[1, 1] writeonly output) {
place u16 i, u16 j in [0:1, 0:1] {
f32[22] buf
f32 v0
f32 v1
f32 v2
f32 v3
f32 v4
f32 v5
f32 v6
f32 v7
f32 v8
f32 v9
f32 v10
f32 v11
f32 v12
f32 v13
f32 v14
f32 v15
f32 v16
f32 v17
f32 v18
f32 left_val
f32 right_val
f32 output_val
f32[1] out_buf
}

dataflow u16 i, u16 j in [0:1, 0:1] {
}

compute u16 i, u16 j in [0:1, 0:1] {
await receive(buf, input[i, j])

completion s0 = async {
v0 = buf[0]
}
completion s1 = async {
v1 = v0 * 0.0 + buf[1]
}
completion s2 = async {
v2 = v1 * 0.0 + buf[2]
}
completion s3 = async {
v3 = v2 * 0.0 + buf[3]
}
completion s4 = async {
v4 = v3 * 0.0 + buf[4]
}
completion s5 = async {
v5 = v4 * 0.0 + buf[5]
}
completion s6 = async {
v6 = v5 * 0.0 + buf[6]
}
completion s7 = async {
v7 = v6 * 0.0 + buf[7]
}
completion s8 = async {
v8 = v7 * 0.0 + buf[8]
}
completion s9 = async {
v9 = v8 * 0.0 + buf[9]
}
completion s10 = async {
v10 = v9 * 0.0 + buf[10]
}
completion s11 = async {
v11 = v10 * 0.0 + buf[11]
}

completion s12 = async {
v12 = v11 * 0.0 + buf[12]
}
completion s13 = async {
v13 = v12 * 0.0 + buf[13]
}
completion s14 = async {
v14 = v13 * 0.0 + buf[14]
}
completion s15 = async {
v15 = v14 * 0.0 + buf[15]
}
completion s16 = async {
v16 = v15 * 0.0 + buf[16]
}
completion s17 = async {
v17 = v16 * 0.0 + buf[17]
}
completion s18 = async {
v18 = v17 * 0.0 + buf[18]
}
completion left = async {
left_val = v19 * 0.0 + buf[20]
}
completion right = async {
right_val = v19 * 0.0 + buf[21]
}

awaitall

output_val = v0 + v1 + v2 + v3 + v4 + v5 + v6
output_val = output_val + v7 + v8 + v9 + v10 + v11
output_val = output_val + v12 + v13 + v14 + v15 + v16
output_val = output_val + v17 + v18 + v19
output_val = output_val + left_val + right_val
out_buf[0] = output_val

await send(out_buf, output[i, j])
}
}
93 changes: 93 additions & 0 deletions tests/csl_runtime/samples/task_recycling_three_stage_wse3.sptl
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Three sequential awaitall groups of 6 completions each.
//
// Key coloring property:
// Each awaitall group with 6 completions produces a 5-clique of blocked join
// tasks in the conflict graph. Tasks from different stages do not conflict
// (each stage is fully downstream of the previous one), so all three 5-cliques
// share the same 5 hardware slots.
//
// Total local tasks: ~18 (5 + 5 + 5 blocked tasks, T0, and join/final tasks).
// Chromatic number: 5. Average slot depth: ~3.5 tasks/slot. This exercises
// deeper slot reuse than the two-stage variant and demonstrates that the
// greedy coloring correctly reuses colors across non-conflicting stages.
//
// Mathematical result: sum(buf[0:18]).
kernel @task_recycling_three_stage<>(stream<f32, 24>[1, 1] readonly input,
stream<f32, 1>[1, 1] writeonly output) {
place u16 i, u16 j in [0:1, 0:1] {
f32[24] buf
f32 a0
f32 a1
f32 a2
f32 a3
f32 a4
f32 a5
f32 a6
f32 a7
f32 b0
f32 b1
f32 b2
f32 b3
f32 b4
f32 b5
f32 b6
f32 b7
f32 c0
f32 c1
f32 c2
f32 c3
f32 c4
f32 c5
f32 c6
f32 c7
f32 output_val
f32[1] out_buf
}

dataflow u16 i, u16 j in [0:1, 0:1] {
}

compute u16 i, u16 j in [0:1, 0:1] {
await receive(buf, input[i, j])

// Stage 1: 6 completions, produces a 5-clique
completion s1_0 = async { a0 = buf[0] }
completion s1_1 = async { a1 = a0 * 0.0 + buf[1] }
completion s1_2 = async { a2 = a1 * 0.0 + buf[2] }
completion s1_3 = async { a3 = a2 * 0.0 + buf[3] }
completion s1_4 = async { a4 = a3 * 0.0 + buf[4] }
completion s1_5 = async { a5 = a4 * 0.0 + buf[5] }
completion s1_6 = async { a6 = a5 * 0.0 + buf[6] }
completion s1_7 = async { a7 = a6 * 0.0 + buf[7] }
awaitall

// Stage 2: 6 completions, another 5-clique sharing stage-1 slots
completion s2_0 = async { b0 = buf[8] }
completion s2_1 = async { b1 = b0 * 0.0 + buf[9] }
completion s2_2 = async { b2 = b1 * 0.0 + buf[10] }
completion s2_3 = async { b3 = b2 * 0.0 + buf[11] }
completion s2_4 = async { b4 = b3 * 0.0 + buf[12] }
completion s2_5 = async { b5 = b4 * 0.0 + buf[13] }
completion s2_6 = async { b6 = b5 * 0.0 + buf[14] }
completion s2_7 = async { b7 = b6 * 0.0 + buf[15] }
awaitall

// Stage 3: 6 completions, third 5-clique sharing the same 5 slots
completion s3_0 = async { c0 = buf[16] }
completion s3_1 = async { c1 = c0 * 0.0 + buf[17] }
completion s3_2 = async { c2 = c1 * 0.0 + buf[18] }
completion s3_3 = async { c3 = c2 * 0.0 + buf[19] }
completion s3_4 = async { c4 = c3 * 0.0 + buf[20] }
completion s3_5 = async { c5 = c4 * 0.0 + buf[21] }
completion s3_6 = async { c6 = c5 * 0.0 + buf[22] }
completion s3_7 = async { c7 = c6 * 0.0 + buf[23] }
awaitall

output_val = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7
output_val = output_val + b0 + b1 + b2 + b3 + b4 + b5 + b6 + b7
output_val = output_val + c0 + c1 + c2 + c3 + c4 + c5 + c6 + c7
out_buf[0] = output_val

await send(out_buf, output[i, j])
}
}
93 changes: 93 additions & 0 deletions tests/csl_runtime/samples/task_recycling_two_stage_wse3.sptl
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Two sequential awaitall groups of 8 completions each.
//
// Key coloring property:
// Each awaitall with N completions creates N-1 blocked join tasks that form
// a clique in the conflict graph (they all have the main task as a common
// predecessor). With 8 completions per group that is a 7-clique per group.
// However, the two groups are fully sequential — every task in group 2 has
// every task in group 1 as an ancestor — so the two 7-cliques share no edges
// and can reuse the same 7 hardware slots.
//
// Total local tasks: ~15 (7 blocked in group 1, 7 blocked in group 2, T0,
// T_final, ...). Chromatic number: 7. Average slot depth: ~2 tasks/slot.
//
// The multiplication-by-zero trick (v * 0.0 + buf[k]) introduces a code-order
// dependency on v while keeping the mathematical value equal to buf[k], so the
// final result is simply sum(buf[0:16]).
kernel @task_recycling_two_stage<>(stream<f32, 24>[1, 1] readonly input,
stream<f32, 1>[1, 1] writeonly output) {
place u16 i, u16 j in [0:1, 0:1] {
f32[24] buf
f32 a0
f32 a1
f32 a2
f32 a3
f32 a4
f32 a5
f32 a6
f32 a7
f32 a8
f32 a9
f32 a10
f32 a11
f32 b0
f32 b1
f32 b2
f32 b3
f32 b4
f32 b5
f32 b6
f32 b7
f32 b8
f32 b9
f32 b10
f32 b11
f32 output_val
f32[1] out_buf
}

dataflow u16 i, u16 j in [0:1, 0:1] {
}

compute u16 i, u16 j in [0:1, 0:1] {
await receive(buf, input[i, j])

// Stage 1: 8 completions
completion s1_0 = async { a0 = buf[0] }
completion s1_1 = async { a1 = a0 * 0.0 + buf[1] }
completion s1_2 = async { a2 = a1 * 0.0 + buf[2] }
completion s1_3 = async { a3 = a2 * 0.0 + buf[3] }
completion s1_4 = async { a4 = a3 * 0.0 + buf[4] }
completion s1_5 = async { a5 = a4 * 0.0 + buf[5] }
completion s1_6 = async { a6 = a5 * 0.0 + buf[6] }
completion s1_7 = async { a7 = a6 * 0.0 + buf[7] }
completion s1_8 = async { a8 = a7 * 0.0 + buf[8] }
completion s1_9 = async { a9 = a8 * 0.0 + buf[9] }
completion s1_10 = async { a10 = a9 * 0.0 + buf[10] }
completion s1_11 = async { a11 = a10 * 0.0 + buf[11] }
awaitall

// Stage 2: 8 completions (independent of stage 1's values)
completion s2_0 = async { b0 = buf[12] }
completion s2_1 = async { b1 = b0 * 0.0 + buf[13] }
completion s2_2 = async { b2 = b1 * 0.0 + buf[14] }
completion s2_3 = async { b3 = b2 * 0.0 + buf[15] }
completion s2_4 = async { b4 = b3 * 0.0 + buf[16] }
completion s2_5 = async { b5 = b4 * 0.0 + buf[17] }
completion s2_6 = async { b6 = b5 * 0.0 + buf[18] }
completion s2_7 = async { b7 = b6 * 0.0 + buf[19] }
completion s2_8 = async { b8 = b7 * 0.0 + buf[20] }
completion s2_9 = async { b9 = b8 * 0.0 + buf[21] }
completion s2_10 = async { b10 = b9 * 0.0 + buf[22] }
completion s2_11 = async { b11 = b10 * 0.0 + buf[23] }
awaitall

output_val = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7
output_val = output_val + a8 + a9 + a10 + a11
output_val = output_val + b0 + b1 + b2 + b3 + b4 + b5 + b6 + b7
output_val = output_val + b8 + b9 + b10 + b11
out_buf[0] = output_val

await send(out_buf, output[i, j])
}
}
23 changes: 12 additions & 11 deletions tests/spatial_ir/test_task_recycling.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@


def _load_sample_kernel():
sample = os.path.join(os.path.dirname(__file__), '..', 'csl_runtime', 'samples', 'task_recycling_merge.sptl')
sample = os.path.join(
os.path.dirname(__file__), '..', 'csl_runtime', 'samples', f'task_recycling_merge_{constants.ARCH}.sptl')
kernel = parser.parse_file(sample)
return passes.constexpr_propagation(kernel)

Expand Down Expand Up @@ -95,12 +96,12 @@ def test_task_recycling_plan_reuses_local_slots():
# Conflict graph helpers
# ---------------------------------------------------------------------------


def _create_fork_tasks():
"""Task 0 activates tasks 1 and 2 independently; 1 and 2 are concurrent."""
return [
tdag.CSLTask(0, 'local', [0],
[(1, tdag.InterTaskEdge.ACTIVATE), (2, tdag.InterTaskEdge.ACTIVATE)],
blocked=False),
tdag.CSLTask(
0, 'local', [0], [(1, tdag.InterTaskEdge.ACTIVATE), (2, tdag.InterTaskEdge.ACTIVATE)], blocked=False),
tdag.CSLTask(1, 'local', [1], [(-1, tdag.InterTaskEdge.SEQUENCE)], blocked=False),
tdag.CSLTask(2, 'local', [2], [(-1, tdag.InterTaskEdge.SEQUENCE)], blocked=False),
]
Expand All @@ -109,9 +110,8 @@ def _create_fork_tasks():
def _create_diamond_tasks():
"""0 forks into 1 and 2, which join at the blocked task 3."""
return [
tdag.CSLTask(0, 'local', [0],
[(1, tdag.InterTaskEdge.ACTIVATE), (2, tdag.InterTaskEdge.ACTIVATE)],
blocked=False),
tdag.CSLTask(
0, 'local', [0], [(1, tdag.InterTaskEdge.ACTIVATE), (2, tdag.InterTaskEdge.ACTIVATE)], blocked=False),
tdag.CSLTask(1, 'local', [1], [(3, tdag.InterTaskEdge.ACTIVATE)], blocked=False),
tdag.CSLTask(2, 'local', [2], [(3, tdag.InterTaskEdge.UNBLOCK)], blocked=False),
tdag.CSLTask(3, 'local', [3], [(-1, tdag.InterTaskEdge.SEQUENCE)], blocked=True),
Expand Down Expand Up @@ -153,15 +153,14 @@ def test_no_conflicting_tasks_share_slot():
for slot in plan.local_slots:
for i, a in enumerate(slot.task_indices):
for b in slot.task_indices[i + 1:]:
assert b not in conflict_graph[a], (
f'Tasks {a} and {b} conflict but were placed in the same slot'
)
assert b not in conflict_graph[a], (f'Tasks {a} and {b} conflict but were placed in the same slot')


# ---------------------------------------------------------------------------
# greedy_coloring helper
# ---------------------------------------------------------------------------


def test_greedy_coloring_max_colors_infeasible():
"""A triangle (K3) needs 3 colors; requesting 2 must return None."""
k3 = {0: {1, 2}, 1: {0, 2}, 2: {0, 1}}
Expand Down Expand Up @@ -205,6 +204,7 @@ def test_greedy_coloring_fixed_conflict_honored():
# plan_task_bindings mode behavior
# ---------------------------------------------------------------------------


def test_fail_on_overrun_raises():
tasks = _create_linear_local_tasks(len(constants.LOCAL_TASK_IDS) + 1)
with pytest.raises(ValueError, match='Too many local tasks'):
Expand Down Expand Up @@ -236,6 +236,7 @@ def test_empty_task_list_returns_empty_plan():
# Transition preamble
# ---------------------------------------------------------------------------


def test_transition_preamble_empty_for_non_recycled():
tasks = _create_linear_local_tasks(len(constants.LOCAL_TASK_IDS))
plan = task_recycling.plan_task_bindings(tasks, tdag.TaskCreationBehavior.STATE_MACHINE_ON_OVERRUN)
Expand All @@ -244,11 +245,11 @@ def test_transition_preamble_empty_for_non_recycled():
assert plan.emit_local_transition_preamble(i, blocked=True) == ''



# ---------------------------------------------------------------------------
# Determinism
# ---------------------------------------------------------------------------


def test_plan_is_deterministic():
tasks = _create_linear_local_tasks(len(constants.LOCAL_TASK_IDS) + 5)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like some formatter was used that is different from what we use (yapf): https://github.com/spcl/spada/blob/main/.style.yapf

This creates a larger diff than it should be here.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the README the required style formatter for contributing is black (which is also installed as part of the dev environment). I did not know that yapf schould be used, I will correct that!

plan1 = task_recycling.plan_task_bindings(tasks, tdag.TaskCreationBehavior.STATE_MACHINE_ON_OVERRUN)
Expand Down
Loading
Loading