From 4641eb0713d0825744295db784a297e1a0d35286 Mon Sep 17 00:00:00 2001 From: jariji <96840304+jariji@users.noreply.github.com> Date: Wed, 4 Mar 2026 23:17:47 +0000 Subject: [PATCH] Fix operand order in accumulate for non-commutative operators The scan implementation applied `op(right, left)` instead of `op(left, right)` in several places, producing wrong results for associative-but-non-commutative operators (e.g. matrix multiply). This was invisible in existing tests since they all used `+`. Fixes: - CPU: cross-task prefix combination (accumulate_1d_cpu.jl) - GPU: up-sweep reduction (accumulate_1d_gpu.jl line 70) - GPU: decoupled lookback prefix accumulation (lines 176, 179) - GPU: coupled preblocks prefix accumulation (lines 239-241) Adds test using 2x2 matrix multiplication as a non-commutative op. Co-Authored-By: Claude Opus 4.6 --- src/accumulate/accumulate_1d_cpu.jl | 2 +- src/accumulate/accumulate_1d_gpu.jl | 15 +++++--- test/accumulate.jl | 58 +++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 6 deletions(-) diff --git a/src/accumulate/accumulate_1d_cpu.jl b/src/accumulate/accumulate_1d_cpu.jl index 9f45ada..67e7444 100644 --- a/src/accumulate/accumulate_1d_cpu.jl +++ b/src/accumulate/accumulate_1d_cpu.jl @@ -59,7 +59,7 @@ function accumulate_1d_cpu!( @inbounds begin if itask != 1 for i in irange - v[i] = op(v[i], shared[itask - 1]) + v[i] = op(shared[itask - 1], v[i]) end end end diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl index 9202692..dbc775a 100644 --- a/src/accumulate/accumulate_1d_gpu.jl +++ b/src/accumulate/accumulate_1d_gpu.jl @@ -67,7 +67,7 @@ end _ai += conflict_free_offset(_ai) _bi += conflict_free_offset(_bi) - temp[_bi + 0x1] = op(temp[_bi + 0x1], temp[_ai + 0x1]) + temp[_bi + 0x1] = op(temp[_ai + 0x1], temp[_bi + 0x1]) end offset = offset << 0x1 @@ -173,10 +173,10 @@ end if UnsafeAtomics.load(pointer(flags, inspected_block + 0x1), UnsafeAtomics.monotonic) == ACC_FLAG_A UnsafeAtomics.fence(UnsafeAtomics.acquire) # (fence before reading from v) # Previous blocks (except last) always have filled values in v, so index is inbounds - running_prefix = op(running_prefix, v[(inspected_block + 0x1) * block_size * 0x2]) + running_prefix = op(v[(inspected_block + 0x1) * block_size * 0x2], running_prefix) break else - running_prefix = op(running_prefix, prefixes[inspected_block + 0x1]) + running_prefix = op(prefixes[inspected_block + 0x1], running_prefix) end inspected_block -= 0x1 @@ -236,8 +236,13 @@ end # along the chunks. We need to accumulate the prefixes of the previous chunks into # running_prefix. num_preblocks = (iblock - 0x1) ÷ (block_size * 0x2) - for i in 0x1:num_preblocks - running_prefix = op(running_prefix, prefixes[i * block_size * 0x2]) + if num_preblocks >= 0x1 + # Accumulate earlier chunk prefixes left-to-right, then prepend to running_prefix + chunk_prefix = prefixes[0x1 * block_size * 0x2] + for i in 0x2:num_preblocks + chunk_prefix = op(chunk_prefix, prefixes[i * block_size * 0x2]) + end + running_prefix = op(chunk_prefix, running_prefix) end # Now we have aggregate prefix of all previous blocks, add it to all our elements diff --git a/test/accumulate.jl b/test/accumulate.jl index 5022e38..2a3cc02 100644 --- a/test/accumulate.jl +++ b/test/accumulate.jl @@ -229,6 +229,64 @@ end temp=array_from_host(zeros(Int32, 3, 4, 1)), ) end +# 2x2 matrix stored as a flat struct — matrix multiply is associative but not commutative +struct Mat2x2 + a::Int32; b::Int32 + c::Int32; d::Int32 +end + +Base.zero(::Type{Mat2x2}) = Mat2x2(1, 0, 0, 1) # identity matrix + +@inline mat2_mul(x::Mat2x2, y::Mat2x2) = Mat2x2( + x.a*y.a + x.b*y.c, x.a*y.b + x.b*y.d, + x.c*y.a + x.d*y.c, x.c*y.b + x.d*y.d, +) + +const mat2_id = Mat2x2(Int32(1), Int32(0), Int32(0), Int32(1)) + +@testset "accumulate_1d_noncommutative $(alg isa AK.DecoupledLookback ? "DL" : "SP")" for alg in ALGS + # 2x2 matrix multiplication is associative but NOT commutative. + # This test verifies that the scan computes op(left, right), not op(right, left). + + # Sanity checks + A = Mat2x2(1, 2, 3, 4) + B = Mat2x2(5, 6, 7, 8) + @test mat2_mul(A, B) != mat2_mul(B, A) + C = Mat2x2(1, 0, 1, 1) + @test mat2_mul(mat2_mul(A, B), C) == mat2_mul(A, mat2_mul(B, C)) + + # Small case + data_h = [Mat2x2(1, 2, 3, 4), Mat2x2(0, 1, 1, 0)] + data = array_from_host(data_h) + result = AK.accumulate(mat2_mul, data; init=mat2_id, neutral=mat2_id, alg) + expected = accumulate(mat2_mul, data_h) + @test Array(result) == expected + + # Larger random test + Random.seed!(42) + for _ in 1:100 + n = rand(2:10_000) + h = [Mat2x2(rand(Int32(-3):Int32(3)), rand(Int32(-3):Int32(3)), + rand(Int32(-3):Int32(3)), rand(Int32(-3):Int32(3))) for _ in 1:n] + d = array_from_host(h) + expected = accumulate(mat2_mul, h) + result = Array(AK.accumulate(mat2_mul, d; init=mat2_id, neutral=mat2_id, alg)) + @test result == expected + end + + # Small block size to exercise multi-block path + for _ in 1:100 + n = rand(2:10_000) + h = [Mat2x2(rand(Int32(-3):Int32(3)), rand(Int32(-3):Int32(3)), + rand(Int32(-3):Int32(3)), rand(Int32(-3):Int32(3))) for _ in 1:n] + d = array_from_host(h) + expected = accumulate(mat2_mul, h) + result = Array(AK.accumulate(mat2_mul, d; init=mat2_id, neutral=mat2_id, block_size=16, alg)) + @test result == expected + end +end + + @testset "cumsum" begin Random.seed!(0)