From 4641eb0713d0825744295db784a297e1a0d35286 Mon Sep 17 00:00:00 2001
From: jariji <96840304+jariji@users.noreply.github.com>
Date: Wed, 4 Mar 2026 23:17:47 +0000
Subject: [PATCH] Fix operand order in accumulate for non-commutative operators

The scan implementation applied `op(right, left)` instead of
`op(left, right)` in several places, producing wrong results for
associative-but-non-commutative operators (e.g. matrix multiply).
This was invisible in existing tests since they all used `+`.

Fixes:
- CPU: cross-task prefix combination (accumulate_1d_cpu.jl)
- GPU: up-sweep reduction (accumulate_1d_gpu.jl line 70)
- GPU: decoupled lookback prefix accumulation (lines 176, 179)
- GPU: coupled preblocks prefix accumulation (lines 239-241)

Adds test using 2x2 matrix multiplication as a non-commutative op.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/accumulate/accumulate_1d_cpu.jl |  2 +-
 src/accumulate/accumulate_1d_gpu.jl | 15 +++++---
 test/accumulate.jl                  | 58 +++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/src/accumulate/accumulate_1d_cpu.jl b/src/accumulate/accumulate_1d_cpu.jl
index 9f45ada..67e7444 100644
--- a/src/accumulate/accumulate_1d_cpu.jl
+++ b/src/accumulate/accumulate_1d_cpu.jl
@@ -59,7 +59,7 @@ function accumulate_1d_cpu!(
         @inbounds begin
             if itask != 1
                 for i in irange
-                    v[i] = op(v[i], shared[itask - 1])
+                    v[i] = op(shared[itask - 1], v[i])
                 end
             end
         end
diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl
index 9202692..dbc775a 100644
--- a/src/accumulate/accumulate_1d_gpu.jl
+++ b/src/accumulate/accumulate_1d_gpu.jl
@@ -67,7 +67,7 @@ end
             _ai += conflict_free_offset(_ai)
             _bi += conflict_free_offset(_bi)
 
-            temp[_bi + 0x1] = op(temp[_bi + 0x1], temp[_ai + 0x1])
+            temp[_bi + 0x1] = op(temp[_ai + 0x1], temp[_bi + 0x1])
         end
 
         offset = offset << 0x1
@@ -173,10 +173,10 @@ end
         if UnsafeAtomics.load(pointer(flags, inspected_block + 0x1), UnsafeAtomics.monotonic) == ACC_FLAG_A
             UnsafeAtomics.fence(UnsafeAtomics.acquire) # (fence before reading from v)
             # Previous blocks (except last) always have filled values in v, so index is inbounds
-            running_prefix = op(running_prefix, v[(inspected_block + 0x1) * block_size * 0x2])
+            running_prefix = op(v[(inspected_block + 0x1) * block_size * 0x2], running_prefix)
             break
         else
-            running_prefix = op(running_prefix, prefixes[inspected_block + 0x1])
+            running_prefix = op(prefixes[inspected_block + 0x1], running_prefix)
         end
 
         inspected_block -= 0x1
@@ -236,8 +236,13 @@ end
     #     along the chunks. We need to accumulate the prefixes of the previous chunks into
     #     running_prefix.
     num_preblocks = (iblock - 0x1) ÷ (block_size * 0x2)
-    for i in 0x1:num_preblocks
-        running_prefix = op(running_prefix, prefixes[i * block_size * 0x2])
+    if num_preblocks >= 0x1
+        # Accumulate earlier chunk prefixes left-to-right, then prepend to running_prefix
+        chunk_prefix = prefixes[0x1 * block_size * 0x2]
+        for i in 0x2:num_preblocks
+            chunk_prefix = op(chunk_prefix, prefixes[i * block_size * 0x2])
+        end
+        running_prefix = op(chunk_prefix, running_prefix)
     end
 
     # Now we have aggregate prefix of all previous blocks, add it to all our elements
diff --git a/test/accumulate.jl b/test/accumulate.jl
index 5022e38..2a3cc02 100644
--- a/test/accumulate.jl
+++ b/test/accumulate.jl
@@ -229,6 +229,64 @@ end
         temp=array_from_host(zeros(Int32, 3, 4, 1)),
     )
 end
+# 2x2 matrix stored as a flat struct — matrix multiply is associative but not commutative
+struct Mat2x2
+    a::Int32; b::Int32
+    c::Int32; d::Int32
+end
+
+Base.zero(::Type{Mat2x2}) = Mat2x2(1, 0, 0, 1)  # identity matrix
+
+@inline mat2_mul(x::Mat2x2, y::Mat2x2) = Mat2x2(
+    x.a*y.a + x.b*y.c, x.a*y.b + x.b*y.d,
+    x.c*y.a + x.d*y.c, x.c*y.b + x.d*y.d,
+)
+
+const mat2_id = Mat2x2(Int32(1), Int32(0), Int32(0), Int32(1))
+
+@testset "accumulate_1d_noncommutative $(alg isa AK.DecoupledLookback ? "DL" : "SP")" for alg in ALGS
+    # 2x2 matrix multiplication is associative but NOT commutative.
+    # This test verifies that the scan computes op(left, right), not op(right, left).
+
+    # Sanity checks
+    A = Mat2x2(1, 2, 3, 4)
+    B = Mat2x2(5, 6, 7, 8)
+    @test mat2_mul(A, B) != mat2_mul(B, A)
+    C = Mat2x2(1, 0, 1, 1)
+    @test mat2_mul(mat2_mul(A, B), C) == mat2_mul(A, mat2_mul(B, C))
+
+    # Small case
+    data_h = [Mat2x2(1, 2, 3, 4), Mat2x2(0, 1, 1, 0)]
+    data = array_from_host(data_h)
+    result = AK.accumulate(mat2_mul, data; init=mat2_id, neutral=mat2_id, alg)
+    expected = accumulate(mat2_mul, data_h)
+    @test Array(result) == expected
+
+    # Larger random test
+    Random.seed!(42)
+    for _ in 1:100
+        n = rand(2:10_000)
+        h = [Mat2x2(rand(Int32(-3):Int32(3)), rand(Int32(-3):Int32(3)),
+                     rand(Int32(-3):Int32(3)), rand(Int32(-3):Int32(3))) for _ in 1:n]
+        d = array_from_host(h)
+        expected = accumulate(mat2_mul, h)
+        result = Array(AK.accumulate(mat2_mul, d; init=mat2_id, neutral=mat2_id, alg))
+        @test result == expected
+    end
+
+    # Small block size to exercise multi-block path
+    for _ in 1:100
+        n = rand(2:10_000)
+        h = [Mat2x2(rand(Int32(-3):Int32(3)), rand(Int32(-3):Int32(3)),
+                     rand(Int32(-3):Int32(3)), rand(Int32(-3):Int32(3))) for _ in 1:n]
+        d = array_from_host(h)
+        expected = accumulate(mat2_mul, h)
+        result = Array(AK.accumulate(mat2_mul, d; init=mat2_id, neutral=mat2_id, block_size=16, alg))
+        @test result == expected
+    end
+end
+
+
 @testset "cumsum" begin
 
     Random.seed!(0)