Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/accumulate/accumulate_1d_cpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ function accumulate_1d_cpu!(
@inbounds begin
if itask != 1
for i in irange
v[i] = op(v[i], shared[itask - 1])
v[i] = op(shared[itask - 1], v[i])
end
end
end
Expand Down
15 changes: 10 additions & 5 deletions src/accumulate/accumulate_1d_gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ end
_ai += conflict_free_offset(_ai)
_bi += conflict_free_offset(_bi)

temp[_bi + 0x1] = op(temp[_bi + 0x1], temp[_ai + 0x1])
temp[_bi + 0x1] = op(temp[_ai + 0x1], temp[_bi + 0x1])
end

offset = offset << 0x1
Expand Down Expand Up @@ -173,10 +173,10 @@ end
if UnsafeAtomics.load(pointer(flags, inspected_block + 0x1), UnsafeAtomics.monotonic) == ACC_FLAG_A
UnsafeAtomics.fence(UnsafeAtomics.acquire) # (fence before reading from v)
# Previous blocks (except last) always have filled values in v, so index is inbounds
running_prefix = op(running_prefix, v[(inspected_block + 0x1) * block_size * 0x2])
running_prefix = op(v[(inspected_block + 0x1) * block_size * 0x2], running_prefix)
break
else
running_prefix = op(running_prefix, prefixes[inspected_block + 0x1])
running_prefix = op(prefixes[inspected_block + 0x1], running_prefix)
end

inspected_block -= 0x1
Expand Down Expand Up @@ -236,8 +236,13 @@ end
# along the chunks. We need to accumulate the prefixes of the previous chunks into
# running_prefix.
num_preblocks = (iblock - 0x1) ÷ (block_size * 0x2)
for i in 0x1:num_preblocks
running_prefix = op(running_prefix, prefixes[i * block_size * 0x2])
if num_preblocks >= 0x1
# Accumulate earlier chunk prefixes left-to-right, then prepend to running_prefix
chunk_prefix = prefixes[0x1 * block_size * 0x2]
for i in 0x2:num_preblocks
chunk_prefix = op(chunk_prefix, prefixes[i * block_size * 0x2])
end
running_prefix = op(chunk_prefix, running_prefix)
end

# Now we have aggregate prefix of all previous blocks, add it to all our elements
Expand Down
58 changes: 58 additions & 0 deletions test/accumulate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,64 @@ end
temp=array_from_host(zeros(Int32, 3, 4, 1)),
)
end
# 2x2 matrix stored as a flat struct — matrix multiply is associative but not commutative
struct Mat2x2
a::Int32; b::Int32
c::Int32; d::Int32
end

Base.zero(::Type{Mat2x2}) = Mat2x2(1, 0, 0, 1) # identity matrix

@inline mat2_mul(x::Mat2x2, y::Mat2x2) = Mat2x2(
x.a*y.a + x.b*y.c, x.a*y.b + x.b*y.d,
x.c*y.a + x.d*y.c, x.c*y.b + x.d*y.d,
)

const mat2_id = Mat2x2(Int32(1), Int32(0), Int32(0), Int32(1))

@testset "accumulate_1d_noncommutative $(alg isa AK.DecoupledLookback ? "DL" : "SP")" for alg in ALGS
# 2x2 matrix multiplication is associative but NOT commutative.
# This test verifies that the scan computes op(left, right), not op(right, left).

# Sanity checks
A = Mat2x2(1, 2, 3, 4)
B = Mat2x2(5, 6, 7, 8)
@test mat2_mul(A, B) != mat2_mul(B, A)
C = Mat2x2(1, 0, 1, 1)
@test mat2_mul(mat2_mul(A, B), C) == mat2_mul(A, mat2_mul(B, C))

# Small case
data_h = [Mat2x2(1, 2, 3, 4), Mat2x2(0, 1, 1, 0)]
data = array_from_host(data_h)
result = AK.accumulate(mat2_mul, data; init=mat2_id, neutral=mat2_id, alg)
expected = accumulate(mat2_mul, data_h)
@test Array(result) == expected

# Larger random test
Random.seed!(42)
for _ in 1:100
n = rand(2:10_000)
h = [Mat2x2(rand(Int32(-3):Int32(3)), rand(Int32(-3):Int32(3)),
rand(Int32(-3):Int32(3)), rand(Int32(-3):Int32(3))) for _ in 1:n]
d = array_from_host(h)
expected = accumulate(mat2_mul, h)
result = Array(AK.accumulate(mat2_mul, d; init=mat2_id, neutral=mat2_id, alg))
@test result == expected
end

# Small block size to exercise multi-block path
for _ in 1:100
n = rand(2:10_000)
h = [Mat2x2(rand(Int32(-3):Int32(3)), rand(Int32(-3):Int32(3)),
rand(Int32(-3):Int32(3)), rand(Int32(-3):Int32(3))) for _ in 1:n]
d = array_from_host(h)
expected = accumulate(mat2_mul, h)
result = Array(AK.accumulate(mat2_mul, d; init=mat2_id, neutral=mat2_id, block_size=16, alg))
@test result == expected
end
end


@testset "cumsum" begin

Random.seed!(0)
Expand Down