From 691f8350e5d692bda3bd9644dd26bc84bb6548db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Thu, 28 May 2026 19:51:49 +0200 Subject: [PATCH 1/2] preload non-block values as well --- src/reverse_mode.jl | 2 +- src/sizes.jl | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/reverse_mode.jl b/src/reverse_mode.jl index 39fe286..72987f5 100644 --- a/src/reverse_mode.jl +++ b/src/reverse_mode.jl @@ -177,7 +177,7 @@ function _forward_eval( # elseif node.type == Nonlinear.NODE_MOI_VARIABLE # f.forward_storage[k] = x[node.index] elseif node.type == NODE_VALUE - f.forward_storage[j] = f.const_values[node.index] + # Pre-loaded into `forward_storage` at construction. elseif node.type == NODE_VARIABLE_BLOCK # Contiguous-to-contiguous copy from `x` into the tape: on CPU a # `copyto!`, on GPU a single `cudaMemcpy`. This is the fast path diff --git a/src/sizes.jl b/src/sizes.jl index 0a853ef..9ae7a77 100644 --- a/src/sizes.jl +++ b/src/sizes.jl @@ -700,10 +700,13 @@ struct _SubexpressionStorage{T<:Real,S<:AbstractVector{T}} ) where {T<:Real,S<:AbstractVector{T}} sizes = _infer_sizes(nodes, adj, block_shapes, const_values, operators) N = _length(sizes) - # Pre-load value blocks into forward_storage once at construction; - # each block is a contiguous-to-contiguous bulk copy. Individual - # `NODE_VALUE` scalars (rare — exponents, constant divisors, etc) and - # variable nodes are loaded by `_forward_eval` in the per-node loop. + # Pre-load all constants (value blocks and individual `NODE_VALUE` + # scalars like exponents or constant divisors) into a host buffer here, + # then bulk-copy to `forward_storage` once. With a `CuVector` backing + # `S`, this avoids a per-call `cuMemcpyHtoDAsync` for every scalar + # `NODE_VALUE` in `_forward_eval` — those scalar setindex writes show + # up as dozens of microseconds of host launch overhead per gradient + # call. Variable nodes are still loaded per-call. cpu_buffer = zeros(T, N) for k in 1:length(nodes) node = nodes[k] @@ -712,6 +715,9 @@ struct _SubexpressionStorage{T<:Real,S<:AbstractVector{T}} len = _length(sizes, k) cpu_buffer[j:(j+len-1)] .= view(const_values, (node.index):(node.index+len-1)) + elseif node.type == NODE_VALUE + cpu_buffer[sizes.storage_offset[k]+1] = + const_values[node.index] end end forward_storage = convert(S, cpu_buffer) From 4fec915449e8d62a766c8c37d3280d703c621160 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Thu, 28 May 2026 21:05:39 +0200 Subject: [PATCH 2/2] Fix format --- src/sizes.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/sizes.jl b/src/sizes.jl index 9ae7a77..bf5469d 100644 --- a/src/sizes.jl +++ b/src/sizes.jl @@ -716,8 +716,7 @@ struct _SubexpressionStorage{T<:Real,S<:AbstractVector{T}} cpu_buffer[j:(j+len-1)] .= view(const_values, (node.index):(node.index+len-1)) elseif node.type == NODE_VALUE - cpu_buffer[sizes.storage_offset[k]+1] = - const_values[node.index] + cpu_buffer[sizes.storage_offset[k]+1] = const_values[node.index] end end forward_storage = convert(S, cpu_buffer)