From cd517127536ba786ca52ea4b660f70adb20068f0 Mon Sep 17 00:00:00 2001 From: AdityaPandeyCN Date: Thu, 2 Apr 2026 16:04:21 +0530 Subject: [PATCH 1/5] rewrite L-BFGS with custom Strong Wolfe line search Signed-off-by: AdityaPandeyCN --- Project.toml | 2 +- src/hybrid.jl | 253 ++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 206 insertions(+), 49 deletions(-) diff --git a/Project.toml b/Project.toml index a4c76f8..05e182e 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" @@ -39,7 +40,6 @@ julia = "1.10" [extras] JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/hybrid.jl b/src/hybrid.jl index 60f80ed..8b0302d 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -1,69 +1,226 @@ -@kernel function simplebfgs_run!(nlprob, x0s, result, opt, maxiters, abstol, reltol) +using LinearAlgebra: norm, dot +using KernelAbstractions +using SciMLBase +using Optimization + +# Clamp x to interior of bounds and evaluate objective +@inline function _safe_eval(f, p, x, lb, ub) + T = eltype(x) + ε = T(1e-6) + xc = clamp.(x, lb .+ ε, ub .- ε) + xc = map(xi -> abs(xi) < ε ? ε : xi, xc) + v = f(xc, p) + ifelse(isfinite(v), v, T(Inf)) +end + +# Clamp x to interior of bounds and evaluate gradient +@inline function _safe_grad(grad_f, p, x, lb, ub) + T = eltype(x) + ε = T(1e-6) + xc = clamp.(x, lb .+ ε, ub .- ε) + xc = map(xi -> abs(xi) < ε ? ε : xi, xc) + g = grad_f(xc, p) + map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) +end + +# Cubic interpolation for line search; falls back to bisection if non-convex +@inline function _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi) + d1 = dϕ_lo + dϕ_hi - 3 * (ϕ_lo - ϕ_hi) / (a_lo - a_hi) + desc = d1 * d1 - dϕ_lo * dϕ_hi + # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff) + d2 = sqrt(max(desc, zero(desc))) + ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) +end + +# Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6) +@inline function _zoom(f, grad_f, p, x, dir, lb, ub, a_lo, a_hi, ϕ_0, dϕ_0, ϕ_lo, c1, c2) + T = eltype(x) + xn_out, ϕ_out, gn_out, ok_out = x, ϕ_0, _safe_grad(grad_f, p, x, lb, ub), false + done = false + for _ in 1:10 + if !done + a_j = _interpolate(a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), + dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir), + dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir)) + a_j = clamp(a_j, min(a_lo, a_hi) + T(1e-3), max(a_lo, a_hi) - T(1e-3)) + xn_j = clamp.(x + a_j * dir, lb, ub) + ϕ_j = _safe_eval(f, p, xn_j, lb, ub) + if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo) + a_hi = a_j + else + gn_j = _safe_grad(grad_f, p, xn_j, lb, ub) + dϕ_j = dot(gn_j, dir) + if abs(dϕ_j) <= -c2 * dϕ_0 + xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true + else + if dϕ_j * (a_hi - a_lo) >= zero(T); a_hi = a_lo; end + a_lo, ϕ_lo = a_j, ϕ_j + end + end + end + end + if !done + xn_lo = clamp.(x + a_lo * dir, lb, ub) + xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub) + end + (xn_out, ϕ_out, gn_out, ok_out) +end + +# Strong Wolfe line search (Nocedal & Wright Algorithm 3.5) +@inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) + T = eltype(x) + c1, c2 = T(1e-4), T(0.9) + dϕ_0 = dot(g, dir) + xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false + if dϕ_0 < zero(T) + a_prev, a_i, ϕ_0, ϕ_prev, done = zero(T), one(T), fx, fx, false + for i in 1:10 + if !done + xn = clamp.(x + a_i * dir, lb, ub) + ϕ_i = _safe_eval(f, p, xn, lb, ub) + if (ϕ_i > ϕ_0 + c1 * a_i * dϕ_0) || (ϕ_i >= ϕ_prev && i > 1) + xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_prev, a_i, ϕ_0, dϕ_0, ϕ_prev, c1, c2)..., true + else + gn_i = _safe_grad(grad_f, p, xn, lb, ub) + dϕ_i = dot(gn_i, dir) + if abs(dϕ_i) <= -c2 * dϕ_0 + xn_out, ϕ_out, gn_out, ok_out, done = xn, ϕ_i, gn_i, true, true + elseif dϕ_i >= zero(T) + xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_i, a_prev, ϕ_0, dϕ_0, ϕ_i, c1, c2)..., true + else + a_prev, ϕ_prev, a_i = a_i, ϕ_i, a_i * T(2.0) + end + end + end + end + if !done + xn = clamp.(x + a_prev * dir, lb, ub) + xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true + end + end + (xn_out, ϕ_out, gn_out, ok_out) +end + +# L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4) +@inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M} + T = eltype(g) + q, a = g, ntuple(_ -> zero(T), Val(M)) + for j in 0:M-1 + idx = k - j + if idx >= 1 + ii = mod1(idx, M) + a = Base.setindex(a, Rho[ii] * dot(S[ii], q), ii) + q = q - a[ii] * Y[ii] + end + end + kk = mod1(k, M) + yy = sum(abs2, Y[kk]) + γ = ifelse(k >= 1 && yy > T(1e-30), dot(S[kk], Y[kk]) / yy, one(T)) + γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T)) + r = γ * q + for j in M-1:-1:0 + idx = k - j + if idx >= 1 + ii = mod1(idx, M) + r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii] + end + end + -r +end + +# Run L-BFGS independently on each starting point +@kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, lb, ub, maxiters, ::Val{M}) where {M} i = @index(Global, Linear) - nlcache = remake(nlprob; u0 = x0s[i]) - sol = solve(nlcache, opt; maxiters, abstol, reltol) - @inbounds result[i] = sol.u + x = clamp.(x0s[i], lb, ub) + T = eltype(x) + z = zero(typeof(x)) + S, Y = ntuple(_ -> z, Val(M)), ntuple(_ -> z, Val(M)) + Rho = ntuple(_ -> zero(T), Val(M)) + fx = _safe_eval(f, p, x, lb, ub) + g = _safe_grad(grad_f, p, x, lb, ub) + k, active = 0, isfinite(fx) && all(isfinite, g) + for _ in 1:maxiters + if active && norm(g) >= T(1e-7) + dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k) + if dot(g, dir) >= zero(T); dir, k = -g, 0; end + xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) + if !ok; xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0; end + if ok && isfinite(fn) && all(isfinite, gn) + s, y = xn - x, gn - g + sy = dot(s, y) + if isfinite(one(T) / sy) && sy > T(1e-10) + k += 1; ii = mod1(k, M) + S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii) + else + k = 0 + end + x, g, fx = xn, gn, fn + else + active = false + end + end + end + @inbounds result[i] = x end +# Main solve: runs PSO for global exploration, then L-BFGS for local refinement function SciMLBase.solve!( cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...; - abstol = nothing, - reltol = nothing, - maxiters = 100, local_maxiters = 10, kwargs... - ) where { - Backend, LocalOpt <: Union{LBFGS, BFGS}, - } - pso_cache = cache.pso_cache + abstol = nothing, reltol = nothing, maxiters = 100, + local_maxiters = 50, n_starts = 20, kwargs... + ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}} - sol_pso = solve!(pso_cache) - x0s = sol_pso.original + # Phase 1: Global search with PSO + sol_pso = SciMLBase.solve!(cache.pso_cache; maxiters = maxiters) - backend = opt.backend + prob = cache.prob + f_raw, p = prob.f.f, prob.p + lb = prob.lb === nothing ? convert.(eltype(prob.u0), -Inf) : prob.lb + ub = prob.ub === nothing ? convert.(eltype(prob.u0), Inf) : prob.ub - prob = remake(cache.prob, lb = nothing, ub = nothing) + best_u = sol_pso.u + best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[] - result = cache.start_points - copyto!(result, x0s) + _obj(x) = let v = prob.f(clamp.(x, lb, ub), p) + (isnan(v) || isinf(v)) ? convert(eltype(best_obj), Inf) : v + end - ∇f = instantiate_gradient(prob.f.f, prob.f.adtype) + # Build candidate pool: inject global best into particle positions + x0s = copy(sol_pso.original) + x0s[1] = best_u - kernel = simplebfgs_run!(backend) - nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(∇f, prob.u0, prob.p) + costs = map(_obj, x0s) + n = min(n_starts, length(x0s)) + pool = [x0s[j] for j in partialsortperm(Vector(costs), 1:n)] - nlalg = opt.local_opt isa LBFGS ? - SimpleLimitedMemoryBroyden(; - threshold = opt.local_opt.threshold, - linesearch = Val(true) - ) : SimpleBroyden(; linesearch = Val(true)) + D = length(first(pool)) + m_val = D > 20 ? Val(5) : Val(10) + grad_f = instantiate_gradient(f_raw, prob.f.adtype) t0 = time() - kernel( - nlprob, - x0s, - result, - nlalg, - local_maxiters, - abstol, - reltol; - ndrange = length(x0s) - ) - sol_bfgs = (x -> prob.f(x, prob.p)).(result) - sol_bfgs = (x -> isnan(x) ? convert(eltype(prob.u0), Inf) : x).(sol_bfgs) + result = similar(pool) + copyto!(result, pool) - minobj, ind = findmin(sol_bfgs) - sol_u, - sol_obj = minobj > sol_pso.objective ? (sol_pso.u, sol_pso.objective) : - (view(result, ind), minobj) - t1 = time() + # Phase 2: Local refinement with L-BFGS on top candidates + kernel = lbfgs_kernel!(opt.backend) + kernel(f_raw, grad_f, p, pool, result, lb, ub, local_maxiters, m_val; ndrange = n) + KernelAbstractions.synchronize(opt.backend) - # @show sol_pso.stats.time + # Select best result across all refined candidates + for j in 1:n + r = clamp.(result[j], lb, ub) + fval = _obj(r) + if fval < best_obj + best_obj = fval + best_u = r + end + end - solve_time = (t1 - t0) + sol_pso.stats.time + solve_time = (time() - t0) + sol_pso.stats.time - return SciMLBase.build_solution( - SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, - sol_u, sol_obj, + SciMLBase.build_solution( + SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj; stats = Optimization.OptimizationStats(; time = solve_time) ) -end +end \ No newline at end of file From e90ef08a0204c2896a215f0b5d5bda40b2b34cf0 Mon Sep 17 00:00:00 2001 From: AdityaPandeyCN Date: Thu, 2 Apr 2026 16:28:39 +0530 Subject: [PATCH 2/5] formatting Signed-off-by: AdityaPandeyCN --- src/hybrid.jl | 52 +++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/src/hybrid.jl b/src/hybrid.jl index 8b0302d..ca1b4eb 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -6,21 +6,21 @@ using Optimization # Clamp x to interior of bounds and evaluate objective @inline function _safe_eval(f, p, x, lb, ub) T = eltype(x) - ε = T(1e-6) + ε = T(1.0e-6) xc = clamp.(x, lb .+ ε, ub .- ε) xc = map(xi -> abs(xi) < ε ? ε : xi, xc) v = f(xc, p) - ifelse(isfinite(v), v, T(Inf)) + return ifelse(isfinite(v), v, T(Inf)) end # Clamp x to interior of bounds and evaluate gradient @inline function _safe_grad(grad_f, p, x, lb, ub) T = eltype(x) - ε = T(1e-6) + ε = T(1.0e-6) xc = clamp.(x, lb .+ ε, ub .- ε) xc = map(xi -> abs(xi) < ε ? ε : xi, xc) g = grad_f(xc, p) - map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) + return map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) end # Cubic interpolation for line search; falls back to bisection if non-convex @@ -29,7 +29,7 @@ end desc = d1 * d1 - dϕ_lo * dϕ_hi # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff) d2 = sqrt(max(desc, zero(desc))) - ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) + return ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) end # Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6) @@ -39,10 +39,12 @@ end done = false for _ in 1:10 if !done - a_j = _interpolate(a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), + a_j = _interpolate( + a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir), - dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir)) - a_j = clamp(a_j, min(a_lo, a_hi) + T(1e-3), max(a_lo, a_hi) - T(1e-3)) + dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir) + ) + a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3)) xn_j = clamp.(x + a_j * dir, lb, ub) ϕ_j = _safe_eval(f, p, xn_j, lb, ub) if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo) @@ -53,7 +55,9 @@ end if abs(dϕ_j) <= -c2 * dϕ_0 xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true else - if dϕ_j * (a_hi - a_lo) >= zero(T); a_hi = a_lo; end + if dϕ_j * (a_hi - a_lo) >= zero(T) + a_hi = a_lo + end a_lo, ϕ_lo = a_j, ϕ_j end end @@ -63,13 +67,13 @@ end xn_lo = clamp.(x + a_lo * dir, lb, ub) xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub) end - (xn_out, ϕ_out, gn_out, ok_out) + return (xn_out, ϕ_out, gn_out, ok_out) end # Strong Wolfe line search (Nocedal & Wright Algorithm 3.5) @inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) T = eltype(x) - c1, c2 = T(1e-4), T(0.9) + c1, c2 = T(1.0e-4), T(0.9) dϕ_0 = dot(g, dir) xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false if dϕ_0 < zero(T) @@ -98,14 +102,14 @@ end xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true end end - (xn_out, ϕ_out, gn_out, ok_out) + return (xn_out, ϕ_out, gn_out, ok_out) end # L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4) @inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M} T = eltype(g) q, a = g, ntuple(_ -> zero(T), Val(M)) - for j in 0:M-1 + for j in 0:(M - 1) idx = k - j if idx >= 1 ii = mod1(idx, M) @@ -115,17 +119,17 @@ end end kk = mod1(k, M) yy = sum(abs2, Y[kk]) - γ = ifelse(k >= 1 && yy > T(1e-30), dot(S[kk], Y[kk]) / yy, one(T)) + γ = ifelse(k >= 1 && yy > T(1.0e-30), dot(S[kk], Y[kk]) / yy, one(T)) γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T)) r = γ * q - for j in M-1:-1:0 + for j in (M - 1):-1:0 idx = k - j if idx >= 1 ii = mod1(idx, M) r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii] end end - -r + return -r end # Run L-BFGS independently on each starting point @@ -140,15 +144,19 @@ end g = _safe_grad(grad_f, p, x, lb, ub) k, active = 0, isfinite(fx) && all(isfinite, g) for _ in 1:maxiters - if active && norm(g) >= T(1e-7) + if active && norm(g) >= T(1.0e-7) dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k) - if dot(g, dir) >= zero(T); dir, k = -g, 0; end + if dot(g, dir) >= zero(T) + dir, k = -g, 0 + end xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) - if !ok; xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0; end + if !ok + xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0 + end if ok && isfinite(fn) && all(isfinite, gn) s, y = xn - x, gn - g sy = dot(s, y) - if isfinite(one(T) / sy) && sy > T(1e-10) + if isfinite(one(T) / sy) && sy > T(1.0e-10) k += 1; ii = mod1(k, M) S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii) else @@ -219,8 +227,8 @@ function SciMLBase.solve!( solve_time = (time() - t0) + sol_pso.stats.time - SciMLBase.build_solution( + return SciMLBase.build_solution( SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj; stats = Optimization.OptimizationStats(; time = solve_time) ) -end \ No newline at end of file +end From c7a60f6e8c60a1d9a1fbffc59d3973880fdc285a Mon Sep 17 00:00:00 2001 From: AdityaPandeyCN Date: Thu, 2 Apr 2026 18:15:29 +0530 Subject: [PATCH 3/5] remove scalar indexing Signed-off-by: AdityaPandeyCN --- src/hybrid.jl | 106 +++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 57 deletions(-) diff --git a/src/hybrid.jl b/src/hybrid.jl index ca1b4eb..3cb82e7 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -6,21 +6,21 @@ using Optimization # Clamp x to interior of bounds and evaluate objective @inline function _safe_eval(f, p, x, lb, ub) T = eltype(x) - ε = T(1.0e-6) + ε = T(1e-6) xc = clamp.(x, lb .+ ε, ub .- ε) xc = map(xi -> abs(xi) < ε ? ε : xi, xc) v = f(xc, p) - return ifelse(isfinite(v), v, T(Inf)) + ifelse(isfinite(v), v, T(Inf)) end # Clamp x to interior of bounds and evaluate gradient @inline function _safe_grad(grad_f, p, x, lb, ub) T = eltype(x) - ε = T(1.0e-6) + ε = T(1e-6) xc = clamp.(x, lb .+ ε, ub .- ε) xc = map(xi -> abs(xi) < ε ? ε : xi, xc) g = grad_f(xc, p) - return map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) + map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) end # Cubic interpolation for line search; falls back to bisection if non-convex @@ -29,7 +29,7 @@ end desc = d1 * d1 - dϕ_lo * dϕ_hi # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff) d2 = sqrt(max(desc, zero(desc))) - return ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) + ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) end # Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6) @@ -39,12 +39,10 @@ end done = false for _ in 1:10 if !done - a_j = _interpolate( - a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), + a_j = _interpolate(a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir), - dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir) - ) - a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3)) + dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir)) + a_j = clamp(a_j, min(a_lo, a_hi) + T(1e-3), max(a_lo, a_hi) - T(1e-3)) xn_j = clamp.(x + a_j * dir, lb, ub) ϕ_j = _safe_eval(f, p, xn_j, lb, ub) if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo) @@ -55,9 +53,7 @@ end if abs(dϕ_j) <= -c2 * dϕ_0 xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true else - if dϕ_j * (a_hi - a_lo) >= zero(T) - a_hi = a_lo - end + if dϕ_j * (a_hi - a_lo) >= zero(T); a_hi = a_lo; end a_lo, ϕ_lo = a_j, ϕ_j end end @@ -67,16 +63,16 @@ end xn_lo = clamp.(x + a_lo * dir, lb, ub) xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub) end - return (xn_out, ϕ_out, gn_out, ok_out) + (xn_out, ϕ_out, gn_out, ok_out) end # Strong Wolfe line search (Nocedal & Wright Algorithm 3.5) @inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) T = eltype(x) - c1, c2 = T(1.0e-4), T(0.9) + c1, c2 = T(1e-4), T(0.9) dϕ_0 = dot(g, dir) xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false - if dϕ_0 < zero(T) + if dϕ_0 < zero(T) # valid descent direction a_prev, a_i, ϕ_0, ϕ_prev, done = zero(T), one(T), fx, fx, false for i in 1:10 if !done @@ -102,14 +98,15 @@ end xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true end end - return (xn_out, ϕ_out, gn_out, ok_out) + (xn_out, ϕ_out, gn_out, ok_out) end # L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4) @inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M} T = eltype(g) q, a = g, ntuple(_ -> zero(T), Val(M)) - for j in 0:(M - 1) + # First loop: newest to oldest + for j in 0:M-1 idx = k - j if idx >= 1 ii = mod1(idx, M) @@ -117,23 +114,26 @@ end q = q - a[ii] * Y[ii] end end + # Compute scaling factor γ kk = mod1(k, M) yy = sum(abs2, Y[kk]) - γ = ifelse(k >= 1 && yy > T(1.0e-30), dot(S[kk], Y[kk]) / yy, one(T)) + γ = ifelse(k >= 1 && yy > T(1e-30), dot(S[kk], Y[kk]) / yy, one(T)) γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T)) r = γ * q - for j in (M - 1):-1:0 + # Second loop: oldest to newest + for j in M-1:-1:0 idx = k - j if idx >= 1 ii = mod1(idx, M) r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii] end end - return -r + -r end # Run L-BFGS independently on each starting point -@kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, lb, ub, maxiters, ::Val{M}) where {M} +# History buffers stored as tuples for GPU compatibility +@kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, result_fx, lb, ub, maxiters, ::Val{M}) where {M} i = @index(Global, Linear) x = clamp.(x0s[i], lb, ub) T = eltype(x) @@ -144,19 +144,18 @@ end g = _safe_grad(grad_f, p, x, lb, ub) k, active = 0, isfinite(fx) && all(isfinite, g) for _ in 1:maxiters - if active && norm(g) >= T(1.0e-7) + if active && norm(g) >= T(1e-7) dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k) - if dot(g, dir) >= zero(T) - dir, k = -g, 0 - end + # Reset to steepest descent if direction is not descent + if dot(g, dir) >= zero(T); dir, k = -g, 0; end xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) - if !ok - xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0 - end + # Fall back to steepest descent if line search fails + if !ok; xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0; end if ok && isfinite(fn) && all(isfinite, gn) s, y = xn - x, gn - g sy = dot(s, y) - if isfinite(one(T) / sy) && sy > T(1.0e-10) + # Update history only if curvature condition holds + if isfinite(one(T) / sy) && sy > T(1e-10) k += 1; ii = mod1(k, M) S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii) else @@ -169,13 +168,14 @@ end end end @inbounds result[i] = x + @inbounds result_fx[i] = fx end # Main solve: runs PSO for global exploration, then L-BFGS for local refinement function SciMLBase.solve!( cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...; abstol = nothing, reltol = nothing, maxiters = 100, - local_maxiters = 50, n_starts = 20, kwargs... + local_maxiters = 50, kwargs... ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}} # Phase 1: Global search with PSO @@ -189,46 +189,38 @@ function SciMLBase.solve!( best_u = sol_pso.u best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[] - _obj(x) = let v = prob.f(clamp.(x, lb, ub), p) - (isnan(v) || isinf(v)) ? convert(eltype(best_obj), Inf) : v - end - - # Build candidate pool: inject global best into particle positions - x0s = copy(sol_pso.original) - x0s[1] = best_u + # Run L-BFGS on all particles + x0s = sol_pso.original + n = length(x0s) - costs = map(_obj, x0s) - n = min(n_starts, length(x0s)) - pool = [x0s[j] for j in partialsortperm(Vector(costs), 1:n)] - - D = length(first(pool)) + D = length(prob.u0) m_val = D > 20 ? Val(5) : Val(10) grad_f = instantiate_gradient(f_raw, prob.f.adtype) t0 = time() - result = similar(pool) - copyto!(result, pool) + result = similar(x0s) + copyto!(result, x0s) + result_fx = KernelAbstractions.allocate(opt.backend, typeof(best_obj), n) - # Phase 2: Local refinement with L-BFGS on top candidates + # Phase 2: Local refinement with L-BFGS on all candidates kernel = lbfgs_kernel!(opt.backend) - kernel(f_raw, grad_f, p, pool, result, lb, ub, local_maxiters, m_val; ndrange = n) + kernel(f_raw, grad_f, p, x0s, result, result_fx, lb, ub, local_maxiters, m_val; ndrange = n) KernelAbstractions.synchronize(opt.backend) - # Select best result across all refined candidates - for j in 1:n - r = clamp.(result[j], lb, ub) - fval = _obj(r) - if fval < best_obj - best_obj = fval - best_u = r - end + # Find best result + minobj, ind = findmin(result_fx) + + # Single bulk transfer for winning solution only + if minobj < best_obj + best_obj = minobj + best_u = Array(view(result, ind:ind))[1] end solve_time = (time() - t0) + sol_pso.stats.time - return SciMLBase.build_solution( + SciMLBase.build_solution( SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj; stats = Optimization.OptimizationStats(; time = solve_time) ) -end +end \ No newline at end of file From 1d3c3b146bef457c8a0ef9764aef778065772706 Mon Sep 17 00:00:00 2001 From: AdityaPandeyCN Date: Thu, 2 Apr 2026 18:18:05 +0530 Subject: [PATCH 4/5] formatting Signed-off-by: AdityaPandeyCN --- src/hybrid.jl | 54 +++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/src/hybrid.jl b/src/hybrid.jl index 3cb82e7..945306f 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -6,21 +6,21 @@ using Optimization # Clamp x to interior of bounds and evaluate objective @inline function _safe_eval(f, p, x, lb, ub) T = eltype(x) - ε = T(1e-6) + ε = T(1.0e-6) xc = clamp.(x, lb .+ ε, ub .- ε) xc = map(xi -> abs(xi) < ε ? ε : xi, xc) v = f(xc, p) - ifelse(isfinite(v), v, T(Inf)) + return ifelse(isfinite(v), v, T(Inf)) end # Clamp x to interior of bounds and evaluate gradient @inline function _safe_grad(grad_f, p, x, lb, ub) T = eltype(x) - ε = T(1e-6) + ε = T(1.0e-6) xc = clamp.(x, lb .+ ε, ub .- ε) xc = map(xi -> abs(xi) < ε ? ε : xi, xc) g = grad_f(xc, p) - map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) + return map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) end # Cubic interpolation for line search; falls back to bisection if non-convex @@ -29,7 +29,7 @@ end desc = d1 * d1 - dϕ_lo * dϕ_hi # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff) d2 = sqrt(max(desc, zero(desc))) - ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) + return ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) end # Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6) @@ -39,10 +39,12 @@ end done = false for _ in 1:10 if !done - a_j = _interpolate(a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), + a_j = _interpolate( + a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir), - dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir)) - a_j = clamp(a_j, min(a_lo, a_hi) + T(1e-3), max(a_lo, a_hi) - T(1e-3)) + dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir) + ) + a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3)) xn_j = clamp.(x + a_j * dir, lb, ub) ϕ_j = _safe_eval(f, p, xn_j, lb, ub) if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo) @@ -53,7 +55,9 @@ end if abs(dϕ_j) <= -c2 * dϕ_0 xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true else - if dϕ_j * (a_hi - a_lo) >= zero(T); a_hi = a_lo; end + if dϕ_j * (a_hi - a_lo) >= zero(T) + a_hi = a_lo + end a_lo, ϕ_lo = a_j, ϕ_j end end @@ -63,13 +67,13 @@ end xn_lo = clamp.(x + a_lo * dir, lb, ub) xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub) end - (xn_out, ϕ_out, gn_out, ok_out) + return (xn_out, ϕ_out, gn_out, ok_out) end # Strong Wolfe line search (Nocedal & Wright Algorithm 3.5) @inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) T = eltype(x) - c1, c2 = T(1e-4), T(0.9) + c1, c2 = T(1.0e-4), T(0.9) dϕ_0 = dot(g, dir) xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false if dϕ_0 < zero(T) # valid descent direction @@ -98,7 +102,7 @@ end xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true end end - (xn_out, ϕ_out, gn_out, ok_out) + return (xn_out, ϕ_out, gn_out, ok_out) end # L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4) @@ -106,7 +110,7 @@ end T = eltype(g) q, a = g, ntuple(_ -> zero(T), Val(M)) # First loop: newest to oldest - for j in 0:M-1 + for j in 0:(M - 1) idx = k - j if idx >= 1 ii = mod1(idx, M) @@ -117,18 +121,18 @@ end # Compute scaling factor γ kk = mod1(k, M) yy = sum(abs2, Y[kk]) - γ = ifelse(k >= 1 && yy > T(1e-30), dot(S[kk], Y[kk]) / yy, one(T)) + γ = ifelse(k >= 1 && yy > T(1.0e-30), dot(S[kk], Y[kk]) / yy, one(T)) γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T)) r = γ * q # Second loop: oldest to newest - for j in M-1:-1:0 + for j in (M - 1):-1:0 idx = k - j if idx >= 1 ii = mod1(idx, M) r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii] end end - -r + return -r end # Run L-BFGS independently on each starting point @@ -144,18 +148,22 @@ end g = _safe_grad(grad_f, p, x, lb, ub) k, active = 0, isfinite(fx) && all(isfinite, g) for _ in 1:maxiters - if active && norm(g) >= T(1e-7) + if active && norm(g) >= T(1.0e-7) dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k) # Reset to steepest descent if direction is not descent - if dot(g, dir) >= zero(T); dir, k = -g, 0; end + if dot(g, dir) >= zero(T) + dir, k = -g, 0 + end xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) # Fall back to steepest descent if line search fails - if !ok; xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0; end + if !ok + xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0 + end if ok && isfinite(fn) && all(isfinite, gn) s, y = xn - x, gn - g sy = dot(s, y) # Update history only if curvature condition holds - if isfinite(one(T) / sy) && sy > T(1e-10) + if isfinite(one(T) / sy) && sy > T(1.0e-10) k += 1; ii = mod1(k, M) S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii) else @@ -208,7 +216,7 @@ function SciMLBase.solve!( kernel(f_raw, grad_f, p, x0s, result, result_fx, lb, ub, local_maxiters, m_val; ndrange = n) KernelAbstractions.synchronize(opt.backend) - # Find best result + # Find best result minobj, ind = findmin(result_fx) # Single bulk transfer for winning solution only @@ -219,8 +227,8 @@ function SciMLBase.solve!( solve_time = (time() - t0) + sol_pso.stats.time - SciMLBase.build_solution( + return SciMLBase.build_solution( SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj; stats = Optimization.OptimizationStats(; time = solve_time) ) -end \ No newline at end of file +end From b6b3443674685480c656f02f7f7cdb8977d9f0d9 Mon Sep 17 00:00:00 2001 From: AdityaPandeyCN Date: Thu, 2 Apr 2026 22:12:57 +0530 Subject: [PATCH 5/5] add comments Signed-off-by: AdityaPandeyCN --- Project.toml | 5 ++- src/hybrid.jl | 115 +++++++++++++++++++++----------------------------- 2 files changed, 51 insertions(+), 69 deletions(-) diff --git a/Project.toml b/Project.toml index 05e182e..7c97061 100644 --- a/Project.toml +++ b/Project.toml @@ -1,20 +1,21 @@ name = "ParallelParticleSwarms" uuid = "ab63da0c-63b4-40fa-a3b7-d2cba5be6419" -version = "1.0.0" authors = ["Utkarsh and contributors"] +version = "1.0.0" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +Runic = "62bfec6d-59d7-401d-8490-b29ee721c001" SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462" Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46" SimpleNonlinearSolve = "727e6d20-b764-4bd8-a329-72de5adea6c7" diff --git a/src/hybrid.jl b/src/hybrid.jl index 945306f..8ba398c 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -3,69 +3,64 @@ using KernelAbstractions using SciMLBase using Optimization -# Clamp x to interior of bounds and evaluate objective -@inline function _safe_eval(f, p, x, lb, ub) - T = eltype(x) - ε = T(1.0e-6) - xc = clamp.(x, lb .+ ε, ub .- ε) - xc = map(xi -> abs(xi) < ε ? ε : xi, xc) - v = f(xc, p) - return ifelse(isfinite(v), v, T(Inf)) +# Clamp to bounds and evaluate; NaN firewall prevents poison through interpolation +@inline _eval(f, p, x, lb, ub) = +let v = f(clamp.(x, lb, ub), p) + ifelse(isfinite(v), v, eltype(x)(Inf)) end -# Clamp x to interior of bounds and evaluate gradient -@inline function _safe_grad(grad_f, p, x, lb, ub) - T = eltype(x) - ε = T(1.0e-6) - xc = clamp.(x, lb .+ ε, ub .- ε) - xc = map(xi -> abs(xi) < ε ? ε : xi, xc) - g = grad_f(xc, p) - return map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) -end +# Clamp to bounds and evaluate gradient; zero out non-finite components +@inline _grad(grad_f, p, x, lb, ub) = + map(gi -> ifelse(isfinite(gi), gi, zero(gi)), grad_f(clamp.(x, lb, ub), p)) -# Cubic interpolation for line search; falls back to bisection if non-convex +# Cubic interpolation with bisection fallback @inline function _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi) d1 = dϕ_lo + dϕ_hi - 3 * (ϕ_lo - ϕ_hi) / (a_lo - a_hi) desc = d1 * d1 - dϕ_lo * dϕ_hi - # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff) d2 = sqrt(max(desc, zero(desc))) - return ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) + return ifelse( + desc < 0, (a_lo + a_hi) / 2, + a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2)) + ) end # Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6) @inline function _zoom(f, grad_f, p, x, dir, lb, ub, a_lo, a_hi, ϕ_0, dϕ_0, ϕ_lo, c1, c2) T = eltype(x) - xn_out, ϕ_out, gn_out, ok_out = x, ϕ_0, _safe_grad(grad_f, p, x, lb, ub), false + xn_out, ϕ_out, gn_out, ok_out = x, ϕ_0, zero(typeof(x)), false + + # Cache endpoint values; updated only when endpoints shift + ϕ_hi = _eval(f, p, x + a_hi * dir, lb, ub) + dϕ_lo = dot(_grad(grad_f, p, x + a_lo * dir, lb, ub), dir) + dϕ_hi = dot(_grad(grad_f, p, x + a_hi * dir, lb, ub), dir) + done = false for _ in 1:10 if !done - a_j = _interpolate( - a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), - dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir), - dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir) - ) + a_j = _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi) a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3)) xn_j = clamp.(x + a_j * dir, lb, ub) - ϕ_j = _safe_eval(f, p, xn_j, lb, ub) + ϕ_j = _eval(f, p, xn_j, lb, ub) if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo) - a_hi = a_j + a_hi, ϕ_hi = a_j, ϕ_j + dϕ_hi = dot(_grad(grad_f, p, xn_j, lb, ub), dir) else - gn_j = _safe_grad(grad_f, p, xn_j, lb, ub) + gn_j = _grad(grad_f, p, xn_j, lb, ub) dϕ_j = dot(gn_j, dir) if abs(dϕ_j) <= -c2 * dϕ_0 xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true else if dϕ_j * (a_hi - a_lo) >= zero(T) - a_hi = a_lo + a_hi, ϕ_hi, dϕ_hi = a_lo, ϕ_lo, dϕ_lo end - a_lo, ϕ_lo = a_j, ϕ_j + a_lo, ϕ_lo, dϕ_lo = a_j, ϕ_j, dϕ_j end end end end if !done - xn_lo = clamp.(x + a_lo * dir, lb, ub) - xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub) + xn = clamp.(x + a_lo * dir, lb, ub) + xn_out, ϕ_out, gn_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub) end return (xn_out, ϕ_out, gn_out, ok_out) end @@ -81,25 +76,25 @@ end for i in 1:10 if !done xn = clamp.(x + a_i * dir, lb, ub) - ϕ_i = _safe_eval(f, p, xn, lb, ub) + ϕ_i = _eval(f, p, xn, lb, ub) if (ϕ_i > ϕ_0 + c1 * a_i * dϕ_0) || (ϕ_i >= ϕ_prev && i > 1) xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_prev, a_i, ϕ_0, dϕ_0, ϕ_prev, c1, c2)..., true else - gn_i = _safe_grad(grad_f, p, xn, lb, ub) + gn_i = _grad(grad_f, p, xn, lb, ub) dϕ_i = dot(gn_i, dir) if abs(dϕ_i) <= -c2 * dϕ_0 xn_out, ϕ_out, gn_out, ok_out, done = xn, ϕ_i, gn_i, true, true elseif dϕ_i >= zero(T) xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_i, a_prev, ϕ_0, dϕ_0, ϕ_i, c1, c2)..., true else - a_prev, ϕ_prev, a_i = a_i, ϕ_i, a_i * T(2.0) + a_prev, ϕ_prev, a_i = a_i, ϕ_i, a_i * T(2) end end end end if !done xn = clamp.(x + a_prev * dir, lb, ub) - xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true + xn_out, ϕ_out, gn_out, ok_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub), true end end return (xn_out, ϕ_out, gn_out, ok_out) @@ -109,7 +104,6 @@ end @inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M} T = eltype(g) q, a = g, ntuple(_ -> zero(T), Val(M)) - # First loop: newest to oldest for j in 0:(M - 1) idx = k - j if idx >= 1 @@ -118,13 +112,11 @@ end q = q - a[ii] * Y[ii] end end - # Compute scaling factor γ kk = mod1(k, M) - yy = sum(abs2, Y[kk]) - γ = ifelse(k >= 1 && yy > T(1.0e-30), dot(S[kk], Y[kk]) / yy, one(T)) - γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T)) + sy, yy = dot(S[kk], Y[kk]), sum(abs2, Y[kk]) + γ = sy / yy + γ = ifelse(k >= 1 && yy > T(1.0e-30) && isfinite(γ) && γ > zero(T), γ, one(T)) r = γ * q - # Second loop: oldest to newest for j in (M - 1):-1:0 idx = k - j if idx >= 1 @@ -135,8 +127,7 @@ end return -r end -# Run L-BFGS independently on each starting point -# History buffers stored as tuples for GPU compatibility +# Run L-BFGS independently per starting point; tuples for GPU-compatible history buffers @kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, result_fx, lb, ub, maxiters, ::Val{M}) where {M} i = @index(Global, Linear) x = clamp.(x0s[i], lb, ub) @@ -144,28 +135,28 @@ end z = zero(typeof(x)) S, Y = ntuple(_ -> z, Val(M)), ntuple(_ -> z, Val(M)) Rho = ntuple(_ -> zero(T), Val(M)) - fx = _safe_eval(f, p, x, lb, ub) - g = _safe_grad(grad_f, p, x, lb, ub) + fx = _eval(f, p, x, lb, ub) + g = _grad(grad_f, p, x, lb, ub) k, active = 0, isfinite(fx) && all(isfinite, g) for _ in 1:maxiters if active && norm(g) >= T(1.0e-7) dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k) - # Reset to steepest descent if direction is not descent - if dot(g, dir) >= zero(T) + if dot(g, dir) >= zero(T) # reset to steepest descent dir, k = -g, 0 end xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) - # Fall back to steepest descent if line search fails - if !ok - xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0 + if !ok # fall back to steepest descent + xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub) + k = 0 end if ok && isfinite(fn) && all(isfinite, gn) s, y = xn - x, gn - g sy = dot(s, y) - # Update history only if curvature condition holds - if isfinite(one(T) / sy) && sy > T(1.0e-10) + if isfinite(one(T) / sy) && sy > T(1.0e-10) # curvature condition k += 1; ii = mod1(k, M) - S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii) + S = Base.setindex(S, s, ii) + Y = Base.setindex(Y, y, ii) + Rho = Base.setindex(Rho, one(T) / sy, ii) else k = 0 end @@ -179,14 +170,13 @@ end @inbounds result_fx[i] = fx end -# Main solve: runs PSO for global exploration, then L-BFGS for local refinement +# PSO for global exploration, then L-BFGS for local refinement on all particles function SciMLBase.solve!( cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...; abstol = nothing, reltol = nothing, maxiters = 100, local_maxiters = 50, kwargs... ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}} - # Phase 1: Global search with PSO sol_pso = SciMLBase.solve!(cache.pso_cache; maxiters = maxiters) prob = cache.prob @@ -197,36 +187,27 @@ function SciMLBase.solve!( best_u = sol_pso.u best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[] - # Run L-BFGS on all particles x0s = sol_pso.original n = length(x0s) - - D = length(prob.u0) - m_val = D > 20 ? Val(5) : Val(10) + m_val = length(prob.u0) > 20 ? Val(5) : Val(10) grad_f = instantiate_gradient(f_raw, prob.f.adtype) t0 = time() result = similar(x0s) - copyto!(result, x0s) result_fx = KernelAbstractions.allocate(opt.backend, typeof(best_obj), n) - # Phase 2: Local refinement with L-BFGS on all candidates kernel = lbfgs_kernel!(opt.backend) kernel(f_raw, grad_f, p, x0s, result, result_fx, lb, ub, local_maxiters, m_val; ndrange = n) KernelAbstractions.synchronize(opt.backend) - # Find best result minobj, ind = findmin(result_fx) - - # Single bulk transfer for winning solution only if minobj < best_obj best_obj = minobj best_u = Array(view(result, ind:ind))[1] end solve_time = (time() - t0) + sol_pso.stats.time - return SciMLBase.build_solution( SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj; stats = Optimization.OptimizationStats(; time = solve_time)