Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
name = "ParallelParticleSwarms"
uuid = "ab63da0c-63b4-40fa-a3b7-d2cba5be6419"
version = "1.0.0"
authors = ["Utkarsh <rajpututkarsh530@gmail.com> and contributors"]
version = "1.0.0"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea"
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Runic = "62bfec6d-59d7-401d-8490-b29ee721c001"
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
SimpleNonlinearSolve = "727e6d20-b764-4bd8-a329-72de5adea6c7"
Expand All @@ -39,7 +41,6 @@ julia = "1.10"

[extras]
JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

Expand Down
246 changes: 196 additions & 50 deletions src/hybrid.jl
Original file line number Diff line number Diff line change
@@ -1,69 +1,215 @@
@kernel function simplebfgs_run!(nlprob, x0s, result, opt, maxiters, abstol, reltol)
using LinearAlgebra: norm, dot
using KernelAbstractions
using SciMLBase
using Optimization

# Clamp to bounds and evaluate; NaN firewall prevents poison through interpolation
@inline _eval(f, p, x, lb, ub) =
let v = f(clamp.(x, lb, ub), p)
ifelse(isfinite(v), v, eltype(x)(Inf))
end

# Clamp to bounds and evaluate gradient; zero out non-finite components
@inline _grad(grad_f, p, x, lb, ub) =
map(gi -> ifelse(isfinite(gi), gi, zero(gi)), grad_f(clamp.(x, lb, ub), p))

# Cubic interpolation with bisection fallback
@inline function _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi)
d1 = dϕ_lo + dϕ_hi - 3 * (ϕ_lo - ϕ_hi) / (a_lo - a_hi)
desc = d1 * d1 - dϕ_lo * dϕ_hi
d2 = sqrt(max(desc, zero(desc)))
return ifelse(
desc < 0, (a_lo + a_hi) / 2,
a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))
)
end

# Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6)
@inline function _zoom(f, grad_f, p, x, dir, lb, ub, a_lo, a_hi, ϕ_0, dϕ_0, ϕ_lo, c1, c2)
T = eltype(x)
xn_out, ϕ_out, gn_out, ok_out = x, ϕ_0, zero(typeof(x)), false

# Cache endpoint values; updated only when endpoints shift
ϕ_hi = _eval(f, p, x + a_hi * dir, lb, ub)
dϕ_lo = dot(_grad(grad_f, p, x + a_lo * dir, lb, ub), dir)
dϕ_hi = dot(_grad(grad_f, p, x + a_hi * dir, lb, ub), dir)

done = false
for _ in 1:10
if !done
a_j = _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi)
a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3))
xn_j = clamp.(x + a_j * dir, lb, ub)
ϕ_j = _eval(f, p, xn_j, lb, ub)
if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo)
a_hi, ϕ_hi = a_j, ϕ_j
dϕ_hi = dot(_grad(grad_f, p, xn_j, lb, ub), dir)
else
gn_j = _grad(grad_f, p, xn_j, lb, ub)
dϕ_j = dot(gn_j, dir)
if abs(dϕ_j) <= -c2 * dϕ_0
xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true
else
if dϕ_j * (a_hi - a_lo) >= zero(T)
a_hi, ϕ_hi, dϕ_hi = a_lo, ϕ_lo, dϕ_lo
end
a_lo, ϕ_lo, dϕ_lo = a_j, ϕ_j, dϕ_j
end
end
end
end
if !done
xn = clamp.(x + a_lo * dir, lb, ub)
xn_out, ϕ_out, gn_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub)
end
return (xn_out, ϕ_out, gn_out, ok_out)
end

# Strong Wolfe line search (Nocedal & Wright Algorithm 3.5)
@inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
T = eltype(x)
c1, c2 = T(1.0e-4), T(0.9)
dϕ_0 = dot(g, dir)
xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false
if dϕ_0 < zero(T) # valid descent direction
a_prev, a_i, ϕ_0, ϕ_prev, done = zero(T), one(T), fx, fx, false
for i in 1:10
if !done
xn = clamp.(x + a_i * dir, lb, ub)
ϕ_i = _eval(f, p, xn, lb, ub)
if (ϕ_i > ϕ_0 + c1 * a_i * dϕ_0) || (ϕ_i >= ϕ_prev && i > 1)
xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_prev, a_i, ϕ_0, dϕ_0, ϕ_prev, c1, c2)..., true
else
gn_i = _grad(grad_f, p, xn, lb, ub)
dϕ_i = dot(gn_i, dir)
if abs(dϕ_i) <= -c2 * dϕ_0
xn_out, ϕ_out, gn_out, ok_out, done = xn, ϕ_i, gn_i, true, true
elseif dϕ_i >= zero(T)
xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_i, a_prev, ϕ_0, dϕ_0, ϕ_i, c1, c2)..., true
else
a_prev, ϕ_prev, a_i = a_i, ϕ_i, a_i * T(2)
end
end
end
end
if !done
xn = clamp.(x + a_prev * dir, lb, ub)
xn_out, ϕ_out, gn_out, ok_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub), true
end
end
return (xn_out, ϕ_out, gn_out, ok_out)
end

# L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4)
@inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M}
T = eltype(g)
q, a = g, ntuple(_ -> zero(T), Val(M))
for j in 0:(M - 1)
idx = k - j
if idx >= 1
ii = mod1(idx, M)
a = Base.setindex(a, Rho[ii] * dot(S[ii], q), ii)
q = q - a[ii] * Y[ii]
end
end
kk = mod1(k, M)
sy, yy = dot(S[kk], Y[kk]), sum(abs2, Y[kk])
γ = sy / yy
γ = ifelse(k >= 1 && yy > T(1.0e-30) && isfinite(γ) && γ > zero(T), γ, one(T))
r = γ * q
for j in (M - 1):-1:0
idx = k - j
if idx >= 1
ii = mod1(idx, M)
r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii]
end
end
return -r
end

# Run L-BFGS independently per starting point; tuples for GPU-compatible history buffers
@kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, result_fx, lb, ub, maxiters, ::Val{M}) where {M}
i = @index(Global, Linear)
nlcache = remake(nlprob; u0 = x0s[i])
sol = solve(nlcache, opt; maxiters, abstol, reltol)
@inbounds result[i] = sol.u
x = clamp.(x0s[i], lb, ub)
T = eltype(x)
z = zero(typeof(x))
S, Y = ntuple(_ -> z, Val(M)), ntuple(_ -> z, Val(M))
Rho = ntuple(_ -> zero(T), Val(M))
fx = _eval(f, p, x, lb, ub)
g = _grad(grad_f, p, x, lb, ub)
k, active = 0, isfinite(fx) && all(isfinite, g)
for _ in 1:maxiters
if active && norm(g) >= T(1.0e-7)
dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k)
if dot(g, dir) >= zero(T) # reset to steepest descent
dir, k = -g, 0
end
xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
if !ok # fall back to steepest descent
xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub)
k = 0
end
if ok && isfinite(fn) && all(isfinite, gn)
s, y = xn - x, gn - g
sy = dot(s, y)
if isfinite(one(T) / sy) && sy > T(1.0e-10) # curvature condition
k += 1; ii = mod1(k, M)
S = Base.setindex(S, s, ii)
Y = Base.setindex(Y, y, ii)
Rho = Base.setindex(Rho, one(T) / sy, ii)
else
k = 0
end
x, g, fx = xn, gn, fn
else
active = false
end
end
end
@inbounds result[i] = x
@inbounds result_fx[i] = fx
end

# PSO for global exploration, then L-BFGS for local refinement on all particles
function SciMLBase.solve!(
cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...;
abstol = nothing,
reltol = nothing,
maxiters = 100, local_maxiters = 10, kwargs...
) where {
Backend, LocalOpt <: Union{LBFGS, BFGS},
}
pso_cache = cache.pso_cache

sol_pso = solve!(pso_cache)
x0s = sol_pso.original
abstol = nothing, reltol = nothing, maxiters = 100,
local_maxiters = 50, kwargs...
) where {Backend, LocalOpt <: Union{LBFGS, BFGS}}

backend = opt.backend
sol_pso = SciMLBase.solve!(cache.pso_cache; maxiters = maxiters)

prob = remake(cache.prob, lb = nothing, ub = nothing)
prob = cache.prob
f_raw, p = prob.f.f, prob.p
lb = prob.lb === nothing ? convert.(eltype(prob.u0), -Inf) : prob.lb
ub = prob.ub === nothing ? convert.(eltype(prob.u0), Inf) : prob.ub

result = cache.start_points
copyto!(result, x0s)
best_u = sol_pso.u
best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[]

∇f = instantiate_gradient(prob.f.f, prob.f.adtype)

kernel = simplebfgs_run!(backend)
nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(∇f, prob.u0, prob.p)

nlalg = opt.local_opt isa LBFGS ?
SimpleLimitedMemoryBroyden(;
threshold = opt.local_opt.threshold,
linesearch = Val(true)
) : SimpleBroyden(; linesearch = Val(true))
x0s = sol_pso.original
n = length(x0s)
m_val = length(prob.u0) > 20 ? Val(5) : Val(10)

grad_f = instantiate_gradient(f_raw, prob.f.adtype)
t0 = time()
kernel(
nlprob,
x0s,
result,
nlalg,
local_maxiters,
abstol,
reltol;
ndrange = length(x0s)
)

sol_bfgs = (x -> prob.f(x, prob.p)).(result)
sol_bfgs = (x -> isnan(x) ? convert(eltype(prob.u0), Inf) : x).(sol_bfgs)

minobj, ind = findmin(sol_bfgs)
sol_u,
sol_obj = minobj > sol_pso.objective ? (sol_pso.u, sol_pso.objective) :
(view(result, ind), minobj)
t1 = time()
result = similar(x0s)
result_fx = KernelAbstractions.allocate(opt.backend, typeof(best_obj), n)

# @show sol_pso.stats.time
kernel = lbfgs_kernel!(opt.backend)
kernel(f_raw, grad_f, p, x0s, result, result_fx, lb, ub, local_maxiters, m_val; ndrange = n)
KernelAbstractions.synchronize(opt.backend)

solve_time = (t1 - t0) + sol_pso.stats.time
minobj, ind = findmin(result_fx)
if minobj < best_obj
best_obj = minobj
best_u = Array(view(result, ind:ind))[1]
end

solve_time = (time() - t0) + sol_pso.stats.time
return SciMLBase.build_solution(
SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
sol_u, sol_obj,
SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj;
stats = Optimization.OptimizationStats(; time = solve_time)
)
end
Loading