diff --git a/Project.toml b/Project.toml index a4c76f8..7c97061 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ParallelParticleSwarms" uuid = "ab63da0c-63b4-40fa-a3b7-d2cba5be6419" -version = "1.0.0" authors = ["Utkarsh and contributors"] +version = "1.0.0" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" @@ -9,11 +9,13 @@ DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +Runic = "62bfec6d-59d7-401d-8490-b29ee721c001" SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462" Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46" SimpleNonlinearSolve = "727e6d20-b764-4bd8-a329-72de5adea6c7" @@ -39,7 +41,6 @@ julia = "1.10" [extras] JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/hybrid.jl b/src/hybrid.jl index 60f80ed..8ba398c 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -1,69 +1,215 @@ -@kernel function simplebfgs_run!(nlprob, x0s, result, opt, maxiters, abstol, reltol) +using LinearAlgebra: norm, dot +using KernelAbstractions +using SciMLBase +using Optimization + +# Clamp to bounds and evaluate; NaN firewall prevents poison through interpolation +@inline _eval(f, p, x, lb, ub) = +let v = f(clamp.(x, lb, ub), p) + ifelse(isfinite(v), v, eltype(x)(Inf)) +end + +# Clamp to bounds and evaluate gradient; zero out non-finite components +@inline _grad(grad_f, p, x, lb, ub) = + map(gi -> ifelse(isfinite(gi), gi, zero(gi)), grad_f(clamp.(x, lb, ub), p)) + +# Cubic interpolation with bisection fallback +@inline function _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi) + d1 = dϕ_lo + dϕ_hi - 3 * (ϕ_lo - ϕ_hi) / (a_lo - a_hi) + desc = d1 * d1 - dϕ_lo * dϕ_hi + d2 = sqrt(max(desc, zero(desc))) + return ifelse( + desc < 0, (a_lo + a_hi) / 2, + a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2)) + ) +end + +# Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6) +@inline function _zoom(f, grad_f, p, x, dir, lb, ub, a_lo, a_hi, ϕ_0, dϕ_0, ϕ_lo, c1, c2) + T = eltype(x) + xn_out, ϕ_out, gn_out, ok_out = x, ϕ_0, zero(typeof(x)), false + + # Cache endpoint values; updated only when endpoints shift + ϕ_hi = _eval(f, p, x + a_hi * dir, lb, ub) + dϕ_lo = dot(_grad(grad_f, p, x + a_lo * dir, lb, ub), dir) + dϕ_hi = dot(_grad(grad_f, p, x + a_hi * dir, lb, ub), dir) + + done = false + for _ in 1:10 + if !done + a_j = _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi) + a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3)) + xn_j = clamp.(x + a_j * dir, lb, ub) + ϕ_j = _eval(f, p, xn_j, lb, ub) + if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo) + a_hi, ϕ_hi = a_j, ϕ_j + dϕ_hi = dot(_grad(grad_f, p, xn_j, lb, ub), dir) + else + gn_j = _grad(grad_f, p, xn_j, lb, ub) + dϕ_j = dot(gn_j, dir) + if abs(dϕ_j) <= -c2 * dϕ_0 + xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true + else + if dϕ_j * (a_hi - a_lo) >= zero(T) + a_hi, ϕ_hi, dϕ_hi = a_lo, ϕ_lo, dϕ_lo + end + a_lo, ϕ_lo, dϕ_lo = a_j, ϕ_j, dϕ_j + end + end + end + end + if !done + xn = clamp.(x + a_lo * dir, lb, ub) + xn_out, ϕ_out, gn_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub) + end + return (xn_out, ϕ_out, gn_out, ok_out) +end + +# Strong Wolfe line search (Nocedal & Wright Algorithm 3.5) +@inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) + T = eltype(x) + c1, c2 = T(1.0e-4), T(0.9) + dϕ_0 = dot(g, dir) + xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false + if dϕ_0 < zero(T) # valid descent direction + a_prev, a_i, ϕ_0, ϕ_prev, done = zero(T), one(T), fx, fx, false + for i in 1:10 + if !done + xn = clamp.(x + a_i * dir, lb, ub) + ϕ_i = _eval(f, p, xn, lb, ub) + if (ϕ_i > ϕ_0 + c1 * a_i * dϕ_0) || (ϕ_i >= ϕ_prev && i > 1) + xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_prev, a_i, ϕ_0, dϕ_0, ϕ_prev, c1, c2)..., true + else + gn_i = _grad(grad_f, p, xn, lb, ub) + dϕ_i = dot(gn_i, dir) + if abs(dϕ_i) <= -c2 * dϕ_0 + xn_out, ϕ_out, gn_out, ok_out, done = xn, ϕ_i, gn_i, true, true + elseif dϕ_i >= zero(T) + xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_i, a_prev, ϕ_0, dϕ_0, ϕ_i, c1, c2)..., true + else + a_prev, ϕ_prev, a_i = a_i, ϕ_i, a_i * T(2) + end + end + end + end + if !done + xn = clamp.(x + a_prev * dir, lb, ub) + xn_out, ϕ_out, gn_out, ok_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub), true + end + end + return (xn_out, ϕ_out, gn_out, ok_out) +end + +# L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4) +@inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M} + T = eltype(g) + q, a = g, ntuple(_ -> zero(T), Val(M)) + for j in 0:(M - 1) + idx = k - j + if idx >= 1 + ii = mod1(idx, M) + a = Base.setindex(a, Rho[ii] * dot(S[ii], q), ii) + q = q - a[ii] * Y[ii] + end + end + kk = mod1(k, M) + sy, yy = dot(S[kk], Y[kk]), sum(abs2, Y[kk]) + γ = sy / yy + γ = ifelse(k >= 1 && yy > T(1.0e-30) && isfinite(γ) && γ > zero(T), γ, one(T)) + r = γ * q + for j in (M - 1):-1:0 + idx = k - j + if idx >= 1 + ii = mod1(idx, M) + r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii] + end + end + return -r +end + +# Run L-BFGS independently per starting point; tuples for GPU-compatible history buffers +@kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, result_fx, lb, ub, maxiters, ::Val{M}) where {M} i = @index(Global, Linear) - nlcache = remake(nlprob; u0 = x0s[i]) - sol = solve(nlcache, opt; maxiters, abstol, reltol) - @inbounds result[i] = sol.u + x = clamp.(x0s[i], lb, ub) + T = eltype(x) + z = zero(typeof(x)) + S, Y = ntuple(_ -> z, Val(M)), ntuple(_ -> z, Val(M)) + Rho = ntuple(_ -> zero(T), Val(M)) + fx = _eval(f, p, x, lb, ub) + g = _grad(grad_f, p, x, lb, ub) + k, active = 0, isfinite(fx) && all(isfinite, g) + for _ in 1:maxiters + if active && norm(g) >= T(1.0e-7) + dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k) + if dot(g, dir) >= zero(T) # reset to steepest descent + dir, k = -g, 0 + end + xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) + if !ok # fall back to steepest descent + xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub) + k = 0 + end + if ok && isfinite(fn) && all(isfinite, gn) + s, y = xn - x, gn - g + sy = dot(s, y) + if isfinite(one(T) / sy) && sy > T(1.0e-10) # curvature condition + k += 1; ii = mod1(k, M) + S = Base.setindex(S, s, ii) + Y = Base.setindex(Y, y, ii) + Rho = Base.setindex(Rho, one(T) / sy, ii) + else + k = 0 + end + x, g, fx = xn, gn, fn + else + active = false + end + end + end + @inbounds result[i] = x + @inbounds result_fx[i] = fx end +# PSO for global exploration, then L-BFGS for local refinement on all particles function SciMLBase.solve!( cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...; - abstol = nothing, - reltol = nothing, - maxiters = 100, local_maxiters = 10, kwargs... - ) where { - Backend, LocalOpt <: Union{LBFGS, BFGS}, - } - pso_cache = cache.pso_cache - - sol_pso = solve!(pso_cache) - x0s = sol_pso.original + abstol = nothing, reltol = nothing, maxiters = 100, + local_maxiters = 50, kwargs... + ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}} - backend = opt.backend + sol_pso = SciMLBase.solve!(cache.pso_cache; maxiters = maxiters) - prob = remake(cache.prob, lb = nothing, ub = nothing) + prob = cache.prob + f_raw, p = prob.f.f, prob.p + lb = prob.lb === nothing ? convert.(eltype(prob.u0), -Inf) : prob.lb + ub = prob.ub === nothing ? convert.(eltype(prob.u0), Inf) : prob.ub - result = cache.start_points - copyto!(result, x0s) + best_u = sol_pso.u + best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[] - ∇f = instantiate_gradient(prob.f.f, prob.f.adtype) - - kernel = simplebfgs_run!(backend) - nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(∇f, prob.u0, prob.p) - - nlalg = opt.local_opt isa LBFGS ? - SimpleLimitedMemoryBroyden(; - threshold = opt.local_opt.threshold, - linesearch = Val(true) - ) : SimpleBroyden(; linesearch = Val(true)) + x0s = sol_pso.original + n = length(x0s) + m_val = length(prob.u0) > 20 ? Val(5) : Val(10) + grad_f = instantiate_gradient(f_raw, prob.f.adtype) t0 = time() - kernel( - nlprob, - x0s, - result, - nlalg, - local_maxiters, - abstol, - reltol; - ndrange = length(x0s) - ) - - sol_bfgs = (x -> prob.f(x, prob.p)).(result) - sol_bfgs = (x -> isnan(x) ? convert(eltype(prob.u0), Inf) : x).(sol_bfgs) - minobj, ind = findmin(sol_bfgs) - sol_u, - sol_obj = minobj > sol_pso.objective ? (sol_pso.u, sol_pso.objective) : - (view(result, ind), minobj) - t1 = time() + result = similar(x0s) + result_fx = KernelAbstractions.allocate(opt.backend, typeof(best_obj), n) - # @show sol_pso.stats.time + kernel = lbfgs_kernel!(opt.backend) + kernel(f_raw, grad_f, p, x0s, result, result_fx, lb, ub, local_maxiters, m_val; ndrange = n) + KernelAbstractions.synchronize(opt.backend) - solve_time = (t1 - t0) + sol_pso.stats.time + minobj, ind = findmin(result_fx) + if minobj < best_obj + best_obj = minobj + best_u = Array(view(result, ind:ind))[1] + end + solve_time = (time() - t0) + sol_pso.stats.time return SciMLBase.build_solution( - SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, - sol_u, sol_obj, + SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj; stats = Optimization.OptimizationStats(; time = solve_time) ) end