diff --git a/Project.toml b/Project.toml
index a4c76f8..7c97061 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "ParallelParticleSwarms"
 uuid = "ab63da0c-63b4-40fa-a3b7-d2cba5be6419"
-version = "1.0.0"
 authors = ["Utkarsh <rajpututkarsh530@gmail.com> and contributors"]
+version = "1.0.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -9,11 +9,13 @@ DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+Runic = "62bfec6d-59d7-401d-8490-b29ee721c001"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 SimpleNonlinearSolve = "727e6d20-b764-4bd8-a329-72de5adea6c7"
@@ -39,7 +41,6 @@ julia = "1.10"
 
 [extras]
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
diff --git a/src/hybrid.jl b/src/hybrid.jl
index 60f80ed..8ba398c 100644
--- a/src/hybrid.jl
+++ b/src/hybrid.jl
@@ -1,69 +1,215 @@
-@kernel function simplebfgs_run!(nlprob, x0s, result, opt, maxiters, abstol, reltol)
+using LinearAlgebra: norm, dot
+using KernelAbstractions
+using SciMLBase
+using Optimization
+
+# Clamp to bounds and evaluate; NaN firewall prevents poison through interpolation
+@inline _eval(f, p, x, lb, ub) =
+let v = f(clamp.(x, lb, ub), p)
+    ifelse(isfinite(v), v, eltype(x)(Inf))
+end
+
+# Clamp to bounds and evaluate gradient; zero out non-finite components
+@inline _grad(grad_f, p, x, lb, ub) =
+    map(gi -> ifelse(isfinite(gi), gi, zero(gi)), grad_f(clamp.(x, lb, ub), p))
+
+# Cubic interpolation with bisection fallback
+@inline function _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi)
+    d1 = dϕ_lo + dϕ_hi - 3 * (ϕ_lo - ϕ_hi) / (a_lo - a_hi)
+    desc = d1 * d1 - dϕ_lo * dϕ_hi
+    d2 = sqrt(max(desc, zero(desc)))
+    return ifelse(
+        desc < 0, (a_lo + a_hi) / 2,
+        a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))
+    )
+end
+
+# Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6)
+@inline function _zoom(f, grad_f, p, x, dir, lb, ub, a_lo, a_hi, ϕ_0, dϕ_0, ϕ_lo, c1, c2)
+    T = eltype(x)
+    xn_out, ϕ_out, gn_out, ok_out = x, ϕ_0, zero(typeof(x)), false
+
+    # Cache endpoint values; updated only when endpoints shift
+    ϕ_hi = _eval(f, p, x + a_hi * dir, lb, ub)
+    dϕ_lo = dot(_grad(grad_f, p, x + a_lo * dir, lb, ub), dir)
+    dϕ_hi = dot(_grad(grad_f, p, x + a_hi * dir, lb, ub), dir)
+
+    done = false
+    for _ in 1:10
+        if !done
+            a_j = _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi)
+            a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3))
+            xn_j = clamp.(x + a_j * dir, lb, ub)
+            ϕ_j = _eval(f, p, xn_j, lb, ub)
+            if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo)
+                a_hi, ϕ_hi = a_j, ϕ_j
+                dϕ_hi = dot(_grad(grad_f, p, xn_j, lb, ub), dir)
+            else
+                gn_j = _grad(grad_f, p, xn_j, lb, ub)
+                dϕ_j = dot(gn_j, dir)
+                if abs(dϕ_j) <= -c2 * dϕ_0
+                    xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true
+                else
+                    if dϕ_j * (a_hi - a_lo) >= zero(T)
+                        a_hi, ϕ_hi, dϕ_hi = a_lo, ϕ_lo, dϕ_lo
+                    end
+                    a_lo, ϕ_lo, dϕ_lo = a_j, ϕ_j, dϕ_j
+                end
+            end
+        end
+    end
+    if !done
+        xn = clamp.(x + a_lo * dir, lb, ub)
+        xn_out, ϕ_out, gn_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub)
+    end
+    return (xn_out, ϕ_out, gn_out, ok_out)
+end
+
+# Strong Wolfe line search (Nocedal & Wright Algorithm 3.5)
+@inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
+    T = eltype(x)
+    c1, c2 = T(1.0e-4), T(0.9)
+    dϕ_0 = dot(g, dir)
+    xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false
+    if dϕ_0 < zero(T)  # valid descent direction
+        a_prev, a_i, ϕ_0, ϕ_prev, done = zero(T), one(T), fx, fx, false
+        for i in 1:10
+            if !done
+                xn = clamp.(x + a_i * dir, lb, ub)
+                ϕ_i = _eval(f, p, xn, lb, ub)
+                if (ϕ_i > ϕ_0 + c1 * a_i * dϕ_0) || (ϕ_i >= ϕ_prev && i > 1)
+                    xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_prev, a_i, ϕ_0, dϕ_0, ϕ_prev, c1, c2)..., true
+                else
+                    gn_i = _grad(grad_f, p, xn, lb, ub)
+                    dϕ_i = dot(gn_i, dir)
+                    if abs(dϕ_i) <= -c2 * dϕ_0
+                        xn_out, ϕ_out, gn_out, ok_out, done = xn, ϕ_i, gn_i, true, true
+                    elseif dϕ_i >= zero(T)
+                        xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_i, a_prev, ϕ_0, dϕ_0, ϕ_i, c1, c2)..., true
+                    else
+                        a_prev, ϕ_prev, a_i = a_i, ϕ_i, a_i * T(2)
+                    end
+                end
+            end
+        end
+        if !done
+            xn = clamp.(x + a_prev * dir, lb, ub)
+            xn_out, ϕ_out, gn_out, ok_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub), true
+        end
+    end
+    return (xn_out, ϕ_out, gn_out, ok_out)
+end
+
+# L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4)
+@inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M}
+    T = eltype(g)
+    q, a = g, ntuple(_ -> zero(T), Val(M))
+    for j in 0:(M - 1)
+        idx = k - j
+        if idx >= 1
+            ii = mod1(idx, M)
+            a = Base.setindex(a, Rho[ii] * dot(S[ii], q), ii)
+            q = q - a[ii] * Y[ii]
+        end
+    end
+    kk = mod1(k, M)
+    sy, yy = dot(S[kk], Y[kk]), sum(abs2, Y[kk])
+    γ = sy / yy
+    γ = ifelse(k >= 1 && yy > T(1.0e-30) && isfinite(γ) && γ > zero(T), γ, one(T))
+    r = γ * q
+    for j in (M - 1):-1:0
+        idx = k - j
+        if idx >= 1
+            ii = mod1(idx, M)
+            r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii]
+        end
+    end
+    return -r
+end
+
+# Run L-BFGS independently per starting point; tuples for GPU-compatible history buffers
+@kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, result_fx, lb, ub, maxiters, ::Val{M}) where {M}
     i = @index(Global, Linear)
-    nlcache = remake(nlprob; u0 = x0s[i])
-    sol = solve(nlcache, opt; maxiters, abstol, reltol)
-    @inbounds result[i] = sol.u
+    x = clamp.(x0s[i], lb, ub)
+    T = eltype(x)
+    z = zero(typeof(x))
+    S, Y = ntuple(_ -> z, Val(M)), ntuple(_ -> z, Val(M))
+    Rho = ntuple(_ -> zero(T), Val(M))
+    fx = _eval(f, p, x, lb, ub)
+    g = _grad(grad_f, p, x, lb, ub)
+    k, active = 0, isfinite(fx) && all(isfinite, g)
+    for _ in 1:maxiters
+        if active && norm(g) >= T(1.0e-7)
+            dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k)
+            if dot(g, dir) >= zero(T)  # reset to steepest descent
+                dir, k = -g, 0
+            end
+            xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
+            if !ok  # fall back to steepest descent
+                xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub)
+                k = 0
+            end
+            if ok && isfinite(fn) && all(isfinite, gn)
+                s, y = xn - x, gn - g
+                sy = dot(s, y)
+                if isfinite(one(T) / sy) && sy > T(1.0e-10)  # curvature condition
+                    k += 1; ii = mod1(k, M)
+                    S = Base.setindex(S, s, ii)
+                    Y = Base.setindex(Y, y, ii)
+                    Rho = Base.setindex(Rho, one(T) / sy, ii)
+                else
+                    k = 0
+                end
+                x, g, fx = xn, gn, fn
+            else
+                active = false
+            end
+        end
+    end
+    @inbounds result[i] = x
+    @inbounds result_fx[i] = fx
 end
 
+# PSO for global exploration, then L-BFGS for local refinement on all particles
 function SciMLBase.solve!(
         cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...;
-        abstol = nothing,
-        reltol = nothing,
-        maxiters = 100, local_maxiters = 10, kwargs...
-    ) where {
-        Backend, LocalOpt <: Union{LBFGS, BFGS},
-    }
-    pso_cache = cache.pso_cache
-
-    sol_pso = solve!(pso_cache)
-    x0s = sol_pso.original
+        abstol = nothing, reltol = nothing, maxiters = 100,
+        local_maxiters = 50, kwargs...
+    ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}}
 
-    backend = opt.backend
+    sol_pso = SciMLBase.solve!(cache.pso_cache; maxiters = maxiters)
 
-    prob = remake(cache.prob, lb = nothing, ub = nothing)
+    prob = cache.prob
+    f_raw, p = prob.f.f, prob.p
+    lb = prob.lb === nothing ? convert.(eltype(prob.u0), -Inf) : prob.lb
+    ub = prob.ub === nothing ? convert.(eltype(prob.u0), Inf) : prob.ub
 
-    result = cache.start_points
-    copyto!(result, x0s)
+    best_u = sol_pso.u
+    best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[]
 
-    ∇f = instantiate_gradient(prob.f.f, prob.f.adtype)
-
-    kernel = simplebfgs_run!(backend)
-    nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(∇f, prob.u0, prob.p)
-
-    nlalg = opt.local_opt isa LBFGS ?
-        SimpleLimitedMemoryBroyden(;
-            threshold = opt.local_opt.threshold,
-            linesearch = Val(true)
-        ) : SimpleBroyden(; linesearch = Val(true))
+    x0s = sol_pso.original
+    n = length(x0s)
+    m_val = length(prob.u0) > 20 ? Val(5) : Val(10)
 
+    grad_f = instantiate_gradient(f_raw, prob.f.adtype)
     t0 = time()
-    kernel(
-        nlprob,
-        x0s,
-        result,
-        nlalg,
-        local_maxiters,
-        abstol,
-        reltol;
-        ndrange = length(x0s)
-    )
-
-    sol_bfgs = (x -> prob.f(x, prob.p)).(result)
-    sol_bfgs = (x -> isnan(x) ? convert(eltype(prob.u0), Inf) : x).(sol_bfgs)
 
-    minobj, ind = findmin(sol_bfgs)
-    sol_u,
-        sol_obj = minobj > sol_pso.objective ? (sol_pso.u, sol_pso.objective) :
-        (view(result, ind), minobj)
-    t1 = time()
+    result = similar(x0s)
+    result_fx = KernelAbstractions.allocate(opt.backend, typeof(best_obj), n)
 
-    # @show sol_pso.stats.time
+    kernel = lbfgs_kernel!(opt.backend)
+    kernel(f_raw, grad_f, p, x0s, result, result_fx, lb, ub, local_maxiters, m_val; ndrange = n)
+    KernelAbstractions.synchronize(opt.backend)
 
-    solve_time = (t1 - t0) + sol_pso.stats.time
+    minobj, ind = findmin(result_fx)
+    if minobj < best_obj
+        best_obj = minobj
+        best_u = Array(view(result, ind:ind))[1]
+    end
 
+    solve_time = (time() - t0) + sol_pso.stats.time
     return SciMLBase.build_solution(
-        SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
-        sol_u, sol_obj,
+        SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj;
         stats = Optimization.OptimizationStats(; time = solve_time)
     )
 end