diff --git a/Project.toml b/Project.toml index a5c275c8..5c9785dc 100644 --- a/Project.toml +++ b/Project.toml @@ -1,13 +1,14 @@ name = "ExponentialUtilities" uuid = "d4d017d3-3776-5f7e-afef-a10c40355c18" authors = ["Chris Rackauckas ", "José E. Cruz Serrallés "] -version = "1.30.2" +version = "1.31.0" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" GenericSchur = "c145ed77-6b09-5dd9-b285-bf645a82121e" +LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -29,6 +30,7 @@ ForwardDiff = "0.10.13" GPUArraysCore = "0.1, 0.2" GenericSchur = "0.5.3" LinearAlgebra = "1.10" +LinearSolve = "4" PrecompileTools = "1" Printf = "1.10" Random = "1" diff --git a/docs/src/matrix_exponentials.md b/docs/src/matrix_exponentials.md index 48aca6b9..8a5e56a9 100644 --- a/docs/src/matrix_exponentials.md +++ b/docs/src/matrix_exponentials.md @@ -3,6 +3,8 @@ ```@docs exponential! phi +phi! +PhiPadeCache ``` ## Methods diff --git a/src/ExponentialUtilities.jl b/src/ExponentialUtilities.jl index 9e89db38..579ab7e1 100644 --- a/src/ExponentialUtilities.jl +++ b/src/ExponentialUtilities.jl @@ -29,6 +29,7 @@ include("exp_noalloc.jl") include("exp_generic.jl") include("exp_sparse.jl") include("phi.jl") +include("phi_almohy.jl") include("arnoldi.jl") include("krylov_phiv.jl") include("krylov_phiv_adaptive.jl") @@ -41,7 +42,7 @@ include("precompile.jl") export phi, phi!, KrylovSubspace, arnoldi, arnoldi!, lanczos!, ExpvCache, PhivCache, expv, expv!, phiv, phiv!, kiops, expv_timestep, expv_timestep!, phiv_timestep, phiv_timestep!, - StegrCache, get_subspace_cache, exponential! + StegrCache, PhiPadeCache, get_subspace_cache, exponential! export ExpMethodHigham2005, ExpMethodHigham2005Base, ExpMethodGeneric, ExpMethodNative, ExpMethodDiagonalization diff --git a/src/phi.jl b/src/phi.jl index 8faa9eb2..010ee37c 100644 --- a/src/phi.jl +++ b/src/phi.jl @@ -103,9 +103,12 @@ The phi functions are defined as \\varphi_0(z) = \\exp(z),\\quad \\varphi_{k+1}(z) = \\frac{\\varphi_k(z) - \\varphi_k(0)}{z} ``` -Calls `phiv_dense` on each of the basis vectors to obtain the answer. If A -is `Diagonal`, instead calls the scalar `phi` on each diagonal element and the -return values are also `Diagonal`s +For `Float64`/`ComplexF64` matrices this uses the scaling-and-recovering +algorithm of Al-Mohy and Liu (arXiv:2506.01193), which computes all of +`phi_0(A), ..., phi_k(A)` simultaneously in `O(k m^3)` operations. Other element +types fall back to calling `phiv_dense` on each basis vector (`O(m (m+k)^3)`). If +`A` is `Diagonal`, the scalar `phi` is instead applied to each diagonal element +and the return values are also `Diagonal`s. """ function phi(A::AbstractMatrix{T}, k; kwargs...) where {T <: Number} m = size(A, 1) @@ -119,7 +122,28 @@ end """ phi!(out,A,k[;caches]) -> out -Non-allocating version of `phi` for non-diagonal matrix inputs. +Non-allocating version of `phi` for non-diagonal matrix inputs, writing +`phi_j(A)` into `out[j+1]`. + +For dense `Float64`/`ComplexF64` matrices, pass a reusable +[`PhiPadeCache`](@ref) as `caches` to make repeated evaluations of the same +size and order allocation-free (the linear solve runs through an embedded +LinearSolve.jl cache): + +```julia +cache = PhiPadeCache(A, k) +phi!(out, A, k; caches = cache) +``` + +Numerical failure (a singular Padé denominator or non-finite result, only +possible for pathological inputs such as matrices containing `NaN`/`Inf`) does +not throw: the outputs are filled with `NaN` and, when a `PhiPadeCache` is +used, `cache.info[]` is set to a nonzero return code (`0` on success). This +lets adaptive integrators reject the step instead of aborting. + +For the legacy basis-vector algorithm, `caches` is instead the tuple +`(Vector{T}(undef, m), Matrix{T}(undef, m, k+1), Matrix{T}(undef, m+k, m+k))`; +supplying it forces that code path. """ function phi!( out::Vector{Matrix{T}}, A::AbstractMatrix{T}, k::Integer; caches = nothing, @@ -127,6 +151,27 @@ function phi!( ) where {T <: Number} m = size(A, 1) @assert length(out) == k + 1&&all(P -> size(P) == (m, m), out) "Dimension mismatch" + # The scaling-and-recovering algorithm of Al-Mohy and Liu (arXiv:2506.01193) + # computes phi_0..phi_k simultaneously in O(k m^3), versus O(m (m+k)^3) for + # the basis-vector approach below. Its Pade tables are tuned for double + # precision, so it is used only for Float64/ComplexF64. The legacy path is + # kept for other element types and when a caller supplies the legacy + # `caches` tuple. + if k >= 1 && T <: Union{Float64, ComplexF64} && !(caches isa Tuple) + if A isa StridedMatrix && (isnothing(caches) || caches isa PhiPadeCache) + cache = caches isa PhiPadeCache ? caches : PhiPadeCache(A, k) + return _phi_almohy!(out, A, k, cache) + elseif isnothing(caches) && !ismutable(A) + # Container-preserving path for immutable dense matrices (e.g. + # `SMatrix`); mutable non-strided types (e.g. sparse) fall through to + # the legacy path below. + Rm = _phi_almohy_generic(A, k) + for j in 1:(k + 1) + copyto!(out[j], Rm[j]) + end + return out + end + end if isnothing(caches) e = Vector{T}(undef, m) W = Matrix{T}(undef, m, k + 1) diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl new file mode 100644 index 00000000..057d3278 --- /dev/null +++ b/src/phi_almohy.jl @@ -0,0 +1,614 @@ +# Scaling-and-recovering algorithm for the matrix phi-functions, computing +# phi_0(A), ..., phi_p(A) simultaneously in O(p*n^3) work. +# +# This is a port of Algorithm 5.1 of +# +# Awad H. Al-Mohy and Xiaobo Liu, "A scaling and recovering algorithm for the +# matrix phi-functions", arXiv:2506.01193 (2025), +# +# and follows the authors' reference MATLAB implementation `phi_funm`. +# +# The idea: scale A down by 2^s, evaluate the [m/m] diagonal Pade approximant to +# phi_p of the scaled matrix, recover the lower-index approximants via the +# recurrence R^{(j)}_m(z) = z R^{(j+1)}_m(z) + 1/j!, then undo the scaling with +# the double-argument formula +# +# phi_j(2A) = 2^{-j} ( phi_0(A) phi_j(A) + sum_{k=1}^{j} phi_k(A)/(j-k)! ). +# +# Two implementations share the scalar machinery below: +# * `_phi_almohy!` -- fully in-place via a reusable `PhiPadeCache` workspace +# (for strided mutable matrices); the only per-call +# allocation is the O(n) LU pivot vector. +# * `_phi_almohy_generic` -- out-of-place and element-container-preserving, so +# a static input (e.g. `SMatrix`) stays static. + +using LinearSolve: LinearSolve, LinearProblem, GenericLUFactorization + +# theta[m, p] = theta_{m,p} from Table 3.1 of the paper (largest 1-norm of the +# scaled matrix for which the [m/m] Pade approximant is backward stable to +# double-precision unit roundoff). Rows are m = 1:20, columns p = 1:10. +const _PHI_THETA_MP = [ + 1.999463452408407e-5 3.7631213142601604e-5 7.366006416045163e-5 0.00014973317297025854 0.0003152443333771182 0.0006855209983764435 0.0015357294906993542 0.0035357368946407606 0.008345789028062234 0.02013882808928226; + 0.0038062018282832713 0.006090206286125726 0.009869682746779615 0.016211831146383013 0.026984240563843326 0.0454757968381803 0.07749273259855331 0.13324895779231027 0.2304625604362521 0.3988991104549146; + 0.03971636005661334 0.05806968886880692 0.08534220076759817 0.12612151517169362 0.18736524835661617 0.27955116495245524 0.41822735418681667 0.6258702800351991 0.9335970443986562 1.1616793320890249; + 0.15442675548312682 0.21278117034577634 0.29371996708854947 0.40617647304246707 0.5623843002320996 0.7787883505754265 1.0464245027100287 1.2572921799132364 1.480350861451984 1.713871003185325; + 0.37980898016147974 0.5014624976587007 0.6621033824456818 0.8739858777744354 1.110828442901451 1.3375805277521873 1.5770955744229305 1.8273848586096524 2.08673114576671 2.3536689190936406; + 0.726177195703321 0.9281910159646274 1.1591052927815408 1.4012982671152012 1.6570386251184455 1.924026602416338 2.20029216725632 2.4841706972729956 2.7742685942850143 3.069426294589578; + 1.1898666361923196 1.4469917284172122 1.7187282676344413 2.0024009817133357 2.295729523138923 2.5968019781751672 2.9040336511997444 3.216121483847106 3.531999855119243 3.8508005672569903; + 1.7605812331512907 2.060907194742016 2.371480030315257 2.690078917436836 3.014877598333601 3.3443898815994957 3.677415494646134 4.01299056190377 4.350344448873932 4.688863279363638; + 2.425818958547233 2.7623053687512256 3.105174730511504 3.4527057639100476 3.8035234182863555 4.156538184296475 4.510892686686127 4.865916398054041 5.2210881567935115 5.57600564043454; + 3.173113456793749 3.5393251225873685 3.9086764034913664 4.279911937822335 4.652058062541427 5.024366624424222 5.396268333440519 5.767334816025469 6.1372481669904 6.505776744744432; + 3.991025201329815 4.3813599805888455 4.772204432206606 5.162703837030561 5.552220615863406 5.940286915538899 6.326566605567779 6.710825187972159 7.092906173082131 7.4727126397309815; + 4.869485489784578 5.279199870248916 5.687344501175957 6.09339280009268 6.496977936247796 6.8978550856394785 7.295871912058635 7.690945666690415 8.083045525424248 8.472179024890941; + 5.799827904547885 6.224971321619904 6.64693480650714 7.065451733211646 7.48036740469753 7.891610269499859 8.29916985381997 8.703079946679036 9.103405852177376 9.500234768451538; + 6.774681996458441 7.211997240187581 7.644910128682907 8.073354767800579 8.497338949398504 8.916922998846443 9.332203860678709 9.74330319935391 10.150358556572542 10.5535168239286; + 7.787817051572435 8.234634975085694 8.676142196766529 9.112424389503664 9.543611219824884 9.969861373150518 10.391351525913848 10.808268293369911 11.220802409044003 11.629144570605774; + 8.833978214833847 9.28811956766922 9.736292792866143 10.178695474892383 10.615546677548995 11.04707680323746 11.473520338548882 11.895110742359616 12.312076916038997 12.724640835984562; + 9.908733823133844 10.368423129623023 10.821685340709736 11.26879859255289 11.710045790427412 12.145708155762449 12.576060822320501 13.001369927621694 13.421890788016032 13.837866852904227; + 11.008341035221061 11.472133539821815 11.929195701618417 12.379861767254168 12.824458619109002 13.263302077220441 13.696694607678852 14.124924034990169 14.548262963142152 14.966968689718191; + 12.129631147167878 12.596352058386916 13.056160721729096 13.509428665638305 13.956511465304834 14.397747062985934 14.83345501195776 15.263936358537684 15.689473955602649 16.110333058994453; + 13.269913405576162 13.738607964511967 14.20030229038679 14.655390835644507 15.104246216977073 15.547218984875734 15.984637960610213 16.41681094336813 16.84402564730148 17.266550769620974; +] + +# Largest Pade degree used; bounds cond(D_m) (Section 4). Fixes the block-size +# and power-buffer counts used to size the workspace. +const _PHI_M_MAX = 12 +# max tau+1 over m <= _PHI_M_MAX, tau = ceil(sqrt(2m)); ceil(sqrt(24)) = 5. +const _PHI_NPOW = 6 +# max i (Paterson--Stockmeyer cost index) for _PHI_M_MAX. +const _PHI_IMAX = ceil(Int, sqrt(8 * (_PHI_M_MAX + 1)) - 3) - 1 + +# theta_{m,p} with the p>7 rule of the paper: for p>7 the p=7 column is used, and +# indices are guarded so m outside 1:20 does not error. +@inline function _phi_theta(m::Integer, p::Integer) + return _PHI_THETA_MP[m, p > 7 ? 7 : p] +end + +# k! as a Float64 (k small; avoids Int overflow for the recovery coefficients). +@inline function _factf(k::Integer) + r = 1.0 + for i in 2:k + r *= i + end + return r +end + +# First-factor coefficient of the backward-error series h_{m,p} (Eq. (3.4)), +# +# (m+p)! m! / ((2m+p)! (2m+p+1)!) +# = prod_{j=1}^{m} 1/(m+p+j) * prod_{j=1}^{m+p+1} 1/(m+j), +# +# accumulated as a product of sub-unit factors so no intermediate factorial can +# spuriously overflow (each factor < 1, so the running product is monotonically +# decreasing and underflows only when the true value does). +@inline function _phi_be_coeff(m::Integer, p::Integer) + r = 1.0 + for j in 1:m + r /= (m + p + j) + end + for j in 1:(m + p + 1) + r /= (m + j) + end + return r +end + +""" + PhiPadeCache(A, p) + +Reusable workspace for the in-place phi-function evaluator [`phi!`](@ref) on +strided `Float64`/`ComplexF64` matrices. Allocate once for a given size and `p`, +then pass it as the `caches` keyword to `phi!` to reuse across calls. The +linear solve runs through an embedded LinearSolve.jl cache (batched matrix +right-hand side, `GenericLUFactorization`); reuse across calls is +allocation-free — all buffers, including the LU pivots, live in the workspace +and the LinearSolve cache. + +After each `phi!` call, `cache.info[]` holds a return code: `0` means success; +a positive value indicates a numerically singular +Padé denominator; `-1` means the result was non-finite (`NaN`/`Inf` input). Both +failures only occur for pathological inputs, and no error is thrown — the +outputs are filled with `NaN` so that adaptive integrators can detect the +failed evaluation via `cache.info[] != 0` (or an `isfinite` check) and reject +the step rather than abort. +""" +struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}, LS} + As::MT + Apow::Vector{MT} + Nm::MT + Dm::MT + Dfact::MT + rhs::MT + linsolve::LS + Naux::MT + Daux::MT + tmp::MT + pow1::MT + pow2::MT + absA::RMT + rvec1::Vector{RT} + rvec2::Vector{RT} + Ncoef::Vector{Float64} + Dcoef::Vector{Float64} + Amat::Matrix{Float64} + eta::Vector{Float64} + alpha::Vector{Float64} + tvals::Vector{Int} + Cost::Matrix{Float64} + info::typeof(Ref(0)) +end + +# Largest r for which alpha_r may be probed (Eq. (3.10) with phat = p; theta_{12,p} +# >= 1 for all p, so phat = p always applies at m = _PHI_M_MAX). +_phi_rmax(p::Integer) = floor(Int, (1 + sqrt(1 + 4 * (2 * _PHI_M_MAX + p + 1))) / 2) + +function PhiPadeCache(A::AbstractMatrix{T}, p::Integer) where {T} + n = size(A, 1) + RT = real(T) + r_max = _phi_rmax(p) + mk() = Matrix{T}(undef, n, n) + Apow = Vector{Matrix{T}}(undef, _PHI_NPOW) + Apow[1] = Matrix{T}(I, n, n) + for i in 2:_PHI_NPOW + Apow[i] = mk() + end + Dfact = Matrix{T}(I, n, n) + rhs = zeros(T, n, n) + # GenericLUFactorization refactorizes in place with a cached pivot vector, + # so cache reuse is allocation-free (LUFactorization currently copies A on + # every dense refactorization). Dfact/rhs are workspace buffers refilled + # before each solve. + linsolve = LinearSolve.init(LinearProblem(Dfact, rhs), GenericLUFactorization()) + return PhiPadeCache{T, RT, Matrix{T}, Matrix{RT}, typeof(linsolve)}( + mk(), Apow, mk(), mk(), Dfact, rhs, linsolve, + mk(), mk(), mk(), mk(), mk(), + Matrix{RT}(undef, n, n), + Vector{RT}(undef, n), Vector{RT}(undef, n), + Vector{Float64}(undef, _PHI_M_MAX + 1), Vector{Float64}(undef, _PHI_M_MAX + 1), + Matrix{Float64}(undef, _PHI_M_MAX + 1, _PHI_M_MAX + 1), + Vector{Float64}(undef, r_max), Vector{Float64}(undef, r_max), + Vector{Int}(undef, _PHI_IMAX + 1), + Matrix{Float64}(undef, _PHI_IMAX + 1, r_max - 1), + Ref(0), + ) +end + +@noinline function _phi_cache_check(cache::PhiPadeCache, n::Integer, r_max::Integer) + size(cache.As, 1) == n || throw( + DimensionMismatch( + "PhiPadeCache was constructed for size $(size(cache.As, 1)) but A has size $n" + ) + ) + length(cache.eta) >= r_max || throw( + ArgumentError( + "PhiPadeCache was constructed for a smaller p; construct it with PhiPadeCache(A, p) for the p in use" + ) + ) + return nothing +end + +# Renormalized [m/m] Pade coefficients (numerator and denominator, low order +# first) of the approximant to phi_p, written into the preallocated `Ncoef`, +# `Dcoef` (length >= m+1) using scratch `Amat` (size >= (m+1)^2). Follows the +# recurrences of Berland, Skaflestad, and Wright used in the reference +# `pade_coef`. +function _phi_pade_coef!(Ncoef, Dcoef, Amat, m::Integer, p::Integer) + n1 = 1.0 # (2m+1)!/m! + for i in (m + 1):(2m + 1) + n1 *= i + end + d1 = n1 + for k in 1:p + if k == p + # First column: Amat[r,1] = n1 * prod_{l=1}^{r-1} 1/(k+l). + cp = 1.0 + Amat[1, 1] = n1 + for r in 2:(m + 1) + cp /= (k + (r - 1)) + Amat[r, 1] = n1 * cp + end + for c in 2:(m + 1), r in 2:(m + 1) + I = r - 1 + J = c - 1 + Amat[r, c] = -(m + 1 - J) * (k + 1 + I - J) / ((2m + k + 1 - J) * J) + end + for r in 1:(m + 1) + s = 0.0 + pr = 1.0 + for c in 1:r + pr *= Amat[r, c] + s += pr + end + Ncoef[r] = s + end + Dcoef[1] = d1 + cpd = 1.0 + for r in 2:(m + 1) + i = r - 1 + cpd *= -(m + 1 - i) / (i * (2m + k + 1 - i)) + Dcoef[r] = d1 * cpd + end + end + n1 = n1 * (2m + k + 1) / (k + 1) + d1 = d1 * (2m + k + 1) + end + return Ncoef, Dcoef +end + +# Given the alpha_r sequence (first `nalpha` entries) and precomputed scaling +# parameters `tvals[i+1] = t(m_i)`, minimize the equivalent-matrix-product cost +# (Section 4) over the optimal degrees and return the chosen (m, s, tau). `Cost` +# is scratch of size >= (imax+1) x (r_max-1). +function _phi_select_from_alpha(alpha, nalpha::Integer, p::Integer, tvals, Cost) + i_max = _PHI_IMAX + r_max = nalpha + 1 + C = @view Cost[1:(i_max + 1), 1:(r_max - 1)] + fill!(C, 0.0) + for i in 0:i_max + m_i = (i + 3)^2 ÷ 8 + θ = _phi_theta(m_i, p) + phat_i = θ < 1 ? 0 : p + t = tvals[i + 1] + for r in 2:r_max + if 2m_i + phat_i + 1 >= r * (r - 1) + a = alpha[r - 1] + s0 = (a > 0 && isfinite(a)) ? max(ceil(Int, log2(a / θ)), t) : t + C[i + 1, r - 1] = i + p + s0 * (p + 1) + end + end + end + nrows = i_max + 1 + ncols = r_max - 1 + minval = Inf + for v in C + (v > 0 && v < minval) && (minval = v) + end + # Column-major "last" tie-break (matches the reference), giving the row index. + i_star = 0 + for cc in 1:ncols, rr in 1:nrows + C[rr, cc] == minval && (i_star = rr - 1) + end + m = (i_star + 3)^2 ÷ 8 + s = round(Int, (minval - i_star - p) / (p + 1)) + tau = floor(Int, sqrt(2m)) + if tau - 1 + 2 * (m ÷ tau) - 2 * (m % tau == 0) != i_star + tau = ceil(Int, sqrt(2m)) + end + return m, s, tau +end + +## -------------------- in-place (workspace) implementation -------------------- + +# Exact 1-norm of B^k for entrywise-nonnegative real B, ‖B^k‖_1 = ‖(B^T)^k 𝟙‖_∞, +# via k matrix-vector products into the workspace vectors. +function _normpow_nonneg!(cache::PhiPadeCache, B, k::Integer) + v = cache.rvec1 + tmp = cache.rvec2 + fill!(v, 1) + Bt = transpose(B) + for _ in 1:k + mul!(tmp, Bt, v) + v, tmp = tmp, v + end + return maximum(v) +end + +# Scaling parameter `t` from the first term of the backward-error series +# (Eq. (3.12)); `ell` subfunction of the reference, in-place. `normT` is the +# caller-computed `opnorm(A, 1)` (hoisted out of the per-degree loop). +function _phi_ell!(cache::PhiPadeCache, A, normT, m::Integer, p::Integer, phat::Integer) + normT == 0 && return 0 + t0 = normT > 1 ? log2(normT) : 0.0 + scalefac = exp2(t0) + normTs = normT / scalefac + delta = (p - 1) * (p - phat) / p + 1 + K = 2m + p + 1 + c = (_phi_be_coeff(m, p) / normTs^delta)^(1 / K) + absA = cache.absA + @. absA = c * abs(A) / scalefac + alpha = _normpow_nonneg!(cache, absA, K) + t = log2(2alpha / eps(Float64)) / (K - delta) + t0 + # NaN/Inf inputs make t non-finite; return 0 (never throw) and let the NaN + # propagate to the output, where step-rejection logic can see it. + isfinite(t) || return 0 + return max(ceil(Int, t), 0) +end + +function _select_parameters_phi!(cache::PhiPadeCache, A, p::Integer) + phat = _phi_theta(_PHI_M_MAX, p) < 1 ? 0 : p + r_max = floor(Int, (1 + sqrt(1 + 4 * (2 * _PHI_M_MAX + phat + 1))) / 2) + _phi_cache_check(cache, size(A, 1), r_max) + eta = cache.eta + mul!(cache.pow1, A, A) + eta[1] = opnorm(cache.pow1, 1)^(1 / 2) + cur, oth = cache.pow1, cache.pow2 + for j in 2:r_max + mul!(oth, cur, A) + eta[j] = opnorm(oth, 1)^(1 / (j + 1)) + cur, oth = oth, cur + end + alpha = cache.alpha + for j in 1:(r_max - 1) + alpha[j] = max(eta[j], eta[j + 1]) + end + normT = opnorm(A, 1) + for i in 0:_PHI_IMAX + m_i = (i + 3)^2 ÷ 8 + phat_i = _phi_theta(m_i, p) < 1 ? 0 : p + cache.tvals[i + 1] = _phi_ell!(cache, A, normT, m_i, p, phat_i) + end + return _phi_select_from_alpha(alpha, r_max - 1, p, cache.tvals, cache.Cost) +end + +# Add block i's coefficient combination, sum_l coef[i*tau + l] * As^{l-1}, to M. +function _phi_ps_addblock!(M, coef, Apow, i::Integer, tau::Integer, m::Integer) + start = i * tau + 1 + stop = min((i + 1) * tau, m + 1) + for l in 1:(stop - start + 1) + c = coef[start + l - 1] + P = Apow[l] + @. M += c * P + end + return M +end + +# Evaluate N(As) and D(As) via Paterson--Stockmeyer in Horner form, +# +# q(As) = ( (B_nu * Atau + B_{nu-1}) * Atau + ... ) * Atau + B_0, +# +# which needs nu products per polynomial -- one fewer when tau | m, since then +# B_nu = c_m*I and the first fold is the broadcast c_m*Atau + B_{nu-1}. Together +# with the tau-1 explicit powers, the multiplication count is exactly Fasi's +# pi_m(tau) = tau - 1 + 2*floor(m/tau) - 2*(tau | m) that the parameter +# selection optimizes over. Returns (N, D) as views into cache buffers. +function _paterson_stockmeyer!(cache::PhiPadeCache, As, m::Integer, tau::Integer) + Nc, Dc, Apow = cache.Ncoef, cache.Dcoef, cache.Apow + copyto!(Apow[2], As) # Apow[1] = I (set once); Apow[i] = As^{i-1} + for i in 3:(tau + 1) + mul!(Apow[i], Apow[i - 1], As) + end + Atau = Apow[tau + 1] + N, Naux = cache.Nm, cache.Naux + D, Daux = cache.Dm, cache.Daux + nu = m ÷ tau + if m % tau == 0 + cN, cD = Nc[m + 1], Dc[m + 1] + @. N = cN * Atau + @. D = cD * Atau + _phi_ps_addblock!(N, Nc, Apow, nu - 1, tau, m) + _phi_ps_addblock!(D, Dc, Apow, nu - 1, tau, m) + inext = nu - 2 + else + fill!(N, 0) + fill!(D, 0) + _phi_ps_addblock!(N, Nc, Apow, nu, tau, m) + _phi_ps_addblock!(D, Dc, Apow, nu, tau, m) + inext = nu - 1 + end + for i in inext:-1:0 + mul!(Naux, N, Atau) + mul!(Daux, D, Atau) + _phi_ps_addblock!(Naux, Nc, Apow, i, tau, m) + _phi_ps_addblock!(Daux, Dc, Apow, i, tau, m) + N, Naux = Naux, N + D, Daux = Daux, D + end + return N, D +end + +# Solve Dm * X = Nm through the workspace's LinearSolve cache (batched matrix +# right-hand side, LinearSolve >= 4), writing X into `X`. Returns an info code +# instead of throwing: 0 on success; 1 on a numerically singular Dm +# (pathological input, e.g. NaN/Inf), in which case `X` is filled with NaN so +# downstream step-rejection logic can detect the failure. +function _phi_solve!(cache::PhiPadeCache, Dm, Nm, X) + copyto!(cache.Dfact, Dm) + copyto!(cache.rhs, Nm) + linsolve = cache.linsolve + linsolve.A = cache.Dfact + linsolve.b = cache.rhs + sol = LinearSolve.solve!(linsolve) + if sol.retcode == LinearSolve.ReturnCode.Success + copyto!(X, sol.u) + return 0 + else + fill!(X, NaN) + return 1 + end +end + +""" + _phi_almohy!(out, A, p, cache) -> out + +In-place evaluation of `phi_0(A), ..., phi_p(A)` for a strided +`Float64`/`ComplexF64` matrix `A` (`p >= 1`), writing `phi_j(A)` into `out[j+1]` +and using the reusable [`PhiPadeCache`](@ref) `cache` for all scratch storage. +The `out` matrices double as the recovery buffers. +""" +function _phi_almohy!( + out::AbstractVector{<:AbstractMatrix}, A::AbstractMatrix, p::Integer, + cache::PhiPadeCache + ) + n = size(A, 1) + m, s, tau = _select_parameters_phi!(cache, A, p) + As = cache.As + @. As = A / exp2(s) + _phi_pade_coef!(cache.Ncoef, cache.Dcoef, cache.Amat, m, p) + Nm, Dm = _paterson_stockmeyer!(cache, As, m, tau) + + info = _phi_solve!(cache, Dm, Nm, out[p + 1]) + if info == 0 && !all(isfinite, out[p + 1]) + # NaN/Inf inputs reach here with info == 0 (the LU does not flag NaN + # pivots); report them so integrators can test cache.info[] alone. + info = -1 + end + cache.info[] = info + if info != 0 + # Return code instead of an exception: fill the outputs with NaN so an + # adaptive integrator can reject the step rather than crash. + for j in 1:(p + 1) + fill!(out[j], NaN) + end + return out + end + + # Recurrence (2.9): R^{(j)} = As R^{(j+1)} + I/j!, j = p-1 : -1 : 0. + for k in p:-1:1 + mul!(out[k], As, out[k + 1]) + f = 1 / _factf(k - 1) + @inbounds for d in 1:n + out[k][d, d] += f + end + end + + # Undo the scaling with the double-argument formula (2.10), s times. + tmp = cache.tmp + for _ in 1:s + for j in p:-1:1 + mul!(tmp, out[1], out[j + 1]) + for k in 1:j + @. tmp += (1 / _factf(j - k)) * out[k + 1] + end + tmp ./= exp2(j) + copyto!(out[j + 1], tmp) + end + mul!(tmp, out[1], out[1]) + copyto!(out[1], tmp) + end + return out +end + +## ----------------- generic (container-preserving) implementation ------------- + +function _normpow_nonneg(B::AbstractMatrix{<:Real}, k::Integer) + n = size(B, 1) + Bt = transpose(B) + v = ones(eltype(B), n) + tmp = similar(v) + for _ in 1:k + mul!(tmp, Bt, v) + v, tmp = tmp, v + end + return maximum(v) +end + +function _phi_ell(A::AbstractMatrix, normT, m::Integer, p::Integer, phat::Integer) + normT == 0 && return 0 + t0 = normT > 1 ? log2(normT) : 0.0 + scalefac = exp2(t0) + normTs = normT / scalefac + delta = (p - 1) * (p - phat) / p + 1 + K = 2m + p + 1 + c = (_phi_be_coeff(m, p) / normTs^delta)^(1 / K) + scaledT = c .* abs.(A ./ scalefac) + alpha = _normpow_nonneg(scaledT, K) + t = log2(2alpha / eps(Float64)) / (K - delta) + t0 + isfinite(t) || return 0 + return max(ceil(Int, t), 0) +end + +function _select_parameters_phi(A::AbstractMatrix, p::Integer) + phat = _phi_theta(_PHI_M_MAX, p) < 1 ? 0 : p + r_max = floor(Int, (1 + sqrt(1 + 4 * (2 * _PHI_M_MAX + phat + 1))) / 2) + eta = Vector{Float64}(undef, r_max) + P = A * A + eta[1] = opnorm(P, 1)^(1 / 2) + for j in 2:r_max + P = P * A + eta[j] = opnorm(P, 1)^(1 / (j + 1)) + end + alpha = [max(eta[j], eta[j + 1]) for j in 1:(r_max - 1)] + tvals = Vector{Int}(undef, _PHI_IMAX + 1) + normT = opnorm(A, 1) + for i in 0:_PHI_IMAX + m_i = (i + 3)^2 ÷ 8 + phat_i = _phi_theta(m_i, p) < 1 ? 0 : p + tvals[i + 1] = _phi_ell(A, normT, m_i, p, phat_i) + end + Cost = zeros(Float64, _PHI_IMAX + 1, r_max - 1) + return _phi_select_from_alpha(alpha, r_max - 1, p, tvals, Cost) +end + +# Out-of-place counterpart of `_phi_ps_addblock!`: returns `init` plus block i's +# coefficient combination. +function _phi_ps_block(init, coef, Apow, i::Integer, tau::Integer, m::Integer) + start = i * tau + 1 + stop = min((i + 1) * tau, m + 1) + M = init + for l in 1:(stop - start + 1) + M = M + coef[start + l - 1] * Apow[l] + end + return M +end + +# Out-of-place Horner-form Paterson--Stockmeyer; same multiplication count as +# the in-place version (Fasi's pi_m(tau)). +function _paterson_stockmeyer(As, Nc, Dc, tau::Integer, m::Integer) + MT = typeof(As) + Apow = Vector{MT}(undef, tau + 1) + Apow[1] = one(As) + Apow[2] = As + for i in 3:(tau + 1) + Apow[i] = Apow[i - 1] * As + end + Atau = Apow[tau + 1] + nu = m ÷ tau + local N::MT, D::MT + if m % tau == 0 + N = _phi_ps_block(Nc[m + 1] * Atau, Nc, Apow, nu - 1, tau, m) + D = _phi_ps_block(Dc[m + 1] * Atau, Dc, Apow, nu - 1, tau, m) + inext = nu - 2 + else + N = _phi_ps_block(zero(As), Nc, Apow, nu, tau, m) + D = _phi_ps_block(zero(As), Dc, Apow, nu, tau, m) + inext = nu - 1 + end + for i in inext:-1:0 + N = _phi_ps_block(N * Atau, Nc, Apow, i, tau, m) + D = _phi_ps_block(D * Atau, Dc, Apow, i, tau, m) + end + return N, D +end + +""" + _phi_almohy_generic(A, p) -> Vector + +Container-preserving evaluation of `phi_0(A), ..., phi_p(A)` (`p >= 1`): the +returned matrices have the same type as `A` (e.g. an `SMatrix` input yields +`SMatrix` results). Used for immutable/static matrices, where the in-place +workspace path does not apply. +""" +function _phi_almohy_generic(A::AbstractMatrix, p::Integer) + m, s, tau = _select_parameters_phi(A, p) + As = A ./ exp2(s) + m1 = m + 1 + Ncoef = Vector{Float64}(undef, m1) + Dcoef = Vector{Float64}(undef, m1) + Amat = Matrix{Float64}(undef, m1, m1) + _phi_pade_coef!(Ncoef, Dcoef, Amat, m, p) + Nm, Dm = _paterson_stockmeyer(As, Ncoef, Dcoef, tau, m) + + Rm = Vector{typeof(As)}(undef, p + 1) + # No-throw on a numerically singular denominator (pathological input): NaN + # results let adaptive integrators reject the step instead of aborting. + Rm[p + 1] = try + Dm \ Nm + catch e + e isa SingularException || rethrow() + zero(As) .+ eltype(As)(NaN) + end + Id = one(As) + for k in p:-1:1 + Rm[k] = As * Rm[k + 1] + Id / _factf(k - 1) + end + for _ in 1:s + for j in p:-1:1 + M = Rm[1] * Rm[j + 1] + for k in 1:j + M += Rm[k + 1] / _factf(j - k) + end + Rm[j + 1] = M / exp2(j) + end + Rm[1] = Rm[1] * Rm[1] + end + return Rm +end diff --git a/test/basictests.jl b/test/basictests.jl index 539ec2cc..faf721c1 100644 --- a/test/basictests.jl +++ b/test/basictests.jl @@ -256,6 +256,151 @@ end end end +@testset "Phi (Al-Mohy--Liu vs reference)" begin + # Independent reference: phi_0..phi_p are the first block row of exp(W) for + # the augmented block matrix W = [A E; 0 J] (Al-Mohy--Liu, Theorem 2.1), with + # E = [I 0 ... 0] and J the nilpotent shift. This is unrelated to the + # scaling-and-recovering algorithm under test. + function phi_block_reference(A, p) + # `local`: without it these assignments capture and clobber the + # enclosing testset's variables of the same name (closures in a local + # scope assign to existing outer locals). + local n = size(A, 1) + local T = eltype(A) + W = zeros(T, n * (p + 1), n * (p + 1)) + W[1:n, 1:n] .= A + for j in 0:(p - 1) + rb = j * n + W[(rb + 1):(rb + n), (rb + n + 1):(rb + 2n)] .= Matrix{T}(I, n, n) + end + eW = exp(W) + return [eW[1:n, (j * n + 1):((j + 1) * n)] for j in 0:p] + end + + # Force the legacy basis-vector path (Sidje/`phiv_dense`) for cross-checking. + function phi_legacy(A, p) + local n = size(A, 1) + local T = eltype(A) + local caches = ( + Vector{T}(undef, n), Matrix{T}(undef, n, p + 1), + Matrix{T}(undef, n + p, n + p), + ) + local out = [Matrix{T}(undef, n, n) for _ in 1:(p + 1)] + return phi!(out, A, p; caches = caches) + end + + Random.seed!(20250701) + testmats = Dict( + "small 2x2" => [0.1 0.2; 0.3 0.4], + "random 8x8" => randn(8, 8) ./ 4, + "nonnormal 10x10" => 3 * (triu(randn(10, 10), 1) + Diagonal(randn(10))), + "large-norm 6x6" => 6 * randn(6, 6), + "complex 5x5" => randn(ComplexF64, 5, 5), + "hessenberg 12x12" => Matrix(hessenberg(randn(12, 12)).H), + "zero 4x4" => zeros(4, 4), + ) + for (name, A) in testmats, p in (1, 2, 3, 5) + ref = phi_block_reference(A, p) + new = phi(A, p) # default path == Al-Mohy--Liu + leg = phi_legacy(A, p) + for j in 1:(p + 1) + @test new[j] ≈ ref[j] rtol = 1.0e-8 atol = 1.0e-12 + @test new[j] ≈ leg[j] rtol = 1.0e-7 atol = 1.0e-10 + end + end + + # phi_0 must equal the matrix exponential. + A = randn(9, 9) ./ 3 + @test phi(A, 4)[1] ≈ exp(A) + + # Large p exercises the recurrence form of the backward-error coefficient + # (naive factorial ratios overflow long before their quotient does). + A = randn(3, 3) + P30 = phi(A, 30) + ref30 = phi_block_reference(A, 30) + for j in 1:31 + @test P30[j] ≈ ref30[j] rtol = 1.0e-8 atol = 1.0e-13 + end + + # Preallocated in-place entry point returns phi_j in out[j+1]. + n = 7 + A = randn(n, n) ./ 3 + out = [Matrix{Float64}(undef, n, n) for _ in 1:4] + phi!(out, A, 3) + @test out ≈ phi_block_reference(A, 3) + + # Reusable workspace: same result, and reuse is allocation-free (the linear + # solve runs through the workspace's LinearSolve cache, which owns the LU + # pivot storage). + cache = PhiPadeCache(A, 3) + out2 = [Matrix{Float64}(undef, n, n) for _ in 1:4] + phi!(out2, A, 3; caches = cache) + @test out2 ≈ out + run_phi!(o, M, c) = phi!(o, M, 3; caches = c) + run_phi!(out2, A, cache) # warm up / compile + @test (@allocated run_phi!(out2, A, cache)) == 0 + + # Complex-matrix workspace path, also allocation-free on reuse. + Ac = randn(ComplexF64, 6, 6) ./ 3 + cachec = PhiPadeCache(Ac, 2) + outc = [Matrix{ComplexF64}(undef, 6, 6) for _ in 1:3] + phi!(outc, Ac, 2; caches = cachec) + @test outc ≈ phi_block_reference(Ac, 2) + @test cachec.info[] == 0 + run_phic!(o, M, c) = phi!(o, M, 2; caches = c) + run_phic!(outc, Ac, cachec) + @test (@allocated run_phic!(outc, Ac, cachec)) == 0 + + # Numerical failure must not throw: NaN input yields NaN-filled outputs and + # a nonzero return code in cache.info[], so adaptive integrators can reject + # the step instead of aborting. + Anan = fill(NaN, 5, 5) + Pnan = phi(Anan, 2) + @test all(P -> any(!isfinite, P), Pnan) + cachenan = PhiPadeCache(Anan, 2) + outnan = [Matrix{Float64}(undef, 5, 5) for _ in 1:3] + phi!(outnan, Anan, 2; caches = cachenan) + @test cachenan.info[] != 0 + @test all(P -> all(isnan, P), outnan) + # ... and a subsequent good call resets the code. + phi!(outnan, randn(5, 5), 2; caches = cachenan) + @test cachenan.info[] == 0 + + # Wrongly sized workspaces are programmer errors, not numerical failures, + # and do throw. + A3 = randn(3, 3) + out3 = [Matrix{Float64}(undef, 3, 3) for _ in 1:4] + @test_throws DimensionMismatch phi!(out3, A3, 3; caches = cache) + cache_p1 = PhiPadeCache(A, 1) + out11 = [Matrix{Float64}(undef, n, n) for _ in 1:11] + @test_throws ArgumentError phi!(out11, A, 10; caches = cache_p1) +end + +@testset "Phi static arrays (container-preserving)" begin + Random.seed!(4321) + function phi_block_reference(A, p) + local n = size(A, 1) + local T = eltype(A) + W = zeros(T, n * (p + 1), n * (p + 1)) + W[1:n, 1:n] .= A + for j in 0:(p - 1) + rb = j * n + W[(rb + 1):(rb + n), (rb + n + 1):(rb + 2n)] .= Matrix{T}(I, n, n) + end + eW = exp(W) + return [eW[1:n, (j * n + 1):((j + 1) * n)] for j in 0:p] + end + for N in (3, 5), p in (1, 2, 3) + A = SMatrix{N, N}(randn(N, N) ./ 3) + Rm = ExponentialUtilities._phi_almohy_generic(A, p) + @test all(r -> r isa SMatrix, Rm) # static in, static out + ref = phi_block_reference(Matrix(A), p) + for j in 1:(p + 1) + @test Matrix(Rm[j]) ≈ ref[j] rtol = 1.0e-8 atol = 1.0e-12 + end + end +end + @testset "Static Arrays" begin Random.seed!(0) for N in (3, 4, 6, 8), t in (0.1, 1.0, 10.0)