From 4b0551edb827445e893e2e40adb784946b9d615d Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Thu, 2 Jul 2026 13:30:17 -0400 Subject: [PATCH 1/8] Add O(p m^3) Al-Mohy--Liu scaling-and-recovering algorithm for phi functions `phi(A, k)` / `phi!` previously computed phi_0(A), ..., phi_k(A) by calling `phiv_dense!` on each of the m basis vectors, each exponentiating an (m+k) x (m+k) block matrix -- overall O(m (m+k)^3). This implements Algorithm 5.1 of Al-Mohy & Liu, "A scaling and recovering algorithm for the matrix phi-functions" (arXiv:2506.01193, 2025), which computes all phi functions simultaneously in O(k m^3): scale A by 2^-s, evaluate the [m/m] diagonal Pade approximant to phi_p via Paterson--Stockmeyer, recover the lower-index approximants with the recurrence R^{(j)} = z R^{(j+1)} + 1/j!, and undo the scaling with the double-argument formula. Backward-error-optimal Pade degree m and scaling s are selected from the paper's theta table (Table 3.1) and the alpha_r / t cost analysis. The new path is the default for Float64/ComplexF64 dense inputs (its Pade tables are tuned for double precision); other element types and callers passing the legacy `caches` bundle keep the basis-vector path. Diagonal inputs are unchanged. Validated against a high-precision block-exponential reference and the legacy algorithm: worst-case relative error ~1e-14 across normal, non-normal, large-norm, complex, Hessenberg and zero matrices for k = 1..10, matching or beating the old accuracy. Measured speedups over the basis-vector path: 18x at n=50, 51x at n=100, 72x at n=200 (k=3). Closes #235. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01EYp371jx6LurezUDhKcYRh --- src/ExponentialUtilities.jl | 1 + src/phi.jl | 17 ++- src/phi_almohy.jl | 281 ++++++++++++++++++++++++++++++++++++ test/basictests.jl | 62 ++++++++ 4 files changed, 358 insertions(+), 3 deletions(-) create mode 100644 src/phi_almohy.jl diff --git a/src/ExponentialUtilities.jl b/src/ExponentialUtilities.jl index 9e89db3..3492d41 100644 --- a/src/ExponentialUtilities.jl +++ b/src/ExponentialUtilities.jl @@ -29,6 +29,7 @@ include("exp_noalloc.jl") include("exp_generic.jl") include("exp_sparse.jl") include("phi.jl") +include("phi_almohy.jl") include("arnoldi.jl") include("krylov_phiv.jl") include("krylov_phiv_adaptive.jl") diff --git a/src/phi.jl b/src/phi.jl index 8faa9eb..70bfac1 100644 --- a/src/phi.jl +++ b/src/phi.jl @@ -103,9 +103,12 @@ The phi functions are defined as \\varphi_0(z) = \\exp(z),\\quad \\varphi_{k+1}(z) = \\frac{\\varphi_k(z) - \\varphi_k(0)}{z} ``` -Calls `phiv_dense` on each of the basis vectors to obtain the answer. If A -is `Diagonal`, instead calls the scalar `phi` on each diagonal element and the -return values are also `Diagonal`s +For `Float64`/`ComplexF64` matrices this uses the scaling-and-recovering +algorithm of Al-Mohy and Liu (arXiv:2506.01193), which computes all of +`phi_0(A), ..., phi_k(A)` simultaneously in `O(k m^3)` operations. Other element +types fall back to calling `phiv_dense` on each basis vector (`O(m (m+k)^3)`). If +`A` is `Diagonal`, the scalar `phi` is instead applied to each diagonal element +and the return values are also `Diagonal`s. """ function phi(A::AbstractMatrix{T}, k; kwargs...) where {T <: Number} m = size(A, 1) @@ -127,6 +130,14 @@ function phi!( ) where {T <: Number} m = size(A, 1) @assert length(out) == k + 1&&all(P -> size(P) == (m, m), out) "Dimension mismatch" + # The scaling-and-recovering algorithm of Al-Mohy and Liu (arXiv:2506.01193) + # computes phi_0..phi_k simultaneously in O(k m^3), versus O(m (m+k)^3) for + # the basis-vector approach below. Its Pade tables are tuned for double + # precision, so it is used only for Float64/ComplexF64 and only when the + # caller has not supplied the legacy `caches` bundle. + if k >= 1 && isnothing(caches) && T <: Union{Float64, ComplexF64} + return _phi_almohy!(out, A, k) + end if isnothing(caches) e = Vector{T}(undef, m) W = Matrix{T}(undef, m, k + 1) diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl new file mode 100644 index 0000000..d2881fe --- /dev/null +++ b/src/phi_almohy.jl @@ -0,0 +1,281 @@ +# Scaling-and-recovering algorithm for the matrix phi-functions, computing +# phi_0(A), ..., phi_p(A) simultaneously in O(p*n^3) work. +# +# This is a port of Algorithm 5.1 of +# +# Awad H. Al-Mohy and Xiaobo Liu, "A scaling and recovering algorithm for the +# matrix phi-functions", arXiv:2506.01193 (2025), +# +# and follows the authors' reference MATLAB implementation `phi_funm`. +# +# The idea: scale A down by 2^s, evaluate the [m/m] diagonal Pade approximant to +# phi_p of the scaled matrix, recover the lower-index approximants via the +# recurrence R^{(j)}_m(z) = z R^{(j+1)}_m(z) + 1/j!, then undo the scaling with +# the double-argument formula +# +# phi_j(2A) = 2^{-j} ( phi_0(A) phi_j(A) + sum_{k=1}^{j} phi_k(A)/(j-k)! ). + +# theta[m, p] = theta_{m,p} from Table 3.1 of the paper (largest 1-norm of the +# scaled matrix for which the [m/m] Pade approximant is backward stable to +# double-precision unit roundoff). Rows are m = 1:20, columns p = 1:10. +const _PHI_THETA_MP = [ + 1.999463452408407e-5 3.7631213142601604e-5 7.366006416045163e-5 0.00014973317297025854 0.0003152443333771182 0.0006855209983764435 0.0015357294906993542 0.0035357368946407606 0.008345789028062234 0.02013882808928226; + 0.0038062018282832713 0.006090206286125726 0.009869682746779615 0.016211831146383013 0.026984240563843326 0.0454757968381803 0.07749273259855331 0.13324895779231027 0.2304625604362521 0.3988991104549146; + 0.03971636005661334 0.05806968886880692 0.08534220076759817 0.12612151517169362 0.18736524835661617 0.27955116495245524 0.41822735418681667 0.6258702800351991 0.9335970443986562 1.1616793320890249; + 0.15442675548312682 0.21278117034577634 0.29371996708854947 0.40617647304246707 0.5623843002320996 0.7787883505754265 1.0464245027100287 1.2572921799132364 1.480350861451984 1.713871003185325; + 0.37980898016147974 0.5014624976587007 0.6621033824456818 0.8739858777744354 1.110828442901451 1.3375805277521873 1.5770955744229305 1.8273848586096524 2.08673114576671 2.3536689190936406; + 0.726177195703321 0.9281910159646274 1.1591052927815408 1.4012982671152012 1.6570386251184455 1.924026602416338 2.20029216725632 2.4841706972729956 2.7742685942850143 3.069426294589578; + 1.1898666361923196 1.4469917284172122 1.7187282676344413 2.0024009817133357 2.295729523138923 2.5968019781751672 2.9040336511997444 3.216121483847106 3.531999855119243 3.8508005672569903; + 1.7605812331512907 2.060907194742016 2.371480030315257 2.690078917436836 3.014877598333601 3.3443898815994957 3.677415494646134 4.01299056190377 4.350344448873932 4.688863279363638; + 2.425818958547233 2.7623053687512256 3.105174730511504 3.4527057639100476 3.8035234182863555 4.156538184296475 4.510892686686127 4.865916398054041 5.2210881567935115 5.57600564043454; + 3.173113456793749 3.5393251225873685 3.9086764034913664 4.279911937822335 4.652058062541427 5.024366624424222 5.396268333440519 5.767334816025469 6.1372481669904 6.505776744744432; + 3.991025201329815 4.3813599805888455 4.772204432206606 5.162703837030561 5.552220615863406 5.940286915538899 6.326566605567779 6.710825187972159 7.092906173082131 7.4727126397309815; + 4.869485489784578 5.279199870248916 5.687344501175957 6.09339280009268 6.496977936247796 6.8978550856394785 7.295871912058635 7.690945666690415 8.083045525424248 8.472179024890941; + 5.799827904547885 6.224971321619904 6.64693480650714 7.065451733211646 7.48036740469753 7.891610269499859 8.29916985381997 8.703079946679036 9.103405852177376 9.500234768451538; + 6.774681996458441 7.211997240187581 7.644910128682907 8.073354767800579 8.497338949398504 8.916922998846443 9.332203860678709 9.74330319935391 10.150358556572542 10.5535168239286; + 7.787817051572435 8.234634975085694 8.676142196766529 9.112424389503664 9.543611219824884 9.969861373150518 10.391351525913848 10.808268293369911 11.220802409044003 11.629144570605774; + 8.833978214833847 9.28811956766922 9.736292792866143 10.178695474892383 10.615546677548995 11.04707680323746 11.473520338548882 11.895110742359616 12.312076916038997 12.724640835984562; + 9.908733823133844 10.368423129623023 10.821685340709736 11.26879859255289 11.710045790427412 12.145708155762449 12.576060822320501 13.001369927621694 13.421890788016032 13.837866852904227; + 11.008341035221061 11.472133539821815 11.929195701618417 12.379861767254168 12.824458619109002 13.263302077220441 13.696694607678852 14.124924034990169 14.548262963142152 14.966968689718191; + 12.129631147167878 12.596352058386916 13.056160721729096 13.509428665638305 13.956511465304834 14.397747062985934 14.83345501195776 15.263936358537684 15.689473955602649 16.110333058994453; + 13.269913405576162 13.738607964511967 14.20030229038679 14.655390835644507 15.104246216977073 15.547218984875734 15.984637960610213 16.41681094336813 16.84402564730148 17.266550769620974; +] + +# theta_{m,p} with the p>7 rule of the paper: for p>7 the p=7 column is used, and +# indices are guarded so m outside 1:20 does not error. +@inline function _phi_theta(m::Integer, p::Integer) + return _PHI_THETA_MP[m, p > 7 ? 7 : p] +end + +# k! as a Float64 (k small; avoids Int overflow for the recovery coefficients). +@inline function _factf(k::Integer) + r = 1.0 + for i in 2:k + r *= i + end + return r +end + +# Exact 1-norm of B^k for an entrywise-nonnegative real B, via ‖B^k‖_1 = +# ‖(B^T)^k 𝟙‖_∞ (k matrix-vector products, no explicit power). +function _normpow_nonneg(B::AbstractMatrix{<:Real}, k::Integer) + n = size(B, 1) + Bt = transpose(B) + v = ones(eltype(B), n) + tmp = similar(v) + for _ in 1:k + mul!(tmp, Bt, v) + v, tmp = tmp, v + end + return maximum(v) +end + +# Scaling parameter `t` derived from the first term of the backward-error series +# (Eq. (3.12)); mirrors the `ell` subfunction of the reference implementation. +function _phi_ell(A::AbstractMatrix, m::Integer, p::Integer, phat::Integer) + normT = opnorm(A, 1) + normT == 0 && return 0 + t0 = normT > 1 ? log2(normT) : 0.0 + scalefac = exp2(t0) + normTs = normT / scalefac + delta = (p - 1) * (p - phat) / p + 1 + coeff = (_factf(m + p) / _factf(2m + p)) * (_factf(m) / _factf(2m + p + 1)) + K = 2m + p + 1 + c = (coeff / normTs^delta)^(1 / K) + scaledT = c .* abs.(A ./ scalefac) + alpha = _normpow_nonneg(scaledT, K) + t = log2(2alpha / eps(Float64)) / (K - delta) + t0 + return max(ceil(Int, t), 0) +end + +# Select the Pade degree m, scaling parameter s, and Paterson-Stockmeyer block +# size tau that minimize the equivalent number of matrix products (Section 4). +function _select_parameters_phi(A::AbstractMatrix, p::Integer) + m_max = 12 + i_max = ceil(Int, sqrt(8 * (m_max + 1)) - 3) - 1 + m_max = (i_max + 3)^2 ÷ 8 + phat = _phi_theta(m_max, p) < 1 ? 0 : p + r_max = floor(Int, (1 + sqrt(1 + 4 * (2m_max + phat + 1))) / 2) + + # eta[j] = ‖A^{j+1}‖_1^{1/(j+1)} for j = 1:r_max, forming the powers exactly. + # (The powers are low, j+1 ≤ r_max+1 ≲ 8; the cost is negligible next to the + # rest of the algorithm and never underestimates, keeping s conservative.) + eta = zeros(Float64, r_max) + P = A * A + eta[1] = opnorm(P, 1)^(1 / 2) + for j in 2:r_max + P = P * A + eta[j] = opnorm(P, 1)^(1 / (j + 1)) + end + alpha = [max(eta[j], eta[j + 1]) for j in 1:(r_max - 1)] + + Cost = zeros(Float64, i_max + 1, r_max - 1) + for i in 0:i_max + m_i = (i + 3)^2 ÷ 8 + θ = _phi_theta(m_i, p) + phat_i = θ < 1 ? 0 : p + t = _phi_ell(A, m_i, p, phat_i) + for r in 2:r_max + if 2m_i + phat_i + 1 >= r * (r - 1) + a = alpha[r - 1] + s0 = (a > 0 && isfinite(a)) ? max(ceil(Int, log2(a / θ)), t) : t + Cost[i + 1, r - 1] = i + p + s0 * (p + 1) + end + end + end + + minval = Inf + for v in Cost + if v > 0 && v < minval + minval = v + end + end + # Match the reference's column-major "last" tie-break for reproducibility. + idx = 0 + for (j, v) in enumerate(Cost) + v == minval && (idx = j) + end + i_star = (idx - 1) % (i_max + 1) + m = (i_star + 3)^2 ÷ 8 + s = round(Int, (minval - i_star - p) / (p + 1)) + + tau = floor(Int, sqrt(2m)) + if tau - 1 + 2 * (m ÷ tau) - 2 * (m % tau == 0) != i_star + tau = ceil(Int, sqrt(2m)) + end + return m, s, tau +end + +# Renormalized [m/m] Pade coefficients (numerator and denominator, low order +# first) of the approximant to phi_p. Follows the recurrences of Berland, +# Skaflestad, and Wright used in the reference `pade_coef`. +function _phi_pade_coef(m::Integer, p::Integer) + n1 = prod(Float64.((m + 1):(2m + 1))) # (2m+1)!/m! + d1 = n1 + Ncoef = Vector{Float64}(undef, m + 1) + Dcoef = Vector{Float64}(undef, m + 1) + Amat = Matrix{Float64}(undef, m + 1, m + 1) + for k in 1:p + if k == p + # First column: Amat[r,1] = n1 * prod_{l=1}^{r-1} 1/(k+l). + cp = 1.0 + Amat[1, 1] = n1 + for r in 2:(m + 1) + cp /= (k + (r - 1)) + Amat[r, 1] = n1 * cp + end + for c in 2:(m + 1), r in 2:(m + 1) + I = r - 1 + J = c - 1 + Amat[r, c] = -(m + 1 - J) * (k + 1 + I - J) / ((2m + k + 1 - J) * J) + end + for r in 1:(m + 1) + s = 0.0 + pr = 1.0 + for c in 1:r + pr *= Amat[r, c] + s += pr + end + Ncoef[r] = s + end + Dcoef[1] = d1 + cpd = 1.0 + for r in 2:(m + 1) + i = r - 1 + cpd *= -(m + 1 - i) / (i * (2m + k + 1 - i)) + Dcoef[r] = d1 * cpd + end + end + n1 = n1 * (2m + k + 1) / (k + 1) + d1 = d1 * (2m + k + 1) + end + return Ncoef, Dcoef +end + +# Simultaneously evaluate the numerator N(A) and denominator D(A) of the Pade +# approximant via the Paterson-Stockmeyer scheme with block size tau. +function _paterson_stockmeyer(A::AbstractMatrix{T}, Nc, Dc, tau::Integer) where {T} + m = length(Nc) - 1 + n = size(A, 1) + Apow = Vector{Matrix{T}}(undef, tau + 1) + Apow[1] = Matrix{T}(I, n, n) + Apow[2] = Matrix(A) + for i in 3:(tau + 1) + Apow[i] = Apow[i - 1] * A + end + Atau = copy(Apow[tau + 1]) + N = zeros(T, n, n) + D = zeros(T, n, n) + nu = m ÷ tau + for i in 0:nu + start = i * tau + 1 + stop = min((i + 1) * tau, m + 1) + blkN = zeros(T, n, n) + blkD = zeros(T, n, n) + for l in 1:(stop - start + 1) + blkN .+= Nc[start + l - 1] .* Apow[l] + blkD .+= Dc[start + l - 1] .* Apow[l] + end + if i == 0 + N .+= blkN + D .+= blkD + else + N .+= blkN * Atau + D .+= blkD * Atau + Atau = Atau * Apow[tau + 1] + end + end + return N, D +end + +""" + _phi_almohy!(out, A, p) -> out + +Simultaneously compute `phi_0(A), phi_1(A), ..., phi_p(A)` for a dense matrix `A` +(`p >= 1`) using the scaling-and-recovering algorithm of Al-Mohy and Liu +(arXiv:2506.01193). `out` must be a length-`p+1` vector of `size(A)` matrices; on +return `out[j+1] == phi_j(A)`. + +The cost is `O(p n^3)`, in contrast to the `O(n (n+p)^3)` of the basis-vector +approach in [`phi!`](@ref). +""" +function _phi_almohy!( + out::AbstractVector{<:AbstractMatrix}, A::AbstractMatrix{T}, + p::Integer + ) where {T} + n = size(A, 1) + m, s, tau = _select_parameters_phi(A, p) + As = A ./ exp2(s) + Nc, Dc = _phi_pade_coef(m, p) + Nm, Dm = _paterson_stockmeyer(As, Nc, Dc, tau) + + Rm = Vector{Matrix{T}}(undef, p + 1) + Rm[p + 1] = Dm \ Nm + # Recurrence (2.9): R^{(j)} = As R^{(j+1)} + I/j!, j = p-1 : -1 : 0. + for k in p:-1:1 + R = As * Rm[k + 1] + f = 1 / _factf(k - 1) + @inbounds for d in 1:n + R[d, d] += f + end + Rm[k] = R + end + + # Undo the scaling with the double-argument formula (2.10), s times. + for _ in 1:s + for j in p:-1:1 + M = Rm[1] * Rm[j + 1] + for k in 1:j + M .+= (1 / _factf(j - k)) .* Rm[k + 1] + end + M ./= exp2(j) + Rm[j + 1] = M + end + Rm[1] = Rm[1] * Rm[1] + end + + for j in 0:p + copyto!(out[j + 1], Rm[j + 1]) + end + return out +end diff --git a/test/basictests.jl b/test/basictests.jl index 539ec2c..924f2d6 100644 --- a/test/basictests.jl +++ b/test/basictests.jl @@ -256,6 +256,68 @@ end end end +@testset "Phi (Al-Mohy--Liu vs reference)" begin + # Independent reference: phi_0..phi_p are the first block row of exp(W) for + # the augmented block matrix W = [A E; 0 J] (Al-Mohy--Liu, Theorem 2.1), with + # E = [I 0 ... 0] and J the nilpotent shift. This is unrelated to the + # scaling-and-recovering algorithm under test. + function phi_block_reference(A, p) + n = size(A, 1) + T = eltype(A) + W = zeros(T, n * (p + 1), n * (p + 1)) + W[1:n, 1:n] .= A + for j in 0:(p - 1) + rb = j * n + W[(rb + 1):(rb + n), (rb + n + 1):(rb + 2n)] .= Matrix{T}(I, n, n) + end + eW = exp(W) + return [eW[1:n, (j * n + 1):((j + 1) * n)] for j in 0:p] + end + + # Force the legacy basis-vector path (Sidje/`phiv_dense`) for cross-checking. + function phi_legacy(A, p) + n = size(A, 1) + T = eltype(A) + caches = ( + Vector{T}(undef, n), Matrix{T}(undef, n, p + 1), + Matrix{T}(undef, n + p, n + p), + ) + out = [Matrix{T}(undef, n, n) for _ in 1:(p + 1)] + return phi!(out, A, p; caches = caches) + end + + Random.seed!(20250701) + testmats = Dict( + "small 2x2" => [0.1 0.2; 0.3 0.4], + "random 8x8" => randn(8, 8) ./ 4, + "nonnormal 10x10" => 3 * (triu(randn(10, 10), 1) + Diagonal(randn(10))), + "large-norm 6x6" => 6 * randn(6, 6), + "complex 5x5" => randn(ComplexF64, 5, 5), + "hessenberg 12x12" => Matrix(hessenberg(randn(12, 12)).H), + "zero 4x4" => zeros(4, 4), + ) + for (name, A) in testmats, p in (1, 2, 3, 5) + ref = phi_block_reference(A, p) + new = phi(A, p) # default path == Al-Mohy--Liu + leg = phi_legacy(A, p) + for j in 1:(p + 1) + @test new[j] ≈ ref[j] rtol = 1.0e-8 atol = 1.0e-12 + @test new[j] ≈ leg[j] rtol = 1.0e-7 atol = 1.0e-10 + end + end + + # phi_0 must equal the matrix exponential. + A = randn(9, 9) ./ 3 + @test phi(A, 4)[1] ≈ exp(A) + + # Preallocated in-place entry point returns phi_j in out[j+1]. + n = 7 + A = randn(n, n) ./ 3 + out = [Matrix{Float64}(undef, n, n) for _ in 1:4] + phi!(out, A, 3) + @test out ≈ phi_block_reference(A, 3) +end + @testset "Static Arrays" begin Random.seed!(0) for N in (3, 4, 6, 8), t in (0.1, 1.0, 10.0) From f72fa0cc0626bb819be3f5b57c4c045d23285cf9 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 3 Jul 2026 03:49:41 -0400 Subject: [PATCH 2/8] phi: reusable zero-alloc workspace + container-preserving static path Address review feedback on #236: * PhiPadeCache: a reusable workspace holding every scratch buffer. The strided Float64/ComplexF64 path (`_phi_almohy!`) now runs fully in place via mul!/getrf!/getrs! and broadcasting, allocating nothing once a cache exists (verified 0 bytes with @allocated across sizes). Pass it as the `caches` keyword to `phi!` to reuse across calls. (@stevengj: "do in-place, maybe?") * Container-preserving generic path (`_phi_almohy_generic`) for immutable dense matrices: an SMatrix input stays an SMatrix throughout instead of being converted to a Matrix. Routed only for `!ismutable` inputs, so sparse/other mutable non-strided matrices keep the legacy path. (@stevengj: "unfortunate to use Matrix here if you start with SMatrix") * Export and document PhiPadeCache and phi!; bump to 1.31.0 (new public API, SemVer minor). Tests: workspace reuse + zero-allocation assertion, complex-matrix workspace, and SMatrix static-in/static-out accuracy vs the block-exponential reference. Full basictests + Aqua + JET pass locally (the only failures are the pre-existing ForwardDiff-1.x Issue 42 asserts, green under CI's pinned ForwardDiff 0.10.x). Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01EYp371jx6LurezUDhKcYRh --- Project.toml | 2 +- docs/src/matrix_exponentials.md | 2 + src/ExponentialUtilities.jl | 2 +- src/phi.jl | 21 +- src/phi_almohy.jl | 490 +++++++++++++++++++++++--------- test/basictests.jl | 41 +++ 6 files changed, 412 insertions(+), 146 deletions(-) diff --git a/Project.toml b/Project.toml index a5c275c..a10df2c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ExponentialUtilities" uuid = "d4d017d3-3776-5f7e-afef-a10c40355c18" authors = ["Chris Rackauckas ", "José E. Cruz Serrallés "] -version = "1.30.2" +version = "1.31.0" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" diff --git a/docs/src/matrix_exponentials.md b/docs/src/matrix_exponentials.md index 48aca6b..8a5e56a 100644 --- a/docs/src/matrix_exponentials.md +++ b/docs/src/matrix_exponentials.md @@ -3,6 +3,8 @@ ```@docs exponential! phi +phi! +PhiPadeCache ``` ## Methods diff --git a/src/ExponentialUtilities.jl b/src/ExponentialUtilities.jl index 3492d41..579ab7e 100644 --- a/src/ExponentialUtilities.jl +++ b/src/ExponentialUtilities.jl @@ -42,7 +42,7 @@ include("precompile.jl") export phi, phi!, KrylovSubspace, arnoldi, arnoldi!, lanczos!, ExpvCache, PhivCache, expv, expv!, phiv, phiv!, kiops, expv_timestep, expv_timestep!, phiv_timestep, phiv_timestep!, - StegrCache, get_subspace_cache, exponential! + StegrCache, PhiPadeCache, get_subspace_cache, exponential! export ExpMethodHigham2005, ExpMethodHigham2005Base, ExpMethodGeneric, ExpMethodNative, ExpMethodDiagonalization diff --git a/src/phi.jl b/src/phi.jl index 70bfac1..d936a8f 100644 --- a/src/phi.jl +++ b/src/phi.jl @@ -133,10 +133,23 @@ function phi!( # The scaling-and-recovering algorithm of Al-Mohy and Liu (arXiv:2506.01193) # computes phi_0..phi_k simultaneously in O(k m^3), versus O(m (m+k)^3) for # the basis-vector approach below. Its Pade tables are tuned for double - # precision, so it is used only for Float64/ComplexF64 and only when the - # caller has not supplied the legacy `caches` bundle. - if k >= 1 && isnothing(caches) && T <: Union{Float64, ComplexF64} - return _phi_almohy!(out, A, k) + # precision, so it is used only for Float64/ComplexF64. The legacy path is + # kept for other element types and when a caller supplies the legacy + # `caches` tuple. + if k >= 1 && T <: Union{Float64, ComplexF64} && !(caches isa Tuple) + if A isa StridedMatrix && (isnothing(caches) || caches isa PhiPadeCache) + cache = caches isa PhiPadeCache ? caches : PhiPadeCache(A, k) + return _phi_almohy!(out, A, k, cache) + elseif isnothing(caches) && !ismutable(A) + # Container-preserving path for immutable dense matrices (e.g. + # `SMatrix`); mutable non-strided types (e.g. sparse) fall through to + # the legacy path below. + Rm = _phi_almohy_generic(A, k) + for j in 1:(k + 1) + copyto!(out[j], Rm[j]) + end + return out + end end if isnothing(caches) e = Vector{T}(undef, m) diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl index d2881fe..ae553f9 100644 --- a/src/phi_almohy.jl +++ b/src/phi_almohy.jl @@ -14,6 +14,12 @@ # the double-argument formula # # phi_j(2A) = 2^{-j} ( phi_0(A) phi_j(A) + sum_{k=1}^{j} phi_k(A)/(j-k)! ). +# +# Two implementations share the scalar machinery below: +# * `_phi_almohy!` -- fully in-place, allocation-free, using a reusable +# `PhiPadeCache` workspace (for strided mutable matrices). +# * `_phi_almohy_generic` -- out-of-place and element-container-preserving, so +# a static input (e.g. `SMatrix`) stays static. # theta[m, p] = theta_{m,p} from Table 3.1 of the paper (largest 1-norm of the # scaled matrix for which the [m/m] Pade approximant is backward stable to @@ -41,6 +47,14 @@ const _PHI_THETA_MP = [ 13.269913405576162 13.738607964511967 14.20030229038679 14.655390835644507 15.104246216977073 15.547218984875734 15.984637960610213 16.41681094336813 16.84402564730148 17.266550769620974; ] +# Largest Pade degree used; bounds cond(D_m) (Section 4). Fixes the block-size +# and power-buffer counts used to size the workspace. +const _PHI_M_MAX = 12 +# max tau+1 over m <= _PHI_M_MAX, tau = ceil(sqrt(2m)); ceil(sqrt(24)) = 5. +const _PHI_NPOW = 6 +# max i (Paterson--Stockmeyer cost index) for _PHI_M_MAX. +const _PHI_IMAX = ceil(Int, sqrt(8 * (_PHI_M_MAX + 1)) - 3) - 1 + # theta_{m,p} with the p>7 rule of the paper: for p>7 the p=7 column is used, and # indices are guarded so m outside 1:20 does not error. @inline function _phi_theta(m::Integer, p::Integer) @@ -56,105 +70,78 @@ end return r end -# Exact 1-norm of B^k for an entrywise-nonnegative real B, via ‖B^k‖_1 = -# ‖(B^T)^k 𝟙‖_∞ (k matrix-vector products, no explicit power). -function _normpow_nonneg(B::AbstractMatrix{<:Real}, k::Integer) - n = size(B, 1) - Bt = transpose(B) - v = ones(eltype(B), n) - tmp = similar(v) - for _ in 1:k - mul!(tmp, Bt, v) - v, tmp = tmp, v - end - return maximum(v) -end - -# Scaling parameter `t` derived from the first term of the backward-error series -# (Eq. (3.12)); mirrors the `ell` subfunction of the reference implementation. -function _phi_ell(A::AbstractMatrix, m::Integer, p::Integer, phat::Integer) - normT = opnorm(A, 1) - normT == 0 && return 0 - t0 = normT > 1 ? log2(normT) : 0.0 - scalefac = exp2(t0) - normTs = normT / scalefac - delta = (p - 1) * (p - phat) / p + 1 - coeff = (_factf(m + p) / _factf(2m + p)) * (_factf(m) / _factf(2m + p + 1)) - K = 2m + p + 1 - c = (coeff / normTs^delta)^(1 / K) - scaledT = c .* abs.(A ./ scalefac) - alpha = _normpow_nonneg(scaledT, K) - t = log2(2alpha / eps(Float64)) / (K - delta) + t0 - return max(ceil(Int, t), 0) +# First-factor coefficient of the backward-error series h_{m,p} (Eq. (3.4)). +@inline function _phi_be_coeff(m::Integer, p::Integer) + return (_factf(m + p) / _factf(2m + p)) * (_factf(m) / _factf(2m + p + 1)) end -# Select the Pade degree m, scaling parameter s, and Paterson-Stockmeyer block -# size tau that minimize the equivalent number of matrix products (Section 4). -function _select_parameters_phi(A::AbstractMatrix, p::Integer) - m_max = 12 - i_max = ceil(Int, sqrt(8 * (m_max + 1)) - 3) - 1 - m_max = (i_max + 3)^2 ÷ 8 - phat = _phi_theta(m_max, p) < 1 ? 0 : p - r_max = floor(Int, (1 + sqrt(1 + 4 * (2m_max + phat + 1))) / 2) - - # eta[j] = ‖A^{j+1}‖_1^{1/(j+1)} for j = 1:r_max, forming the powers exactly. - # (The powers are low, j+1 ≤ r_max+1 ≲ 8; the cost is negligible next to the - # rest of the algorithm and never underestimates, keeping s conservative.) - eta = zeros(Float64, r_max) - P = A * A - eta[1] = opnorm(P, 1)^(1 / 2) - for j in 2:r_max - P = P * A - eta[j] = opnorm(P, 1)^(1 / (j + 1)) - end - alpha = [max(eta[j], eta[j + 1]) for j in 1:(r_max - 1)] - - Cost = zeros(Float64, i_max + 1, r_max - 1) - for i in 0:i_max - m_i = (i + 3)^2 ÷ 8 - θ = _phi_theta(m_i, p) - phat_i = θ < 1 ? 0 : p - t = _phi_ell(A, m_i, p, phat_i) - for r in 2:r_max - if 2m_i + phat_i + 1 >= r * (r - 1) - a = alpha[r - 1] - s0 = (a > 0 && isfinite(a)) ? max(ceil(Int, log2(a / θ)), t) : t - Cost[i + 1, r - 1] = i + p + s0 * (p + 1) - end - end - end +""" + PhiPadeCache(A, p) - minval = Inf - for v in Cost - if v > 0 && v < minval - minval = v - end - end - # Match the reference's column-major "last" tie-break for reproducibility. - idx = 0 - for (j, v) in enumerate(Cost) - v == minval && (idx = j) - end - i_star = (idx - 1) % (i_max + 1) - m = (i_star + 3)^2 ÷ 8 - s = round(Int, (minval - i_star - p) / (p + 1)) +Reusable workspace for the in-place phi-function evaluator [`phi!`](@ref) on +strided `Float64`/`ComplexF64` matrices. Allocate once for a given size and `p`, +then pass it as the `caches` keyword to `phi!` to reuse across calls without +further allocation. +""" +struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}} + As::MT + Id::MT + Apow::Vector{MT} + Atau::MT + Nm::MT + Dm::MT + Dfact::MT + blkN::MT + blkD::MT + tmp::MT + pow1::MT + pow2::MT + absA::RMT + ipiv::Vector{LinearAlgebra.BlasInt} + rvec1::Vector{RT} + rvec2::Vector{RT} + Ncoef::Vector{Float64} + Dcoef::Vector{Float64} + Amat::Matrix{Float64} + eta::Vector{Float64} + alpha::Vector{Float64} + tvals::Vector{Int} + Cost::Matrix{Float64} +end - tau = floor(Int, sqrt(2m)) - if tau - 1 + 2 * (m ÷ tau) - 2 * (m % tau == 0) != i_star - tau = ceil(Int, sqrt(2m)) +function PhiPadeCache(A::AbstractMatrix{T}, p::Integer) where {T} + n = size(A, 1) + RT = real(T) + mk() = Matrix{T}(undef, n, n) + Apow = Vector{Matrix{T}}(undef, _PHI_NPOW) + Apow[1] = Matrix{T}(I, n, n) + for i in 2:_PHI_NPOW + Apow[i] = mk() end - return m, s, tau + return PhiPadeCache{T, RT, Matrix{T}, Matrix{RT}}( + mk(), Apow[1], Apow, mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(), + Matrix{RT}(undef, n, n), + Vector{LinearAlgebra.BlasInt}(undef, n), + Vector{RT}(undef, n), Vector{RT}(undef, n), + Vector{Float64}(undef, _PHI_M_MAX + 1), Vector{Float64}(undef, _PHI_M_MAX + 1), + Matrix{Float64}(undef, _PHI_M_MAX + 1, _PHI_M_MAX + 1), + Vector{Float64}(undef, _PHI_IMAX + 2), Vector{Float64}(undef, _PHI_IMAX + 2), + Vector{Int}(undef, _PHI_IMAX + 1), + Matrix{Float64}(undef, _PHI_IMAX + 1, _PHI_IMAX + 2), + ) end # Renormalized [m/m] Pade coefficients (numerator and denominator, low order -# first) of the approximant to phi_p. Follows the recurrences of Berland, -# Skaflestad, and Wright used in the reference `pade_coef`. -function _phi_pade_coef(m::Integer, p::Integer) - n1 = prod(Float64.((m + 1):(2m + 1))) # (2m+1)!/m! +# first) of the approximant to phi_p, written into the preallocated `Ncoef`, +# `Dcoef` (length >= m+1) using scratch `Amat` (size >= (m+1)^2). Follows the +# recurrences of Berland, Skaflestad, and Wright used in the reference +# `pade_coef`. +function _phi_pade_coef!(Ncoef, Dcoef, Amat, m::Integer, p::Integer) + n1 = 1.0 # (2m+1)!/m! + for i in (m + 1):(2m + 1) + n1 *= i + end d1 = n1 - Ncoef = Vector{Float64}(undef, m + 1) - Dcoef = Vector{Float64}(undef, m + 1) - Amat = Matrix{Float64}(undef, m + 1, m + 1) for k in 1:p if k == p # First column: Amat[r,1] = n1 * prod_{l=1}^{r-1} 1/(k+l). @@ -192,90 +179,313 @@ function _phi_pade_coef(m::Integer, p::Integer) return Ncoef, Dcoef end -# Simultaneously evaluate the numerator N(A) and denominator D(A) of the Pade -# approximant via the Paterson-Stockmeyer scheme with block size tau. -function _paterson_stockmeyer(A::AbstractMatrix{T}, Nc, Dc, tau::Integer) where {T} - m = length(Nc) - 1 - n = size(A, 1) - Apow = Vector{Matrix{T}}(undef, tau + 1) - Apow[1] = Matrix{T}(I, n, n) - Apow[2] = Matrix(A) +# Given the alpha_r sequence (first `nalpha` entries) and precomputed scaling +# parameters `tvals[i+1] = t(m_i)`, minimize the equivalent-matrix-product cost +# (Section 4) over the optimal degrees and return the chosen (m, s, tau). `Cost` +# is scratch of size >= (imax+1) x (r_max-1). +function _phi_select_from_alpha(alpha, nalpha::Integer, p::Integer, tvals, Cost) + i_max = _PHI_IMAX + r_max = nalpha + 1 + C = @view Cost[1:(i_max + 1), 1:(r_max - 1)] + fill!(C, 0.0) + for i in 0:i_max + m_i = (i + 3)^2 ÷ 8 + θ = _phi_theta(m_i, p) + phat_i = θ < 1 ? 0 : p + t = tvals[i + 1] + for r in 2:r_max + if 2m_i + phat_i + 1 >= r * (r - 1) + a = alpha[r - 1] + s0 = (a > 0 && isfinite(a)) ? max(ceil(Int, log2(a / θ)), t) : t + C[i + 1, r - 1] = i + p + s0 * (p + 1) + end + end + end + nrows = i_max + 1 + ncols = r_max - 1 + minval = Inf + for v in C + (v > 0 && v < minval) && (minval = v) + end + # Column-major "last" tie-break (matches the reference), giving the row index. + i_star = 0 + for cc in 1:ncols, rr in 1:nrows + C[rr, cc] == minval && (i_star = rr - 1) + end + m = (i_star + 3)^2 ÷ 8 + s = round(Int, (minval - i_star - p) / (p + 1)) + tau = floor(Int, sqrt(2m)) + if tau - 1 + 2 * (m ÷ tau) - 2 * (m % tau == 0) != i_star + tau = ceil(Int, sqrt(2m)) + end + return m, s, tau +end + +## -------------------- in-place (workspace) implementation -------------------- + +# Exact 1-norm of B^k for entrywise-nonnegative real B, ‖B^k‖_1 = ‖(B^T)^k 𝟙‖_∞, +# via k matrix-vector products into the workspace vectors. +function _normpow_nonneg!(cache::PhiPadeCache, B, k::Integer) + v = cache.rvec1 + tmp = cache.rvec2 + fill!(v, 1) + Bt = transpose(B) + for _ in 1:k + mul!(tmp, Bt, v) + v, tmp = tmp, v + end + return maximum(v) +end + +# Scaling parameter `t` from the first term of the backward-error series +# (Eq. (3.12)); `ell` subfunction of the reference, in-place. +function _phi_ell!(cache::PhiPadeCache, A, m::Integer, p::Integer, phat::Integer) + normT = opnorm(A, 1) + normT == 0 && return 0 + t0 = normT > 1 ? log2(normT) : 0.0 + scalefac = exp2(t0) + normTs = normT / scalefac + delta = (p - 1) * (p - phat) / p + 1 + K = 2m + p + 1 + c = (_phi_be_coeff(m, p) / normTs^delta)^(1 / K) + absA = cache.absA + @. absA = c * abs(A) / scalefac + alpha = _normpow_nonneg!(cache, absA, K) + t = log2(2alpha / eps(Float64)) / (K - delta) + t0 + return max(ceil(Int, t), 0) +end + +function _select_parameters_phi!(cache::PhiPadeCache, A, p::Integer) + phat = _phi_theta(_PHI_M_MAX, p) < 1 ? 0 : p + r_max = floor(Int, (1 + sqrt(1 + 4 * (2 * _PHI_M_MAX + phat + 1))) / 2) + eta = cache.eta + mul!(cache.pow1, A, A) + eta[1] = opnorm(cache.pow1, 1)^(1 / 2) + cur, oth = cache.pow1, cache.pow2 + for j in 2:r_max + mul!(oth, cur, A) + eta[j] = opnorm(oth, 1)^(1 / (j + 1)) + cur, oth = oth, cur + end + alpha = cache.alpha + for j in 1:(r_max - 1) + alpha[j] = max(eta[j], eta[j + 1]) + end + for i in 0:_PHI_IMAX + m_i = (i + 3)^2 ÷ 8 + phat_i = _phi_theta(m_i, p) < 1 ? 0 : p + cache.tvals[i + 1] = _phi_ell!(cache, A, m_i, p, phat_i) + end + return _phi_select_from_alpha(alpha, r_max - 1, p, cache.tvals, cache.Cost) +end + +# Evaluate N(As) and D(As) into cache.Nm, cache.Dm via Paterson--Stockmeyer. +function _paterson_stockmeyer!(cache::PhiPadeCache, As, m::Integer, tau::Integer) + Nc, Dc, Apow = cache.Ncoef, cache.Dcoef, cache.Apow + Atau, Nm, Dm = cache.Atau, cache.Nm, cache.Dm + blkN, blkD, tmp = cache.blkN, cache.blkD, cache.tmp + copyto!(Apow[2], As) # Apow[1] === Id; Apow[i] = As^{i-1} for i in 3:(tau + 1) - Apow[i] = Apow[i - 1] * A + mul!(Apow[i], Apow[i - 1], As) end - Atau = copy(Apow[tau + 1]) - N = zeros(T, n, n) - D = zeros(T, n, n) + copyto!(Atau, Apow[tau + 1]) + fill!(Nm, 0) + fill!(Dm, 0) nu = m ÷ tau for i in 0:nu start = i * tau + 1 stop = min((i + 1) * tau, m + 1) - blkN = zeros(T, n, n) - blkD = zeros(T, n, n) + fill!(blkN, 0) + fill!(blkD, 0) for l in 1:(stop - start + 1) - blkN .+= Nc[start + l - 1] .* Apow[l] - blkD .+= Dc[start + l - 1] .* Apow[l] + @. blkN += Nc[start + l - 1] * Apow[l] + @. blkD += Dc[start + l - 1] * Apow[l] end if i == 0 - N .+= blkN - D .+= blkD + Nm .+= blkN + Dm .+= blkD else - N .+= blkN * Atau - D .+= blkD * Atau - Atau = Atau * Apow[tau + 1] + mul!(Nm, blkN, Atau, true, true) + mul!(Dm, blkD, Atau, true, true) + mul!(tmp, Atau, Apow[tau + 1]) + copyto!(Atau, tmp) end end - return N, D + return Nm, Dm end -""" - _phi_almohy!(out, A, p) -> out +# Solve Dfact * X = B in place (B overwritten by X), zero-allocation LU. +function _phi_solve!(Dfact, ipiv, B) + LinearAlgebra.LAPACK.getrf!(Dfact, ipiv) + LinearAlgebra.LAPACK.getrs!('N', Dfact, ipiv, B) + return B +end -Simultaneously compute `phi_0(A), phi_1(A), ..., phi_p(A)` for a dense matrix `A` -(`p >= 1`) using the scaling-and-recovering algorithm of Al-Mohy and Liu -(arXiv:2506.01193). `out` must be a length-`p+1` vector of `size(A)` matrices; on -return `out[j+1] == phi_j(A)`. +""" + _phi_almohy!(out, A, p, cache) -> out -The cost is `O(p n^3)`, in contrast to the `O(n (n+p)^3)` of the basis-vector -approach in [`phi!`](@ref). +Allocation-free evaluation of `phi_0(A), ..., phi_p(A)` for a strided +`Float64`/`ComplexF64` matrix `A` (`p >= 1`), writing `phi_j(A)` into `out[j+1]` +and using the reusable [`PhiPadeCache`](@ref) `cache` for all scratch storage. +The `out` matrices double as the recovery buffers. """ function _phi_almohy!( - out::AbstractVector{<:AbstractMatrix}, A::AbstractMatrix{T}, - p::Integer - ) where {T} + out::AbstractVector{<:AbstractMatrix}, A::AbstractMatrix, p::Integer, + cache::PhiPadeCache + ) n = size(A, 1) - m, s, tau = _select_parameters_phi(A, p) - As = A ./ exp2(s) - Nc, Dc = _phi_pade_coef(m, p) - Nm, Dm = _paterson_stockmeyer(As, Nc, Dc, tau) + m, s, tau = _select_parameters_phi!(cache, A, p) + As = cache.As + @. As = A / exp2(s) + _phi_pade_coef!(cache.Ncoef, cache.Dcoef, cache.Amat, m, p) + Nm, Dm = _paterson_stockmeyer!(cache, As, m, tau) + + copyto!(cache.Dfact, Dm) + copyto!(out[p + 1], Nm) + _phi_solve!(cache.Dfact, cache.ipiv, out[p + 1]) - Rm = Vector{Matrix{T}}(undef, p + 1) - Rm[p + 1] = Dm \ Nm # Recurrence (2.9): R^{(j)} = As R^{(j+1)} + I/j!, j = p-1 : -1 : 0. for k in p:-1:1 - R = As * Rm[k + 1] + mul!(out[k], As, out[k + 1]) f = 1 / _factf(k - 1) @inbounds for d in 1:n - R[d, d] += f + out[k][d, d] += f end - Rm[k] = R end # Undo the scaling with the double-argument formula (2.10), s times. + tmp = cache.tmp for _ in 1:s for j in p:-1:1 - M = Rm[1] * Rm[j + 1] + mul!(tmp, out[1], out[j + 1]) for k in 1:j - M .+= (1 / _factf(j - k)) .* Rm[k + 1] + @. tmp += (1 / _factf(j - k)) * out[k + 1] end - M ./= exp2(j) - Rm[j + 1] = M + tmp ./= exp2(j) + copyto!(out[j + 1], tmp) end - Rm[1] = Rm[1] * Rm[1] + mul!(tmp, out[1], out[1]) + copyto!(out[1], tmp) end + return out +end + +## ----------------- generic (container-preserving) implementation ------------- - for j in 0:p - copyto!(out[j + 1], Rm[j + 1]) +function _normpow_nonneg(B::AbstractMatrix{<:Real}, k::Integer) + n = size(B, 1) + Bt = transpose(B) + v = ones(eltype(B), n) + tmp = similar(v) + for _ in 1:k + mul!(tmp, Bt, v) + v, tmp = tmp, v end - return out + return maximum(v) +end + +function _phi_ell(A::AbstractMatrix, m::Integer, p::Integer, phat::Integer) + normT = opnorm(A, 1) + normT == 0 && return 0 + t0 = normT > 1 ? log2(normT) : 0.0 + scalefac = exp2(t0) + normTs = normT / scalefac + delta = (p - 1) * (p - phat) / p + 1 + K = 2m + p + 1 + c = (_phi_be_coeff(m, p) / normTs^delta)^(1 / K) + scaledT = c .* abs.(A ./ scalefac) + alpha = _normpow_nonneg(scaledT, K) + t = log2(2alpha / eps(Float64)) / (K - delta) + t0 + return max(ceil(Int, t), 0) +end + +function _select_parameters_phi(A::AbstractMatrix, p::Integer) + phat = _phi_theta(_PHI_M_MAX, p) < 1 ? 0 : p + r_max = floor(Int, (1 + sqrt(1 + 4 * (2 * _PHI_M_MAX + phat + 1))) / 2) + eta = Vector{Float64}(undef, r_max) + P = A * A + eta[1] = opnorm(P, 1)^(1 / 2) + for j in 2:r_max + P = P * A + eta[j] = opnorm(P, 1)^(1 / (j + 1)) + end + alpha = [max(eta[j], eta[j + 1]) for j in 1:(r_max - 1)] + tvals = Vector{Int}(undef, _PHI_IMAX + 1) + for i in 0:_PHI_IMAX + m_i = (i + 3)^2 ÷ 8 + phat_i = _phi_theta(m_i, p) < 1 ? 0 : p + tvals[i + 1] = _phi_ell(A, m_i, p, phat_i) + end + Cost = zeros(Float64, _PHI_IMAX + 1, r_max - 1) + return _phi_select_from_alpha(alpha, r_max - 1, p, tvals, Cost) +end + +function _paterson_stockmeyer(As, Nc, Dc, tau::Integer, m::Integer) + MT = typeof(As) + Apow = Vector{MT}(undef, tau + 1) + Apow[1] = one(As) + Apow[2] = As + for i in 3:(tau + 1) + Apow[i] = Apow[i - 1] * As + end + Atau = Apow[tau + 1] + N = zero(As) + D = zero(As) + nu = m ÷ tau + for i in 0:nu + start = i * tau + 1 + stop = min((i + 1) * tau, m + 1) + blkN = zero(As) + blkD = zero(As) + for l in 1:(stop - start + 1) + blkN += Nc[start + l - 1] * Apow[l] + blkD += Dc[start + l - 1] * Apow[l] + end + if i == 0 + N += blkN + D += blkD + else + N += blkN * Atau + D += blkD * Atau + Atau = Atau * Apow[tau + 1] + end + end + return N, D +end + +""" + _phi_almohy_generic(A, p) -> Vector + +Container-preserving evaluation of `phi_0(A), ..., phi_p(A)` (`p >= 1`): the +returned matrices have the same type as `A` (e.g. an `SMatrix` input yields +`SMatrix` results). Used for immutable/static matrices, where the in-place +workspace path does not apply. +""" +function _phi_almohy_generic(A::AbstractMatrix, p::Integer) + m, s, tau = _select_parameters_phi(A, p) + As = A ./ exp2(s) + m1 = m + 1 + Ncoef = Vector{Float64}(undef, m1) + Dcoef = Vector{Float64}(undef, m1) + Amat = Matrix{Float64}(undef, m1, m1) + _phi_pade_coef!(Ncoef, Dcoef, Amat, m, p) + Nm, Dm = _paterson_stockmeyer(As, Ncoef, Dcoef, tau, m) + + Rm = Vector{typeof(As)}(undef, p + 1) + Rm[p + 1] = Dm \ Nm + Id = one(As) + for k in p:-1:1 + Rm[k] = As * Rm[k + 1] + Id / _factf(k - 1) + end + for _ in 1:s + for j in p:-1:1 + M = Rm[1] * Rm[j + 1] + for k in 1:j + M += Rm[k + 1] / _factf(j - k) + end + Rm[j + 1] = M / exp2(j) + end + Rm[1] = Rm[1] * Rm[1] + end + return Rm end diff --git a/test/basictests.jl b/test/basictests.jl index 924f2d6..e11d041 100644 --- a/test/basictests.jl +++ b/test/basictests.jl @@ -316,6 +316,47 @@ end out = [Matrix{Float64}(undef, n, n) for _ in 1:4] phi!(out, A, 3) @test out ≈ phi_block_reference(A, 3) + + # Reusable workspace: same result, and no allocation once the cache exists. + cache = PhiPadeCache(A, 3) + out2 = [Matrix{Float64}(undef, n, n) for _ in 1:4] + phi!(out2, A, 3; caches = cache) + @test out2 ≈ out + run_phi!(o, M, c) = phi!(o, M, 3; caches = c) + run_phi!(out2, A, cache) # warm up / compile + @test (@allocated run_phi!(out2, A, cache)) == 0 + + # Complex-matrix workspace path. + Ac = randn(ComplexF64, 6, 6) ./ 3 + cachec = PhiPadeCache(Ac, 2) + outc = [Matrix{ComplexF64}(undef, 6, 6) for _ in 1:3] + phi!(outc, Ac, 2; caches = cachec) + @test outc ≈ phi_block_reference(Ac, 2) +end + +@testset "Phi static arrays (container-preserving)" begin + Random.seed!(4321) + function phi_block_reference(A, p) + n = size(A, 1) + T = eltype(A) + W = zeros(T, n * (p + 1), n * (p + 1)) + W[1:n, 1:n] .= A + for j in 0:(p - 1) + rb = j * n + W[(rb + 1):(rb + n), (rb + n + 1):(rb + 2n)] .= Matrix{T}(I, n, n) + end + eW = exp(W) + return [eW[1:n, (j * n + 1):((j + 1) * n)] for j in 0:p] + end + for N in (3, 5), p in (1, 2, 3) + A = SMatrix{N, N}(randn(N, N) ./ 3) + Rm = ExponentialUtilities._phi_almohy_generic(A, p) + @test all(r -> r isa SMatrix, Rm) # static in, static out + ref = phi_block_reference(Matrix(A), p) + for j in 1:(p + 1) + @test Matrix(Rm[j]) ≈ ref[j] rtol = 1.0e-8 atol = 1.0e-12 + end + end end @testset "Static Arrays" begin From 91d942784b4afea153a447ea7b73bb62dfa10b8b Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 3 Jul 2026 04:01:26 -0400 Subject: [PATCH 3/8] phi: allocation-free LU across Julia versions (fixes LTS downgrade) `LAPACK.getrf!(A, ipiv)` with a preallocated pivot vector only exists on newer Julia; on the LTS (1.10) it errored, breaking the workspace solve. ccall `getrf!` directly through libblastrampoline (as `StegrWork` does for `stegr!`) so the factorization stays zero-allocation on all supported versions; the 4-argument `getrs!` used for the solve is available everywhere. Verified on Julia 1.10 and 1.12: correctness ~2.5e-15 vs the BigFloat reference and 0 bytes allocated on cache reuse. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01EYp371jx6LurezUDhKcYRh --- src/phi_almohy.jl | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl index ae553f9..55c1ec2 100644 --- a/src/phi_almohy.jl +++ b/src/phi_almohy.jl @@ -21,6 +21,8 @@ # * `_phi_almohy_generic` -- out-of-place and element-container-preserving, so # a static input (e.g. `SMatrix`) stays static. +import libblastrampoline_jll + # theta[m, p] = theta_{m,p} from Table 3.1 of the paper (largest 1-norm of the # scaled matrix for which the [m/m] Pade approximant is backward stable to # double-precision unit roundoff). Rows are m = 1:20, columns p = 1:10. @@ -314,9 +316,30 @@ function _paterson_stockmeyer!(cache::PhiPadeCache, As, m::Integer, tau::Integer return Nm, Dm end +# LU factorization writing pivots into a preallocated `ipiv`. LAPACK's `getrf!` +# only gained a pivot-accepting method in newer Julia, so ccall it directly (as +# `StegrWork` does for `stegr!`) to keep the factorization allocation-free on the +# LTS. The 4-argument `getrs!` used for the solve is available on all versions. +for (getrf, elty) in ((:dgetrf_, :Float64), (:zgetrf_, :ComplexF64)) + @eval function _phi_getrf!(A::AbstractMatrix{$elty}, ipiv::Vector{LinearAlgebra.BlasInt}) + m, n = size(A) + info = Ref{LinearAlgebra.BlasInt}(0) + ccall( + (LinearAlgebra.BLAS.@blasfunc($getrf), libblastrampoline_jll.libblastrampoline), + Cvoid, + ( + Ref{LinearAlgebra.BlasInt}, Ref{LinearAlgebra.BlasInt}, Ptr{$elty}, + Ref{LinearAlgebra.BlasInt}, Ptr{LinearAlgebra.BlasInt}, Ref{LinearAlgebra.BlasInt}, + ), + m, n, A, max(1, stride(A, 2)), ipiv, info + ) + return A + end +end + # Solve Dfact * X = B in place (B overwritten by X), zero-allocation LU. function _phi_solve!(Dfact, ipiv, B) - LinearAlgebra.LAPACK.getrf!(Dfact, ipiv) + _phi_getrf!(Dfact, ipiv) LinearAlgebra.LAPACK.getrs!('N', Dfact, ipiv, B) return B end From 69bb1fcab1964e13cfba6c3d104a0f2faa692d9a Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 3 Jul 2026 04:29:34 -0400 Subject: [PATCH 4/8] phi: ccall getrs! directly to keep the LU public-API-clean (QA) The workspace solve used qualified `LinearAlgebra.BlasInt` and `LinearAlgebra.LAPACK.getrs!`, which SciMLTesting's ExplicitImports `all_qualified_accesses_are_public` check flags as reaching into non-public internals (they are not on its ignore list). Mirror `exp_noalloc.jl` instead: import `BlasInt` (an allow-listed explicit import) and ccall both `getrf!` and `getrs!` through `BLAS.@blasfunc`/`BLAS.libblastrampoline` (allow-listed names). No qualified access to non-public names remains in `phi_almohy.jl`. Behavior unchanged: verified correctness ~2.5e-15 and 0-byte cache reuse on Julia 1.10 and 1.12; Aqua and JET still pass. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01EYp371jx6LurezUDhKcYRh --- src/phi_almohy.jl | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl index 55c1ec2..2cc553c 100644 --- a/src/phi_almohy.jl +++ b/src/phi_almohy.jl @@ -21,7 +21,7 @@ # * `_phi_almohy_generic` -- out-of-place and element-container-preserving, so # a static input (e.g. `SMatrix`) stays static. -import libblastrampoline_jll +using LinearAlgebra: BlasInt # theta[m, p] = theta_{m,p} from Table 3.1 of the paper (largest 1-norm of the # scaled matrix for which the [m/m] Pade approximant is backward stable to @@ -99,7 +99,7 @@ struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}} pow1::MT pow2::MT absA::RMT - ipiv::Vector{LinearAlgebra.BlasInt} + ipiv::Vector{BlasInt} rvec1::Vector{RT} rvec2::Vector{RT} Ncoef::Vector{Float64} @@ -123,7 +123,7 @@ function PhiPadeCache(A::AbstractMatrix{T}, p::Integer) where {T} return PhiPadeCache{T, RT, Matrix{T}, Matrix{RT}}( mk(), Apow[1], Apow, mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(), Matrix{RT}(undef, n, n), - Vector{LinearAlgebra.BlasInt}(undef, n), + Vector{BlasInt}(undef, n), Vector{RT}(undef, n), Vector{RT}(undef, n), Vector{Float64}(undef, _PHI_M_MAX + 1), Vector{Float64}(undef, _PHI_M_MAX + 1), Matrix{Float64}(undef, _PHI_M_MAX + 1, _PHI_M_MAX + 1), @@ -316,31 +316,47 @@ function _paterson_stockmeyer!(cache::PhiPadeCache, As, m::Integer, tau::Integer return Nm, Dm end -# LU factorization writing pivots into a preallocated `ipiv`. LAPACK's `getrf!` -# only gained a pivot-accepting method in newer Julia, so ccall it directly (as -# `StegrWork` does for `stegr!`) to keep the factorization allocation-free on the -# LTS. The 4-argument `getrs!` used for the solve is available on all versions. -for (getrf, elty) in ((:dgetrf_, :Float64), (:zgetrf_, :ComplexF64)) - @eval function _phi_getrf!(A::AbstractMatrix{$elty}, ipiv::Vector{LinearAlgebra.BlasInt}) +# LAPACK LU factor + solve with pivots written into a preallocated `ipiv`. +# `LAPACK.getrf!` only gained a pivot-accepting method in newer Julia, so both +# `getrf!` and `getrs!` are ccall-ed directly (as `exp_noalloc.jl` does for +# `gebal!`) to keep the solve allocation-free on every supported version. +for (getrf, getrs, elty) in + ((:dgetrf_, :dgetrs_, :Float64), (:zgetrf_, :zgetrs_, :ComplexF64)) + @eval function _phi_getrf!(A::AbstractMatrix{$elty}, ipiv::Vector{BlasInt}) m, n = size(A) - info = Ref{LinearAlgebra.BlasInt}(0) + info = Ref{BlasInt}(0) ccall( - (LinearAlgebra.BLAS.@blasfunc($getrf), libblastrampoline_jll.libblastrampoline), - Cvoid, + (BLAS.@blasfunc($getrf), BLAS.libblastrampoline), Cvoid, ( - Ref{LinearAlgebra.BlasInt}, Ref{LinearAlgebra.BlasInt}, Ptr{$elty}, - Ref{LinearAlgebra.BlasInt}, Ptr{LinearAlgebra.BlasInt}, Ref{LinearAlgebra.BlasInt}, + Ref{BlasInt}, Ref{BlasInt}, Ptr{$elty}, Ref{BlasInt}, + Ptr{BlasInt}, Ref{BlasInt}, ), m, n, A, max(1, stride(A, 2)), ipiv, info ) return A end + @eval function _phi_getrs!( + A::AbstractMatrix{$elty}, ipiv::Vector{BlasInt}, B::AbstractMatrix{$elty} + ) + n = size(A, 1) + info = Ref{BlasInt}(0) + ccall( + (BLAS.@blasfunc($getrs), BLAS.libblastrampoline), Cvoid, + ( + Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{$elty}, Ref{BlasInt}, + Ptr{BlasInt}, Ptr{$elty}, Ref{BlasInt}, Ref{BlasInt}, Clong, + ), + 'N', n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, + max(1, stride(B, 2)), info, 1 + ) + return B + end end # Solve Dfact * X = B in place (B overwritten by X), zero-allocation LU. function _phi_solve!(Dfact, ipiv, B) _phi_getrf!(Dfact, ipiv) - LinearAlgebra.LAPACK.getrs!('N', Dfact, ipiv, B) + _phi_getrs!(Dfact, ipiv, B) return B end From 46cad116cb6c44f62bd84c7140960fd1c47fa936 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 3 Jul 2026 05:57:49 -0400 Subject: [PATCH 5/8] phi: Horner PS, no-throw return codes, p-sized workspace, NaN guards Re-analysis pass over the Al-Mohy--Liu implementation: * Paterson--Stockmeyer rewritten in Horner form with the delta trick. The previous block scheme (mirroring the reference MATLAB) used explicit Atau powers plus two products per block -- up to 12 multiplications for m = 12 where Fasi's pi_m(tau) = 7, so the parameter selection was optimizing a cost the evaluator did not achieve. The Horner form's count now equals pi_m(tau) exactly. Measured 10-20% end-to-end speedup on the workspace path across n = 25..200. * Numerical failure returns codes instead of throwing, so adaptive integrators can reject a step rather than abort: cache.info[] is 0 on success, the LAPACK getrf info for a singular Pade denominator, or -1 for non-finite results (LAPACK does not flag NaN pivots, so a post-solve finiteness check backs the code); outputs are NaN-filled on failure. Fixes a latent InexactError where NaN/Inf input crashed in ceil(Int, t) during parameter selection before reaching the solve, and the generic path's SingularException from `\`. Wrongly sized workspaces (programmer errors) still throw, with informative messages. * Workspace vectors sized from p at construction (was a fixed cap that BoundsError'd for p >~ 48) with an explicit capacity check; opnorm(A, 1) hoisted out of the per-degree ell loop (was recomputed 8x); dead Atau/Id buffers dropped (2n^2 less memory). * Fix a closure-capture leak in the test helpers: phi_block_reference assigned to `n`, clobbering the enclosing testset's `n` (Julia closures assign to existing outer locals); the helpers now declare their temporaries `local`. Verified: worst relative error 8.8e-15 over 33 BigFloat-referenced cases (real/complex, normal/nonnormal, p = 1..8); 0 bytes allocated on cache reuse (Float64 and ComplexF64, n = 10..100, p = 1..8, Julia 1.10 and 1.12); NaN/Inf inputs return info != 0 with NaN outputs and no throw; full basictests 574 pass on 1.10 and 1.12 (only the pre-existing ForwardDiff-1.x Issue 42 failures remain, which pass under CI's pinned ForwardDiff 0.10); ExplicitImports clean. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Fable 5 Claude-Session: https://claude.ai/code/session_01EYp371jx6LurezUDhKcYRh --- src/phi.jl | 22 ++++- src/phi_almohy.jl | 220 +++++++++++++++++++++++++++++++-------------- test/basictests.jl | 49 ++++++++-- 3 files changed, 216 insertions(+), 75 deletions(-) diff --git a/src/phi.jl b/src/phi.jl index d936a8f..dbd3f09 100644 --- a/src/phi.jl +++ b/src/phi.jl @@ -122,7 +122,27 @@ end """ phi!(out,A,k[;caches]) -> out -Non-allocating version of `phi` for non-diagonal matrix inputs. +Non-allocating version of `phi` for non-diagonal matrix inputs, writing +`phi_j(A)` into `out[j+1]`. + +For dense `Float64`/`ComplexF64` matrices, pass a reusable +[`PhiPadeCache`](@ref) as `caches` to make repeated evaluations of the same +size and order allocation-free: + +```julia +cache = PhiPadeCache(A, k) +phi!(out, A, k; caches = cache) +``` + +Numerical failure (a singular Padé denominator or non-finite result, only +possible for pathological inputs such as matrices containing `NaN`/`Inf`) does +not throw: the outputs are filled with `NaN` and, when a `PhiPadeCache` is +used, `cache.info[]` is set to a nonzero return code (`0` on success). This +lets adaptive integrators reject the step instead of aborting. + +For the legacy basis-vector algorithm, `caches` is instead the tuple +`(Vector{T}(undef, m), Matrix{T}(undef, m, k+1), Matrix{T}(undef, m+k, m+k))`; +supplying it forces that code path. """ function phi!( out::Vector{Matrix{T}}, A::AbstractMatrix{T}, k::Integer; caches = nothing, diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl index 2cc553c..478c8cb 100644 --- a/src/phi_almohy.jl +++ b/src/phi_almohy.jl @@ -84,17 +84,23 @@ Reusable workspace for the in-place phi-function evaluator [`phi!`](@ref) on strided `Float64`/`ComplexF64` matrices. Allocate once for a given size and `p`, then pass it as the `caches` keyword to `phi!` to reuse across calls without further allocation. + +After each `phi!` call, `cache.info[]` holds a return code: `0` means success; +a positive value is the LAPACK `getrf` info of a numerically singular Padé +denominator; `-1` means the result was non-finite (`NaN`/`Inf` input). Both +failures only occur for pathological inputs, and no error is thrown — the +outputs are filled with `NaN` so that adaptive integrators can detect the +failed evaluation via `cache.info[] != 0` (or an `isfinite` check) and reject +the step rather than abort. """ struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}} As::MT - Id::MT Apow::Vector{MT} - Atau::MT Nm::MT Dm::MT Dfact::MT - blkN::MT - blkD::MT + Naux::MT + Daux::MT tmp::MT pow1::MT pow2::MT @@ -109,11 +115,17 @@ struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}} alpha::Vector{Float64} tvals::Vector{Int} Cost::Matrix{Float64} + info::typeof(Ref(zero(BlasInt))) end +# Largest r for which alpha_r may be probed (Eq. (3.10) with phat = p; theta_{12,p} +# >= 1 for all p, so phat = p always applies at m = _PHI_M_MAX). +_phi_rmax(p::Integer) = floor(Int, (1 + sqrt(1 + 4 * (2 * _PHI_M_MAX + p + 1))) / 2) + function PhiPadeCache(A::AbstractMatrix{T}, p::Integer) where {T} n = size(A, 1) RT = real(T) + r_max = _phi_rmax(p) mk() = Matrix{T}(undef, n, n) Apow = Vector{Matrix{T}}(undef, _PHI_NPOW) Apow[1] = Matrix{T}(I, n, n) @@ -121,18 +133,33 @@ function PhiPadeCache(A::AbstractMatrix{T}, p::Integer) where {T} Apow[i] = mk() end return PhiPadeCache{T, RT, Matrix{T}, Matrix{RT}}( - mk(), Apow[1], Apow, mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(), + mk(), Apow, mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(), Matrix{RT}(undef, n, n), Vector{BlasInt}(undef, n), Vector{RT}(undef, n), Vector{RT}(undef, n), Vector{Float64}(undef, _PHI_M_MAX + 1), Vector{Float64}(undef, _PHI_M_MAX + 1), Matrix{Float64}(undef, _PHI_M_MAX + 1, _PHI_M_MAX + 1), - Vector{Float64}(undef, _PHI_IMAX + 2), Vector{Float64}(undef, _PHI_IMAX + 2), + Vector{Float64}(undef, r_max), Vector{Float64}(undef, r_max), Vector{Int}(undef, _PHI_IMAX + 1), - Matrix{Float64}(undef, _PHI_IMAX + 1, _PHI_IMAX + 2), + Matrix{Float64}(undef, _PHI_IMAX + 1, r_max - 1), + Ref(zero(BlasInt)), ) end +@noinline function _phi_cache_check(cache::PhiPadeCache, n::Integer, r_max::Integer) + size(cache.As, 1) == n || throw( + DimensionMismatch( + "PhiPadeCache was constructed for size $(size(cache.As, 1)) but A has size $n" + ) + ) + length(cache.eta) >= r_max || throw( + ArgumentError( + "PhiPadeCache was constructed for a smaller p; construct it with PhiPadeCache(A, p) for the p in use" + ) + ) + return nothing +end + # Renormalized [m/m] Pade coefficients (numerator and denominator, low order # first) of the approximant to phi_p, written into the preallocated `Ncoef`, # `Dcoef` (length >= m+1) using scratch `Amat` (size >= (m+1)^2). Follows the @@ -240,9 +267,9 @@ function _normpow_nonneg!(cache::PhiPadeCache, B, k::Integer) end # Scaling parameter `t` from the first term of the backward-error series -# (Eq. (3.12)); `ell` subfunction of the reference, in-place. -function _phi_ell!(cache::PhiPadeCache, A, m::Integer, p::Integer, phat::Integer) - normT = opnorm(A, 1) +# (Eq. (3.12)); `ell` subfunction of the reference, in-place. `normT` is the +# caller-computed `opnorm(A, 1)` (hoisted out of the per-degree loop). +function _phi_ell!(cache::PhiPadeCache, A, normT, m::Integer, p::Integer, phat::Integer) normT == 0 && return 0 t0 = normT > 1 ? log2(normT) : 0.0 scalefac = exp2(t0) @@ -254,12 +281,16 @@ function _phi_ell!(cache::PhiPadeCache, A, m::Integer, p::Integer, phat::Integer @. absA = c * abs(A) / scalefac alpha = _normpow_nonneg!(cache, absA, K) t = log2(2alpha / eps(Float64)) / (K - delta) + t0 + # NaN/Inf inputs make t non-finite; return 0 (never throw) and let the NaN + # propagate to the output, where step-rejection logic can see it. + isfinite(t) || return 0 return max(ceil(Int, t), 0) end function _select_parameters_phi!(cache::PhiPadeCache, A, p::Integer) phat = _phi_theta(_PHI_M_MAX, p) < 1 ? 0 : p r_max = floor(Int, (1 + sqrt(1 + 4 * (2 * _PHI_M_MAX + phat + 1))) / 2) + _phi_cache_check(cache, size(A, 1), r_max) eta = cache.eta mul!(cache.pow1, A, A) eta[1] = opnorm(cache.pow1, 1)^(1 / 2) @@ -273,47 +304,69 @@ function _select_parameters_phi!(cache::PhiPadeCache, A, p::Integer) for j in 1:(r_max - 1) alpha[j] = max(eta[j], eta[j + 1]) end + normT = opnorm(A, 1) for i in 0:_PHI_IMAX m_i = (i + 3)^2 ÷ 8 phat_i = _phi_theta(m_i, p) < 1 ? 0 : p - cache.tvals[i + 1] = _phi_ell!(cache, A, m_i, p, phat_i) + cache.tvals[i + 1] = _phi_ell!(cache, A, normT, m_i, p, phat_i) end return _phi_select_from_alpha(alpha, r_max - 1, p, cache.tvals, cache.Cost) end -# Evaluate N(As) and D(As) into cache.Nm, cache.Dm via Paterson--Stockmeyer. +# Add block i's coefficient combination, sum_l coef[i*tau + l] * As^{l-1}, to M. +function _phi_ps_addblock!(M, coef, Apow, i::Integer, tau::Integer, m::Integer) + start = i * tau + 1 + stop = min((i + 1) * tau, m + 1) + for l in 1:(stop - start + 1) + c = coef[start + l - 1] + P = Apow[l] + @. M += c * P + end + return M +end + +# Evaluate N(As) and D(As) via Paterson--Stockmeyer in Horner form, +# +# q(As) = ( (B_nu * Atau + B_{nu-1}) * Atau + ... ) * Atau + B_0, +# +# which needs nu products per polynomial -- one fewer when tau | m, since then +# B_nu = c_m*I and the first fold is the broadcast c_m*Atau + B_{nu-1}. Together +# with the tau-1 explicit powers, the multiplication count is exactly Fasi's +# pi_m(tau) = tau - 1 + 2*floor(m/tau) - 2*(tau | m) that the parameter +# selection optimizes over. Returns (N, D) as views into cache buffers. function _paterson_stockmeyer!(cache::PhiPadeCache, As, m::Integer, tau::Integer) Nc, Dc, Apow = cache.Ncoef, cache.Dcoef, cache.Apow - Atau, Nm, Dm = cache.Atau, cache.Nm, cache.Dm - blkN, blkD, tmp = cache.blkN, cache.blkD, cache.tmp - copyto!(Apow[2], As) # Apow[1] === Id; Apow[i] = As^{i-1} + copyto!(Apow[2], As) # Apow[1] = I (set once); Apow[i] = As^{i-1} for i in 3:(tau + 1) mul!(Apow[i], Apow[i - 1], As) end - copyto!(Atau, Apow[tau + 1]) - fill!(Nm, 0) - fill!(Dm, 0) + Atau = Apow[tau + 1] + N, Naux = cache.Nm, cache.Naux + D, Daux = cache.Dm, cache.Daux nu = m ÷ tau - for i in 0:nu - start = i * tau + 1 - stop = min((i + 1) * tau, m + 1) - fill!(blkN, 0) - fill!(blkD, 0) - for l in 1:(stop - start + 1) - @. blkN += Nc[start + l - 1] * Apow[l] - @. blkD += Dc[start + l - 1] * Apow[l] - end - if i == 0 - Nm .+= blkN - Dm .+= blkD - else - mul!(Nm, blkN, Atau, true, true) - mul!(Dm, blkD, Atau, true, true) - mul!(tmp, Atau, Apow[tau + 1]) - copyto!(Atau, tmp) - end + if m % tau == 0 + cN, cD = Nc[m + 1], Dc[m + 1] + @. N = cN * Atau + @. D = cD * Atau + _phi_ps_addblock!(N, Nc, Apow, nu - 1, tau, m) + _phi_ps_addblock!(D, Dc, Apow, nu - 1, tau, m) + inext = nu - 2 + else + fill!(N, 0) + fill!(D, 0) + _phi_ps_addblock!(N, Nc, Apow, nu, tau, m) + _phi_ps_addblock!(D, Dc, Apow, nu, tau, m) + inext = nu - 1 + end + for i in inext:-1:0 + mul!(Naux, N, Atau) + mul!(Daux, D, Atau) + _phi_ps_addblock!(Naux, Nc, Apow, i, tau, m) + _phi_ps_addblock!(Daux, Dc, Apow, i, tau, m) + N, Naux = Naux, N + D, Daux = Daux, D end - return Nm, Dm + return N, D end # LAPACK LU factor + solve with pivots written into a preallocated `ipiv`. @@ -333,7 +386,7 @@ for (getrf, getrs, elty) in ), m, n, A, max(1, stride(A, 2)), ipiv, info ) - return A + return info[] end @eval function _phi_getrs!( A::AbstractMatrix{$elty}, ipiv::Vector{BlasInt}, B::AbstractMatrix{$elty} @@ -354,10 +407,17 @@ for (getrf, getrs, elty) in end # Solve Dfact * X = B in place (B overwritten by X), zero-allocation LU. +# Returns the LAPACK getrf info code instead of throwing: 0 on success; on a +# numerically singular Dfact (pathological input, e.g. NaN/Inf), B is filled +# with NaN so downstream step-rejection logic can detect the failure. function _phi_solve!(Dfact, ipiv, B) - _phi_getrf!(Dfact, ipiv) - _phi_getrs!(Dfact, ipiv, B) - return B + info = _phi_getrf!(Dfact, ipiv) + if info == 0 + _phi_getrs!(Dfact, ipiv, B) + else + fill!(B, NaN) + end + return info end """ @@ -381,7 +441,21 @@ function _phi_almohy!( copyto!(cache.Dfact, Dm) copyto!(out[p + 1], Nm) - _phi_solve!(cache.Dfact, cache.ipiv, out[p + 1]) + info = _phi_solve!(cache.Dfact, cache.ipiv, out[p + 1]) + if info == 0 && !all(isfinite, out[p + 1]) + # NaN/Inf inputs reach here with info == 0 (LAPACK does not flag NaN + # pivots); report them so integrators can test cache.info[] alone. + info = BlasInt(-1) + end + cache.info[] = info + if info != 0 + # Return code instead of an exception: fill the outputs with NaN so an + # adaptive integrator can reject the step rather than crash. + for j in 1:(p + 1) + fill!(out[j], NaN) + end + return out + end # Recurrence (2.9): R^{(j)} = As R^{(j+1)} + I/j!, j = p-1 : -1 : 0. for k in p:-1:1 @@ -423,8 +497,7 @@ function _normpow_nonneg(B::AbstractMatrix{<:Real}, k::Integer) return maximum(v) end -function _phi_ell(A::AbstractMatrix, m::Integer, p::Integer, phat::Integer) - normT = opnorm(A, 1) +function _phi_ell(A::AbstractMatrix, normT, m::Integer, p::Integer, phat::Integer) normT == 0 && return 0 t0 = normT > 1 ? log2(normT) : 0.0 scalefac = exp2(t0) @@ -435,6 +508,7 @@ function _phi_ell(A::AbstractMatrix, m::Integer, p::Integer, phat::Integer) scaledT = c .* abs.(A ./ scalefac) alpha = _normpow_nonneg(scaledT, K) t = log2(2alpha / eps(Float64)) / (K - delta) + t0 + isfinite(t) || return 0 return max(ceil(Int, t), 0) end @@ -450,15 +524,30 @@ function _select_parameters_phi(A::AbstractMatrix, p::Integer) end alpha = [max(eta[j], eta[j + 1]) for j in 1:(r_max - 1)] tvals = Vector{Int}(undef, _PHI_IMAX + 1) + normT = opnorm(A, 1) for i in 0:_PHI_IMAX m_i = (i + 3)^2 ÷ 8 phat_i = _phi_theta(m_i, p) < 1 ? 0 : p - tvals[i + 1] = _phi_ell(A, m_i, p, phat_i) + tvals[i + 1] = _phi_ell(A, normT, m_i, p, phat_i) end Cost = zeros(Float64, _PHI_IMAX + 1, r_max - 1) return _phi_select_from_alpha(alpha, r_max - 1, p, tvals, Cost) end +# Out-of-place counterpart of `_phi_ps_addblock!`: returns `init` plus block i's +# coefficient combination. +function _phi_ps_block(init, coef, Apow, i::Integer, tau::Integer, m::Integer) + start = i * tau + 1 + stop = min((i + 1) * tau, m + 1) + M = init + for l in 1:(stop - start + 1) + M = M + coef[start + l - 1] * Apow[l] + end + return M +end + +# Out-of-place Horner-form Paterson--Stockmeyer; same multiplication count as +# the in-place version (Fasi's pi_m(tau)). function _paterson_stockmeyer(As, Nc, Dc, tau::Integer, m::Integer) MT = typeof(As) Apow = Vector{MT}(undef, tau + 1) @@ -468,26 +557,20 @@ function _paterson_stockmeyer(As, Nc, Dc, tau::Integer, m::Integer) Apow[i] = Apow[i - 1] * As end Atau = Apow[tau + 1] - N = zero(As) - D = zero(As) nu = m ÷ tau - for i in 0:nu - start = i * tau + 1 - stop = min((i + 1) * tau, m + 1) - blkN = zero(As) - blkD = zero(As) - for l in 1:(stop - start + 1) - blkN += Nc[start + l - 1] * Apow[l] - blkD += Dc[start + l - 1] * Apow[l] - end - if i == 0 - N += blkN - D += blkD - else - N += blkN * Atau - D += blkD * Atau - Atau = Atau * Apow[tau + 1] - end + local N::MT, D::MT + if m % tau == 0 + N = _phi_ps_block(Nc[m + 1] * Atau, Nc, Apow, nu - 1, tau, m) + D = _phi_ps_block(Dc[m + 1] * Atau, Dc, Apow, nu - 1, tau, m) + inext = nu - 2 + else + N = _phi_ps_block(zero(As), Nc, Apow, nu, tau, m) + D = _phi_ps_block(zero(As), Dc, Apow, nu, tau, m) + inext = nu - 1 + end + for i in inext:-1:0 + N = _phi_ps_block(N * Atau, Nc, Apow, i, tau, m) + D = _phi_ps_block(D * Atau, Dc, Apow, i, tau, m) end return N, D end @@ -511,7 +594,14 @@ function _phi_almohy_generic(A::AbstractMatrix, p::Integer) Nm, Dm = _paterson_stockmeyer(As, Ncoef, Dcoef, tau, m) Rm = Vector{typeof(As)}(undef, p + 1) - Rm[p + 1] = Dm \ Nm + # No-throw on a numerically singular denominator (pathological input): NaN + # results let adaptive integrators reject the step instead of aborting. + Rm[p + 1] = try + Dm \ Nm + catch e + e isa SingularException || rethrow() + zero(As) .+ eltype(As)(NaN) + end Id = one(As) for k in p:-1:1 Rm[k] = As * Rm[k + 1] + Id / _factf(k - 1) diff --git a/test/basictests.jl b/test/basictests.jl index e11d041..b95eb83 100644 --- a/test/basictests.jl +++ b/test/basictests.jl @@ -262,8 +262,11 @@ end # E = [I 0 ... 0] and J the nilpotent shift. This is unrelated to the # scaling-and-recovering algorithm under test. function phi_block_reference(A, p) - n = size(A, 1) - T = eltype(A) + # `local`: without it these assignments capture and clobber the + # enclosing testset's variables of the same name (closures in a local + # scope assign to existing outer locals). + local n = size(A, 1) + local T = eltype(A) W = zeros(T, n * (p + 1), n * (p + 1)) W[1:n, 1:n] .= A for j in 0:(p - 1) @@ -276,13 +279,13 @@ end # Force the legacy basis-vector path (Sidje/`phiv_dense`) for cross-checking. function phi_legacy(A, p) - n = size(A, 1) - T = eltype(A) - caches = ( + local n = size(A, 1) + local T = eltype(A) + local caches = ( Vector{T}(undef, n), Matrix{T}(undef, n, p + 1), Matrix{T}(undef, n + p, n + p), ) - out = [Matrix{T}(undef, n, n) for _ in 1:(p + 1)] + local out = [Matrix{T}(undef, n, n) for _ in 1:(p + 1)] return phi!(out, A, p; caches = caches) end @@ -326,19 +329,47 @@ end run_phi!(out2, A, cache) # warm up / compile @test (@allocated run_phi!(out2, A, cache)) == 0 - # Complex-matrix workspace path. + # Complex-matrix workspace path, also allocation-free on reuse. Ac = randn(ComplexF64, 6, 6) ./ 3 cachec = PhiPadeCache(Ac, 2) outc = [Matrix{ComplexF64}(undef, 6, 6) for _ in 1:3] phi!(outc, Ac, 2; caches = cachec) @test outc ≈ phi_block_reference(Ac, 2) + @test cachec.info[] == 0 + run_phic!(o, M, c) = phi!(o, M, 2; caches = c) + run_phic!(outc, Ac, cachec) + @test (@allocated run_phic!(outc, Ac, cachec)) == 0 + + # Numerical failure must not throw: NaN input yields NaN-filled outputs and + # a nonzero return code in cache.info[], so adaptive integrators can reject + # the step instead of aborting. + Anan = fill(NaN, 5, 5) + Pnan = phi(Anan, 2) + @test all(P -> any(!isfinite, P), Pnan) + cachenan = PhiPadeCache(Anan, 2) + outnan = [Matrix{Float64}(undef, 5, 5) for _ in 1:3] + phi!(outnan, Anan, 2; caches = cachenan) + @test cachenan.info[] != 0 + @test all(P -> all(isnan, P), outnan) + # ... and a subsequent good call resets the code. + phi!(outnan, randn(5, 5), 2; caches = cachenan) + @test cachenan.info[] == 0 + + # Wrongly sized workspaces are programmer errors, not numerical failures, + # and do throw. + A3 = randn(3, 3) + out3 = [Matrix{Float64}(undef, 3, 3) for _ in 1:4] + @test_throws DimensionMismatch phi!(out3, A3, 3; caches = cache) + cache_p1 = PhiPadeCache(A, 1) + out11 = [Matrix{Float64}(undef, n, n) for _ in 1:11] + @test_throws ArgumentError phi!(out11, A, 10; caches = cache_p1) end @testset "Phi static arrays (container-preserving)" begin Random.seed!(4321) function phi_block_reference(A, p) - n = size(A, 1) - T = eltype(A) + local n = size(A, 1) + local T = eltype(A) W = zeros(T, n * (p + 1), n * (p + 1)) W[1:n, 1:n] .= A for j in 0:(p - 1) From b71efd7cb45dc39a447ffff2c16193ef84ce7c2d Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 3 Jul 2026 09:17:47 -0400 Subject: [PATCH 6/8] phi: overflow-safe backward-error coefficient; public lu! replaces ccalls Two review-driven changes: * _phi_be_coeff evaluated (m+p)!, m!, (2m+p)!, (2m+p+1)! separately before dividing, so intermediates could spuriously overflow (Inf/Inf = NaN for p >~ 159) even where the quotient is representable (@stevengj). Rewritten as a product of sub-unit factors, prod 1/(m+p+j) * prod 1/(m+j), which is monotonically decreasing and can only underflow when the true value does. Matches the BigFloat-exact value to 1.1e-15 over m = 1:12, p = 1:60; new p = 30 regression test. * The hand-rolled getrf!/getrs! ccalls are gone: the solve now uses the public LinearAlgebra API, lu!(Dfact; check = false) + issuccess + ldiv!, which supports the matrix RHS directly and keeps the no-throw return-code semantics (issuccess false -> info > 0, NaN-filled outputs). LinearSolve.jl was considered but its LinearCache only solves vector right-hand sides through the public path, which would force a column-by-column loop or reaching into cache internals; plain lu!/ldiv! achieves ccall-free with zero new dependencies. Cost: the pivot vector is now the one per-call allocation (8n + ~100 bytes, measured 112 B at n = 7 to 928 B at n = 100; all O(n^2) buffers still reused), since no public API preallocates lu! pivots. Allocation tests assert that O(n) bound. The BlasInt import and ipiv field are removed; phi_almohy.jl is now entirely public-API. Verified on Julia 1.10 and 1.12: 254-assertion Phi testset passes (including new large-p and allocation-bound tests), full basictests 605 pass (only the pre-existing ForwardDiff-1.x Issue 42 failures remain), ExplicitImports clean, NaN/Inf return codes intact. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Fable 5 Claude-Session: https://claude.ai/code/session_01EYp371jx6LurezUDhKcYRh --- src/phi.jl | 5 ++- src/phi_almohy.jl | 103 ++++++++++++++++++--------------------------- test/basictests.jl | 20 +++++++-- 3 files changed, 59 insertions(+), 69 deletions(-) diff --git a/src/phi.jl b/src/phi.jl index dbd3f09..b20c653 100644 --- a/src/phi.jl +++ b/src/phi.jl @@ -126,8 +126,9 @@ Non-allocating version of `phi` for non-diagonal matrix inputs, writing `phi_j(A)` into `out[j+1]`. For dense `Float64`/`ComplexF64` matrices, pass a reusable -[`PhiPadeCache`](@ref) as `caches` to make repeated evaluations of the same -size and order allocation-free: +[`PhiPadeCache`](@ref) as `caches` so repeated evaluations of the same size and +order reuse all large buffers (the only per-call allocation is the `O(n)` LU +pivot vector): ```julia cache = PhiPadeCache(A, k) diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl index 478c8cb..d8b37ef 100644 --- a/src/phi_almohy.jl +++ b/src/phi_almohy.jl @@ -16,13 +16,12 @@ # phi_j(2A) = 2^{-j} ( phi_0(A) phi_j(A) + sum_{k=1}^{j} phi_k(A)/(j-k)! ). # # Two implementations share the scalar machinery below: -# * `_phi_almohy!` -- fully in-place, allocation-free, using a reusable -# `PhiPadeCache` workspace (for strided mutable matrices). +# * `_phi_almohy!` -- fully in-place via a reusable `PhiPadeCache` workspace +# (for strided mutable matrices); the only per-call +# allocation is the O(n) LU pivot vector. # * `_phi_almohy_generic` -- out-of-place and element-container-preserving, so # a static input (e.g. `SMatrix`) stays static. -using LinearAlgebra: BlasInt - # theta[m, p] = theta_{m,p} from Table 3.1 of the paper (largest 1-norm of the # scaled matrix for which the [m/m] Pade approximant is backward stable to # double-precision unit roundoff). Rows are m = 1:20, columns p = 1:10. @@ -72,9 +71,23 @@ end return r end -# First-factor coefficient of the backward-error series h_{m,p} (Eq. (3.4)). +# First-factor coefficient of the backward-error series h_{m,p} (Eq. (3.4)), +# +# (m+p)! m! / ((2m+p)! (2m+p+1)!) +# = prod_{j=1}^{m} 1/(m+p+j) * prod_{j=1}^{m+p+1} 1/(m+j), +# +# accumulated as a product of sub-unit factors so no intermediate factorial can +# spuriously overflow (each factor < 1, so the running product is monotonically +# decreasing and underflows only when the true value does). @inline function _phi_be_coeff(m::Integer, p::Integer) - return (_factf(m + p) / _factf(2m + p)) * (_factf(m) / _factf(2m + p + 1)) + r = 1.0 + for j in 1:m + r /= (m + p + j) + end + for j in 1:(m + p + 1) + r /= (m + j) + end + return r end """ @@ -82,12 +95,12 @@ end Reusable workspace for the in-place phi-function evaluator [`phi!`](@ref) on strided `Float64`/`ComplexF64` matrices. Allocate once for a given size and `p`, -then pass it as the `caches` keyword to `phi!` to reuse across calls without -further allocation. +then pass it as the `caches` keyword to `phi!` to reuse across calls; the only +remaining per-call allocation is the LU pivot vector (`O(n)` bytes). After each `phi!` call, `cache.info[]` holds a return code: `0` means success; -a positive value is the LAPACK `getrf` info of a numerically singular Padé -denominator; `-1` means the result was non-finite (`NaN`/`Inf` input). Both +a positive value is the LU factorization's info for a numerically singular +Padé denominator; `-1` means the result was non-finite (`NaN`/`Inf` input). Both failures only occur for pathological inputs, and no error is thrown — the outputs are filled with `NaN` so that adaptive integrators can detect the failed evaluation via `cache.info[] != 0` (or an `isfinite` check) and reject @@ -105,7 +118,6 @@ struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}} pow1::MT pow2::MT absA::RMT - ipiv::Vector{BlasInt} rvec1::Vector{RT} rvec2::Vector{RT} Ncoef::Vector{Float64} @@ -115,7 +127,7 @@ struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}} alpha::Vector{Float64} tvals::Vector{Int} Cost::Matrix{Float64} - info::typeof(Ref(zero(BlasInt))) + info::typeof(Ref(0)) end # Largest r for which alpha_r may be probed (Eq. (3.10) with phat = p; theta_{12,p} @@ -135,14 +147,13 @@ function PhiPadeCache(A::AbstractMatrix{T}, p::Integer) where {T} return PhiPadeCache{T, RT, Matrix{T}, Matrix{RT}}( mk(), Apow, mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(), Matrix{RT}(undef, n, n), - Vector{BlasInt}(undef, n), Vector{RT}(undef, n), Vector{RT}(undef, n), Vector{Float64}(undef, _PHI_M_MAX + 1), Vector{Float64}(undef, _PHI_M_MAX + 1), Matrix{Float64}(undef, _PHI_M_MAX + 1, _PHI_M_MAX + 1), Vector{Float64}(undef, r_max), Vector{Float64}(undef, r_max), Vector{Int}(undef, _PHI_IMAX + 1), Matrix{Float64}(undef, _PHI_IMAX + 1, r_max - 1), - Ref(zero(BlasInt)), + Ref(0), ) end @@ -369,61 +380,27 @@ function _paterson_stockmeyer!(cache::PhiPadeCache, As, m::Integer, tau::Integer return N, D end -# LAPACK LU factor + solve with pivots written into a preallocated `ipiv`. -# `LAPACK.getrf!` only gained a pivot-accepting method in newer Julia, so both -# `getrf!` and `getrs!` are ccall-ed directly (as `exp_noalloc.jl` does for -# `gebal!`) to keep the solve allocation-free on every supported version. -for (getrf, getrs, elty) in - ((:dgetrf_, :dgetrs_, :Float64), (:zgetrf_, :zgetrs_, :ComplexF64)) - @eval function _phi_getrf!(A::AbstractMatrix{$elty}, ipiv::Vector{BlasInt}) - m, n = size(A) - info = Ref{BlasInt}(0) - ccall( - (BLAS.@blasfunc($getrf), BLAS.libblastrampoline), Cvoid, - ( - Ref{BlasInt}, Ref{BlasInt}, Ptr{$elty}, Ref{BlasInt}, - Ptr{BlasInt}, Ref{BlasInt}, - ), - m, n, A, max(1, stride(A, 2)), ipiv, info - ) - return info[] - end - @eval function _phi_getrs!( - A::AbstractMatrix{$elty}, ipiv::Vector{BlasInt}, B::AbstractMatrix{$elty} - ) - n = size(A, 1) - info = Ref{BlasInt}(0) - ccall( - (BLAS.@blasfunc($getrs), BLAS.libblastrampoline), Cvoid, - ( - Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{$elty}, Ref{BlasInt}, - Ptr{BlasInt}, Ptr{$elty}, Ref{BlasInt}, Ref{BlasInt}, Clong, - ), - 'N', n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, - max(1, stride(B, 2)), info, 1 - ) - return B - end -end - -# Solve Dfact * X = B in place (B overwritten by X), zero-allocation LU. -# Returns the LAPACK getrf info code instead of throwing: 0 on success; on a +# Solve Dfact * X = B in place (Dfact overwritten by its LU factorization, B +# by X). Returns an info code instead of throwing: 0 on success; on a # numerically singular Dfact (pathological input, e.g. NaN/Inf), B is filled -# with NaN so downstream step-rejection logic can detect the failure. -function _phi_solve!(Dfact, ipiv, B) - info = _phi_getrf!(Dfact, ipiv) - if info == 0 - _phi_getrs!(Dfact, ipiv, B) +# with NaN so downstream step-rejection logic can detect the failure. The lu! +# pivot vector is the only per-call allocation (O(n); no public API yet allows +# reusing it across factorizations). +function _phi_solve!(Dfact, B) + F = lu!(Dfact; check = false) + if issuccess(F) + ldiv!(F, B) + return 0 else fill!(B, NaN) + return Int(F.info) end - return info end """ _phi_almohy!(out, A, p, cache) -> out -Allocation-free evaluation of `phi_0(A), ..., phi_p(A)` for a strided +In-place evaluation of `phi_0(A), ..., phi_p(A)` for a strided `Float64`/`ComplexF64` matrix `A` (`p >= 1`), writing `phi_j(A)` into `out[j+1]` and using the reusable [`PhiPadeCache`](@ref) `cache` for all scratch storage. The `out` matrices double as the recovery buffers. @@ -441,11 +418,11 @@ function _phi_almohy!( copyto!(cache.Dfact, Dm) copyto!(out[p + 1], Nm) - info = _phi_solve!(cache.Dfact, cache.ipiv, out[p + 1]) + info = _phi_solve!(cache.Dfact, out[p + 1]) if info == 0 && !all(isfinite, out[p + 1]) - # NaN/Inf inputs reach here with info == 0 (LAPACK does not flag NaN + # NaN/Inf inputs reach here with info == 0 (the LU does not flag NaN # pivots); report them so integrators can test cache.info[] alone. - info = BlasInt(-1) + info = -1 end cache.info[] = info if info != 0 diff --git a/test/basictests.jl b/test/basictests.jl index b95eb83..fdf4302 100644 --- a/test/basictests.jl +++ b/test/basictests.jl @@ -313,6 +313,15 @@ end A = randn(9, 9) ./ 3 @test phi(A, 4)[1] ≈ exp(A) + # Large p exercises the recurrence form of the backward-error coefficient + # (naive factorial ratios overflow long before their quotient does). + A = randn(3, 3) + P30 = phi(A, 30) + ref30 = phi_block_reference(A, 30) + for j in 1:31 + @test P30[j] ≈ ref30[j] rtol = 1.0e-8 atol = 1.0e-13 + end + # Preallocated in-place entry point returns phi_j in out[j+1]. n = 7 A = randn(n, n) ./ 3 @@ -320,16 +329,19 @@ end phi!(out, A, 3) @test out ≈ phi_block_reference(A, 3) - # Reusable workspace: same result, and no allocation once the cache exists. + # Reusable workspace: same result, and only the O(n) `lu!` pivot vector is + # allocated once the cache exists (all O(n^2) buffers are reused; the pivot + # vector cannot be preallocated through the public LinearAlgebra API). + lu_alloc_bound(m) = 8m + 200 cache = PhiPadeCache(A, 3) out2 = [Matrix{Float64}(undef, n, n) for _ in 1:4] phi!(out2, A, 3; caches = cache) @test out2 ≈ out run_phi!(o, M, c) = phi!(o, M, 3; caches = c) run_phi!(out2, A, cache) # warm up / compile - @test (@allocated run_phi!(out2, A, cache)) == 0 + @test (@allocated run_phi!(out2, A, cache)) <= lu_alloc_bound(n) - # Complex-matrix workspace path, also allocation-free on reuse. + # Complex-matrix workspace path, same allocation bound on reuse. Ac = randn(ComplexF64, 6, 6) ./ 3 cachec = PhiPadeCache(Ac, 2) outc = [Matrix{ComplexF64}(undef, 6, 6) for _ in 1:3] @@ -338,7 +350,7 @@ end @test cachec.info[] == 0 run_phic!(o, M, c) = phi!(o, M, 2; caches = c) run_phic!(outc, Ac, cachec) - @test (@allocated run_phic!(outc, Ac, cachec)) == 0 + @test (@allocated run_phic!(outc, Ac, cachec)) <= lu_alloc_bound(6) # Numerical failure must not throw: NaN input yields NaN-filled outputs and # a nonzero return code in cache.info[], so adaptive integrators can reject From 583e40a5aabbfdec094935944ec1c09d5e8d8da6 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Sat, 4 Jul 2026 17:07:05 -0400 Subject: [PATCH 7/8] phi: solve through LinearSolve v4 batched matrix RHS With LinearSolve 4.0.0 supporting LinearProblem(A, B::Matrix) like A\B (SciML/LinearSolve.jl#1072), PhiPadeCache now embeds a LinearSolve cache and _phi_solve! runs the Pade denominator solve through it instead of raw lu!/ldiv!. GenericLUFactorization is used because it refactorizes in place with a cached pivot vector: workspace reuse is now fully allocation-free (0 bytes measured for Float64 and ComplexF64 at n = 7..100, p = 3..8, on Julia 1.10 and 1.12), improving on the lu! version's per-call O(n) pivot allocation. (LUFactorization currently copies A on every dense refactorization -- an in-place opt-in is being added upstream, after which the LAPACK path can be swapped in for large n.) Return-code semantics are unchanged: retcode Success -> info 0, singular -> info 1 with NaN-filled outputs, non-finite results -> -1. Tests tightened back to @allocated == 0; 254-assertion Phi testset passes on 1.10 and 1.12 against LinearSolve 4.0.0. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Fable 5 Claude-Session: https://claude.ai/code/session_01EYp371jx6LurezUDhKcYRh --- Project.toml | 2 ++ src/phi.jl | 6 ++--- src/phi_almohy.jl | 59 +++++++++++++++++++++++++++++----------------- test/basictests.jl | 13 +++++----- 4 files changed, 49 insertions(+), 31 deletions(-) diff --git a/Project.toml b/Project.toml index a10df2c..5c9785d 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" GenericSchur = "c145ed77-6b09-5dd9-b285-bf645a82121e" +LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -29,6 +30,7 @@ ForwardDiff = "0.10.13" GPUArraysCore = "0.1, 0.2" GenericSchur = "0.5.3" LinearAlgebra = "1.10" +LinearSolve = "4" PrecompileTools = "1" Printf = "1.10" Random = "1" diff --git a/src/phi.jl b/src/phi.jl index b20c653..010ee37 100644 --- a/src/phi.jl +++ b/src/phi.jl @@ -126,9 +126,9 @@ Non-allocating version of `phi` for non-diagonal matrix inputs, writing `phi_j(A)` into `out[j+1]`. For dense `Float64`/`ComplexF64` matrices, pass a reusable -[`PhiPadeCache`](@ref) as `caches` so repeated evaluations of the same size and -order reuse all large buffers (the only per-call allocation is the `O(n)` LU -pivot vector): +[`PhiPadeCache`](@ref) as `caches` to make repeated evaluations of the same +size and order allocation-free (the linear solve runs through an embedded +LinearSolve.jl cache): ```julia cache = PhiPadeCache(A, k) diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl index d8b37ef..057d327 100644 --- a/src/phi_almohy.jl +++ b/src/phi_almohy.jl @@ -22,6 +22,8 @@ # * `_phi_almohy_generic` -- out-of-place and element-container-preserving, so # a static input (e.g. `SMatrix`) stays static. +using LinearSolve: LinearSolve, LinearProblem, GenericLUFactorization + # theta[m, p] = theta_{m,p} from Table 3.1 of the paper (largest 1-norm of the # scaled matrix for which the [m/m] Pade approximant is backward stable to # double-precision unit roundoff). Rows are m = 1:20, columns p = 1:10. @@ -95,23 +97,28 @@ end Reusable workspace for the in-place phi-function evaluator [`phi!`](@ref) on strided `Float64`/`ComplexF64` matrices. Allocate once for a given size and `p`, -then pass it as the `caches` keyword to `phi!` to reuse across calls; the only -remaining per-call allocation is the LU pivot vector (`O(n)` bytes). +then pass it as the `caches` keyword to `phi!` to reuse across calls. The +linear solve runs through an embedded LinearSolve.jl cache (batched matrix +right-hand side, `GenericLUFactorization`); reuse across calls is +allocation-free — all buffers, including the LU pivots, live in the workspace +and the LinearSolve cache. After each `phi!` call, `cache.info[]` holds a return code: `0` means success; -a positive value is the LU factorization's info for a numerically singular +a positive value indicates a numerically singular Padé denominator; `-1` means the result was non-finite (`NaN`/`Inf` input). Both failures only occur for pathological inputs, and no error is thrown — the outputs are filled with `NaN` so that adaptive integrators can detect the failed evaluation via `cache.info[] != 0` (or an `isfinite` check) and reject the step rather than abort. """ -struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}} +struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}, LS} As::MT Apow::Vector{MT} Nm::MT Dm::MT Dfact::MT + rhs::MT + linsolve::LS Naux::MT Daux::MT tmp::MT @@ -144,8 +151,16 @@ function PhiPadeCache(A::AbstractMatrix{T}, p::Integer) where {T} for i in 2:_PHI_NPOW Apow[i] = mk() end - return PhiPadeCache{T, RT, Matrix{T}, Matrix{RT}}( - mk(), Apow, mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(), + Dfact = Matrix{T}(I, n, n) + rhs = zeros(T, n, n) + # GenericLUFactorization refactorizes in place with a cached pivot vector, + # so cache reuse is allocation-free (LUFactorization currently copies A on + # every dense refactorization). Dfact/rhs are workspace buffers refilled + # before each solve. + linsolve = LinearSolve.init(LinearProblem(Dfact, rhs), GenericLUFactorization()) + return PhiPadeCache{T, RT, Matrix{T}, Matrix{RT}, typeof(linsolve)}( + mk(), Apow, mk(), mk(), Dfact, rhs, linsolve, + mk(), mk(), mk(), mk(), mk(), Matrix{RT}(undef, n, n), Vector{RT}(undef, n), Vector{RT}(undef, n), Vector{Float64}(undef, _PHI_M_MAX + 1), Vector{Float64}(undef, _PHI_M_MAX + 1), @@ -380,20 +395,24 @@ function _paterson_stockmeyer!(cache::PhiPadeCache, As, m::Integer, tau::Integer return N, D end -# Solve Dfact * X = B in place (Dfact overwritten by its LU factorization, B -# by X). Returns an info code instead of throwing: 0 on success; on a -# numerically singular Dfact (pathological input, e.g. NaN/Inf), B is filled -# with NaN so downstream step-rejection logic can detect the failure. The lu! -# pivot vector is the only per-call allocation (O(n); no public API yet allows -# reusing it across factorizations). -function _phi_solve!(Dfact, B) - F = lu!(Dfact; check = false) - if issuccess(F) - ldiv!(F, B) +# Solve Dm * X = Nm through the workspace's LinearSolve cache (batched matrix +# right-hand side, LinearSolve >= 4), writing X into `X`. Returns an info code +# instead of throwing: 0 on success; 1 on a numerically singular Dm +# (pathological input, e.g. NaN/Inf), in which case `X` is filled with NaN so +# downstream step-rejection logic can detect the failure. +function _phi_solve!(cache::PhiPadeCache, Dm, Nm, X) + copyto!(cache.Dfact, Dm) + copyto!(cache.rhs, Nm) + linsolve = cache.linsolve + linsolve.A = cache.Dfact + linsolve.b = cache.rhs + sol = LinearSolve.solve!(linsolve) + if sol.retcode == LinearSolve.ReturnCode.Success + copyto!(X, sol.u) return 0 else - fill!(B, NaN) - return Int(F.info) + fill!(X, NaN) + return 1 end end @@ -416,9 +435,7 @@ function _phi_almohy!( _phi_pade_coef!(cache.Ncoef, cache.Dcoef, cache.Amat, m, p) Nm, Dm = _paterson_stockmeyer!(cache, As, m, tau) - copyto!(cache.Dfact, Dm) - copyto!(out[p + 1], Nm) - info = _phi_solve!(cache.Dfact, out[p + 1]) + info = _phi_solve!(cache, Dm, Nm, out[p + 1]) if info == 0 && !all(isfinite, out[p + 1]) # NaN/Inf inputs reach here with info == 0 (the LU does not flag NaN # pivots); report them so integrators can test cache.info[] alone. diff --git a/test/basictests.jl b/test/basictests.jl index fdf4302..faf721c 100644 --- a/test/basictests.jl +++ b/test/basictests.jl @@ -329,19 +329,18 @@ end phi!(out, A, 3) @test out ≈ phi_block_reference(A, 3) - # Reusable workspace: same result, and only the O(n) `lu!` pivot vector is - # allocated once the cache exists (all O(n^2) buffers are reused; the pivot - # vector cannot be preallocated through the public LinearAlgebra API). - lu_alloc_bound(m) = 8m + 200 + # Reusable workspace: same result, and reuse is allocation-free (the linear + # solve runs through the workspace's LinearSolve cache, which owns the LU + # pivot storage). cache = PhiPadeCache(A, 3) out2 = [Matrix{Float64}(undef, n, n) for _ in 1:4] phi!(out2, A, 3; caches = cache) @test out2 ≈ out run_phi!(o, M, c) = phi!(o, M, 3; caches = c) run_phi!(out2, A, cache) # warm up / compile - @test (@allocated run_phi!(out2, A, cache)) <= lu_alloc_bound(n) + @test (@allocated run_phi!(out2, A, cache)) == 0 - # Complex-matrix workspace path, same allocation bound on reuse. + # Complex-matrix workspace path, also allocation-free on reuse. Ac = randn(ComplexF64, 6, 6) ./ 3 cachec = PhiPadeCache(Ac, 2) outc = [Matrix{ComplexF64}(undef, 6, 6) for _ in 1:3] @@ -350,7 +349,7 @@ end @test cachec.info[] == 0 run_phic!(o, M, c) = phi!(o, M, 2; caches = c) run_phic!(outc, Ac, cachec) - @test (@allocated run_phic!(outc, Ac, cachec)) <= lu_alloc_bound(6) + @test (@allocated run_phic!(outc, Ac, cachec)) == 0 # Numerical failure must not throw: NaN input yields NaN-filled outputs and # a nonzero return code in cache.info[], so adaptive integrators can reject From 27be6405d3023f89649bd75fdb5d383d92ea5d52 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Sun, 5 Jul 2026 05:22:51 -0400 Subject: [PATCH 8/8] Retrigger CI: LinearSolve 4.1.0 registered Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Fable 5