From 4b0551edb827445e893e2e40adb784946b9d615d Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Thu, 2 Jul 2026 13:30:17 -0400
Subject: [PATCH 1/8] Add O(p m^3) Al-Mohy--Liu scaling-and-recovering
 algorithm for phi functions

`phi(A, k)` / `phi!` previously computed phi_0(A), ..., phi_k(A) by calling
`phiv_dense!` on each of the m basis vectors, each exponentiating an
(m+k) x (m+k) block matrix -- overall O(m (m+k)^3).

This implements Algorithm 5.1 of Al-Mohy & Liu, "A scaling and recovering
algorithm for the matrix phi-functions" (arXiv:2506.01193, 2025), which
computes all phi functions simultaneously in O(k m^3): scale A by 2^-s,
evaluate the [m/m] diagonal Pade approximant to phi_p via Paterson--Stockmeyer,
recover the lower-index approximants with the recurrence
R^{(j)} = z R^{(j+1)} + 1/j!, and undo the scaling with the double-argument
formula. Backward-error-optimal Pade degree m and scaling s are selected from
the paper's theta table (Table 3.1) and the alpha_r / t cost analysis.

The new path is the default for Float64/ComplexF64 dense inputs (its Pade
tables are tuned for double precision); other element types and callers passing
the legacy `caches` bundle keep the basis-vector path. Diagonal inputs are
unchanged.

Validated against a high-precision block-exponential reference and the legacy
algorithm: worst-case relative error ~1e-14 across normal, non-normal,
large-norm, complex, Hessenberg and zero matrices for k = 1..10, matching or
beating the old accuracy. Measured speedups over the basis-vector path: 18x at
n=50, 51x at n=100, 72x at n=200 (k=3).

Closes #235.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01EYp371jx6LurezUDhKcYRh
---
 src/ExponentialUtilities.jl |   1 +
 src/phi.jl                  |  17 ++-
 src/phi_almohy.jl           | 281 ++++++++++++++++++++++++++++++++++++
 test/basictests.jl          |  62 ++++++++
 4 files changed, 358 insertions(+), 3 deletions(-)
 create mode 100644 src/phi_almohy.jl

diff --git a/src/ExponentialUtilities.jl b/src/ExponentialUtilities.jl
index 9e89db3..3492d41 100644
--- a/src/ExponentialUtilities.jl
+++ b/src/ExponentialUtilities.jl
@@ -29,6 +29,7 @@ include("exp_noalloc.jl")
 include("exp_generic.jl")
 include("exp_sparse.jl")
 include("phi.jl")
+include("phi_almohy.jl")
 include("arnoldi.jl")
 include("krylov_phiv.jl")
 include("krylov_phiv_adaptive.jl")
diff --git a/src/phi.jl b/src/phi.jl
index 8faa9eb..70bfac1 100644
--- a/src/phi.jl
+++ b/src/phi.jl
@@ -103,9 +103,12 @@ The phi functions are defined as
 \\varphi_0(z) = \\exp(z),\\quad \\varphi_{k+1}(z) = \\frac{\\varphi_k(z) - \\varphi_k(0)}{z}
 ```
 
-Calls `phiv_dense` on each of the basis vectors to obtain the answer. If A
-is `Diagonal`, instead calls the scalar `phi` on each diagonal element and the
-return values are also `Diagonal`s
+For `Float64`/`ComplexF64` matrices this uses the scaling-and-recovering
+algorithm of Al-Mohy and Liu (arXiv:2506.01193), which computes all of
+`phi_0(A), ..., phi_k(A)` simultaneously in `O(k m^3)` operations. Other element
+types fall back to calling `phiv_dense` on each basis vector (`O(m (m+k)^3)`). If
+`A` is `Diagonal`, the scalar `phi` is instead applied to each diagonal element
+and the return values are also `Diagonal`s.
 """
 function phi(A::AbstractMatrix{T}, k; kwargs...) where {T <: Number}
     m = size(A, 1)
@@ -127,6 +130,14 @@ function phi!(
     ) where {T <: Number}
     m = size(A, 1)
     @assert length(out) == k + 1&&all(P -> size(P) == (m, m), out) "Dimension mismatch"
+    # The scaling-and-recovering algorithm of Al-Mohy and Liu (arXiv:2506.01193)
+    # computes phi_0..phi_k simultaneously in O(k m^3), versus O(m (m+k)^3) for
+    # the basis-vector approach below. Its Pade tables are tuned for double
+    # precision, so it is used only for Float64/ComplexF64 and only when the
+    # caller has not supplied the legacy `caches` bundle.
+    if k >= 1 && isnothing(caches) && T <: Union{Float64, ComplexF64}
+        return _phi_almohy!(out, A, k)
+    end
     if isnothing(caches)
         e = Vector{T}(undef, m)
         W = Matrix{T}(undef, m, k + 1)
diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl
new file mode 100644
index 0000000..d2881fe
--- /dev/null
+++ b/src/phi_almohy.jl
@@ -0,0 +1,281 @@
+# Scaling-and-recovering algorithm for the matrix phi-functions, computing
+# phi_0(A), ..., phi_p(A) simultaneously in O(p*n^3) work.
+#
+# This is a port of Algorithm 5.1 of
+#
+#   Awad H. Al-Mohy and Xiaobo Liu, "A scaling and recovering algorithm for the
+#   matrix phi-functions", arXiv:2506.01193 (2025),
+#
+# and follows the authors' reference MATLAB implementation `phi_funm`.
+#
+# The idea: scale A down by 2^s, evaluate the [m/m] diagonal Pade approximant to
+# phi_p of the scaled matrix, recover the lower-index approximants via the
+# recurrence R^{(j)}_m(z) = z R^{(j+1)}_m(z) + 1/j!, then undo the scaling with
+# the double-argument formula
+#
+#   phi_j(2A) = 2^{-j} ( phi_0(A) phi_j(A) + sum_{k=1}^{j} phi_k(A)/(j-k)! ).
+
+# theta[m, p] = theta_{m,p} from Table 3.1 of the paper (largest 1-norm of the
+# scaled matrix for which the [m/m] Pade approximant is backward stable to
+# double-precision unit roundoff). Rows are m = 1:20, columns p = 1:10.
+const _PHI_THETA_MP = [
+    1.999463452408407e-5 3.7631213142601604e-5 7.366006416045163e-5 0.00014973317297025854 0.0003152443333771182 0.0006855209983764435 0.0015357294906993542 0.0035357368946407606 0.008345789028062234 0.02013882808928226;
+    0.0038062018282832713 0.006090206286125726 0.009869682746779615 0.016211831146383013 0.026984240563843326 0.0454757968381803 0.07749273259855331 0.13324895779231027 0.2304625604362521 0.3988991104549146;
+    0.03971636005661334 0.05806968886880692 0.08534220076759817 0.12612151517169362 0.18736524835661617 0.27955116495245524 0.41822735418681667 0.6258702800351991 0.9335970443986562 1.1616793320890249;
+    0.15442675548312682 0.21278117034577634 0.29371996708854947 0.40617647304246707 0.5623843002320996 0.7787883505754265 1.0464245027100287 1.2572921799132364 1.480350861451984 1.713871003185325;
+    0.37980898016147974 0.5014624976587007 0.6621033824456818 0.8739858777744354 1.110828442901451 1.3375805277521873 1.5770955744229305 1.8273848586096524 2.08673114576671 2.3536689190936406;
+    0.726177195703321 0.9281910159646274 1.1591052927815408 1.4012982671152012 1.6570386251184455 1.924026602416338 2.20029216725632 2.4841706972729956 2.7742685942850143 3.069426294589578;
+    1.1898666361923196 1.4469917284172122 1.7187282676344413 2.0024009817133357 2.295729523138923 2.5968019781751672 2.9040336511997444 3.216121483847106 3.531999855119243 3.8508005672569903;
+    1.7605812331512907 2.060907194742016 2.371480030315257 2.690078917436836 3.014877598333601 3.3443898815994957 3.677415494646134 4.01299056190377 4.350344448873932 4.688863279363638;
+    2.425818958547233 2.7623053687512256 3.105174730511504 3.4527057639100476 3.8035234182863555 4.156538184296475 4.510892686686127 4.865916398054041 5.2210881567935115 5.57600564043454;
+    3.173113456793749 3.5393251225873685 3.9086764034913664 4.279911937822335 4.652058062541427 5.024366624424222 5.396268333440519 5.767334816025469 6.1372481669904 6.505776744744432;
+    3.991025201329815 4.3813599805888455 4.772204432206606 5.162703837030561 5.552220615863406 5.940286915538899 6.326566605567779 6.710825187972159 7.092906173082131 7.4727126397309815;
+    4.869485489784578 5.279199870248916 5.687344501175957 6.09339280009268 6.496977936247796 6.8978550856394785 7.295871912058635 7.690945666690415 8.083045525424248 8.472179024890941;
+    5.799827904547885 6.224971321619904 6.64693480650714 7.065451733211646 7.48036740469753 7.891610269499859 8.29916985381997 8.703079946679036 9.103405852177376 9.500234768451538;
+    6.774681996458441 7.211997240187581 7.644910128682907 8.073354767800579 8.497338949398504 8.916922998846443 9.332203860678709 9.74330319935391 10.150358556572542 10.5535168239286;
+    7.787817051572435 8.234634975085694 8.676142196766529 9.112424389503664 9.543611219824884 9.969861373150518 10.391351525913848 10.808268293369911 11.220802409044003 11.629144570605774;
+    8.833978214833847 9.28811956766922 9.736292792866143 10.178695474892383 10.615546677548995 11.04707680323746 11.473520338548882 11.895110742359616 12.312076916038997 12.724640835984562;
+    9.908733823133844 10.368423129623023 10.821685340709736 11.26879859255289 11.710045790427412 12.145708155762449 12.576060822320501 13.001369927621694 13.421890788016032 13.837866852904227;
+    11.008341035221061 11.472133539821815 11.929195701618417 12.379861767254168 12.824458619109002 13.263302077220441 13.696694607678852 14.124924034990169 14.548262963142152 14.966968689718191;
+    12.129631147167878 12.596352058386916 13.056160721729096 13.509428665638305 13.956511465304834 14.397747062985934 14.83345501195776 15.263936358537684 15.689473955602649 16.110333058994453;
+    13.269913405576162 13.738607964511967 14.20030229038679 14.655390835644507 15.104246216977073 15.547218984875734 15.984637960610213 16.41681094336813 16.84402564730148 17.266550769620974;
+]
+
+# theta_{m,p} with the p>7 rule of the paper: for p>7 the p=7 column is used, and
+# indices are guarded so m outside 1:20 does not error.
+@inline function _phi_theta(m::Integer, p::Integer)
+    return _PHI_THETA_MP[m, p > 7 ? 7 : p]
+end
+
+# k! as a Float64 (k small; avoids Int overflow for the recovery coefficients).
+@inline function _factf(k::Integer)
+    r = 1.0
+    for i in 2:k
+        r *= i
+    end
+    return r
+end
+
+# Exact 1-norm of B^k for an entrywise-nonnegative real B, via ‖B^k‖_1 =
+# ‖(B^T)^k 𝟙‖_∞ (k matrix-vector products, no explicit power).
+function _normpow_nonneg(B::AbstractMatrix{<:Real}, k::Integer)
+    n = size(B, 1)
+    Bt = transpose(B)
+    v = ones(eltype(B), n)
+    tmp = similar(v)
+    for _ in 1:k
+        mul!(tmp, Bt, v)
+        v, tmp = tmp, v
+    end
+    return maximum(v)
+end
+
+# Scaling parameter `t` derived from the first term of the backward-error series
+# (Eq. (3.12)); mirrors the `ell` subfunction of the reference implementation.
+function _phi_ell(A::AbstractMatrix, m::Integer, p::Integer, phat::Integer)
+    normT = opnorm(A, 1)
+    normT == 0 && return 0
+    t0 = normT > 1 ? log2(normT) : 0.0
+    scalefac = exp2(t0)
+    normTs = normT / scalefac
+    delta = (p - 1) * (p - phat) / p + 1
+    coeff = (_factf(m + p) / _factf(2m + p)) * (_factf(m) / _factf(2m + p + 1))
+    K = 2m + p + 1
+    c = (coeff / normTs^delta)^(1 / K)
+    scaledT = c .* abs.(A ./ scalefac)
+    alpha = _normpow_nonneg(scaledT, K)
+    t = log2(2alpha / eps(Float64)) / (K - delta) + t0
+    return max(ceil(Int, t), 0)
+end
+
+# Select the Pade degree m, scaling parameter s, and Paterson-Stockmeyer block
+# size tau that minimize the equivalent number of matrix products (Section 4).
+function _select_parameters_phi(A::AbstractMatrix, p::Integer)
+    m_max = 12
+    i_max = ceil(Int, sqrt(8 * (m_max + 1)) - 3) - 1
+    m_max = (i_max + 3)^2 ÷ 8
+    phat = _phi_theta(m_max, p) < 1 ? 0 : p
+    r_max = floor(Int, (1 + sqrt(1 + 4 * (2m_max + phat + 1))) / 2)
+
+    # eta[j] = ‖A^{j+1}‖_1^{1/(j+1)} for j = 1:r_max, forming the powers exactly.
+    # (The powers are low, j+1 ≤ r_max+1 ≲ 8; the cost is negligible next to the
+    # rest of the algorithm and never underestimates, keeping s conservative.)
+    eta = zeros(Float64, r_max)
+    P = A * A
+    eta[1] = opnorm(P, 1)^(1 / 2)
+    for j in 2:r_max
+        P = P * A
+        eta[j] = opnorm(P, 1)^(1 / (j + 1))
+    end
+    alpha = [max(eta[j], eta[j + 1]) for j in 1:(r_max - 1)]
+
+    Cost = zeros(Float64, i_max + 1, r_max - 1)
+    for i in 0:i_max
+        m_i = (i + 3)^2 ÷ 8
+        θ = _phi_theta(m_i, p)
+        phat_i = θ < 1 ? 0 : p
+        t = _phi_ell(A, m_i, p, phat_i)
+        for r in 2:r_max
+            if 2m_i + phat_i + 1 >= r * (r - 1)
+                a = alpha[r - 1]
+                s0 = (a > 0 && isfinite(a)) ? max(ceil(Int, log2(a / θ)), t) : t
+                Cost[i + 1, r - 1] = i + p + s0 * (p + 1)
+            end
+        end
+    end
+
+    minval = Inf
+    for v in Cost
+        if v > 0 && v < minval
+            minval = v
+        end
+    end
+    # Match the reference's column-major "last" tie-break for reproducibility.
+    idx = 0
+    for (j, v) in enumerate(Cost)
+        v == minval && (idx = j)
+    end
+    i_star = (idx - 1) % (i_max + 1)
+    m = (i_star + 3)^2 ÷ 8
+    s = round(Int, (minval - i_star - p) / (p + 1))
+
+    tau = floor(Int, sqrt(2m))
+    if tau - 1 + 2 * (m ÷ tau) - 2 * (m % tau == 0) != i_star
+        tau = ceil(Int, sqrt(2m))
+    end
+    return m, s, tau
+end
+
+# Renormalized [m/m] Pade coefficients (numerator and denominator, low order
+# first) of the approximant to phi_p. Follows the recurrences of Berland,
+# Skaflestad, and Wright used in the reference `pade_coef`.
+function _phi_pade_coef(m::Integer, p::Integer)
+    n1 = prod(Float64.((m + 1):(2m + 1)))   # (2m+1)!/m!
+    d1 = n1
+    Ncoef = Vector{Float64}(undef, m + 1)
+    Dcoef = Vector{Float64}(undef, m + 1)
+    Amat = Matrix{Float64}(undef, m + 1, m + 1)
+    for k in 1:p
+        if k == p
+            # First column: Amat[r,1] = n1 * prod_{l=1}^{r-1} 1/(k+l).
+            cp = 1.0
+            Amat[1, 1] = n1
+            for r in 2:(m + 1)
+                cp /= (k + (r - 1))
+                Amat[r, 1] = n1 * cp
+            end
+            for c in 2:(m + 1), r in 2:(m + 1)
+                I = r - 1
+                J = c - 1
+                Amat[r, c] = -(m + 1 - J) * (k + 1 + I - J) / ((2m + k + 1 - J) * J)
+            end
+            for r in 1:(m + 1)
+                s = 0.0
+                pr = 1.0
+                for c in 1:r
+                    pr *= Amat[r, c]
+                    s += pr
+                end
+                Ncoef[r] = s
+            end
+            Dcoef[1] = d1
+            cpd = 1.0
+            for r in 2:(m + 1)
+                i = r - 1
+                cpd *= -(m + 1 - i) / (i * (2m + k + 1 - i))
+                Dcoef[r] = d1 * cpd
+            end
+        end
+        n1 = n1 * (2m + k + 1) / (k + 1)
+        d1 = d1 * (2m + k + 1)
+    end
+    return Ncoef, Dcoef
+end
+
+# Simultaneously evaluate the numerator N(A) and denominator D(A) of the Pade
+# approximant via the Paterson-Stockmeyer scheme with block size tau.
+function _paterson_stockmeyer(A::AbstractMatrix{T}, Nc, Dc, tau::Integer) where {T}
+    m = length(Nc) - 1
+    n = size(A, 1)
+    Apow = Vector{Matrix{T}}(undef, tau + 1)
+    Apow[1] = Matrix{T}(I, n, n)
+    Apow[2] = Matrix(A)
+    for i in 3:(tau + 1)
+        Apow[i] = Apow[i - 1] * A
+    end
+    Atau = copy(Apow[tau + 1])
+    N = zeros(T, n, n)
+    D = zeros(T, n, n)
+    nu = m ÷ tau
+    for i in 0:nu
+        start = i * tau + 1
+        stop = min((i + 1) * tau, m + 1)
+        blkN = zeros(T, n, n)
+        blkD = zeros(T, n, n)
+        for l in 1:(stop - start + 1)
+            blkN .+= Nc[start + l - 1] .* Apow[l]
+            blkD .+= Dc[start + l - 1] .* Apow[l]
+        end
+        if i == 0
+            N .+= blkN
+            D .+= blkD
+        else
+            N .+= blkN * Atau
+            D .+= blkD * Atau
+            Atau = Atau * Apow[tau + 1]
+        end
+    end
+    return N, D
+end
+
+"""
+    _phi_almohy!(out, A, p) -> out
+
+Simultaneously compute `phi_0(A), phi_1(A), ..., phi_p(A)` for a dense matrix `A`
+(`p >= 1`) using the scaling-and-recovering algorithm of Al-Mohy and Liu
+(arXiv:2506.01193). `out` must be a length-`p+1` vector of `size(A)` matrices; on
+return `out[j+1] == phi_j(A)`.
+
+The cost is `O(p n^3)`, in contrast to the `O(n (n+p)^3)` of the basis-vector
+approach in [`phi!`](@ref).
+"""
+function _phi_almohy!(
+        out::AbstractVector{<:AbstractMatrix}, A::AbstractMatrix{T},
+        p::Integer
+    ) where {T}
+    n = size(A, 1)
+    m, s, tau = _select_parameters_phi(A, p)
+    As = A ./ exp2(s)
+    Nc, Dc = _phi_pade_coef(m, p)
+    Nm, Dm = _paterson_stockmeyer(As, Nc, Dc, tau)
+
+    Rm = Vector{Matrix{T}}(undef, p + 1)
+    Rm[p + 1] = Dm \ Nm
+    # Recurrence (2.9): R^{(j)} = As R^{(j+1)} + I/j!, j = p-1 : -1 : 0.
+    for k in p:-1:1
+        R = As * Rm[k + 1]
+        f = 1 / _factf(k - 1)
+        @inbounds for d in 1:n
+            R[d, d] += f
+        end
+        Rm[k] = R
+    end
+
+    # Undo the scaling with the double-argument formula (2.10), s times.
+    for _ in 1:s
+        for j in p:-1:1
+            M = Rm[1] * Rm[j + 1]
+            for k in 1:j
+                M .+= (1 / _factf(j - k)) .* Rm[k + 1]
+            end
+            M ./= exp2(j)
+            Rm[j + 1] = M
+        end
+        Rm[1] = Rm[1] * Rm[1]
+    end
+
+    for j in 0:p
+        copyto!(out[j + 1], Rm[j + 1])
+    end
+    return out
+end
diff --git a/test/basictests.jl b/test/basictests.jl
index 539ec2c..924f2d6 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -256,6 +256,68 @@ end
     end
 end
 
+@testset "Phi (Al-Mohy--Liu vs reference)" begin
+    # Independent reference: phi_0..phi_p are the first block row of exp(W) for
+    # the augmented block matrix W = [A E; 0 J] (Al-Mohy--Liu, Theorem 2.1), with
+    # E = [I 0 ... 0] and J the nilpotent shift. This is unrelated to the
+    # scaling-and-recovering algorithm under test.
+    function phi_block_reference(A, p)
+        n = size(A, 1)
+        T = eltype(A)
+        W = zeros(T, n * (p + 1), n * (p + 1))
+        W[1:n, 1:n] .= A
+        for j in 0:(p - 1)
+            rb = j * n
+            W[(rb + 1):(rb + n), (rb + n + 1):(rb + 2n)] .= Matrix{T}(I, n, n)
+        end
+        eW = exp(W)
+        return [eW[1:n, (j * n + 1):((j + 1) * n)] for j in 0:p]
+    end
+
+    # Force the legacy basis-vector path (Sidje/`phiv_dense`) for cross-checking.
+    function phi_legacy(A, p)
+        n = size(A, 1)
+        T = eltype(A)
+        caches = (
+            Vector{T}(undef, n), Matrix{T}(undef, n, p + 1),
+            Matrix{T}(undef, n + p, n + p),
+        )
+        out = [Matrix{T}(undef, n, n) for _ in 1:(p + 1)]
+        return phi!(out, A, p; caches = caches)
+    end
+
+    Random.seed!(20250701)
+    testmats = Dict(
+        "small 2x2" => [0.1 0.2; 0.3 0.4],
+        "random 8x8" => randn(8, 8) ./ 4,
+        "nonnormal 10x10" => 3 * (triu(randn(10, 10), 1) + Diagonal(randn(10))),
+        "large-norm 6x6" => 6 * randn(6, 6),
+        "complex 5x5" => randn(ComplexF64, 5, 5),
+        "hessenberg 12x12" => Matrix(hessenberg(randn(12, 12)).H),
+        "zero 4x4" => zeros(4, 4),
+    )
+    for (name, A) in testmats, p in (1, 2, 3, 5)
+        ref = phi_block_reference(A, p)
+        new = phi(A, p)                 # default path == Al-Mohy--Liu
+        leg = phi_legacy(A, p)
+        for j in 1:(p + 1)
+            @test new[j] ≈ ref[j] rtol = 1.0e-8 atol = 1.0e-12
+            @test new[j] ≈ leg[j] rtol = 1.0e-7 atol = 1.0e-10
+        end
+    end
+
+    # phi_0 must equal the matrix exponential.
+    A = randn(9, 9) ./ 3
+    @test phi(A, 4)[1] ≈ exp(A)
+
+    # Preallocated in-place entry point returns phi_j in out[j+1].
+    n = 7
+    A = randn(n, n) ./ 3
+    out = [Matrix{Float64}(undef, n, n) for _ in 1:4]
+    phi!(out, A, 3)
+    @test out ≈ phi_block_reference(A, 3)
+end
+
 @testset "Static Arrays" begin
     Random.seed!(0)
     for N in (3, 4, 6, 8), t in (0.1, 1.0, 10.0)

From f72fa0cc0626bb819be3f5b57c4c045d23285cf9 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 3 Jul 2026 03:49:41 -0400
Subject: [PATCH 2/8] phi: reusable zero-alloc workspace + container-preserving
 static path

Address review feedback on #236:

* PhiPadeCache: a reusable workspace holding every scratch buffer. The
  strided Float64/ComplexF64 path (`_phi_almohy!`) now runs fully in place
  via mul!/getrf!/getrs! and broadcasting, allocating nothing once a cache
  exists (verified 0 bytes with @allocated across sizes). Pass it as the
  `caches` keyword to `phi!` to reuse across calls. (@stevengj: "do
  in-place, maybe?")

* Container-preserving generic path (`_phi_almohy_generic`) for immutable
  dense matrices: an SMatrix input stays an SMatrix throughout instead of
  being converted to a Matrix. Routed only for `!ismutable` inputs, so
  sparse/other mutable non-strided matrices keep the legacy path.
  (@stevengj: "unfortunate to use Matrix here if you start with SMatrix")

* Export and document PhiPadeCache and phi!; bump to 1.31.0 (new public
  API, SemVer minor).

Tests: workspace reuse + zero-allocation assertion, complex-matrix
workspace, and SMatrix static-in/static-out accuracy vs the block-exponential
reference. Full basictests + Aqua + JET pass locally (the only failures are
the pre-existing ForwardDiff-1.x Issue 42 asserts, green under CI's pinned
ForwardDiff 0.10.x).

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01EYp371jx6LurezUDhKcYRh
---
 Project.toml                    |   2 +-
 docs/src/matrix_exponentials.md |   2 +
 src/ExponentialUtilities.jl     |   2 +-
 src/phi.jl                      |  21 +-
 src/phi_almohy.jl               | 490 +++++++++++++++++++++++---------
 test/basictests.jl              |  41 +++
 6 files changed, 412 insertions(+), 146 deletions(-)

diff --git a/Project.toml b/Project.toml
index a5c275c..a10df2c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "ExponentialUtilities"
 uuid = "d4d017d3-3776-5f7e-afef-a10c40355c18"
 authors = ["Chris Rackauckas <accounts@chrisrackauckas.com>", "José E. Cruz Serrallés <Jose.CruzSerralles@nyulangone.org>"]
-version = "1.30.2"
+version = "1.31.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
diff --git a/docs/src/matrix_exponentials.md b/docs/src/matrix_exponentials.md
index 48aca6b..8a5e56a 100644
--- a/docs/src/matrix_exponentials.md
+++ b/docs/src/matrix_exponentials.md
@@ -3,6 +3,8 @@
 ```@docs
 exponential!
 phi
+phi!
+PhiPadeCache
 ```
 
 ## Methods
diff --git a/src/ExponentialUtilities.jl b/src/ExponentialUtilities.jl
index 3492d41..579ab7e 100644
--- a/src/ExponentialUtilities.jl
+++ b/src/ExponentialUtilities.jl
@@ -42,7 +42,7 @@ include("precompile.jl")
 export phi, phi!, KrylovSubspace, arnoldi, arnoldi!, lanczos!, ExpvCache, PhivCache,
     expv, expv!, phiv, phiv!, kiops, expv_timestep, expv_timestep!, phiv_timestep,
     phiv_timestep!,
-    StegrCache, get_subspace_cache, exponential!
+    StegrCache, PhiPadeCache, get_subspace_cache, exponential!
 export ExpMethodHigham2005,
     ExpMethodHigham2005Base, ExpMethodGeneric, ExpMethodNative,
     ExpMethodDiagonalization
diff --git a/src/phi.jl b/src/phi.jl
index 70bfac1..d936a8f 100644
--- a/src/phi.jl
+++ b/src/phi.jl
@@ -133,10 +133,23 @@ function phi!(
     # The scaling-and-recovering algorithm of Al-Mohy and Liu (arXiv:2506.01193)
     # computes phi_0..phi_k simultaneously in O(k m^3), versus O(m (m+k)^3) for
     # the basis-vector approach below. Its Pade tables are tuned for double
-    # precision, so it is used only for Float64/ComplexF64 and only when the
-    # caller has not supplied the legacy `caches` bundle.
-    if k >= 1 && isnothing(caches) && T <: Union{Float64, ComplexF64}
-        return _phi_almohy!(out, A, k)
+    # precision, so it is used only for Float64/ComplexF64. The legacy path is
+    # kept for other element types and when a caller supplies the legacy
+    # `caches` tuple.
+    if k >= 1 && T <: Union{Float64, ComplexF64} && !(caches isa Tuple)
+        if A isa StridedMatrix && (isnothing(caches) || caches isa PhiPadeCache)
+            cache = caches isa PhiPadeCache ? caches : PhiPadeCache(A, k)
+            return _phi_almohy!(out, A, k, cache)
+        elseif isnothing(caches) && !ismutable(A)
+            # Container-preserving path for immutable dense matrices (e.g.
+            # `SMatrix`); mutable non-strided types (e.g. sparse) fall through to
+            # the legacy path below.
+            Rm = _phi_almohy_generic(A, k)
+            for j in 1:(k + 1)
+                copyto!(out[j], Rm[j])
+            end
+            return out
+        end
     end
     if isnothing(caches)
         e = Vector{T}(undef, m)
diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl
index d2881fe..ae553f9 100644
--- a/src/phi_almohy.jl
+++ b/src/phi_almohy.jl
@@ -14,6 +14,12 @@
 # the double-argument formula
 #
 #   phi_j(2A) = 2^{-j} ( phi_0(A) phi_j(A) + sum_{k=1}^{j} phi_k(A)/(j-k)! ).
+#
+# Two implementations share the scalar machinery below:
+#   * `_phi_almohy!`  -- fully in-place, allocation-free, using a reusable
+#                        `PhiPadeCache` workspace (for strided mutable matrices).
+#   * `_phi_almohy_generic` -- out-of-place and element-container-preserving, so
+#                        a static input (e.g. `SMatrix`) stays static.
 
 # theta[m, p] = theta_{m,p} from Table 3.1 of the paper (largest 1-norm of the
 # scaled matrix for which the [m/m] Pade approximant is backward stable to
@@ -41,6 +47,14 @@ const _PHI_THETA_MP = [
     13.269913405576162 13.738607964511967 14.20030229038679 14.655390835644507 15.104246216977073 15.547218984875734 15.984637960610213 16.41681094336813 16.84402564730148 17.266550769620974;
 ]
 
+# Largest Pade degree used; bounds cond(D_m) (Section 4). Fixes the block-size
+# and power-buffer counts used to size the workspace.
+const _PHI_M_MAX = 12
+# max tau+1 over m <= _PHI_M_MAX, tau = ceil(sqrt(2m)); ceil(sqrt(24)) = 5.
+const _PHI_NPOW = 6
+# max i (Paterson--Stockmeyer cost index) for _PHI_M_MAX.
+const _PHI_IMAX = ceil(Int, sqrt(8 * (_PHI_M_MAX + 1)) - 3) - 1
+
 # theta_{m,p} with the p>7 rule of the paper: for p>7 the p=7 column is used, and
 # indices are guarded so m outside 1:20 does not error.
 @inline function _phi_theta(m::Integer, p::Integer)
@@ -56,105 +70,78 @@ end
     return r
 end
 
-# Exact 1-norm of B^k for an entrywise-nonnegative real B, via ‖B^k‖_1 =
-# ‖(B^T)^k 𝟙‖_∞ (k matrix-vector products, no explicit power).
-function _normpow_nonneg(B::AbstractMatrix{<:Real}, k::Integer)
-    n = size(B, 1)
-    Bt = transpose(B)
-    v = ones(eltype(B), n)
-    tmp = similar(v)
-    for _ in 1:k
-        mul!(tmp, Bt, v)
-        v, tmp = tmp, v
-    end
-    return maximum(v)
-end
-
-# Scaling parameter `t` derived from the first term of the backward-error series
-# (Eq. (3.12)); mirrors the `ell` subfunction of the reference implementation.
-function _phi_ell(A::AbstractMatrix, m::Integer, p::Integer, phat::Integer)
-    normT = opnorm(A, 1)
-    normT == 0 && return 0
-    t0 = normT > 1 ? log2(normT) : 0.0
-    scalefac = exp2(t0)
-    normTs = normT / scalefac
-    delta = (p - 1) * (p - phat) / p + 1
-    coeff = (_factf(m + p) / _factf(2m + p)) * (_factf(m) / _factf(2m + p + 1))
-    K = 2m + p + 1
-    c = (coeff / normTs^delta)^(1 / K)
-    scaledT = c .* abs.(A ./ scalefac)
-    alpha = _normpow_nonneg(scaledT, K)
-    t = log2(2alpha / eps(Float64)) / (K - delta) + t0
-    return max(ceil(Int, t), 0)
+# First-factor coefficient of the backward-error series h_{m,p} (Eq. (3.4)).
+@inline function _phi_be_coeff(m::Integer, p::Integer)
+    return (_factf(m + p) / _factf(2m + p)) * (_factf(m) / _factf(2m + p + 1))
 end
 
-# Select the Pade degree m, scaling parameter s, and Paterson-Stockmeyer block
-# size tau that minimize the equivalent number of matrix products (Section 4).
-function _select_parameters_phi(A::AbstractMatrix, p::Integer)
-    m_max = 12
-    i_max = ceil(Int, sqrt(8 * (m_max + 1)) - 3) - 1
-    m_max = (i_max + 3)^2 ÷ 8
-    phat = _phi_theta(m_max, p) < 1 ? 0 : p
-    r_max = floor(Int, (1 + sqrt(1 + 4 * (2m_max + phat + 1))) / 2)
-
-    # eta[j] = ‖A^{j+1}‖_1^{1/(j+1)} for j = 1:r_max, forming the powers exactly.
-    # (The powers are low, j+1 ≤ r_max+1 ≲ 8; the cost is negligible next to the
-    # rest of the algorithm and never underestimates, keeping s conservative.)
-    eta = zeros(Float64, r_max)
-    P = A * A
-    eta[1] = opnorm(P, 1)^(1 / 2)
-    for j in 2:r_max
-        P = P * A
-        eta[j] = opnorm(P, 1)^(1 / (j + 1))
-    end
-    alpha = [max(eta[j], eta[j + 1]) for j in 1:(r_max - 1)]
-
-    Cost = zeros(Float64, i_max + 1, r_max - 1)
-    for i in 0:i_max
-        m_i = (i + 3)^2 ÷ 8
-        θ = _phi_theta(m_i, p)
-        phat_i = θ < 1 ? 0 : p
-        t = _phi_ell(A, m_i, p, phat_i)
-        for r in 2:r_max
-            if 2m_i + phat_i + 1 >= r * (r - 1)
-                a = alpha[r - 1]
-                s0 = (a > 0 && isfinite(a)) ? max(ceil(Int, log2(a / θ)), t) : t
-                Cost[i + 1, r - 1] = i + p + s0 * (p + 1)
-            end
-        end
-    end
+"""
+    PhiPadeCache(A, p)
 
-    minval = Inf
-    for v in Cost
-        if v > 0 && v < minval
-            minval = v
-        end
-    end
-    # Match the reference's column-major "last" tie-break for reproducibility.
-    idx = 0
-    for (j, v) in enumerate(Cost)
-        v == minval && (idx = j)
-    end
-    i_star = (idx - 1) % (i_max + 1)
-    m = (i_star + 3)^2 ÷ 8
-    s = round(Int, (minval - i_star - p) / (p + 1))
+Reusable workspace for the in-place phi-function evaluator [`phi!`](@ref) on
+strided `Float64`/`ComplexF64` matrices. Allocate once for a given size and `p`,
+then pass it as the `caches` keyword to `phi!` to reuse across calls without
+further allocation.
+"""
+struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}}
+    As::MT
+    Id::MT
+    Apow::Vector{MT}
+    Atau::MT
+    Nm::MT
+    Dm::MT
+    Dfact::MT
+    blkN::MT
+    blkD::MT
+    tmp::MT
+    pow1::MT
+    pow2::MT
+    absA::RMT
+    ipiv::Vector{LinearAlgebra.BlasInt}
+    rvec1::Vector{RT}
+    rvec2::Vector{RT}
+    Ncoef::Vector{Float64}
+    Dcoef::Vector{Float64}
+    Amat::Matrix{Float64}
+    eta::Vector{Float64}
+    alpha::Vector{Float64}
+    tvals::Vector{Int}
+    Cost::Matrix{Float64}
+end
 
-    tau = floor(Int, sqrt(2m))
-    if tau - 1 + 2 * (m ÷ tau) - 2 * (m % tau == 0) != i_star
-        tau = ceil(Int, sqrt(2m))
+function PhiPadeCache(A::AbstractMatrix{T}, p::Integer) where {T}
+    n = size(A, 1)
+    RT = real(T)
+    mk() = Matrix{T}(undef, n, n)
+    Apow = Vector{Matrix{T}}(undef, _PHI_NPOW)
+    Apow[1] = Matrix{T}(I, n, n)
+    for i in 2:_PHI_NPOW
+        Apow[i] = mk()
     end
-    return m, s, tau
+    return PhiPadeCache{T, RT, Matrix{T}, Matrix{RT}}(
+        mk(), Apow[1], Apow, mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(),
+        Matrix{RT}(undef, n, n),
+        Vector{LinearAlgebra.BlasInt}(undef, n),
+        Vector{RT}(undef, n), Vector{RT}(undef, n),
+        Vector{Float64}(undef, _PHI_M_MAX + 1), Vector{Float64}(undef, _PHI_M_MAX + 1),
+        Matrix{Float64}(undef, _PHI_M_MAX + 1, _PHI_M_MAX + 1),
+        Vector{Float64}(undef, _PHI_IMAX + 2), Vector{Float64}(undef, _PHI_IMAX + 2),
+        Vector{Int}(undef, _PHI_IMAX + 1),
+        Matrix{Float64}(undef, _PHI_IMAX + 1, _PHI_IMAX + 2),
+    )
 end
 
 # Renormalized [m/m] Pade coefficients (numerator and denominator, low order
-# first) of the approximant to phi_p. Follows the recurrences of Berland,
-# Skaflestad, and Wright used in the reference `pade_coef`.
-function _phi_pade_coef(m::Integer, p::Integer)
-    n1 = prod(Float64.((m + 1):(2m + 1)))   # (2m+1)!/m!
+# first) of the approximant to phi_p, written into the preallocated `Ncoef`,
+# `Dcoef` (length >= m+1) using scratch `Amat` (size >= (m+1)^2). Follows the
+# recurrences of Berland, Skaflestad, and Wright used in the reference
+# `pade_coef`.
+function _phi_pade_coef!(Ncoef, Dcoef, Amat, m::Integer, p::Integer)
+    n1 = 1.0                                # (2m+1)!/m!
+    for i in (m + 1):(2m + 1)
+        n1 *= i
+    end
     d1 = n1
-    Ncoef = Vector{Float64}(undef, m + 1)
-    Dcoef = Vector{Float64}(undef, m + 1)
-    Amat = Matrix{Float64}(undef, m + 1, m + 1)
     for k in 1:p
         if k == p
             # First column: Amat[r,1] = n1 * prod_{l=1}^{r-1} 1/(k+l).
@@ -192,90 +179,313 @@ function _phi_pade_coef(m::Integer, p::Integer)
     return Ncoef, Dcoef
 end
 
-# Simultaneously evaluate the numerator N(A) and denominator D(A) of the Pade
-# approximant via the Paterson-Stockmeyer scheme with block size tau.
-function _paterson_stockmeyer(A::AbstractMatrix{T}, Nc, Dc, tau::Integer) where {T}
-    m = length(Nc) - 1
-    n = size(A, 1)
-    Apow = Vector{Matrix{T}}(undef, tau + 1)
-    Apow[1] = Matrix{T}(I, n, n)
-    Apow[2] = Matrix(A)
+# Given the alpha_r sequence (first `nalpha` entries) and precomputed scaling
+# parameters `tvals[i+1] = t(m_i)`, minimize the equivalent-matrix-product cost
+# (Section 4) over the optimal degrees and return the chosen (m, s, tau). `Cost`
+# is scratch of size >= (imax+1) x (r_max-1).
+function _phi_select_from_alpha(alpha, nalpha::Integer, p::Integer, tvals, Cost)
+    i_max = _PHI_IMAX
+    r_max = nalpha + 1
+    C = @view Cost[1:(i_max + 1), 1:(r_max - 1)]
+    fill!(C, 0.0)
+    for i in 0:i_max
+        m_i = (i + 3)^2 ÷ 8
+        θ = _phi_theta(m_i, p)
+        phat_i = θ < 1 ? 0 : p
+        t = tvals[i + 1]
+        for r in 2:r_max
+            if 2m_i + phat_i + 1 >= r * (r - 1)
+                a = alpha[r - 1]
+                s0 = (a > 0 && isfinite(a)) ? max(ceil(Int, log2(a / θ)), t) : t
+                C[i + 1, r - 1] = i + p + s0 * (p + 1)
+            end
+        end
+    end
+    nrows = i_max + 1
+    ncols = r_max - 1
+    minval = Inf
+    for v in C
+        (v > 0 && v < minval) && (minval = v)
+    end
+    # Column-major "last" tie-break (matches the reference), giving the row index.
+    i_star = 0
+    for cc in 1:ncols, rr in 1:nrows
+        C[rr, cc] == minval && (i_star = rr - 1)
+    end
+    m = (i_star + 3)^2 ÷ 8
+    s = round(Int, (minval - i_star - p) / (p + 1))
+    tau = floor(Int, sqrt(2m))
+    if tau - 1 + 2 * (m ÷ tau) - 2 * (m % tau == 0) != i_star
+        tau = ceil(Int, sqrt(2m))
+    end
+    return m, s, tau
+end
+
+## -------------------- in-place (workspace) implementation --------------------
+
+# Exact 1-norm of B^k for entrywise-nonnegative real B, ‖B^k‖_1 = ‖(B^T)^k 𝟙‖_∞,
+# via k matrix-vector products into the workspace vectors.
+function _normpow_nonneg!(cache::PhiPadeCache, B, k::Integer)
+    v = cache.rvec1
+    tmp = cache.rvec2
+    fill!(v, 1)
+    Bt = transpose(B)
+    for _ in 1:k
+        mul!(tmp, Bt, v)
+        v, tmp = tmp, v
+    end
+    return maximum(v)
+end
+
+# Scaling parameter `t` from the first term of the backward-error series
+# (Eq. (3.12)); `ell` subfunction of the reference, in-place.
+function _phi_ell!(cache::PhiPadeCache, A, m::Integer, p::Integer, phat::Integer)
+    normT = opnorm(A, 1)
+    normT == 0 && return 0
+    t0 = normT > 1 ? log2(normT) : 0.0
+    scalefac = exp2(t0)
+    normTs = normT / scalefac
+    delta = (p - 1) * (p - phat) / p + 1
+    K = 2m + p + 1
+    c = (_phi_be_coeff(m, p) / normTs^delta)^(1 / K)
+    absA = cache.absA
+    @. absA = c * abs(A) / scalefac
+    alpha = _normpow_nonneg!(cache, absA, K)
+    t = log2(2alpha / eps(Float64)) / (K - delta) + t0
+    return max(ceil(Int, t), 0)
+end
+
+function _select_parameters_phi!(cache::PhiPadeCache, A, p::Integer)
+    phat = _phi_theta(_PHI_M_MAX, p) < 1 ? 0 : p
+    r_max = floor(Int, (1 + sqrt(1 + 4 * (2 * _PHI_M_MAX + phat + 1))) / 2)
+    eta = cache.eta
+    mul!(cache.pow1, A, A)
+    eta[1] = opnorm(cache.pow1, 1)^(1 / 2)
+    cur, oth = cache.pow1, cache.pow2
+    for j in 2:r_max
+        mul!(oth, cur, A)
+        eta[j] = opnorm(oth, 1)^(1 / (j + 1))
+        cur, oth = oth, cur
+    end
+    alpha = cache.alpha
+    for j in 1:(r_max - 1)
+        alpha[j] = max(eta[j], eta[j + 1])
+    end
+    for i in 0:_PHI_IMAX
+        m_i = (i + 3)^2 ÷ 8
+        phat_i = _phi_theta(m_i, p) < 1 ? 0 : p
+        cache.tvals[i + 1] = _phi_ell!(cache, A, m_i, p, phat_i)
+    end
+    return _phi_select_from_alpha(alpha, r_max - 1, p, cache.tvals, cache.Cost)
+end
+
+# Evaluate N(As) and D(As) into cache.Nm, cache.Dm via Paterson--Stockmeyer.
+function _paterson_stockmeyer!(cache::PhiPadeCache, As, m::Integer, tau::Integer)
+    Nc, Dc, Apow = cache.Ncoef, cache.Dcoef, cache.Apow
+    Atau, Nm, Dm = cache.Atau, cache.Nm, cache.Dm
+    blkN, blkD, tmp = cache.blkN, cache.blkD, cache.tmp
+    copyto!(Apow[2], As)               # Apow[1] === Id; Apow[i] = As^{i-1}
     for i in 3:(tau + 1)
-        Apow[i] = Apow[i - 1] * A
+        mul!(Apow[i], Apow[i - 1], As)
     end
-    Atau = copy(Apow[tau + 1])
-    N = zeros(T, n, n)
-    D = zeros(T, n, n)
+    copyto!(Atau, Apow[tau + 1])
+    fill!(Nm, 0)
+    fill!(Dm, 0)
     nu = m ÷ tau
     for i in 0:nu
         start = i * tau + 1
         stop = min((i + 1) * tau, m + 1)
-        blkN = zeros(T, n, n)
-        blkD = zeros(T, n, n)
+        fill!(blkN, 0)
+        fill!(blkD, 0)
         for l in 1:(stop - start + 1)
-            blkN .+= Nc[start + l - 1] .* Apow[l]
-            blkD .+= Dc[start + l - 1] .* Apow[l]
+            @. blkN += Nc[start + l - 1] * Apow[l]
+            @. blkD += Dc[start + l - 1] * Apow[l]
         end
         if i == 0
-            N .+= blkN
-            D .+= blkD
+            Nm .+= blkN
+            Dm .+= blkD
         else
-            N .+= blkN * Atau
-            D .+= blkD * Atau
-            Atau = Atau * Apow[tau + 1]
+            mul!(Nm, blkN, Atau, true, true)
+            mul!(Dm, blkD, Atau, true, true)
+            mul!(tmp, Atau, Apow[tau + 1])
+            copyto!(Atau, tmp)
         end
     end
-    return N, D
+    return Nm, Dm
 end
 
-"""
-    _phi_almohy!(out, A, p) -> out
+# Solve Dfact * X = B in place (B overwritten by X), zero-allocation LU.
+function _phi_solve!(Dfact, ipiv, B)
+    LinearAlgebra.LAPACK.getrf!(Dfact, ipiv)
+    LinearAlgebra.LAPACK.getrs!('N', Dfact, ipiv, B)
+    return B
+end
 
-Simultaneously compute `phi_0(A), phi_1(A), ..., phi_p(A)` for a dense matrix `A`
-(`p >= 1`) using the scaling-and-recovering algorithm of Al-Mohy and Liu
-(arXiv:2506.01193). `out` must be a length-`p+1` vector of `size(A)` matrices; on
-return `out[j+1] == phi_j(A)`.
+"""
+    _phi_almohy!(out, A, p, cache) -> out
 
-The cost is `O(p n^3)`, in contrast to the `O(n (n+p)^3)` of the basis-vector
-approach in [`phi!`](@ref).
+Allocation-free evaluation of `phi_0(A), ..., phi_p(A)` for a strided
+`Float64`/`ComplexF64` matrix `A` (`p >= 1`), writing `phi_j(A)` into `out[j+1]`
+and using the reusable [`PhiPadeCache`](@ref) `cache` for all scratch storage.
+The `out` matrices double as the recovery buffers.
 """
 function _phi_almohy!(
-        out::AbstractVector{<:AbstractMatrix}, A::AbstractMatrix{T},
-        p::Integer
-    ) where {T}
+        out::AbstractVector{<:AbstractMatrix}, A::AbstractMatrix, p::Integer,
+        cache::PhiPadeCache
+    )
     n = size(A, 1)
-    m, s, tau = _select_parameters_phi(A, p)
-    As = A ./ exp2(s)
-    Nc, Dc = _phi_pade_coef(m, p)
-    Nm, Dm = _paterson_stockmeyer(As, Nc, Dc, tau)
+    m, s, tau = _select_parameters_phi!(cache, A, p)
+    As = cache.As
+    @. As = A / exp2(s)
+    _phi_pade_coef!(cache.Ncoef, cache.Dcoef, cache.Amat, m, p)
+    Nm, Dm = _paterson_stockmeyer!(cache, As, m, tau)
+
+    copyto!(cache.Dfact, Dm)
+    copyto!(out[p + 1], Nm)
+    _phi_solve!(cache.Dfact, cache.ipiv, out[p + 1])
 
-    Rm = Vector{Matrix{T}}(undef, p + 1)
-    Rm[p + 1] = Dm \ Nm
     # Recurrence (2.9): R^{(j)} = As R^{(j+1)} + I/j!, j = p-1 : -1 : 0.
     for k in p:-1:1
-        R = As * Rm[k + 1]
+        mul!(out[k], As, out[k + 1])
         f = 1 / _factf(k - 1)
         @inbounds for d in 1:n
-            R[d, d] += f
+            out[k][d, d] += f
         end
-        Rm[k] = R
     end
 
     # Undo the scaling with the double-argument formula (2.10), s times.
+    tmp = cache.tmp
     for _ in 1:s
         for j in p:-1:1
-            M = Rm[1] * Rm[j + 1]
+            mul!(tmp, out[1], out[j + 1])
             for k in 1:j
-                M .+= (1 / _factf(j - k)) .* Rm[k + 1]
+                @. tmp += (1 / _factf(j - k)) * out[k + 1]
             end
-            M ./= exp2(j)
-            Rm[j + 1] = M
+            tmp ./= exp2(j)
+            copyto!(out[j + 1], tmp)
         end
-        Rm[1] = Rm[1] * Rm[1]
+        mul!(tmp, out[1], out[1])
+        copyto!(out[1], tmp)
     end
+    return out
+end
+
+## ----------------- generic (container-preserving) implementation -------------
 
-    for j in 0:p
-        copyto!(out[j + 1], Rm[j + 1])
+function _normpow_nonneg(B::AbstractMatrix{<:Real}, k::Integer)
+    n = size(B, 1)
+    Bt = transpose(B)
+    v = ones(eltype(B), n)
+    tmp = similar(v)
+    for _ in 1:k
+        mul!(tmp, Bt, v)
+        v, tmp = tmp, v
     end
-    return out
+    return maximum(v)
+end
+
+function _phi_ell(A::AbstractMatrix, m::Integer, p::Integer, phat::Integer)
+    normT = opnorm(A, 1)
+    normT == 0 && return 0
+    t0 = normT > 1 ? log2(normT) : 0.0
+    scalefac = exp2(t0)
+    normTs = normT / scalefac
+    delta = (p - 1) * (p - phat) / p + 1
+    K = 2m + p + 1
+    c = (_phi_be_coeff(m, p) / normTs^delta)^(1 / K)
+    scaledT = c .* abs.(A ./ scalefac)
+    alpha = _normpow_nonneg(scaledT, K)
+    t = log2(2alpha / eps(Float64)) / (K - delta) + t0
+    return max(ceil(Int, t), 0)
+end
+
+function _select_parameters_phi(A::AbstractMatrix, p::Integer)
+    phat = _phi_theta(_PHI_M_MAX, p) < 1 ? 0 : p
+    r_max = floor(Int, (1 + sqrt(1 + 4 * (2 * _PHI_M_MAX + phat + 1))) / 2)
+    eta = Vector{Float64}(undef, r_max)
+    P = A * A
+    eta[1] = opnorm(P, 1)^(1 / 2)
+    for j in 2:r_max
+        P = P * A
+        eta[j] = opnorm(P, 1)^(1 / (j + 1))
+    end
+    alpha = [max(eta[j], eta[j + 1]) for j in 1:(r_max - 1)]
+    tvals = Vector{Int}(undef, _PHI_IMAX + 1)
+    for i in 0:_PHI_IMAX
+        m_i = (i + 3)^2 ÷ 8
+        phat_i = _phi_theta(m_i, p) < 1 ? 0 : p
+        tvals[i + 1] = _phi_ell(A, m_i, p, phat_i)
+    end
+    Cost = zeros(Float64, _PHI_IMAX + 1, r_max - 1)
+    return _phi_select_from_alpha(alpha, r_max - 1, p, tvals, Cost)
+end
+
+function _paterson_stockmeyer(As, Nc, Dc, tau::Integer, m::Integer)
+    MT = typeof(As)
+    Apow = Vector{MT}(undef, tau + 1)
+    Apow[1] = one(As)
+    Apow[2] = As
+    for i in 3:(tau + 1)
+        Apow[i] = Apow[i - 1] * As
+    end
+    Atau = Apow[tau + 1]
+    N = zero(As)
+    D = zero(As)
+    nu = m ÷ tau
+    for i in 0:nu
+        start = i * tau + 1
+        stop = min((i + 1) * tau, m + 1)
+        blkN = zero(As)
+        blkD = zero(As)
+        for l in 1:(stop - start + 1)
+            blkN += Nc[start + l - 1] * Apow[l]
+            blkD += Dc[start + l - 1] * Apow[l]
+        end
+        if i == 0
+            N += blkN
+            D += blkD
+        else
+            N += blkN * Atau
+            D += blkD * Atau
+            Atau = Atau * Apow[tau + 1]
+        end
+    end
+    return N, D
+end
+
+"""
+    _phi_almohy_generic(A, p) -> Vector
+
+Container-preserving evaluation of `phi_0(A), ..., phi_p(A)` (`p >= 1`): the
+returned matrices have the same type as `A` (e.g. an `SMatrix` input yields
+`SMatrix` results). Used for immutable/static matrices, where the in-place
+workspace path does not apply.
+"""
+function _phi_almohy_generic(A::AbstractMatrix, p::Integer)
+    m, s, tau = _select_parameters_phi(A, p)
+    As = A ./ exp2(s)
+    m1 = m + 1
+    Ncoef = Vector{Float64}(undef, m1)
+    Dcoef = Vector{Float64}(undef, m1)
+    Amat = Matrix{Float64}(undef, m1, m1)
+    _phi_pade_coef!(Ncoef, Dcoef, Amat, m, p)
+    Nm, Dm = _paterson_stockmeyer(As, Ncoef, Dcoef, tau, m)
+
+    Rm = Vector{typeof(As)}(undef, p + 1)
+    Rm[p + 1] = Dm \ Nm
+    Id = one(As)
+    for k in p:-1:1
+        Rm[k] = As * Rm[k + 1] + Id / _factf(k - 1)
+    end
+    for _ in 1:s
+        for j in p:-1:1
+            M = Rm[1] * Rm[j + 1]
+            for k in 1:j
+                M += Rm[k + 1] / _factf(j - k)
+            end
+            Rm[j + 1] = M / exp2(j)
+        end
+        Rm[1] = Rm[1] * Rm[1]
+    end
+    return Rm
 end
diff --git a/test/basictests.jl b/test/basictests.jl
index 924f2d6..e11d041 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -316,6 +316,47 @@ end
     out = [Matrix{Float64}(undef, n, n) for _ in 1:4]
     phi!(out, A, 3)
     @test out ≈ phi_block_reference(A, 3)
+
+    # Reusable workspace: same result, and no allocation once the cache exists.
+    cache = PhiPadeCache(A, 3)
+    out2 = [Matrix{Float64}(undef, n, n) for _ in 1:4]
+    phi!(out2, A, 3; caches = cache)
+    @test out2 ≈ out
+    run_phi!(o, M, c) = phi!(o, M, 3; caches = c)
+    run_phi!(out2, A, cache)                           # warm up / compile
+    @test (@allocated run_phi!(out2, A, cache)) == 0
+
+    # Complex-matrix workspace path.
+    Ac = randn(ComplexF64, 6, 6) ./ 3
+    cachec = PhiPadeCache(Ac, 2)
+    outc = [Matrix{ComplexF64}(undef, 6, 6) for _ in 1:3]
+    phi!(outc, Ac, 2; caches = cachec)
+    @test outc ≈ phi_block_reference(Ac, 2)
+end
+
+@testset "Phi static arrays (container-preserving)" begin
+    Random.seed!(4321)
+    function phi_block_reference(A, p)
+        n = size(A, 1)
+        T = eltype(A)
+        W = zeros(T, n * (p + 1), n * (p + 1))
+        W[1:n, 1:n] .= A
+        for j in 0:(p - 1)
+            rb = j * n
+            W[(rb + 1):(rb + n), (rb + n + 1):(rb + 2n)] .= Matrix{T}(I, n, n)
+        end
+        eW = exp(W)
+        return [eW[1:n, (j * n + 1):((j + 1) * n)] for j in 0:p]
+    end
+    for N in (3, 5), p in (1, 2, 3)
+        A = SMatrix{N, N}(randn(N, N) ./ 3)
+        Rm = ExponentialUtilities._phi_almohy_generic(A, p)
+        @test all(r -> r isa SMatrix, Rm)        # static in, static out
+        ref = phi_block_reference(Matrix(A), p)
+        for j in 1:(p + 1)
+            @test Matrix(Rm[j]) ≈ ref[j] rtol = 1.0e-8 atol = 1.0e-12
+        end
+    end
 end
 
 @testset "Static Arrays" begin

From 91d942784b4afea153a447ea7b73bb62dfa10b8b Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 3 Jul 2026 04:01:26 -0400
Subject: [PATCH 3/8] phi: allocation-free LU across Julia versions (fixes LTS
 downgrade)

`LAPACK.getrf!(A, ipiv)` with a preallocated pivot vector only exists on
newer Julia; on the LTS (1.10) it errored, breaking the workspace solve.
ccall `getrf!` directly through libblastrampoline (as `StegrWork` does for
`stegr!`) so the factorization stays zero-allocation on all supported
versions; the 4-argument `getrs!` used for the solve is available
everywhere.

Verified on Julia 1.10 and 1.12: correctness ~2.5e-15 vs the BigFloat
reference and 0 bytes allocated on cache reuse.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01EYp371jx6LurezUDhKcYRh
---
 src/phi_almohy.jl | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl
index ae553f9..55c1ec2 100644
--- a/src/phi_almohy.jl
+++ b/src/phi_almohy.jl
@@ -21,6 +21,8 @@
 #   * `_phi_almohy_generic` -- out-of-place and element-container-preserving, so
 #                        a static input (e.g. `SMatrix`) stays static.
 
+import libblastrampoline_jll
+
 # theta[m, p] = theta_{m,p} from Table 3.1 of the paper (largest 1-norm of the
 # scaled matrix for which the [m/m] Pade approximant is backward stable to
 # double-precision unit roundoff). Rows are m = 1:20, columns p = 1:10.
@@ -314,9 +316,30 @@ function _paterson_stockmeyer!(cache::PhiPadeCache, As, m::Integer, tau::Integer
     return Nm, Dm
 end
 
+# LU factorization writing pivots into a preallocated `ipiv`. LAPACK's `getrf!`
+# only gained a pivot-accepting method in newer Julia, so ccall it directly (as
+# `StegrWork` does for `stegr!`) to keep the factorization allocation-free on the
+# LTS. The 4-argument `getrs!` used for the solve is available on all versions.
+for (getrf, elty) in ((:dgetrf_, :Float64), (:zgetrf_, :ComplexF64))
+    @eval function _phi_getrf!(A::AbstractMatrix{$elty}, ipiv::Vector{LinearAlgebra.BlasInt})
+        m, n = size(A)
+        info = Ref{LinearAlgebra.BlasInt}(0)
+        ccall(
+            (LinearAlgebra.BLAS.@blasfunc($getrf), libblastrampoline_jll.libblastrampoline),
+            Cvoid,
+            (
+                Ref{LinearAlgebra.BlasInt}, Ref{LinearAlgebra.BlasInt}, Ptr{$elty},
+                Ref{LinearAlgebra.BlasInt}, Ptr{LinearAlgebra.BlasInt}, Ref{LinearAlgebra.BlasInt},
+            ),
+            m, n, A, max(1, stride(A, 2)), ipiv, info
+        )
+        return A
+    end
+end
+
 # Solve Dfact * X = B in place (B overwritten by X), zero-allocation LU.
 function _phi_solve!(Dfact, ipiv, B)
-    LinearAlgebra.LAPACK.getrf!(Dfact, ipiv)
+    _phi_getrf!(Dfact, ipiv)
     LinearAlgebra.LAPACK.getrs!('N', Dfact, ipiv, B)
     return B
 end

From 69bb1fcab1964e13cfba6c3d104a0f2faa692d9a Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 3 Jul 2026 04:29:34 -0400
Subject: [PATCH 4/8] phi: ccall getrs! directly to keep the LU
 public-API-clean (QA)

The workspace solve used qualified `LinearAlgebra.BlasInt` and
`LinearAlgebra.LAPACK.getrs!`, which SciMLTesting's ExplicitImports
`all_qualified_accesses_are_public` check flags as reaching into non-public
internals (they are not on its ignore list). Mirror `exp_noalloc.jl`
instead: import `BlasInt` (an allow-listed explicit import) and ccall both
`getrf!` and `getrs!` through `BLAS.@blasfunc`/`BLAS.libblastrampoline`
(allow-listed names). No qualified access to non-public names remains in
`phi_almohy.jl`.

Behavior unchanged: verified correctness ~2.5e-15 and 0-byte cache reuse on
Julia 1.10 and 1.12; Aqua and JET still pass.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01EYp371jx6LurezUDhKcYRh
---
 src/phi_almohy.jl | 46 +++++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl
index 55c1ec2..2cc553c 100644
--- a/src/phi_almohy.jl
+++ b/src/phi_almohy.jl
@@ -21,7 +21,7 @@
 #   * `_phi_almohy_generic` -- out-of-place and element-container-preserving, so
 #                        a static input (e.g. `SMatrix`) stays static.
 
-import libblastrampoline_jll
+using LinearAlgebra: BlasInt
 
 # theta[m, p] = theta_{m,p} from Table 3.1 of the paper (largest 1-norm of the
 # scaled matrix for which the [m/m] Pade approximant is backward stable to
@@ -99,7 +99,7 @@ struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}}
     pow1::MT
     pow2::MT
     absA::RMT
-    ipiv::Vector{LinearAlgebra.BlasInt}
+    ipiv::Vector{BlasInt}
     rvec1::Vector{RT}
     rvec2::Vector{RT}
     Ncoef::Vector{Float64}
@@ -123,7 +123,7 @@ function PhiPadeCache(A::AbstractMatrix{T}, p::Integer) where {T}
     return PhiPadeCache{T, RT, Matrix{T}, Matrix{RT}}(
         mk(), Apow[1], Apow, mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(),
         Matrix{RT}(undef, n, n),
-        Vector{LinearAlgebra.BlasInt}(undef, n),
+        Vector{BlasInt}(undef, n),
         Vector{RT}(undef, n), Vector{RT}(undef, n),
         Vector{Float64}(undef, _PHI_M_MAX + 1), Vector{Float64}(undef, _PHI_M_MAX + 1),
         Matrix{Float64}(undef, _PHI_M_MAX + 1, _PHI_M_MAX + 1),
@@ -316,31 +316,47 @@ function _paterson_stockmeyer!(cache::PhiPadeCache, As, m::Integer, tau::Integer
     return Nm, Dm
 end
 
-# LU factorization writing pivots into a preallocated `ipiv`. LAPACK's `getrf!`
-# only gained a pivot-accepting method in newer Julia, so ccall it directly (as
-# `StegrWork` does for `stegr!`) to keep the factorization allocation-free on the
-# LTS. The 4-argument `getrs!` used for the solve is available on all versions.
-for (getrf, elty) in ((:dgetrf_, :Float64), (:zgetrf_, :ComplexF64))
-    @eval function _phi_getrf!(A::AbstractMatrix{$elty}, ipiv::Vector{LinearAlgebra.BlasInt})
+# LAPACK LU factor + solve with pivots written into a preallocated `ipiv`.
+# `LAPACK.getrf!` only gained a pivot-accepting method in newer Julia, so both
+# `getrf!` and `getrs!` are ccall-ed directly (as `exp_noalloc.jl` does for
+# `gebal!`) to keep the solve allocation-free on every supported version.
+for (getrf, getrs, elty) in
+    ((:dgetrf_, :dgetrs_, :Float64), (:zgetrf_, :zgetrs_, :ComplexF64))
+    @eval function _phi_getrf!(A::AbstractMatrix{$elty}, ipiv::Vector{BlasInt})
         m, n = size(A)
-        info = Ref{LinearAlgebra.BlasInt}(0)
+        info = Ref{BlasInt}(0)
         ccall(
-            (LinearAlgebra.BLAS.@blasfunc($getrf), libblastrampoline_jll.libblastrampoline),
-            Cvoid,
+            (BLAS.@blasfunc($getrf), BLAS.libblastrampoline), Cvoid,
             (
-                Ref{LinearAlgebra.BlasInt}, Ref{LinearAlgebra.BlasInt}, Ptr{$elty},
-                Ref{LinearAlgebra.BlasInt}, Ptr{LinearAlgebra.BlasInt}, Ref{LinearAlgebra.BlasInt},
+                Ref{BlasInt}, Ref{BlasInt}, Ptr{$elty}, Ref{BlasInt},
+                Ptr{BlasInt}, Ref{BlasInt},
             ),
             m, n, A, max(1, stride(A, 2)), ipiv, info
         )
         return A
     end
+    @eval function _phi_getrs!(
+            A::AbstractMatrix{$elty}, ipiv::Vector{BlasInt}, B::AbstractMatrix{$elty}
+        )
+        n = size(A, 1)
+        info = Ref{BlasInt}(0)
+        ccall(
+            (BLAS.@blasfunc($getrs), BLAS.libblastrampoline), Cvoid,
+            (
+                Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{$elty}, Ref{BlasInt},
+                Ptr{BlasInt}, Ptr{$elty}, Ref{BlasInt}, Ref{BlasInt}, Clong,
+            ),
+            'N', n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B,
+            max(1, stride(B, 2)), info, 1
+        )
+        return B
+    end
 end
 
 # Solve Dfact * X = B in place (B overwritten by X), zero-allocation LU.
 function _phi_solve!(Dfact, ipiv, B)
     _phi_getrf!(Dfact, ipiv)
-    LinearAlgebra.LAPACK.getrs!('N', Dfact, ipiv, B)
+    _phi_getrs!(Dfact, ipiv, B)
     return B
 end
 

From 46cad116cb6c44f62bd84c7140960fd1c47fa936 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 3 Jul 2026 05:57:49 -0400
Subject: [PATCH 5/8] phi: Horner PS, no-throw return codes, p-sized workspace,
 NaN guards

Re-analysis pass over the Al-Mohy--Liu implementation:

* Paterson--Stockmeyer rewritten in Horner form with the delta trick. The
  previous block scheme (mirroring the reference MATLAB) used explicit
  Atau powers plus two products per block -- up to 12 multiplications for
  m = 12 where Fasi's pi_m(tau) = 7, so the parameter selection was
  optimizing a cost the evaluator did not achieve. The Horner form's count
  now equals pi_m(tau) exactly. Measured 10-20% end-to-end speedup on the
  workspace path across n = 25..200.

* Numerical failure returns codes instead of throwing, so adaptive
  integrators can reject a step rather than abort: cache.info[] is 0 on
  success, the LAPACK getrf info for a singular Pade denominator, or -1
  for non-finite results (LAPACK does not flag NaN pivots, so a post-solve
  finiteness check backs the code); outputs are NaN-filled on failure.
  Fixes a latent InexactError where NaN/Inf input crashed in
  ceil(Int, t) during parameter selection before reaching the solve, and
  the generic path's SingularException from `\`. Wrongly sized workspaces
  (programmer errors) still throw, with informative messages.

* Workspace vectors sized from p at construction (was a fixed cap that
  BoundsError'd for p >~ 48) with an explicit capacity check; opnorm(A, 1)
  hoisted out of the per-degree ell loop (was recomputed 8x); dead Atau/Id
  buffers dropped (2n^2 less memory).

* Fix a closure-capture leak in the test helpers: phi_block_reference
  assigned to `n`, clobbering the enclosing testset's `n` (Julia closures
  assign to existing outer locals); the helpers now declare their
  temporaries `local`.

Verified: worst relative error 8.8e-15 over 33 BigFloat-referenced cases
(real/complex, normal/nonnormal, p = 1..8); 0 bytes allocated on cache
reuse (Float64 and ComplexF64, n = 10..100, p = 1..8, Julia 1.10 and
1.12); NaN/Inf inputs return info != 0 with NaN outputs and no throw;
full basictests 574 pass on 1.10 and 1.12 (only the pre-existing
ForwardDiff-1.x Issue 42 failures remain, which pass under CI's pinned
ForwardDiff 0.10); ExplicitImports clean.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01EYp371jx6LurezUDhKcYRh
---
 src/phi.jl         |  22 ++++-
 src/phi_almohy.jl  | 220 +++++++++++++++++++++++++++++++--------------
 test/basictests.jl |  49 ++++++++--
 3 files changed, 216 insertions(+), 75 deletions(-)

diff --git a/src/phi.jl b/src/phi.jl
index d936a8f..dbd3f09 100644
--- a/src/phi.jl
+++ b/src/phi.jl
@@ -122,7 +122,27 @@ end
 """
     phi!(out,A,k[;caches]) -> out
 
-Non-allocating version of `phi` for non-diagonal matrix inputs.
+Non-allocating version of `phi` for non-diagonal matrix inputs, writing
+`phi_j(A)` into `out[j+1]`.
+
+For dense `Float64`/`ComplexF64` matrices, pass a reusable
+[`PhiPadeCache`](@ref) as `caches` to make repeated evaluations of the same
+size and order allocation-free:
+
+```julia
+cache = PhiPadeCache(A, k)
+phi!(out, A, k; caches = cache)
+```
+
+Numerical failure (a singular Padé denominator or non-finite result, only
+possible for pathological inputs such as matrices containing `NaN`/`Inf`) does
+not throw: the outputs are filled with `NaN` and, when a `PhiPadeCache` is
+used, `cache.info[]` is set to a nonzero return code (`0` on success). This
+lets adaptive integrators reject the step instead of aborting.
+
+For the legacy basis-vector algorithm, `caches` is instead the tuple
+`(Vector{T}(undef, m), Matrix{T}(undef, m, k+1), Matrix{T}(undef, m+k, m+k))`;
+supplying it forces that code path.
 """
 function phi!(
         out::Vector{Matrix{T}}, A::AbstractMatrix{T}, k::Integer; caches = nothing,
diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl
index 2cc553c..478c8cb 100644
--- a/src/phi_almohy.jl
+++ b/src/phi_almohy.jl
@@ -84,17 +84,23 @@ Reusable workspace for the in-place phi-function evaluator [`phi!`](@ref) on
 strided `Float64`/`ComplexF64` matrices. Allocate once for a given size and `p`,
 then pass it as the `caches` keyword to `phi!` to reuse across calls without
 further allocation.
+
+After each `phi!` call, `cache.info[]` holds a return code: `0` means success;
+a positive value is the LAPACK `getrf` info of a numerically singular Padé
+denominator; `-1` means the result was non-finite (`NaN`/`Inf` input). Both
+failures only occur for pathological inputs, and no error is thrown — the
+outputs are filled with `NaN` so that adaptive integrators can detect the
+failed evaluation via `cache.info[] != 0` (or an `isfinite` check) and reject
+the step rather than abort.
 """
 struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}}
     As::MT
-    Id::MT
     Apow::Vector{MT}
-    Atau::MT
     Nm::MT
     Dm::MT
     Dfact::MT
-    blkN::MT
-    blkD::MT
+    Naux::MT
+    Daux::MT
     tmp::MT
     pow1::MT
     pow2::MT
@@ -109,11 +115,17 @@ struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}}
     alpha::Vector{Float64}
     tvals::Vector{Int}
     Cost::Matrix{Float64}
+    info::typeof(Ref(zero(BlasInt)))
 end
 
+# Largest r for which alpha_r may be probed (Eq. (3.10) with phat = p; theta_{12,p}
+# >= 1 for all p, so phat = p always applies at m = _PHI_M_MAX).
+_phi_rmax(p::Integer) = floor(Int, (1 + sqrt(1 + 4 * (2 * _PHI_M_MAX + p + 1))) / 2)
+
 function PhiPadeCache(A::AbstractMatrix{T}, p::Integer) where {T}
     n = size(A, 1)
     RT = real(T)
+    r_max = _phi_rmax(p)
     mk() = Matrix{T}(undef, n, n)
     Apow = Vector{Matrix{T}}(undef, _PHI_NPOW)
     Apow[1] = Matrix{T}(I, n, n)
@@ -121,18 +133,33 @@ function PhiPadeCache(A::AbstractMatrix{T}, p::Integer) where {T}
         Apow[i] = mk()
     end
     return PhiPadeCache{T, RT, Matrix{T}, Matrix{RT}}(
-        mk(), Apow[1], Apow, mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(),
+        mk(), Apow, mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(),
         Matrix{RT}(undef, n, n),
         Vector{BlasInt}(undef, n),
         Vector{RT}(undef, n), Vector{RT}(undef, n),
         Vector{Float64}(undef, _PHI_M_MAX + 1), Vector{Float64}(undef, _PHI_M_MAX + 1),
         Matrix{Float64}(undef, _PHI_M_MAX + 1, _PHI_M_MAX + 1),
-        Vector{Float64}(undef, _PHI_IMAX + 2), Vector{Float64}(undef, _PHI_IMAX + 2),
+        Vector{Float64}(undef, r_max), Vector{Float64}(undef, r_max),
         Vector{Int}(undef, _PHI_IMAX + 1),
-        Matrix{Float64}(undef, _PHI_IMAX + 1, _PHI_IMAX + 2),
+        Matrix{Float64}(undef, _PHI_IMAX + 1, r_max - 1),
+        Ref(zero(BlasInt)),
     )
 end
 
+@noinline function _phi_cache_check(cache::PhiPadeCache, n::Integer, r_max::Integer)
+    size(cache.As, 1) == n || throw(
+        DimensionMismatch(
+            "PhiPadeCache was constructed for size $(size(cache.As, 1)) but A has size $n"
+        )
+    )
+    length(cache.eta) >= r_max || throw(
+        ArgumentError(
+            "PhiPadeCache was constructed for a smaller p; construct it with PhiPadeCache(A, p) for the p in use"
+        )
+    )
+    return nothing
+end
+
 # Renormalized [m/m] Pade coefficients (numerator and denominator, low order
 # first) of the approximant to phi_p, written into the preallocated `Ncoef`,
 # `Dcoef` (length >= m+1) using scratch `Amat` (size >= (m+1)^2). Follows the
@@ -240,9 +267,9 @@ function _normpow_nonneg!(cache::PhiPadeCache, B, k::Integer)
 end
 
 # Scaling parameter `t` from the first term of the backward-error series
-# (Eq. (3.12)); `ell` subfunction of the reference, in-place.
-function _phi_ell!(cache::PhiPadeCache, A, m::Integer, p::Integer, phat::Integer)
-    normT = opnorm(A, 1)
+# (Eq. (3.12)); `ell` subfunction of the reference, in-place. `normT` is the
+# caller-computed `opnorm(A, 1)` (hoisted out of the per-degree loop).
+function _phi_ell!(cache::PhiPadeCache, A, normT, m::Integer, p::Integer, phat::Integer)
     normT == 0 && return 0
     t0 = normT > 1 ? log2(normT) : 0.0
     scalefac = exp2(t0)
@@ -254,12 +281,16 @@ function _phi_ell!(cache::PhiPadeCache, A, m::Integer, p::Integer, phat::Integer
     @. absA = c * abs(A) / scalefac
     alpha = _normpow_nonneg!(cache, absA, K)
     t = log2(2alpha / eps(Float64)) / (K - delta) + t0
+    # NaN/Inf inputs make t non-finite; return 0 (never throw) and let the NaN
+    # propagate to the output, where step-rejection logic can see it.
+    isfinite(t) || return 0
     return max(ceil(Int, t), 0)
 end
 
 function _select_parameters_phi!(cache::PhiPadeCache, A, p::Integer)
     phat = _phi_theta(_PHI_M_MAX, p) < 1 ? 0 : p
     r_max = floor(Int, (1 + sqrt(1 + 4 * (2 * _PHI_M_MAX + phat + 1))) / 2)
+    _phi_cache_check(cache, size(A, 1), r_max)
     eta = cache.eta
     mul!(cache.pow1, A, A)
     eta[1] = opnorm(cache.pow1, 1)^(1 / 2)
@@ -273,47 +304,69 @@ function _select_parameters_phi!(cache::PhiPadeCache, A, p::Integer)
     for j in 1:(r_max - 1)
         alpha[j] = max(eta[j], eta[j + 1])
     end
+    normT = opnorm(A, 1)
     for i in 0:_PHI_IMAX
         m_i = (i + 3)^2 ÷ 8
         phat_i = _phi_theta(m_i, p) < 1 ? 0 : p
-        cache.tvals[i + 1] = _phi_ell!(cache, A, m_i, p, phat_i)
+        cache.tvals[i + 1] = _phi_ell!(cache, A, normT, m_i, p, phat_i)
     end
     return _phi_select_from_alpha(alpha, r_max - 1, p, cache.tvals, cache.Cost)
 end
 
-# Evaluate N(As) and D(As) into cache.Nm, cache.Dm via Paterson--Stockmeyer.
+# Add block i's coefficient combination, sum_l coef[i*tau + l] * As^{l-1}, to M.
+function _phi_ps_addblock!(M, coef, Apow, i::Integer, tau::Integer, m::Integer)
+    start = i * tau + 1
+    stop = min((i + 1) * tau, m + 1)
+    for l in 1:(stop - start + 1)
+        c = coef[start + l - 1]
+        P = Apow[l]
+        @. M += c * P
+    end
+    return M
+end
+
+# Evaluate N(As) and D(As) via Paterson--Stockmeyer in Horner form,
+#
+#   q(As) = ( (B_nu * Atau + B_{nu-1}) * Atau + ... ) * Atau + B_0,
+#
+# which needs nu products per polynomial -- one fewer when tau | m, since then
+# B_nu = c_m*I and the first fold is the broadcast c_m*Atau + B_{nu-1}. Together
+# with the tau-1 explicit powers, the multiplication count is exactly Fasi's
+# pi_m(tau) = tau - 1 + 2*floor(m/tau) - 2*(tau | m) that the parameter
+# selection optimizes over. Returns (N, D) as views into cache buffers.
 function _paterson_stockmeyer!(cache::PhiPadeCache, As, m::Integer, tau::Integer)
     Nc, Dc, Apow = cache.Ncoef, cache.Dcoef, cache.Apow
-    Atau, Nm, Dm = cache.Atau, cache.Nm, cache.Dm
-    blkN, blkD, tmp = cache.blkN, cache.blkD, cache.tmp
-    copyto!(Apow[2], As)               # Apow[1] === Id; Apow[i] = As^{i-1}
+    copyto!(Apow[2], As)               # Apow[1] = I (set once); Apow[i] = As^{i-1}
     for i in 3:(tau + 1)
         mul!(Apow[i], Apow[i - 1], As)
     end
-    copyto!(Atau, Apow[tau + 1])
-    fill!(Nm, 0)
-    fill!(Dm, 0)
+    Atau = Apow[tau + 1]
+    N, Naux = cache.Nm, cache.Naux
+    D, Daux = cache.Dm, cache.Daux
     nu = m ÷ tau
-    for i in 0:nu
-        start = i * tau + 1
-        stop = min((i + 1) * tau, m + 1)
-        fill!(blkN, 0)
-        fill!(blkD, 0)
-        for l in 1:(stop - start + 1)
-            @. blkN += Nc[start + l - 1] * Apow[l]
-            @. blkD += Dc[start + l - 1] * Apow[l]
-        end
-        if i == 0
-            Nm .+= blkN
-            Dm .+= blkD
-        else
-            mul!(Nm, blkN, Atau, true, true)
-            mul!(Dm, blkD, Atau, true, true)
-            mul!(tmp, Atau, Apow[tau + 1])
-            copyto!(Atau, tmp)
-        end
+    if m % tau == 0
+        cN, cD = Nc[m + 1], Dc[m + 1]
+        @. N = cN * Atau
+        @. D = cD * Atau
+        _phi_ps_addblock!(N, Nc, Apow, nu - 1, tau, m)
+        _phi_ps_addblock!(D, Dc, Apow, nu - 1, tau, m)
+        inext = nu - 2
+    else
+        fill!(N, 0)
+        fill!(D, 0)
+        _phi_ps_addblock!(N, Nc, Apow, nu, tau, m)
+        _phi_ps_addblock!(D, Dc, Apow, nu, tau, m)
+        inext = nu - 1
+    end
+    for i in inext:-1:0
+        mul!(Naux, N, Atau)
+        mul!(Daux, D, Atau)
+        _phi_ps_addblock!(Naux, Nc, Apow, i, tau, m)
+        _phi_ps_addblock!(Daux, Dc, Apow, i, tau, m)
+        N, Naux = Naux, N
+        D, Daux = Daux, D
     end
-    return Nm, Dm
+    return N, D
 end
 
 # LAPACK LU factor + solve with pivots written into a preallocated `ipiv`.
@@ -333,7 +386,7 @@ for (getrf, getrs, elty) in
             ),
             m, n, A, max(1, stride(A, 2)), ipiv, info
         )
-        return A
+        return info[]
     end
     @eval function _phi_getrs!(
             A::AbstractMatrix{$elty}, ipiv::Vector{BlasInt}, B::AbstractMatrix{$elty}
@@ -354,10 +407,17 @@ for (getrf, getrs, elty) in
 end
 
 # Solve Dfact * X = B in place (B overwritten by X), zero-allocation LU.
+# Returns the LAPACK getrf info code instead of throwing: 0 on success; on a
+# numerically singular Dfact (pathological input, e.g. NaN/Inf), B is filled
+# with NaN so downstream step-rejection logic can detect the failure.
 function _phi_solve!(Dfact, ipiv, B)
-    _phi_getrf!(Dfact, ipiv)
-    _phi_getrs!(Dfact, ipiv, B)
-    return B
+    info = _phi_getrf!(Dfact, ipiv)
+    if info == 0
+        _phi_getrs!(Dfact, ipiv, B)
+    else
+        fill!(B, NaN)
+    end
+    return info
 end
 
 """
@@ -381,7 +441,21 @@ function _phi_almohy!(
 
     copyto!(cache.Dfact, Dm)
     copyto!(out[p + 1], Nm)
-    _phi_solve!(cache.Dfact, cache.ipiv, out[p + 1])
+    info = _phi_solve!(cache.Dfact, cache.ipiv, out[p + 1])
+    if info == 0 && !all(isfinite, out[p + 1])
+        # NaN/Inf inputs reach here with info == 0 (LAPACK does not flag NaN
+        # pivots); report them so integrators can test cache.info[] alone.
+        info = BlasInt(-1)
+    end
+    cache.info[] = info
+    if info != 0
+        # Return code instead of an exception: fill the outputs with NaN so an
+        # adaptive integrator can reject the step rather than crash.
+        for j in 1:(p + 1)
+            fill!(out[j], NaN)
+        end
+        return out
+    end
 
     # Recurrence (2.9): R^{(j)} = As R^{(j+1)} + I/j!, j = p-1 : -1 : 0.
     for k in p:-1:1
@@ -423,8 +497,7 @@ function _normpow_nonneg(B::AbstractMatrix{<:Real}, k::Integer)
     return maximum(v)
 end
 
-function _phi_ell(A::AbstractMatrix, m::Integer, p::Integer, phat::Integer)
-    normT = opnorm(A, 1)
+function _phi_ell(A::AbstractMatrix, normT, m::Integer, p::Integer, phat::Integer)
     normT == 0 && return 0
     t0 = normT > 1 ? log2(normT) : 0.0
     scalefac = exp2(t0)
@@ -435,6 +508,7 @@ function _phi_ell(A::AbstractMatrix, m::Integer, p::Integer, phat::Integer)
     scaledT = c .* abs.(A ./ scalefac)
     alpha = _normpow_nonneg(scaledT, K)
     t = log2(2alpha / eps(Float64)) / (K - delta) + t0
+    isfinite(t) || return 0
     return max(ceil(Int, t), 0)
 end
 
@@ -450,15 +524,30 @@ function _select_parameters_phi(A::AbstractMatrix, p::Integer)
     end
     alpha = [max(eta[j], eta[j + 1]) for j in 1:(r_max - 1)]
     tvals = Vector{Int}(undef, _PHI_IMAX + 1)
+    normT = opnorm(A, 1)
     for i in 0:_PHI_IMAX
         m_i = (i + 3)^2 ÷ 8
         phat_i = _phi_theta(m_i, p) < 1 ? 0 : p
-        tvals[i + 1] = _phi_ell(A, m_i, p, phat_i)
+        tvals[i + 1] = _phi_ell(A, normT, m_i, p, phat_i)
     end
     Cost = zeros(Float64, _PHI_IMAX + 1, r_max - 1)
     return _phi_select_from_alpha(alpha, r_max - 1, p, tvals, Cost)
 end
 
+# Out-of-place counterpart of `_phi_ps_addblock!`: returns `init` plus block i's
+# coefficient combination.
+function _phi_ps_block(init, coef, Apow, i::Integer, tau::Integer, m::Integer)
+    start = i * tau + 1
+    stop = min((i + 1) * tau, m + 1)
+    M = init
+    for l in 1:(stop - start + 1)
+        M = M + coef[start + l - 1] * Apow[l]
+    end
+    return M
+end
+
+# Out-of-place Horner-form Paterson--Stockmeyer; same multiplication count as
+# the in-place version (Fasi's pi_m(tau)).
 function _paterson_stockmeyer(As, Nc, Dc, tau::Integer, m::Integer)
     MT = typeof(As)
     Apow = Vector{MT}(undef, tau + 1)
@@ -468,26 +557,20 @@ function _paterson_stockmeyer(As, Nc, Dc, tau::Integer, m::Integer)
         Apow[i] = Apow[i - 1] * As
     end
     Atau = Apow[tau + 1]
-    N = zero(As)
-    D = zero(As)
     nu = m ÷ tau
-    for i in 0:nu
-        start = i * tau + 1
-        stop = min((i + 1) * tau, m + 1)
-        blkN = zero(As)
-        blkD = zero(As)
-        for l in 1:(stop - start + 1)
-            blkN += Nc[start + l - 1] * Apow[l]
-            blkD += Dc[start + l - 1] * Apow[l]
-        end
-        if i == 0
-            N += blkN
-            D += blkD
-        else
-            N += blkN * Atau
-            D += blkD * Atau
-            Atau = Atau * Apow[tau + 1]
-        end
+    local N::MT, D::MT
+    if m % tau == 0
+        N = _phi_ps_block(Nc[m + 1] * Atau, Nc, Apow, nu - 1, tau, m)
+        D = _phi_ps_block(Dc[m + 1] * Atau, Dc, Apow, nu - 1, tau, m)
+        inext = nu - 2
+    else
+        N = _phi_ps_block(zero(As), Nc, Apow, nu, tau, m)
+        D = _phi_ps_block(zero(As), Dc, Apow, nu, tau, m)
+        inext = nu - 1
+    end
+    for i in inext:-1:0
+        N = _phi_ps_block(N * Atau, Nc, Apow, i, tau, m)
+        D = _phi_ps_block(D * Atau, Dc, Apow, i, tau, m)
     end
     return N, D
 end
@@ -511,7 +594,14 @@ function _phi_almohy_generic(A::AbstractMatrix, p::Integer)
     Nm, Dm = _paterson_stockmeyer(As, Ncoef, Dcoef, tau, m)
 
     Rm = Vector{typeof(As)}(undef, p + 1)
-    Rm[p + 1] = Dm \ Nm
+    # No-throw on a numerically singular denominator (pathological input): NaN
+    # results let adaptive integrators reject the step instead of aborting.
+    Rm[p + 1] = try
+        Dm \ Nm
+    catch e
+        e isa SingularException || rethrow()
+        zero(As) .+ eltype(As)(NaN)
+    end
     Id = one(As)
     for k in p:-1:1
         Rm[k] = As * Rm[k + 1] + Id / _factf(k - 1)
diff --git a/test/basictests.jl b/test/basictests.jl
index e11d041..b95eb83 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -262,8 +262,11 @@ end
     # E = [I 0 ... 0] and J the nilpotent shift. This is unrelated to the
     # scaling-and-recovering algorithm under test.
     function phi_block_reference(A, p)
-        n = size(A, 1)
-        T = eltype(A)
+        # `local`: without it these assignments capture and clobber the
+        # enclosing testset's variables of the same name (closures in a local
+        # scope assign to existing outer locals).
+        local n = size(A, 1)
+        local T = eltype(A)
         W = zeros(T, n * (p + 1), n * (p + 1))
         W[1:n, 1:n] .= A
         for j in 0:(p - 1)
@@ -276,13 +279,13 @@ end
 
     # Force the legacy basis-vector path (Sidje/`phiv_dense`) for cross-checking.
     function phi_legacy(A, p)
-        n = size(A, 1)
-        T = eltype(A)
-        caches = (
+        local n = size(A, 1)
+        local T = eltype(A)
+        local caches = (
             Vector{T}(undef, n), Matrix{T}(undef, n, p + 1),
             Matrix{T}(undef, n + p, n + p),
         )
-        out = [Matrix{T}(undef, n, n) for _ in 1:(p + 1)]
+        local out = [Matrix{T}(undef, n, n) for _ in 1:(p + 1)]
         return phi!(out, A, p; caches = caches)
     end
 
@@ -326,19 +329,47 @@ end
     run_phi!(out2, A, cache)                           # warm up / compile
     @test (@allocated run_phi!(out2, A, cache)) == 0
 
-    # Complex-matrix workspace path.
+    # Complex-matrix workspace path, also allocation-free on reuse.
     Ac = randn(ComplexF64, 6, 6) ./ 3
     cachec = PhiPadeCache(Ac, 2)
     outc = [Matrix{ComplexF64}(undef, 6, 6) for _ in 1:3]
     phi!(outc, Ac, 2; caches = cachec)
     @test outc ≈ phi_block_reference(Ac, 2)
+    @test cachec.info[] == 0
+    run_phic!(o, M, c) = phi!(o, M, 2; caches = c)
+    run_phic!(outc, Ac, cachec)
+    @test (@allocated run_phic!(outc, Ac, cachec)) == 0
+
+    # Numerical failure must not throw: NaN input yields NaN-filled outputs and
+    # a nonzero return code in cache.info[], so adaptive integrators can reject
+    # the step instead of aborting.
+    Anan = fill(NaN, 5, 5)
+    Pnan = phi(Anan, 2)
+    @test all(P -> any(!isfinite, P), Pnan)
+    cachenan = PhiPadeCache(Anan, 2)
+    outnan = [Matrix{Float64}(undef, 5, 5) for _ in 1:3]
+    phi!(outnan, Anan, 2; caches = cachenan)
+    @test cachenan.info[] != 0
+    @test all(P -> all(isnan, P), outnan)
+    # ... and a subsequent good call resets the code.
+    phi!(outnan, randn(5, 5), 2; caches = cachenan)
+    @test cachenan.info[] == 0
+
+    # Wrongly sized workspaces are programmer errors, not numerical failures,
+    # and do throw.
+    A3 = randn(3, 3)
+    out3 = [Matrix{Float64}(undef, 3, 3) for _ in 1:4]
+    @test_throws DimensionMismatch phi!(out3, A3, 3; caches = cache)
+    cache_p1 = PhiPadeCache(A, 1)
+    out11 = [Matrix{Float64}(undef, n, n) for _ in 1:11]
+    @test_throws ArgumentError phi!(out11, A, 10; caches = cache_p1)
 end
 
 @testset "Phi static arrays (container-preserving)" begin
     Random.seed!(4321)
     function phi_block_reference(A, p)
-        n = size(A, 1)
-        T = eltype(A)
+        local n = size(A, 1)
+        local T = eltype(A)
         W = zeros(T, n * (p + 1), n * (p + 1))
         W[1:n, 1:n] .= A
         for j in 0:(p - 1)

From b71efd7cb45dc39a447ffff2c16193ef84ce7c2d Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 3 Jul 2026 09:17:47 -0400
Subject: [PATCH 6/8] phi: overflow-safe backward-error coefficient; public lu!
 replaces ccalls

Two review-driven changes:

* _phi_be_coeff evaluated (m+p)!, m!, (2m+p)!, (2m+p+1)! separately before
  dividing, so intermediates could spuriously overflow (Inf/Inf = NaN for
  p >~ 159) even where the quotient is representable (@stevengj). Rewritten
  as a product of sub-unit factors,
  prod 1/(m+p+j) * prod 1/(m+j), which is monotonically decreasing and can
  only underflow when the true value does. Matches the BigFloat-exact value
  to 1.1e-15 over m = 1:12, p = 1:60; new p = 30 regression test.

* The hand-rolled getrf!/getrs! ccalls are gone: the solve now uses the
  public LinearAlgebra API, lu!(Dfact; check = false) + issuccess + ldiv!,
  which supports the matrix RHS directly and keeps the no-throw return-code
  semantics (issuccess false -> info > 0, NaN-filled outputs).
  LinearSolve.jl was considered but its LinearCache only solves vector
  right-hand sides through the public path, which would force a
  column-by-column loop or reaching into cache internals; plain lu!/ldiv!
  achieves ccall-free with zero new dependencies. Cost: the pivot vector is
  now the one per-call allocation (8n + ~100 bytes, measured 112 B at
  n = 7 to 928 B at n = 100; all O(n^2) buffers still reused), since no
  public API preallocates lu! pivots. Allocation tests assert that O(n)
  bound. The BlasInt import and ipiv field are removed; phi_almohy.jl is
  now entirely public-API.

Verified on Julia 1.10 and 1.12: 254-assertion Phi testset passes
(including new large-p and allocation-bound tests), full basictests 605
pass (only the pre-existing ForwardDiff-1.x Issue 42 failures remain),
ExplicitImports clean, NaN/Inf return codes intact.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01EYp371jx6LurezUDhKcYRh
---
 src/phi.jl         |   5 ++-
 src/phi_almohy.jl  | 103 ++++++++++++++++++---------------------------
 test/basictests.jl |  20 +++++++--
 3 files changed, 59 insertions(+), 69 deletions(-)

diff --git a/src/phi.jl b/src/phi.jl
index dbd3f09..b20c653 100644
--- a/src/phi.jl
+++ b/src/phi.jl
@@ -126,8 +126,9 @@ Non-allocating version of `phi` for non-diagonal matrix inputs, writing
 `phi_j(A)` into `out[j+1]`.
 
 For dense `Float64`/`ComplexF64` matrices, pass a reusable
-[`PhiPadeCache`](@ref) as `caches` to make repeated evaluations of the same
-size and order allocation-free:
+[`PhiPadeCache`](@ref) as `caches` so repeated evaluations of the same size and
+order reuse all large buffers (the only per-call allocation is the `O(n)` LU
+pivot vector):
 
 ```julia
 cache = PhiPadeCache(A, k)
diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl
index 478c8cb..d8b37ef 100644
--- a/src/phi_almohy.jl
+++ b/src/phi_almohy.jl
@@ -16,13 +16,12 @@
 #   phi_j(2A) = 2^{-j} ( phi_0(A) phi_j(A) + sum_{k=1}^{j} phi_k(A)/(j-k)! ).
 #
 # Two implementations share the scalar machinery below:
-#   * `_phi_almohy!`  -- fully in-place, allocation-free, using a reusable
-#                        `PhiPadeCache` workspace (for strided mutable matrices).
+#   * `_phi_almohy!`  -- fully in-place via a reusable `PhiPadeCache` workspace
+#                        (for strided mutable matrices); the only per-call
+#                        allocation is the O(n) LU pivot vector.
 #   * `_phi_almohy_generic` -- out-of-place and element-container-preserving, so
 #                        a static input (e.g. `SMatrix`) stays static.
 
-using LinearAlgebra: BlasInt
-
 # theta[m, p] = theta_{m,p} from Table 3.1 of the paper (largest 1-norm of the
 # scaled matrix for which the [m/m] Pade approximant is backward stable to
 # double-precision unit roundoff). Rows are m = 1:20, columns p = 1:10.
@@ -72,9 +71,23 @@ end
     return r
 end
 
-# First-factor coefficient of the backward-error series h_{m,p} (Eq. (3.4)).
+# First-factor coefficient of the backward-error series h_{m,p} (Eq. (3.4)),
+#
+#   (m+p)! m! / ((2m+p)! (2m+p+1)!)
+#     = prod_{j=1}^{m} 1/(m+p+j) * prod_{j=1}^{m+p+1} 1/(m+j),
+#
+# accumulated as a product of sub-unit factors so no intermediate factorial can
+# spuriously overflow (each factor < 1, so the running product is monotonically
+# decreasing and underflows only when the true value does).
 @inline function _phi_be_coeff(m::Integer, p::Integer)
-    return (_factf(m + p) / _factf(2m + p)) * (_factf(m) / _factf(2m + p + 1))
+    r = 1.0
+    for j in 1:m
+        r /= (m + p + j)
+    end
+    for j in 1:(m + p + 1)
+        r /= (m + j)
+    end
+    return r
 end
 
 """
@@ -82,12 +95,12 @@ end
 
 Reusable workspace for the in-place phi-function evaluator [`phi!`](@ref) on
 strided `Float64`/`ComplexF64` matrices. Allocate once for a given size and `p`,
-then pass it as the `caches` keyword to `phi!` to reuse across calls without
-further allocation.
+then pass it as the `caches` keyword to `phi!` to reuse across calls; the only
+remaining per-call allocation is the LU pivot vector (`O(n)` bytes).
 
 After each `phi!` call, `cache.info[]` holds a return code: `0` means success;
-a positive value is the LAPACK `getrf` info of a numerically singular Padé
-denominator; `-1` means the result was non-finite (`NaN`/`Inf` input). Both
+a positive value is the LU factorization's info for a numerically singular
+Padé denominator; `-1` means the result was non-finite (`NaN`/`Inf` input). Both
 failures only occur for pathological inputs, and no error is thrown — the
 outputs are filled with `NaN` so that adaptive integrators can detect the
 failed evaluation via `cache.info[] != 0` (or an `isfinite` check) and reject
@@ -105,7 +118,6 @@ struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}}
     pow1::MT
     pow2::MT
     absA::RMT
-    ipiv::Vector{BlasInt}
     rvec1::Vector{RT}
     rvec2::Vector{RT}
     Ncoef::Vector{Float64}
@@ -115,7 +127,7 @@ struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}}
     alpha::Vector{Float64}
     tvals::Vector{Int}
     Cost::Matrix{Float64}
-    info::typeof(Ref(zero(BlasInt)))
+    info::typeof(Ref(0))
 end
 
 # Largest r for which alpha_r may be probed (Eq. (3.10) with phat = p; theta_{12,p}
@@ -135,14 +147,13 @@ function PhiPadeCache(A::AbstractMatrix{T}, p::Integer) where {T}
     return PhiPadeCache{T, RT, Matrix{T}, Matrix{RT}}(
         mk(), Apow, mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(),
         Matrix{RT}(undef, n, n),
-        Vector{BlasInt}(undef, n),
         Vector{RT}(undef, n), Vector{RT}(undef, n),
         Vector{Float64}(undef, _PHI_M_MAX + 1), Vector{Float64}(undef, _PHI_M_MAX + 1),
         Matrix{Float64}(undef, _PHI_M_MAX + 1, _PHI_M_MAX + 1),
         Vector{Float64}(undef, r_max), Vector{Float64}(undef, r_max),
         Vector{Int}(undef, _PHI_IMAX + 1),
         Matrix{Float64}(undef, _PHI_IMAX + 1, r_max - 1),
-        Ref(zero(BlasInt)),
+        Ref(0),
     )
 end
 
@@ -369,61 +380,27 @@ function _paterson_stockmeyer!(cache::PhiPadeCache, As, m::Integer, tau::Integer
     return N, D
 end
 
-# LAPACK LU factor + solve with pivots written into a preallocated `ipiv`.
-# `LAPACK.getrf!` only gained a pivot-accepting method in newer Julia, so both
-# `getrf!` and `getrs!` are ccall-ed directly (as `exp_noalloc.jl` does for
-# `gebal!`) to keep the solve allocation-free on every supported version.
-for (getrf, getrs, elty) in
-    ((:dgetrf_, :dgetrs_, :Float64), (:zgetrf_, :zgetrs_, :ComplexF64))
-    @eval function _phi_getrf!(A::AbstractMatrix{$elty}, ipiv::Vector{BlasInt})
-        m, n = size(A)
-        info = Ref{BlasInt}(0)
-        ccall(
-            (BLAS.@blasfunc($getrf), BLAS.libblastrampoline), Cvoid,
-            (
-                Ref{BlasInt}, Ref{BlasInt}, Ptr{$elty}, Ref{BlasInt},
-                Ptr{BlasInt}, Ref{BlasInt},
-            ),
-            m, n, A, max(1, stride(A, 2)), ipiv, info
-        )
-        return info[]
-    end
-    @eval function _phi_getrs!(
-            A::AbstractMatrix{$elty}, ipiv::Vector{BlasInt}, B::AbstractMatrix{$elty}
-        )
-        n = size(A, 1)
-        info = Ref{BlasInt}(0)
-        ccall(
-            (BLAS.@blasfunc($getrs), BLAS.libblastrampoline), Cvoid,
-            (
-                Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{$elty}, Ref{BlasInt},
-                Ptr{BlasInt}, Ptr{$elty}, Ref{BlasInt}, Ref{BlasInt}, Clong,
-            ),
-            'N', n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B,
-            max(1, stride(B, 2)), info, 1
-        )
-        return B
-    end
-end
-
-# Solve Dfact * X = B in place (B overwritten by X), zero-allocation LU.
-# Returns the LAPACK getrf info code instead of throwing: 0 on success; on a
+# Solve Dfact * X = B in place (Dfact overwritten by its LU factorization, B
+# by X). Returns an info code instead of throwing: 0 on success; on a
 # numerically singular Dfact (pathological input, e.g. NaN/Inf), B is filled
-# with NaN so downstream step-rejection logic can detect the failure.
-function _phi_solve!(Dfact, ipiv, B)
-    info = _phi_getrf!(Dfact, ipiv)
-    if info == 0
-        _phi_getrs!(Dfact, ipiv, B)
+# with NaN so downstream step-rejection logic can detect the failure. The lu!
+# pivot vector is the only per-call allocation (O(n); no public API yet allows
+# reusing it across factorizations).
+function _phi_solve!(Dfact, B)
+    F = lu!(Dfact; check = false)
+    if issuccess(F)
+        ldiv!(F, B)
+        return 0
     else
         fill!(B, NaN)
+        return Int(F.info)
     end
-    return info
 end
 
 """
     _phi_almohy!(out, A, p, cache) -> out
 
-Allocation-free evaluation of `phi_0(A), ..., phi_p(A)` for a strided
+In-place evaluation of `phi_0(A), ..., phi_p(A)` for a strided
 `Float64`/`ComplexF64` matrix `A` (`p >= 1`), writing `phi_j(A)` into `out[j+1]`
 and using the reusable [`PhiPadeCache`](@ref) `cache` for all scratch storage.
 The `out` matrices double as the recovery buffers.
@@ -441,11 +418,11 @@ function _phi_almohy!(
 
     copyto!(cache.Dfact, Dm)
     copyto!(out[p + 1], Nm)
-    info = _phi_solve!(cache.Dfact, cache.ipiv, out[p + 1])
+    info = _phi_solve!(cache.Dfact, out[p + 1])
     if info == 0 && !all(isfinite, out[p + 1])
-        # NaN/Inf inputs reach here with info == 0 (LAPACK does not flag NaN
+        # NaN/Inf inputs reach here with info == 0 (the LU does not flag NaN
         # pivots); report them so integrators can test cache.info[] alone.
-        info = BlasInt(-1)
+        info = -1
     end
     cache.info[] = info
     if info != 0
diff --git a/test/basictests.jl b/test/basictests.jl
index b95eb83..fdf4302 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -313,6 +313,15 @@ end
     A = randn(9, 9) ./ 3
     @test phi(A, 4)[1] ≈ exp(A)
 
+    # Large p exercises the recurrence form of the backward-error coefficient
+    # (naive factorial ratios overflow long before their quotient does).
+    A = randn(3, 3)
+    P30 = phi(A, 30)
+    ref30 = phi_block_reference(A, 30)
+    for j in 1:31
+        @test P30[j] ≈ ref30[j] rtol = 1.0e-8 atol = 1.0e-13
+    end
+
     # Preallocated in-place entry point returns phi_j in out[j+1].
     n = 7
     A = randn(n, n) ./ 3
@@ -320,16 +329,19 @@ end
     phi!(out, A, 3)
     @test out ≈ phi_block_reference(A, 3)
 
-    # Reusable workspace: same result, and no allocation once the cache exists.
+    # Reusable workspace: same result, and only the O(n) `lu!` pivot vector is
+    # allocated once the cache exists (all O(n^2) buffers are reused; the pivot
+    # vector cannot be preallocated through the public LinearAlgebra API).
+    lu_alloc_bound(m) = 8m + 200
     cache = PhiPadeCache(A, 3)
     out2 = [Matrix{Float64}(undef, n, n) for _ in 1:4]
     phi!(out2, A, 3; caches = cache)
     @test out2 ≈ out
     run_phi!(o, M, c) = phi!(o, M, 3; caches = c)
     run_phi!(out2, A, cache)                           # warm up / compile
-    @test (@allocated run_phi!(out2, A, cache)) == 0
+    @test (@allocated run_phi!(out2, A, cache)) <= lu_alloc_bound(n)
 
-    # Complex-matrix workspace path, also allocation-free on reuse.
+    # Complex-matrix workspace path, same allocation bound on reuse.
     Ac = randn(ComplexF64, 6, 6) ./ 3
     cachec = PhiPadeCache(Ac, 2)
     outc = [Matrix{ComplexF64}(undef, 6, 6) for _ in 1:3]
@@ -338,7 +350,7 @@ end
     @test cachec.info[] == 0
     run_phic!(o, M, c) = phi!(o, M, 2; caches = c)
     run_phic!(outc, Ac, cachec)
-    @test (@allocated run_phic!(outc, Ac, cachec)) == 0
+    @test (@allocated run_phic!(outc, Ac, cachec)) <= lu_alloc_bound(6)
 
     # Numerical failure must not throw: NaN input yields NaN-filled outputs and
     # a nonzero return code in cache.info[], so adaptive integrators can reject

From 583e40a5aabbfdec094935944ec1c09d5e8d8da6 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Sat, 4 Jul 2026 17:07:05 -0400
Subject: [PATCH 7/8] phi: solve through LinearSolve v4 batched matrix RHS

With LinearSolve 4.0.0 supporting LinearProblem(A, B::Matrix) like A\B
(SciML/LinearSolve.jl#1072), PhiPadeCache now embeds a LinearSolve cache
and _phi_solve! runs the Pade denominator solve through it instead of
raw lu!/ldiv!. GenericLUFactorization is used because it refactorizes in
place with a cached pivot vector: workspace reuse is now fully
allocation-free (0 bytes measured for Float64 and ComplexF64 at
n = 7..100, p = 3..8, on Julia 1.10 and 1.12), improving on the lu!
version's per-call O(n) pivot allocation. (LUFactorization currently
copies A on every dense refactorization -- an in-place opt-in is being
added upstream, after which the LAPACK path can be swapped in for large
n.) Return-code semantics are unchanged: retcode Success -> info 0,
singular -> info 1 with NaN-filled outputs, non-finite results -> -1.

Tests tightened back to @allocated == 0; 254-assertion Phi testset
passes on 1.10 and 1.12 against LinearSolve 4.0.0.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01EYp371jx6LurezUDhKcYRh
---
 Project.toml       |  2 ++
 src/phi.jl         |  6 ++---
 src/phi_almohy.jl  | 59 +++++++++++++++++++++++++++++-----------------
 test/basictests.jl | 13 +++++-----
 4 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/Project.toml b/Project.toml
index a10df2c..5c9785d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,6 +8,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 GenericSchur = "c145ed77-6b09-5dd9-b285-bf645a82121e"
+LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -29,6 +30,7 @@ ForwardDiff = "0.10.13"
 GPUArraysCore = "0.1, 0.2"
 GenericSchur = "0.5.3"
 LinearAlgebra = "1.10"
+LinearSolve = "4"
 PrecompileTools = "1"
 Printf = "1.10"
 Random = "1"
diff --git a/src/phi.jl b/src/phi.jl
index b20c653..010ee37 100644
--- a/src/phi.jl
+++ b/src/phi.jl
@@ -126,9 +126,9 @@ Non-allocating version of `phi` for non-diagonal matrix inputs, writing
 `phi_j(A)` into `out[j+1]`.
 
 For dense `Float64`/`ComplexF64` matrices, pass a reusable
-[`PhiPadeCache`](@ref) as `caches` so repeated evaluations of the same size and
-order reuse all large buffers (the only per-call allocation is the `O(n)` LU
-pivot vector):
+[`PhiPadeCache`](@ref) as `caches` to make repeated evaluations of the same
+size and order allocation-free (the linear solve runs through an embedded
+LinearSolve.jl cache):
 
 ```julia
 cache = PhiPadeCache(A, k)
diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl
index d8b37ef..057d327 100644
--- a/src/phi_almohy.jl
+++ b/src/phi_almohy.jl
@@ -22,6 +22,8 @@
 #   * `_phi_almohy_generic` -- out-of-place and element-container-preserving, so
 #                        a static input (e.g. `SMatrix`) stays static.
 
+using LinearSolve: LinearSolve, LinearProblem, GenericLUFactorization
+
 # theta[m, p] = theta_{m,p} from Table 3.1 of the paper (largest 1-norm of the
 # scaled matrix for which the [m/m] Pade approximant is backward stable to
 # double-precision unit roundoff). Rows are m = 1:20, columns p = 1:10.
@@ -95,23 +97,28 @@ end
 
 Reusable workspace for the in-place phi-function evaluator [`phi!`](@ref) on
 strided `Float64`/`ComplexF64` matrices. Allocate once for a given size and `p`,
-then pass it as the `caches` keyword to `phi!` to reuse across calls; the only
-remaining per-call allocation is the LU pivot vector (`O(n)` bytes).
+then pass it as the `caches` keyword to `phi!` to reuse across calls. The
+linear solve runs through an embedded LinearSolve.jl cache (batched matrix
+right-hand side, `GenericLUFactorization`); reuse across calls is
+allocation-free — all buffers, including the LU pivots, live in the workspace
+and the LinearSolve cache.
 
 After each `phi!` call, `cache.info[]` holds a return code: `0` means success;
-a positive value is the LU factorization's info for a numerically singular
+a positive value indicates a numerically singular
 Padé denominator; `-1` means the result was non-finite (`NaN`/`Inf` input). Both
 failures only occur for pathological inputs, and no error is thrown — the
 outputs are filled with `NaN` so that adaptive integrators can detect the
 failed evaluation via `cache.info[] != 0` (or an `isfinite` check) and reject
 the step rather than abort.
 """
-struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}}
+struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}, LS}
     As::MT
     Apow::Vector{MT}
     Nm::MT
     Dm::MT
     Dfact::MT
+    rhs::MT
+    linsolve::LS
     Naux::MT
     Daux::MT
     tmp::MT
@@ -144,8 +151,16 @@ function PhiPadeCache(A::AbstractMatrix{T}, p::Integer) where {T}
     for i in 2:_PHI_NPOW
         Apow[i] = mk()
     end
-    return PhiPadeCache{T, RT, Matrix{T}, Matrix{RT}}(
-        mk(), Apow, mk(), mk(), mk(), mk(), mk(), mk(), mk(), mk(),
+    Dfact = Matrix{T}(I, n, n)
+    rhs = zeros(T, n, n)
+    # GenericLUFactorization refactorizes in place with a cached pivot vector,
+    # so cache reuse is allocation-free (LUFactorization currently copies A on
+    # every dense refactorization). Dfact/rhs are workspace buffers refilled
+    # before each solve.
+    linsolve = LinearSolve.init(LinearProblem(Dfact, rhs), GenericLUFactorization())
+    return PhiPadeCache{T, RT, Matrix{T}, Matrix{RT}, typeof(linsolve)}(
+        mk(), Apow, mk(), mk(), Dfact, rhs, linsolve,
+        mk(), mk(), mk(), mk(), mk(),
         Matrix{RT}(undef, n, n),
         Vector{RT}(undef, n), Vector{RT}(undef, n),
         Vector{Float64}(undef, _PHI_M_MAX + 1), Vector{Float64}(undef, _PHI_M_MAX + 1),
@@ -380,20 +395,24 @@ function _paterson_stockmeyer!(cache::PhiPadeCache, As, m::Integer, tau::Integer
     return N, D
 end
 
-# Solve Dfact * X = B in place (Dfact overwritten by its LU factorization, B
-# by X). Returns an info code instead of throwing: 0 on success; on a
-# numerically singular Dfact (pathological input, e.g. NaN/Inf), B is filled
-# with NaN so downstream step-rejection logic can detect the failure. The lu!
-# pivot vector is the only per-call allocation (O(n); no public API yet allows
-# reusing it across factorizations).
-function _phi_solve!(Dfact, B)
-    F = lu!(Dfact; check = false)
-    if issuccess(F)
-        ldiv!(F, B)
+# Solve Dm * X = Nm through the workspace's LinearSolve cache (batched matrix
+# right-hand side, LinearSolve >= 4), writing X into `X`. Returns an info code
+# instead of throwing: 0 on success; 1 on a numerically singular Dm
+# (pathological input, e.g. NaN/Inf), in which case `X` is filled with NaN so
+# downstream step-rejection logic can detect the failure.
+function _phi_solve!(cache::PhiPadeCache, Dm, Nm, X)
+    copyto!(cache.Dfact, Dm)
+    copyto!(cache.rhs, Nm)
+    linsolve = cache.linsolve
+    linsolve.A = cache.Dfact
+    linsolve.b = cache.rhs
+    sol = LinearSolve.solve!(linsolve)
+    if sol.retcode == LinearSolve.ReturnCode.Success
+        copyto!(X, sol.u)
         return 0
     else
-        fill!(B, NaN)
-        return Int(F.info)
+        fill!(X, NaN)
+        return 1
     end
 end
 
@@ -416,9 +435,7 @@ function _phi_almohy!(
     _phi_pade_coef!(cache.Ncoef, cache.Dcoef, cache.Amat, m, p)
     Nm, Dm = _paterson_stockmeyer!(cache, As, m, tau)
 
-    copyto!(cache.Dfact, Dm)
-    copyto!(out[p + 1], Nm)
-    info = _phi_solve!(cache.Dfact, out[p + 1])
+    info = _phi_solve!(cache, Dm, Nm, out[p + 1])
     if info == 0 && !all(isfinite, out[p + 1])
         # NaN/Inf inputs reach here with info == 0 (the LU does not flag NaN
         # pivots); report them so integrators can test cache.info[] alone.
diff --git a/test/basictests.jl b/test/basictests.jl
index fdf4302..faf721c 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -329,19 +329,18 @@ end
     phi!(out, A, 3)
     @test out ≈ phi_block_reference(A, 3)
 
-    # Reusable workspace: same result, and only the O(n) `lu!` pivot vector is
-    # allocated once the cache exists (all O(n^2) buffers are reused; the pivot
-    # vector cannot be preallocated through the public LinearAlgebra API).
-    lu_alloc_bound(m) = 8m + 200
+    # Reusable workspace: same result, and reuse is allocation-free (the linear
+    # solve runs through the workspace's LinearSolve cache, which owns the LU
+    # pivot storage).
     cache = PhiPadeCache(A, 3)
     out2 = [Matrix{Float64}(undef, n, n) for _ in 1:4]
     phi!(out2, A, 3; caches = cache)
     @test out2 ≈ out
     run_phi!(o, M, c) = phi!(o, M, 3; caches = c)
     run_phi!(out2, A, cache)                           # warm up / compile
-    @test (@allocated run_phi!(out2, A, cache)) <= lu_alloc_bound(n)
+    @test (@allocated run_phi!(out2, A, cache)) == 0
 
-    # Complex-matrix workspace path, same allocation bound on reuse.
+    # Complex-matrix workspace path, also allocation-free on reuse.
     Ac = randn(ComplexF64, 6, 6) ./ 3
     cachec = PhiPadeCache(Ac, 2)
     outc = [Matrix{ComplexF64}(undef, 6, 6) for _ in 1:3]
@@ -350,7 +349,7 @@ end
     @test cachec.info[] == 0
     run_phic!(o, M, c) = phi!(o, M, 2; caches = c)
     run_phic!(outc, Ac, cachec)
-    @test (@allocated run_phic!(outc, Ac, cachec)) <= lu_alloc_bound(6)
+    @test (@allocated run_phic!(outc, Ac, cachec)) == 0
 
     # Numerical failure must not throw: NaN input yields NaN-filled outputs and
     # a nonzero return code in cache.info[], so adaptive integrators can reject

From 27be6405d3023f89649bd75fdb5d383d92ea5d52 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Sun, 5 Jul 2026 05:22:51 -0400
Subject: [PATCH 8/8] Retrigger CI: LinearSolve 4.1.0 registered

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>