diff --git a/Project.toml b/Project.toml
index a5c275c8..5c9785dc 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,13 +1,14 @@
 name = "ExponentialUtilities"
 uuid = "d4d017d3-3776-5f7e-afef-a10c40355c18"
 authors = ["Chris Rackauckas <accounts@chrisrackauckas.com>", "José E. Cruz Serrallés <Jose.CruzSerralles@nyulangone.org>"]
-version = "1.30.2"
+version = "1.31.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 GenericSchur = "c145ed77-6b09-5dd9-b285-bf645a82121e"
+LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -29,6 +30,7 @@ ForwardDiff = "0.10.13"
 GPUArraysCore = "0.1, 0.2"
 GenericSchur = "0.5.3"
 LinearAlgebra = "1.10"
+LinearSolve = "4"
 PrecompileTools = "1"
 Printf = "1.10"
 Random = "1"
diff --git a/docs/src/matrix_exponentials.md b/docs/src/matrix_exponentials.md
index 48aca6b9..8a5e56a9 100644
--- a/docs/src/matrix_exponentials.md
+++ b/docs/src/matrix_exponentials.md
@@ -3,6 +3,8 @@
 ```@docs
 exponential!
 phi
+phi!
+PhiPadeCache
 ```
 
 ## Methods
diff --git a/src/ExponentialUtilities.jl b/src/ExponentialUtilities.jl
index 9e89db38..579ab7e1 100644
--- a/src/ExponentialUtilities.jl
+++ b/src/ExponentialUtilities.jl
@@ -29,6 +29,7 @@ include("exp_noalloc.jl")
 include("exp_generic.jl")
 include("exp_sparse.jl")
 include("phi.jl")
+include("phi_almohy.jl")
 include("arnoldi.jl")
 include("krylov_phiv.jl")
 include("krylov_phiv_adaptive.jl")
@@ -41,7 +42,7 @@ include("precompile.jl")
 export phi, phi!, KrylovSubspace, arnoldi, arnoldi!, lanczos!, ExpvCache, PhivCache,
     expv, expv!, phiv, phiv!, kiops, expv_timestep, expv_timestep!, phiv_timestep,
     phiv_timestep!,
-    StegrCache, get_subspace_cache, exponential!
+    StegrCache, PhiPadeCache, get_subspace_cache, exponential!
 export ExpMethodHigham2005,
     ExpMethodHigham2005Base, ExpMethodGeneric, ExpMethodNative,
     ExpMethodDiagonalization
diff --git a/src/phi.jl b/src/phi.jl
index 8faa9eb2..010ee37c 100644
--- a/src/phi.jl
+++ b/src/phi.jl
@@ -103,9 +103,12 @@ The phi functions are defined as
 \\varphi_0(z) = \\exp(z),\\quad \\varphi_{k+1}(z) = \\frac{\\varphi_k(z) - \\varphi_k(0)}{z}
 ```
 
-Calls `phiv_dense` on each of the basis vectors to obtain the answer. If A
-is `Diagonal`, instead calls the scalar `phi` on each diagonal element and the
-return values are also `Diagonal`s
+For `Float64`/`ComplexF64` matrices this uses the scaling-and-recovering
+algorithm of Al-Mohy and Liu (arXiv:2506.01193), which computes all of
+`phi_0(A), ..., phi_k(A)` simultaneously in `O(k m^3)` operations. Other element
+types fall back to calling `phiv_dense` on each basis vector (`O(m (m+k)^3)`). If
+`A` is `Diagonal`, the scalar `phi` is instead applied to each diagonal element
+and the return values are also `Diagonal`s.
 """
 function phi(A::AbstractMatrix{T}, k; kwargs...) where {T <: Number}
     m = size(A, 1)
@@ -119,7 +122,28 @@ end
 """
     phi!(out,A,k[;caches]) -> out
 
-Non-allocating version of `phi` for non-diagonal matrix inputs.
+Non-allocating version of `phi` for non-diagonal matrix inputs, writing
+`phi_j(A)` into `out[j+1]`.
+
+For dense `Float64`/`ComplexF64` matrices, pass a reusable
+[`PhiPadeCache`](@ref) as `caches` to make repeated evaluations of the same
+size and order allocation-free (the linear solve runs through an embedded
+LinearSolve.jl cache):
+
+```julia
+cache = PhiPadeCache(A, k)
+phi!(out, A, k; caches = cache)
+```
+
+Numerical failure (a singular Padé denominator or non-finite result, only
+possible for pathological inputs such as matrices containing `NaN`/`Inf`) does
+not throw: the outputs are filled with `NaN` and, when a `PhiPadeCache` is
+used, `cache.info[]` is set to a nonzero return code (`0` on success). This
+lets adaptive integrators reject the step instead of aborting.
+
+For the legacy basis-vector algorithm, `caches` is instead the tuple
+`(Vector{T}(undef, m), Matrix{T}(undef, m, k+1), Matrix{T}(undef, m+k, m+k))`;
+supplying it forces that code path.
 """
 function phi!(
         out::Vector{Matrix{T}}, A::AbstractMatrix{T}, k::Integer; caches = nothing,
@@ -127,6 +151,27 @@ function phi!(
     ) where {T <: Number}
     m = size(A, 1)
     @assert length(out) == k + 1&&all(P -> size(P) == (m, m), out) "Dimension mismatch"
+    # The scaling-and-recovering algorithm of Al-Mohy and Liu (arXiv:2506.01193)
+    # computes phi_0..phi_k simultaneously in O(k m^3), versus O(m (m+k)^3) for
+    # the basis-vector approach below. Its Pade tables are tuned for double
+    # precision, so it is used only for Float64/ComplexF64. The legacy path is
+    # kept for other element types and when a caller supplies the legacy
+    # `caches` tuple.
+    if k >= 1 && T <: Union{Float64, ComplexF64} && !(caches isa Tuple)
+        if A isa StridedMatrix && (isnothing(caches) || caches isa PhiPadeCache)
+            cache = caches isa PhiPadeCache ? caches : PhiPadeCache(A, k)
+            return _phi_almohy!(out, A, k, cache)
+        elseif isnothing(caches) && !ismutable(A)
+            # Container-preserving path for immutable dense matrices (e.g.
+            # `SMatrix`); mutable non-strided types (e.g. sparse) fall through to
+            # the legacy path below.
+            Rm = _phi_almohy_generic(A, k)
+            for j in 1:(k + 1)
+                copyto!(out[j], Rm[j])
+            end
+            return out
+        end
+    end
     if isnothing(caches)
         e = Vector{T}(undef, m)
         W = Matrix{T}(undef, m, k + 1)
diff --git a/src/phi_almohy.jl b/src/phi_almohy.jl
new file mode 100644
index 00000000..057d3278
--- /dev/null
+++ b/src/phi_almohy.jl
@@ -0,0 +1,614 @@
+# Scaling-and-recovering algorithm for the matrix phi-functions, computing
+# phi_0(A), ..., phi_p(A) simultaneously in O(p*n^3) work.
+#
+# This is a port of Algorithm 5.1 of
+#
+#   Awad H. Al-Mohy and Xiaobo Liu, "A scaling and recovering algorithm for the
+#   matrix phi-functions", arXiv:2506.01193 (2025),
+#
+# and follows the authors' reference MATLAB implementation `phi_funm`.
+#
+# The idea: scale A down by 2^s, evaluate the [m/m] diagonal Pade approximant to
+# phi_p of the scaled matrix, recover the lower-index approximants via the
+# recurrence R^{(j)}_m(z) = z R^{(j+1)}_m(z) + 1/j!, then undo the scaling with
+# the double-argument formula
+#
+#   phi_j(2A) = 2^{-j} ( phi_0(A) phi_j(A) + sum_{k=1}^{j} phi_k(A)/(j-k)! ).
+#
+# Two implementations share the scalar machinery below:
+#   * `_phi_almohy!`  -- fully in-place via a reusable `PhiPadeCache` workspace
+#                        (for strided mutable matrices); the only per-call
+#                        allocation is the O(n) LU pivot vector.
+#   * `_phi_almohy_generic` -- out-of-place and element-container-preserving, so
+#                        a static input (e.g. `SMatrix`) stays static.
+
+using LinearSolve: LinearSolve, LinearProblem, GenericLUFactorization
+
+# theta[m, p] = theta_{m,p} from Table 3.1 of the paper (largest 1-norm of the
+# scaled matrix for which the [m/m] Pade approximant is backward stable to
+# double-precision unit roundoff). Rows are m = 1:20, columns p = 1:10.
+const _PHI_THETA_MP = [
+    1.999463452408407e-5 3.7631213142601604e-5 7.366006416045163e-5 0.00014973317297025854 0.0003152443333771182 0.0006855209983764435 0.0015357294906993542 0.0035357368946407606 0.008345789028062234 0.02013882808928226;
+    0.0038062018282832713 0.006090206286125726 0.009869682746779615 0.016211831146383013 0.026984240563843326 0.0454757968381803 0.07749273259855331 0.13324895779231027 0.2304625604362521 0.3988991104549146;
+    0.03971636005661334 0.05806968886880692 0.08534220076759817 0.12612151517169362 0.18736524835661617 0.27955116495245524 0.41822735418681667 0.6258702800351991 0.9335970443986562 1.1616793320890249;
+    0.15442675548312682 0.21278117034577634 0.29371996708854947 0.40617647304246707 0.5623843002320996 0.7787883505754265 1.0464245027100287 1.2572921799132364 1.480350861451984 1.713871003185325;
+    0.37980898016147974 0.5014624976587007 0.6621033824456818 0.8739858777744354 1.110828442901451 1.3375805277521873 1.5770955744229305 1.8273848586096524 2.08673114576671 2.3536689190936406;
+    0.726177195703321 0.9281910159646274 1.1591052927815408 1.4012982671152012 1.6570386251184455 1.924026602416338 2.20029216725632 2.4841706972729956 2.7742685942850143 3.069426294589578;
+    1.1898666361923196 1.4469917284172122 1.7187282676344413 2.0024009817133357 2.295729523138923 2.5968019781751672 2.9040336511997444 3.216121483847106 3.531999855119243 3.8508005672569903;
+    1.7605812331512907 2.060907194742016 2.371480030315257 2.690078917436836 3.014877598333601 3.3443898815994957 3.677415494646134 4.01299056190377 4.350344448873932 4.688863279363638;
+    2.425818958547233 2.7623053687512256 3.105174730511504 3.4527057639100476 3.8035234182863555 4.156538184296475 4.510892686686127 4.865916398054041 5.2210881567935115 5.57600564043454;
+    3.173113456793749 3.5393251225873685 3.9086764034913664 4.279911937822335 4.652058062541427 5.024366624424222 5.396268333440519 5.767334816025469 6.1372481669904 6.505776744744432;
+    3.991025201329815 4.3813599805888455 4.772204432206606 5.162703837030561 5.552220615863406 5.940286915538899 6.326566605567779 6.710825187972159 7.092906173082131 7.4727126397309815;
+    4.869485489784578 5.279199870248916 5.687344501175957 6.09339280009268 6.496977936247796 6.8978550856394785 7.295871912058635 7.690945666690415 8.083045525424248 8.472179024890941;
+    5.799827904547885 6.224971321619904 6.64693480650714 7.065451733211646 7.48036740469753 7.891610269499859 8.29916985381997 8.703079946679036 9.103405852177376 9.500234768451538;
+    6.774681996458441 7.211997240187581 7.644910128682907 8.073354767800579 8.497338949398504 8.916922998846443 9.332203860678709 9.74330319935391 10.150358556572542 10.5535168239286;
+    7.787817051572435 8.234634975085694 8.676142196766529 9.112424389503664 9.543611219824884 9.969861373150518 10.391351525913848 10.808268293369911 11.220802409044003 11.629144570605774;
+    8.833978214833847 9.28811956766922 9.736292792866143 10.178695474892383 10.615546677548995 11.04707680323746 11.473520338548882 11.895110742359616 12.312076916038997 12.724640835984562;
+    9.908733823133844 10.368423129623023 10.821685340709736 11.26879859255289 11.710045790427412 12.145708155762449 12.576060822320501 13.001369927621694 13.421890788016032 13.837866852904227;
+    11.008341035221061 11.472133539821815 11.929195701618417 12.379861767254168 12.824458619109002 13.263302077220441 13.696694607678852 14.124924034990169 14.548262963142152 14.966968689718191;
+    12.129631147167878 12.596352058386916 13.056160721729096 13.509428665638305 13.956511465304834 14.397747062985934 14.83345501195776 15.263936358537684 15.689473955602649 16.110333058994453;
+    13.269913405576162 13.738607964511967 14.20030229038679 14.655390835644507 15.104246216977073 15.547218984875734 15.984637960610213 16.41681094336813 16.84402564730148 17.266550769620974;
+]
+
+# Largest Pade degree used; bounds cond(D_m) (Section 4). Fixes the block-size
+# and power-buffer counts used to size the workspace.
+const _PHI_M_MAX = 12
+# max tau+1 over m <= _PHI_M_MAX, tau = ceil(sqrt(2m)); ceil(sqrt(24)) = 5.
+const _PHI_NPOW = 6
+# max i (Paterson--Stockmeyer cost index) for _PHI_M_MAX.
+const _PHI_IMAX = ceil(Int, sqrt(8 * (_PHI_M_MAX + 1)) - 3) - 1
+
+# theta_{m,p} with the p>7 rule of the paper: for p>7 the p=7 column is used, and
+# indices are guarded so m outside 1:20 does not error.
+@inline function _phi_theta(m::Integer, p::Integer)
+    return _PHI_THETA_MP[m, p > 7 ? 7 : p]
+end
+
+# k! as a Float64 (k small; avoids Int overflow for the recovery coefficients).
+@inline function _factf(k::Integer)
+    r = 1.0
+    for i in 2:k
+        r *= i
+    end
+    return r
+end
+
+# First-factor coefficient of the backward-error series h_{m,p} (Eq. (3.4)),
+#
+#   (m+p)! m! / ((2m+p)! (2m+p+1)!)
+#     = prod_{j=1}^{m} 1/(m+p+j) * prod_{j=1}^{m+p+1} 1/(m+j),
+#
+# accumulated as a product of sub-unit factors so no intermediate factorial can
+# spuriously overflow (each factor < 1, so the running product is monotonically
+# decreasing and underflows only when the true value does).
+@inline function _phi_be_coeff(m::Integer, p::Integer)
+    r = 1.0
+    for j in 1:m
+        r /= (m + p + j)
+    end
+    for j in 1:(m + p + 1)
+        r /= (m + j)
+    end
+    return r
+end
+
+"""
+    PhiPadeCache(A, p)
+
+Reusable workspace for the in-place phi-function evaluator [`phi!`](@ref) on
+strided `Float64`/`ComplexF64` matrices. Allocate once for a given size and `p`,
+then pass it as the `caches` keyword to `phi!` to reuse across calls. The
+linear solve runs through an embedded LinearSolve.jl cache (batched matrix
+right-hand side, `GenericLUFactorization`); reuse across calls is
+allocation-free — all buffers, including the LU pivots, live in the workspace
+and the LinearSolve cache.
+
+After each `phi!` call, `cache.info[]` holds a return code: `0` means success;
+a positive value indicates a numerically singular
+Padé denominator; `-1` means the result was non-finite (`NaN`/`Inf` input). Both
+failures only occur for pathological inputs, and no error is thrown — the
+outputs are filled with `NaN` so that adaptive integrators can detect the
+failed evaluation via `cache.info[] != 0` (or an `isfinite` check) and reject
+the step rather than abort.
+"""
+struct PhiPadeCache{T, RT, MT <: AbstractMatrix{T}, RMT <: AbstractMatrix{RT}, LS}
+    As::MT
+    Apow::Vector{MT}
+    Nm::MT
+    Dm::MT
+    Dfact::MT
+    rhs::MT
+    linsolve::LS
+    Naux::MT
+    Daux::MT
+    tmp::MT
+    pow1::MT
+    pow2::MT
+    absA::RMT
+    rvec1::Vector{RT}
+    rvec2::Vector{RT}
+    Ncoef::Vector{Float64}
+    Dcoef::Vector{Float64}
+    Amat::Matrix{Float64}
+    eta::Vector{Float64}
+    alpha::Vector{Float64}
+    tvals::Vector{Int}
+    Cost::Matrix{Float64}
+    info::typeof(Ref(0))
+end
+
+# Largest r for which alpha_r may be probed (Eq. (3.10) with phat = p; theta_{12,p}
+# >= 1 for all p, so phat = p always applies at m = _PHI_M_MAX).
+_phi_rmax(p::Integer) = floor(Int, (1 + sqrt(1 + 4 * (2 * _PHI_M_MAX + p + 1))) / 2)
+
+function PhiPadeCache(A::AbstractMatrix{T}, p::Integer) where {T}
+    n = size(A, 1)
+    RT = real(T)
+    r_max = _phi_rmax(p)
+    mk() = Matrix{T}(undef, n, n)
+    Apow = Vector{Matrix{T}}(undef, _PHI_NPOW)
+    Apow[1] = Matrix{T}(I, n, n)
+    for i in 2:_PHI_NPOW
+        Apow[i] = mk()
+    end
+    Dfact = Matrix{T}(I, n, n)
+    rhs = zeros(T, n, n)
+    # GenericLUFactorization refactorizes in place with a cached pivot vector,
+    # so cache reuse is allocation-free (LUFactorization currently copies A on
+    # every dense refactorization). Dfact/rhs are workspace buffers refilled
+    # before each solve.
+    linsolve = LinearSolve.init(LinearProblem(Dfact, rhs), GenericLUFactorization())
+    return PhiPadeCache{T, RT, Matrix{T}, Matrix{RT}, typeof(linsolve)}(
+        mk(), Apow, mk(), mk(), Dfact, rhs, linsolve,
+        mk(), mk(), mk(), mk(), mk(),
+        Matrix{RT}(undef, n, n),
+        Vector{RT}(undef, n), Vector{RT}(undef, n),
+        Vector{Float64}(undef, _PHI_M_MAX + 1), Vector{Float64}(undef, _PHI_M_MAX + 1),
+        Matrix{Float64}(undef, _PHI_M_MAX + 1, _PHI_M_MAX + 1),
+        Vector{Float64}(undef, r_max), Vector{Float64}(undef, r_max),
+        Vector{Int}(undef, _PHI_IMAX + 1),
+        Matrix{Float64}(undef, _PHI_IMAX + 1, r_max - 1),
+        Ref(0),
+    )
+end
+
+@noinline function _phi_cache_check(cache::PhiPadeCache, n::Integer, r_max::Integer)
+    size(cache.As, 1) == n || throw(
+        DimensionMismatch(
+            "PhiPadeCache was constructed for size $(size(cache.As, 1)) but A has size $n"
+        )
+    )
+    length(cache.eta) >= r_max || throw(
+        ArgumentError(
+            "PhiPadeCache was constructed for a smaller p; construct it with PhiPadeCache(A, p) for the p in use"
+        )
+    )
+    return nothing
+end
+
+# Renormalized [m/m] Pade coefficients (numerator and denominator, low order
+# first) of the approximant to phi_p, written into the preallocated `Ncoef`,
+# `Dcoef` (length >= m+1) using scratch `Amat` (size >= (m+1)^2). Follows the
+# recurrences of Berland, Skaflestad, and Wright used in the reference
+# `pade_coef`.
+function _phi_pade_coef!(Ncoef, Dcoef, Amat, m::Integer, p::Integer)
+    n1 = 1.0                                # (2m+1)!/m!
+    for i in (m + 1):(2m + 1)
+        n1 *= i
+    end
+    d1 = n1
+    for k in 1:p
+        if k == p
+            # First column: Amat[r,1] = n1 * prod_{l=1}^{r-1} 1/(k+l).
+            cp = 1.0
+            Amat[1, 1] = n1
+            for r in 2:(m + 1)
+                cp /= (k + (r - 1))
+                Amat[r, 1] = n1 * cp
+            end
+            for c in 2:(m + 1), r in 2:(m + 1)
+                I = r - 1
+                J = c - 1
+                Amat[r, c] = -(m + 1 - J) * (k + 1 + I - J) / ((2m + k + 1 - J) * J)
+            end
+            for r in 1:(m + 1)
+                s = 0.0
+                pr = 1.0
+                for c in 1:r
+                    pr *= Amat[r, c]
+                    s += pr
+                end
+                Ncoef[r] = s
+            end
+            Dcoef[1] = d1
+            cpd = 1.0
+            for r in 2:(m + 1)
+                i = r - 1
+                cpd *= -(m + 1 - i) / (i * (2m + k + 1 - i))
+                Dcoef[r] = d1 * cpd
+            end
+        end
+        n1 = n1 * (2m + k + 1) / (k + 1)
+        d1 = d1 * (2m + k + 1)
+    end
+    return Ncoef, Dcoef
+end
+
+# Given the alpha_r sequence (first `nalpha` entries) and precomputed scaling
+# parameters `tvals[i+1] = t(m_i)`, minimize the equivalent-matrix-product cost
+# (Section 4) over the optimal degrees and return the chosen (m, s, tau). `Cost`
+# is scratch of size >= (imax+1) x (r_max-1).
+function _phi_select_from_alpha(alpha, nalpha::Integer, p::Integer, tvals, Cost)
+    i_max = _PHI_IMAX
+    r_max = nalpha + 1
+    C = @view Cost[1:(i_max + 1), 1:(r_max - 1)]
+    fill!(C, 0.0)
+    for i in 0:i_max
+        m_i = (i + 3)^2 ÷ 8
+        θ = _phi_theta(m_i, p)
+        phat_i = θ < 1 ? 0 : p
+        t = tvals[i + 1]
+        for r in 2:r_max
+            if 2m_i + phat_i + 1 >= r * (r - 1)
+                a = alpha[r - 1]
+                s0 = (a > 0 && isfinite(a)) ? max(ceil(Int, log2(a / θ)), t) : t
+                C[i + 1, r - 1] = i + p + s0 * (p + 1)
+            end
+        end
+    end
+    nrows = i_max + 1
+    ncols = r_max - 1
+    minval = Inf
+    for v in C
+        (v > 0 && v < minval) && (minval = v)
+    end
+    # Column-major "last" tie-break (matches the reference), giving the row index.
+    i_star = 0
+    for cc in 1:ncols, rr in 1:nrows
+        C[rr, cc] == minval && (i_star = rr - 1)
+    end
+    m = (i_star + 3)^2 ÷ 8
+    s = round(Int, (minval - i_star - p) / (p + 1))
+    tau = floor(Int, sqrt(2m))
+    if tau - 1 + 2 * (m ÷ tau) - 2 * (m % tau == 0) != i_star
+        tau = ceil(Int, sqrt(2m))
+    end
+    return m, s, tau
+end
+
+## -------------------- in-place (workspace) implementation --------------------
+
+# Exact 1-norm of B^k for entrywise-nonnegative real B, ‖B^k‖_1 = ‖(B^T)^k 𝟙‖_∞,
+# via k matrix-vector products into the workspace vectors.
+function _normpow_nonneg!(cache::PhiPadeCache, B, k::Integer)
+    v = cache.rvec1
+    tmp = cache.rvec2
+    fill!(v, 1)
+    Bt = transpose(B)
+    for _ in 1:k
+        mul!(tmp, Bt, v)
+        v, tmp = tmp, v
+    end
+    return maximum(v)
+end
+
+# Scaling parameter `t` from the first term of the backward-error series
+# (Eq. (3.12)); `ell` subfunction of the reference, in-place. `normT` is the
+# caller-computed `opnorm(A, 1)` (hoisted out of the per-degree loop).
+function _phi_ell!(cache::PhiPadeCache, A, normT, m::Integer, p::Integer, phat::Integer)
+    normT == 0 && return 0
+    t0 = normT > 1 ? log2(normT) : 0.0
+    scalefac = exp2(t0)
+    normTs = normT / scalefac
+    delta = (p - 1) * (p - phat) / p + 1
+    K = 2m + p + 1
+    c = (_phi_be_coeff(m, p) / normTs^delta)^(1 / K)
+    absA = cache.absA
+    @. absA = c * abs(A) / scalefac
+    alpha = _normpow_nonneg!(cache, absA, K)
+    t = log2(2alpha / eps(Float64)) / (K - delta) + t0
+    # NaN/Inf inputs make t non-finite; return 0 (never throw) and let the NaN
+    # propagate to the output, where step-rejection logic can see it.
+    isfinite(t) || return 0
+    return max(ceil(Int, t), 0)
+end
+
+function _select_parameters_phi!(cache::PhiPadeCache, A, p::Integer)
+    phat = _phi_theta(_PHI_M_MAX, p) < 1 ? 0 : p
+    r_max = floor(Int, (1 + sqrt(1 + 4 * (2 * _PHI_M_MAX + phat + 1))) / 2)
+    _phi_cache_check(cache, size(A, 1), r_max)
+    eta = cache.eta
+    mul!(cache.pow1, A, A)
+    eta[1] = opnorm(cache.pow1, 1)^(1 / 2)
+    cur, oth = cache.pow1, cache.pow2
+    for j in 2:r_max
+        mul!(oth, cur, A)
+        eta[j] = opnorm(oth, 1)^(1 / (j + 1))
+        cur, oth = oth, cur
+    end
+    alpha = cache.alpha
+    for j in 1:(r_max - 1)
+        alpha[j] = max(eta[j], eta[j + 1])
+    end
+    normT = opnorm(A, 1)
+    for i in 0:_PHI_IMAX
+        m_i = (i + 3)^2 ÷ 8
+        phat_i = _phi_theta(m_i, p) < 1 ? 0 : p
+        cache.tvals[i + 1] = _phi_ell!(cache, A, normT, m_i, p, phat_i)
+    end
+    return _phi_select_from_alpha(alpha, r_max - 1, p, cache.tvals, cache.Cost)
+end
+
+# Add block i's coefficient combination, sum_l coef[i*tau + l] * As^{l-1}, to M.
+function _phi_ps_addblock!(M, coef, Apow, i::Integer, tau::Integer, m::Integer)
+    start = i * tau + 1
+    stop = min((i + 1) * tau, m + 1)
+    for l in 1:(stop - start + 1)
+        c = coef[start + l - 1]
+        P = Apow[l]
+        @. M += c * P
+    end
+    return M
+end
+
+# Evaluate N(As) and D(As) via Paterson--Stockmeyer in Horner form,
+#
+#   q(As) = ( (B_nu * Atau + B_{nu-1}) * Atau + ... ) * Atau + B_0,
+#
+# which needs nu products per polynomial -- one fewer when tau | m, since then
+# B_nu = c_m*I and the first fold is the broadcast c_m*Atau + B_{nu-1}. Together
+# with the tau-1 explicit powers, the multiplication count is exactly Fasi's
+# pi_m(tau) = tau - 1 + 2*floor(m/tau) - 2*(tau | m) that the parameter
+# selection optimizes over. Returns (N, D) as views into cache buffers.
+function _paterson_stockmeyer!(cache::PhiPadeCache, As, m::Integer, tau::Integer)
+    Nc, Dc, Apow = cache.Ncoef, cache.Dcoef, cache.Apow
+    copyto!(Apow[2], As)               # Apow[1] = I (set once); Apow[i] = As^{i-1}
+    for i in 3:(tau + 1)
+        mul!(Apow[i], Apow[i - 1], As)
+    end
+    Atau = Apow[tau + 1]
+    N, Naux = cache.Nm, cache.Naux
+    D, Daux = cache.Dm, cache.Daux
+    nu = m ÷ tau
+    if m % tau == 0
+        cN, cD = Nc[m + 1], Dc[m + 1]
+        @. N = cN * Atau
+        @. D = cD * Atau
+        _phi_ps_addblock!(N, Nc, Apow, nu - 1, tau, m)
+        _phi_ps_addblock!(D, Dc, Apow, nu - 1, tau, m)
+        inext = nu - 2
+    else
+        fill!(N, 0)
+        fill!(D, 0)
+        _phi_ps_addblock!(N, Nc, Apow, nu, tau, m)
+        _phi_ps_addblock!(D, Dc, Apow, nu, tau, m)
+        inext = nu - 1
+    end
+    for i in inext:-1:0
+        mul!(Naux, N, Atau)
+        mul!(Daux, D, Atau)
+        _phi_ps_addblock!(Naux, Nc, Apow, i, tau, m)
+        _phi_ps_addblock!(Daux, Dc, Apow, i, tau, m)
+        N, Naux = Naux, N
+        D, Daux = Daux, D
+    end
+    return N, D
+end
+
+# Solve Dm * X = Nm through the workspace's LinearSolve cache (batched matrix
+# right-hand side, LinearSolve >= 4), writing X into `X`. Returns an info code
+# instead of throwing: 0 on success; 1 on a numerically singular Dm
+# (pathological input, e.g. NaN/Inf), in which case `X` is filled with NaN so
+# downstream step-rejection logic can detect the failure.
+function _phi_solve!(cache::PhiPadeCache, Dm, Nm, X)
+    copyto!(cache.Dfact, Dm)
+    copyto!(cache.rhs, Nm)
+    linsolve = cache.linsolve
+    linsolve.A = cache.Dfact
+    linsolve.b = cache.rhs
+    sol = LinearSolve.solve!(linsolve)
+    if sol.retcode == LinearSolve.ReturnCode.Success
+        copyto!(X, sol.u)
+        return 0
+    else
+        fill!(X, NaN)
+        return 1
+    end
+end
+
+"""
+    _phi_almohy!(out, A, p, cache) -> out
+
+In-place evaluation of `phi_0(A), ..., phi_p(A)` for a strided
+`Float64`/`ComplexF64` matrix `A` (`p >= 1`), writing `phi_j(A)` into `out[j+1]`
+and using the reusable [`PhiPadeCache`](@ref) `cache` for all scratch storage.
+The `out` matrices double as the recovery buffers.
+"""
+function _phi_almohy!(
+        out::AbstractVector{<:AbstractMatrix}, A::AbstractMatrix, p::Integer,
+        cache::PhiPadeCache
+    )
+    n = size(A, 1)
+    m, s, tau = _select_parameters_phi!(cache, A, p)
+    As = cache.As
+    @. As = A / exp2(s)
+    _phi_pade_coef!(cache.Ncoef, cache.Dcoef, cache.Amat, m, p)
+    Nm, Dm = _paterson_stockmeyer!(cache, As, m, tau)
+
+    info = _phi_solve!(cache, Dm, Nm, out[p + 1])
+    if info == 0 && !all(isfinite, out[p + 1])
+        # NaN/Inf inputs reach here with info == 0 (the LU does not flag NaN
+        # pivots); report them so integrators can test cache.info[] alone.
+        info = -1
+    end
+    cache.info[] = info
+    if info != 0
+        # Return code instead of an exception: fill the outputs with NaN so an
+        # adaptive integrator can reject the step rather than crash.
+        for j in 1:(p + 1)
+            fill!(out[j], NaN)
+        end
+        return out
+    end
+
+    # Recurrence (2.9): R^{(j)} = As R^{(j+1)} + I/j!, j = p-1 : -1 : 0.
+    for k in p:-1:1
+        mul!(out[k], As, out[k + 1])
+        f = 1 / _factf(k - 1)
+        @inbounds for d in 1:n
+            out[k][d, d] += f
+        end
+    end
+
+    # Undo the scaling with the double-argument formula (2.10), s times.
+    tmp = cache.tmp
+    for _ in 1:s
+        for j in p:-1:1
+            mul!(tmp, out[1], out[j + 1])
+            for k in 1:j
+                @. tmp += (1 / _factf(j - k)) * out[k + 1]
+            end
+            tmp ./= exp2(j)
+            copyto!(out[j + 1], tmp)
+        end
+        mul!(tmp, out[1], out[1])
+        copyto!(out[1], tmp)
+    end
+    return out
+end
+
+## ----------------- generic (container-preserving) implementation -------------
+
+function _normpow_nonneg(B::AbstractMatrix{<:Real}, k::Integer)
+    n = size(B, 1)
+    Bt = transpose(B)
+    v = ones(eltype(B), n)
+    tmp = similar(v)
+    for _ in 1:k
+        mul!(tmp, Bt, v)
+        v, tmp = tmp, v
+    end
+    return maximum(v)
+end
+
+function _phi_ell(A::AbstractMatrix, normT, m::Integer, p::Integer, phat::Integer)
+    normT == 0 && return 0
+    t0 = normT > 1 ? log2(normT) : 0.0
+    scalefac = exp2(t0)
+    normTs = normT / scalefac
+    delta = (p - 1) * (p - phat) / p + 1
+    K = 2m + p + 1
+    c = (_phi_be_coeff(m, p) / normTs^delta)^(1 / K)
+    scaledT = c .* abs.(A ./ scalefac)
+    alpha = _normpow_nonneg(scaledT, K)
+    t = log2(2alpha / eps(Float64)) / (K - delta) + t0
+    isfinite(t) || return 0
+    return max(ceil(Int, t), 0)
+end
+
+function _select_parameters_phi(A::AbstractMatrix, p::Integer)
+    phat = _phi_theta(_PHI_M_MAX, p) < 1 ? 0 : p
+    r_max = floor(Int, (1 + sqrt(1 + 4 * (2 * _PHI_M_MAX + phat + 1))) / 2)
+    eta = Vector{Float64}(undef, r_max)
+    P = A * A
+    eta[1] = opnorm(P, 1)^(1 / 2)
+    for j in 2:r_max
+        P = P * A
+        eta[j] = opnorm(P, 1)^(1 / (j + 1))
+    end
+    alpha = [max(eta[j], eta[j + 1]) for j in 1:(r_max - 1)]
+    tvals = Vector{Int}(undef, _PHI_IMAX + 1)
+    normT = opnorm(A, 1)
+    for i in 0:_PHI_IMAX
+        m_i = (i + 3)^2 ÷ 8
+        phat_i = _phi_theta(m_i, p) < 1 ? 0 : p
+        tvals[i + 1] = _phi_ell(A, normT, m_i, p, phat_i)
+    end
+    Cost = zeros(Float64, _PHI_IMAX + 1, r_max - 1)
+    return _phi_select_from_alpha(alpha, r_max - 1, p, tvals, Cost)
+end
+
+# Out-of-place counterpart of `_phi_ps_addblock!`: returns `init` plus block i's
+# coefficient combination.
+function _phi_ps_block(init, coef, Apow, i::Integer, tau::Integer, m::Integer)
+    start = i * tau + 1
+    stop = min((i + 1) * tau, m + 1)
+    M = init
+    for l in 1:(stop - start + 1)
+        M = M + coef[start + l - 1] * Apow[l]
+    end
+    return M
+end
+
+# Out-of-place Horner-form Paterson--Stockmeyer; same multiplication count as
+# the in-place version (Fasi's pi_m(tau)).
+function _paterson_stockmeyer(As, Nc, Dc, tau::Integer, m::Integer)
+    MT = typeof(As)
+    Apow = Vector{MT}(undef, tau + 1)
+    Apow[1] = one(As)
+    Apow[2] = As
+    for i in 3:(tau + 1)
+        Apow[i] = Apow[i - 1] * As
+    end
+    Atau = Apow[tau + 1]
+    nu = m ÷ tau
+    local N::MT, D::MT
+    if m % tau == 0
+        N = _phi_ps_block(Nc[m + 1] * Atau, Nc, Apow, nu - 1, tau, m)
+        D = _phi_ps_block(Dc[m + 1] * Atau, Dc, Apow, nu - 1, tau, m)
+        inext = nu - 2
+    else
+        N = _phi_ps_block(zero(As), Nc, Apow, nu, tau, m)
+        D = _phi_ps_block(zero(As), Dc, Apow, nu, tau, m)
+        inext = nu - 1
+    end
+    for i in inext:-1:0
+        N = _phi_ps_block(N * Atau, Nc, Apow, i, tau, m)
+        D = _phi_ps_block(D * Atau, Dc, Apow, i, tau, m)
+    end
+    return N, D
+end
+
+"""
+    _phi_almohy_generic(A, p) -> Vector
+
+Container-preserving evaluation of `phi_0(A), ..., phi_p(A)` (`p >= 1`): the
+returned matrices have the same type as `A` (e.g. an `SMatrix` input yields
+`SMatrix` results). Used for immutable/static matrices, where the in-place
+workspace path does not apply.
+"""
+function _phi_almohy_generic(A::AbstractMatrix, p::Integer)
+    m, s, tau = _select_parameters_phi(A, p)
+    As = A ./ exp2(s)
+    m1 = m + 1
+    Ncoef = Vector{Float64}(undef, m1)
+    Dcoef = Vector{Float64}(undef, m1)
+    Amat = Matrix{Float64}(undef, m1, m1)
+    _phi_pade_coef!(Ncoef, Dcoef, Amat, m, p)
+    Nm, Dm = _paterson_stockmeyer(As, Ncoef, Dcoef, tau, m)
+
+    Rm = Vector{typeof(As)}(undef, p + 1)
+    # No-throw on a numerically singular denominator (pathological input): NaN
+    # results let adaptive integrators reject the step instead of aborting.
+    Rm[p + 1] = try
+        Dm \ Nm
+    catch e
+        e isa SingularException || rethrow()
+        zero(As) .+ eltype(As)(NaN)
+    end
+    Id = one(As)
+    for k in p:-1:1
+        Rm[k] = As * Rm[k + 1] + Id / _factf(k - 1)
+    end
+    for _ in 1:s
+        for j in p:-1:1
+            M = Rm[1] * Rm[j + 1]
+            for k in 1:j
+                M += Rm[k + 1] / _factf(j - k)
+            end
+            Rm[j + 1] = M / exp2(j)
+        end
+        Rm[1] = Rm[1] * Rm[1]
+    end
+    return Rm
+end
diff --git a/test/basictests.jl b/test/basictests.jl
index 539ec2cc..faf721c1 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -256,6 +256,151 @@ end
     end
 end
 
+@testset "Phi (Al-Mohy--Liu vs reference)" begin
+    # Independent reference: phi_0..phi_p are the first block row of exp(W) for
+    # the augmented block matrix W = [A E; 0 J] (Al-Mohy--Liu, Theorem 2.1), with
+    # E = [I 0 ... 0] and J the nilpotent shift. This is unrelated to the
+    # scaling-and-recovering algorithm under test.
+    function phi_block_reference(A, p)
+        # `local`: without it these assignments capture and clobber the
+        # enclosing testset's variables of the same name (closures in a local
+        # scope assign to existing outer locals).
+        local n = size(A, 1)
+        local T = eltype(A)
+        W = zeros(T, n * (p + 1), n * (p + 1))
+        W[1:n, 1:n] .= A
+        for j in 0:(p - 1)
+            rb = j * n
+            W[(rb + 1):(rb + n), (rb + n + 1):(rb + 2n)] .= Matrix{T}(I, n, n)
+        end
+        eW = exp(W)
+        return [eW[1:n, (j * n + 1):((j + 1) * n)] for j in 0:p]
+    end
+
+    # Force the legacy basis-vector path (Sidje/`phiv_dense`) for cross-checking.
+    function phi_legacy(A, p)
+        local n = size(A, 1)
+        local T = eltype(A)
+        local caches = (
+            Vector{T}(undef, n), Matrix{T}(undef, n, p + 1),
+            Matrix{T}(undef, n + p, n + p),
+        )
+        local out = [Matrix{T}(undef, n, n) for _ in 1:(p + 1)]
+        return phi!(out, A, p; caches = caches)
+    end
+
+    Random.seed!(20250701)
+    testmats = Dict(
+        "small 2x2" => [0.1 0.2; 0.3 0.4],
+        "random 8x8" => randn(8, 8) ./ 4,
+        "nonnormal 10x10" => 3 * (triu(randn(10, 10), 1) + Diagonal(randn(10))),
+        "large-norm 6x6" => 6 * randn(6, 6),
+        "complex 5x5" => randn(ComplexF64, 5, 5),
+        "hessenberg 12x12" => Matrix(hessenberg(randn(12, 12)).H),
+        "zero 4x4" => zeros(4, 4),
+    )
+    for (name, A) in testmats, p in (1, 2, 3, 5)
+        ref = phi_block_reference(A, p)
+        new = phi(A, p)                 # default path == Al-Mohy--Liu
+        leg = phi_legacy(A, p)
+        for j in 1:(p + 1)
+            @test new[j] ≈ ref[j] rtol = 1.0e-8 atol = 1.0e-12
+            @test new[j] ≈ leg[j] rtol = 1.0e-7 atol = 1.0e-10
+        end
+    end
+
+    # phi_0 must equal the matrix exponential.
+    A = randn(9, 9) ./ 3
+    @test phi(A, 4)[1] ≈ exp(A)
+
+    # Large p exercises the recurrence form of the backward-error coefficient
+    # (naive factorial ratios overflow long before their quotient does).
+    A = randn(3, 3)
+    P30 = phi(A, 30)
+    ref30 = phi_block_reference(A, 30)
+    for j in 1:31
+        @test P30[j] ≈ ref30[j] rtol = 1.0e-8 atol = 1.0e-13
+    end
+
+    # Preallocated in-place entry point returns phi_j in out[j+1].
+    n = 7
+    A = randn(n, n) ./ 3
+    out = [Matrix{Float64}(undef, n, n) for _ in 1:4]
+    phi!(out, A, 3)
+    @test out ≈ phi_block_reference(A, 3)
+
+    # Reusable workspace: same result, and reuse is allocation-free (the linear
+    # solve runs through the workspace's LinearSolve cache, which owns the LU
+    # pivot storage).
+    cache = PhiPadeCache(A, 3)
+    out2 = [Matrix{Float64}(undef, n, n) for _ in 1:4]
+    phi!(out2, A, 3; caches = cache)
+    @test out2 ≈ out
+    run_phi!(o, M, c) = phi!(o, M, 3; caches = c)
+    run_phi!(out2, A, cache)                           # warm up / compile
+    @test (@allocated run_phi!(out2, A, cache)) == 0
+
+    # Complex-matrix workspace path, also allocation-free on reuse.
+    Ac = randn(ComplexF64, 6, 6) ./ 3
+    cachec = PhiPadeCache(Ac, 2)
+    outc = [Matrix{ComplexF64}(undef, 6, 6) for _ in 1:3]
+    phi!(outc, Ac, 2; caches = cachec)
+    @test outc ≈ phi_block_reference(Ac, 2)
+    @test cachec.info[] == 0
+    run_phic!(o, M, c) = phi!(o, M, 2; caches = c)
+    run_phic!(outc, Ac, cachec)
+    @test (@allocated run_phic!(outc, Ac, cachec)) == 0
+
+    # Numerical failure must not throw: NaN input yields NaN-filled outputs and
+    # a nonzero return code in cache.info[], so adaptive integrators can reject
+    # the step instead of aborting.
+    Anan = fill(NaN, 5, 5)
+    Pnan = phi(Anan, 2)
+    @test all(P -> any(!isfinite, P), Pnan)
+    cachenan = PhiPadeCache(Anan, 2)
+    outnan = [Matrix{Float64}(undef, 5, 5) for _ in 1:3]
+    phi!(outnan, Anan, 2; caches = cachenan)
+    @test cachenan.info[] != 0
+    @test all(P -> all(isnan, P), outnan)
+    # ... and a subsequent good call resets the code.
+    phi!(outnan, randn(5, 5), 2; caches = cachenan)
+    @test cachenan.info[] == 0
+
+    # Wrongly sized workspaces are programmer errors, not numerical failures,
+    # and do throw.
+    A3 = randn(3, 3)
+    out3 = [Matrix{Float64}(undef, 3, 3) for _ in 1:4]
+    @test_throws DimensionMismatch phi!(out3, A3, 3; caches = cache)
+    cache_p1 = PhiPadeCache(A, 1)
+    out11 = [Matrix{Float64}(undef, n, n) for _ in 1:11]
+    @test_throws ArgumentError phi!(out11, A, 10; caches = cache_p1)
+end
+
+@testset "Phi static arrays (container-preserving)" begin
+    Random.seed!(4321)
+    function phi_block_reference(A, p)
+        local n = size(A, 1)
+        local T = eltype(A)
+        W = zeros(T, n * (p + 1), n * (p + 1))
+        W[1:n, 1:n] .= A
+        for j in 0:(p - 1)
+            rb = j * n
+            W[(rb + 1):(rb + n), (rb + n + 1):(rb + 2n)] .= Matrix{T}(I, n, n)
+        end
+        eW = exp(W)
+        return [eW[1:n, (j * n + 1):((j + 1) * n)] for j in 0:p]
+    end
+    for N in (3, 5), p in (1, 2, 3)
+        A = SMatrix{N, N}(randn(N, N) ./ 3)
+        Rm = ExponentialUtilities._phi_almohy_generic(A, p)
+        @test all(r -> r isa SMatrix, Rm)        # static in, static out
+        ref = phi_block_reference(Matrix(A), p)
+        for j in 1:(p + 1)
+            @test Matrix(Rm[j]) ≈ ref[j] rtol = 1.0e-8 atol = 1.0e-12
+        end
+    end
+end
+
 @testset "Static Arrays" begin
     Random.seed!(0)
     for N in (3, 4, 6, 8), t in (0.1, 1.0, 10.0)