diff --git a/Project.toml b/Project.toml
index 548b834d..d4babfbb 100644
--- a/Project.toml
+++ b/Project.toml
@@ -66,4 +66,5 @@ SparseArrays = "1"
 SpecialFunctions = "2"
 StaticArrays = "1"
 TOML = "1"
-julia = "1.9"
+# Require >=1.11 so that resize! has the shrink=false option
+julia = "1.11"
diff --git a/src/api.jl b/src/api.jl
index 26cbccf1..dc10a2be 100644
--- a/src/api.jl
+++ b/src/api.jl
@@ -358,6 +358,27 @@ function volume_potential(; op, target, source::Quadrature, compression, correct
             correction.maxdist,
             interpolation_order,
         )
+    elseif correction.method == :ldim
+        loc = target === source ? :inside : correction.target_location
+        μ = _green_multiplier(loc)
+        green_multiplier = fill(μ, length(target))
+        shift = Val(true)
+        form = get(correction, :form, :contraction)
+        δV = local_vdim_correction(
+            op,
+            eltype(V),
+            target,
+            source,
+            correction.mesh,
+            correction.bdry_nodes;
+            green_multiplier,
+            correction.maxdist,
+            correction.interpolation_order,
+            correction.quadrature_order,
+            correction.meshsize,
+            shift,
+            form,
+        )
     else
         error("Unknown correction method. Available options: $CORRECTION_METHODS")
     end
diff --git a/src/mesh.jl b/src/mesh.jl
index feb5acb7..fa0ce531 100644
--- a/src/mesh.jl
+++ b/src/mesh.jl
@@ -51,11 +51,11 @@ for a given mesh).
 function elements end
 
 """
-    element_types(msh::AbstractMesh)
+    elements(msh::AbstractMesh,E::DataType)
 
-Return the element types present in the `msh`.
+Return an iterator for all elements of type `E` on a mesh `msh`.
 """
-function element_types end
+elements(msh::AbstractMesh, E::DataType) = ElementIterator{E, typeof(msh)}(msh)
 
 """
     struct Mesh{N,T} <: AbstractMesh{N,T}
@@ -608,6 +608,69 @@ function connectivity(msh::SubMesh, E::DataType)
     return map(t -> g2l[t], view(msh.parent.etype2mat[E], :, eltags))
 end
 
+"""
+    Domain(f::Function, msh::AbstractMesh)
+
+Call `Domain(f, ents)` on `ents = entities(msh).`
+"""
+Domain(f::Function, msh::AbstractMesh) = Domain(f, entities(msh))
+
+"""
+    topological_neighbors(msh::LagrangeMesh, k=1)
+
+Return the `k` neighbors of each element in `msh`. The one-neighbors are the
+elements that share a common vertex with the element, `k` neighbors are the
+one-neighbors of the `k-1` neighbors.
+
+This function returns a dictionary where the key is an `(Eᵢ,i)` tuple denoting
+the element `i` of type `E` in the mesh, and the value is a set of tuples
+`(Eⱼ,j)` where `Eⱼ` is the type of the element and `j` its index.
+"""
+function topological_neighbors(msh::AbstractMesh, k = 1)
+    # dictionary mapping a node index to all elements containing it. Note
+    # that the elements are stored as a tuple (type, index)
+    T = Tuple{DataType, Int}
+    node2els = OrderedDict{Int, Vector{T}}()
+    for E in element_types(msh)
+        mat = connectivity(msh, E)::Matrix{Int} # connectivity matrix
+        np, Nel = size(mat)
+        for n in 1:Nel
+            for i in 1:np
+                idx = mat[i, n]
+                els = get!(node2els, idx, Vector{T}())
+                push!(els, (E, n))
+            end
+        end
+    end
+    # now revert the map to get the neighbors
+    one_neighbors = OrderedDict{T, Set{T}}()
+    for (_, els) in node2els
+        for el in els
+            nei = get!(one_neighbors, el, Set{T}())
+            for el′ in els
+                push!(nei, el′)
+            end
+        end
+    end
+    # Recursively compute the neighbors from the one-neighbors
+    if k > 1
+        k_neighbors = deepcopy(one_neighbors)
+    else
+        k_neighbors = one_neighbors
+    end
+    while k > 1
+        # update neighborhood of each element
+        for el in keys(one_neighbors)
+            knn = k_neighbors[el]
+            for el′ in copy(knn)
+                union!(knn, one_neighbors[el′])
+            end
+        end
+        k -= 1
+    end
+    return k_neighbors
+end
+
 """
     near_interaction_list(X,Y::AbstractMesh; tol)
 
@@ -643,12 +706,9 @@ end
     return inrange(balltree, centers, tol)
 end
 
-"""
-    Domain(f::Function, msh::AbstractMesh)
+function viz_elements(args...; kwargs...) end
 
-Call `Domain(f, ents)` on `ents = entities(msh).`
-"""
-Domain(f::Function, msh::AbstractMesh) = Domain(f, entities(msh))
+function viz_elements_bords(args...; kwargs...) end
 
 function node2etags(msh)
     # dictionary mapping a node index to all elements containing it. Note
diff --git a/src/nystrom.jl b/src/nystrom.jl
index 86af35fc..8462f4e6 100644
--- a/src/nystrom.jl
+++ b/src/nystrom.jl
@@ -76,15 +76,18 @@ kernel(iop::IntegralOperator) = iop.kernel
 target(iop::IntegralOperator) = iop.target
 source(iop::IntegralOperator) = iop.source
 
-function IntegralOperator(k, X, Y::Quadrature = X)
+function IntegralOperator(k, X, Y = X)
     # check that all entities in the quadrature are of the same dimension
-    if !allequal(geometric_dimension(ent) for ent in entities(Y))
-        msg = "entities in the target quadrature have different geometric dimensions"
-        throw(ArgumentError(msg))
+    if Y isa Quadrature && !isnothing(Y.mesh)
+        if !allequal(geometric_dimension(ent) for ent in entities(Y))
+            msg = "entities in the target quadrature have different geometric dimensions"
+            throw(ArgumentError(msg))
+        end
     end
     T = return_type(k, eltype(X), eltype(Y))
-    msg = """IntegralOperator of nonbits being created: $T"""
-    isbitstype(T) || (@warn msg)
+    # FIXME This cripples performance for local VDIM
+    #msg = """IntegralOperator of nonbits being created: $T"""
+    #isbitstype(T) || (@warn msg)
     return IntegralOperator{T, typeof(k), typeof(X), typeof(Y)}(k, X, Y)
 end
 
@@ -108,16 +111,27 @@ function assemble_matrix(iop::IntegralOperator; threads = true)
     else
         Array{T}(undef, m, n)
     end
-    K = kernel(iop)
+    return assemble_matrix!(out, iop; threads)
+end
+
+"""
+    assemble_matrix!(out, iop::IntegralOperator; threads = true)
+
+In-place version of [`assemble_matrix`](@ref): fill `out` (an `m × n`
+`AbstractMatrix`, which may be a `view` into a larger preallocated buffer) with
+the dense representation of `iop`. No allocation of the output is performed.
+"""
+function assemble_matrix!(out, iop::IntegralOperator; threads = true)
+    @assert size(out) == size(iop) "`out` must have size $(size(iop)), got $(size(out))"
     # function barrier
-    _assemble_matrix!(out, K, iop.target, iop.source, threads)
+    _assemble_matrix!(out, kernel(iop), iop.target, iop.source, threads)
     return out
 end
 
 @noinline function _assemble_matrix!(out, K, X, Y::Quadrature, threads)
     @usethreads threads for j in 1:length(Y)
         for i in 1:length(X)
-            out[i, j] = K(X[i], Y[j]) * weight(Y[j])
+            @inbounds out[i, j] = K(X[i], Y[j]) * weight(Y[j])
         end
     end
     return out
diff --git a/src/quadrature.jl b/src/quadrature.jl
index 40588f0e..b42363f3 100644
--- a/src/quadrature.jl
+++ b/src/quadrature.jl
@@ -56,6 +56,8 @@ function Base.show(io::IO, q::QuadratureNode)
     return print(io, "-- weight: $(q.weight)")
 end
 
+const Maybe{T} = Union{T, Nothing}
+
 """
     struct Quadrature{N,T} <: AbstractVector{QuadratureNode{N,T}}
 
@@ -63,7 +65,7 @@ A collection of [`QuadratureNode`](@ref)s used to integrate over an
 [`AbstractMesh`](@ref).
 """
 struct Quadrature{N, T} <: AbstractVector{QuadratureNode{N, T}}
-    mesh::AbstractMesh{N, T}
+    mesh::Maybe{AbstractMesh{N, T}}
     etype2qrule::OrderedDict{DataType, ReferenceQuadrature}
     qnodes::Vector{QuadratureNode{N, T}}
     etype2qtags::OrderedDict{DataType, Matrix{Int}}
@@ -75,7 +77,10 @@ Base.getindex(quad::Quadrature, i) = quad.qnodes[i]
 Base.setindex!(quad::Quadrature, q, i) = (quad.qnodes[i] = q)
 
 qnodes(quad::Quadrature) = quad.qnodes
-mesh(quad::Quadrature) = quad.mesh
+function mesh(quad::Quadrature)
+    isnothing(quad.mesh) && error("The Quadrature has no mesh!")
+    return quad.mesh
+end
 etype2qtags(quad::Quadrature, E) = quad.etype2qtags[E]
 
 quadrature_rule(quad::Quadrature, E) = quad.etype2qrule[E]
@@ -86,7 +91,7 @@ function Base.show(io::IO, quad::Quadrature)
 end
 
 """
-    Quadrature(msh::AbstractMesh, etype2qrule::Dict)
+    Quadrature(msh::AbstractMesh, etype2qrule::OrderedDict)
     Quadrature(msh::AbstractMesh, qrule::ReferenceQuadrature)
     Quadrature(msh::AbstractMesh; qorder)
 
@@ -119,6 +124,57 @@ function Quadrature(msh::AbstractMesh{N, T}, etype2qrule::OrderedDict) where {N,
     return quad
 end
 
+# Quadrature constructor for list of volume elements for local vdim
+function Quadrature(
+        ::Type{T},
+        elementlist::AbstractVector{E},
+        etype2qrule::OrderedDict{DataType, Q},
+        qrule::Q;
+        center::SVector{N, Float64} = zero(SVector{N, Float64}),
+        scale::Float64 = 1.0,
+    ) where {N, T, E, Q}
+    # initialize mesh with empty fields
+    quad = Quadrature{N, T}(
+        nothing,
+        etype2qrule,
+        QuadratureNode{N, T}[],
+        OrderedDict{DataType, Matrix{Int}}(),
+    )
+    ori = ones(Int64, length(elementlist))
+    # loop element types and generate quadrature for each
+    _build_quadrature!(quad, elementlist, ori, qrule; center, scale)
+
+    # check for entities with negative orientation and flip normal vectors if
+    # present
+    #for ent in entities(msh)
+    #    if (sign(tag(ent)) < 0) && (N - geometric_dimension(ent) == 1)
+    #        @debug "Flipping normals of $ent"
+    #        tags = dom2qtags(quad, Domain(ent))
+    #        for i in tags
+    #            quad[i] = flip_normal(quad[i])
+    #        end
+    #    end
+    #end
+    return quad
+end
+
+# In-place refill of a local-vdim Quadrature, reusing its qnodes buffer.
+# `empty!` retains capacity; the `shrink=false` sizehint in `_build_quadrature!`
+# then grows it only at new high-water marks, so steady state allocates no qnodes.
+function build_local_quadrature!(
+        quad::Quadrature{N, T},
+        elementlist::AbstractVector{E},
+        qrule::ReferenceQuadrature;
+        center::SVector{N, Float64} = zero(SVector{N, Float64}),
+        scale::Float64 = 1.0,
+    ) where {N, T, E}
+    empty!(quad.qnodes)
+    empty!(quad.etype2qtags)
+    ori = ones(Int, length(elementlist))
+    _build_quadrature!(quad, elementlist, ori, qrule; center, scale)
+    return quad
+end
+
 function Quadrature(msh::AbstractMesh{N, T}, qrule::ReferenceQuadrature) where {N, T}
     etype2qrule = OrderedDict(E => qrule for E in element_types(msh))
     return Quadrature(msh, etype2qrule)
@@ -134,14 +190,18 @@ end
         quad::Quadrature{N, T},
         els::AbstractVector{E},
         orientation::Vector{Int},
-        qrule::ReferenceQuadrature,
-    ) where {N, T, E}
+        qrule::ReferenceQuadrature;
+        center::SVector{N, Float64} = zero(SVector{N, Float64}),
+        scale::Float64 = 1.0,
+    ) where {E, N, T}
+    x̂, ŵ = qrule() # nodes and weights on reference element
     x̂ = map(x̂ -> T.(x̂), qcoords(qrule))
     ŵ = map(ŵ -> T.(ŵ), qweights(qrule))
     num_nodes = length(ŵ)
     M = geometric_dimension(domain(E))
     codim = N - M
     istart = length(quad.qnodes) + 1
+    sizehint!(quad.qnodes, length(els) * length(x̂); shrink = false)
     @assert length(els) == length(orientation)
     for (s, el) in zip(orientation, els)
         # and all qnodes for that element
@@ -151,7 +211,7 @@ end
             μ = _integration_measure(jac)
             w = μ * ŵi
             ν = codim == 1 ? T.(s * _normal(jac)) : nothing
-            qnode = QuadratureNode(T.(x), T.(w), ν)
+            qnode = QuadratureNode(T.((x - center) / scale), T.(w / scale^M), ν)
             push!(quad.qnodes, qnode)
         end
     end
diff --git a/src/reference_integration.jl b/src/reference_integration.jl
index 2696a2e1..4c2a64a9 100644
--- a/src/reference_integration.jl
+++ b/src/reference_integration.jl
@@ -169,17 +169,20 @@ struct Gauss{D, N} <: ReferenceQuadrature{D}
     function Gauss(; domain, order)
         domain == :segment && (domain = ReferenceLine())
         domain == :triangle && (domain = ReferenceTriangle())
-        domain == :tetrahedron && (domain = ReferenceTetrahedron())
-        msg = "quadrature of order $order not available for $domain"
+        domain == :tetrehedron && (domain = ReferenceTetrahedron())
         if domain isa ReferenceLine
             # TODO: support Gauss-Legendre quadratures of arbitrary order
-            order == 13 || error(msg)
+            order == 13 || error("quadrature of order $order not available for $domain")
             n = 7
         elseif domain isa ReferenceTriangle
-            haskey(TRIANGLE_GAUSS_ORDER_TO_NPTS, order) || error(msg)
+            if !haskey(TRIANGLE_GAUSS_ORDER_TO_NPTS, order)
+                error("quadrature of order $order not available for $domain")
+            end
             n = TRIANGLE_GAUSS_ORDER_TO_NPTS[order]
         elseif domain isa ReferenceTetrahedron
-            haskey(TETRAHEDRON_GAUSS_ORDER_TO_NPTS, order) || error(msg)
+            if !haskey(TETRAHEDRON_GAUSS_ORDER_TO_NPTS, order)
+                error("quadrature of order $order not available for $domain")
+            end
             n = TETRAHEDRON_GAUSS_ORDER_TO_NPTS[order]
         else
             error(
@@ -272,12 +275,18 @@ struct VioreanuRokhlin{D, N} <: ReferenceQuadrature{D}
         domain == :triangle && (domain = ReferenceTriangle())
         domain == :tetrahedron && (domain = ReferenceTetrahedron())
         if domain isa ReferenceTriangle
-            msg = "VioreanuRokhlin quadrature of order $order not available for ReferenceTriangle"
-            haskey(TRIANGLE_VR_ORDER_TO_NPTS, order) || error(msg)
+            if !haskey(TRIANGLE_VR_ORDER_TO_NPTS, order)
+                error(
+                    "VioreanuRokhlin quadrature of order $order not available for ReferenceTriangle",
+                )
+            end
             n = TRIANGLE_VR_ORDER_TO_NPTS[order]
         elseif domain isa ReferenceTetrahedron
-            msg = "VioreanuRokhlin quadrature of order $order not available for ReferenceTetrahedron"
-            haskey(TETRAHEDRON_VR_ORDER_TO_NPTS, order) || error(msg)
+            if !haskey(TETRAHEDRON_VR_ORDER_TO_NPTS, order)
+                error(
+                    "VioreanuRokhlin quadrature of order $order not available for ReferenceTetrahedron",
+                )
+            end
             n = TETRAHEDRON_VR_ORDER_TO_NPTS[order]
         else
             error(
diff --git a/src/reference_interpolation.jl b/src/reference_interpolation.jl
index 0cd710a2..36e11e2c 100644
--- a/src/reference_interpolation.jl
+++ b/src/reference_interpolation.jl
@@ -360,6 +360,7 @@ const LagrangeCube = LagrangeElement{ReferenceCube}
 
 """
     vertices_idxs(el::LagrangeElement)
+    vertices_idxs(::Type{LagrangeElement})
 
 The indices of the nodes in `el` that define the vertices of the element.
 """
@@ -402,20 +403,25 @@ vertices(el::LagrangeElement) = view(vals(el), vertices_idxs(el))
 
 The indices of the nodes in `el` that define the boundary of the element.
 """
-function boundary_idxs(el::LagrangeLine)
-    return 1, length(vals(el))
+
+function boundary_idxs(::Type{<:LagrangeLine})
+    return 1, 2
 end
 
-function boundary_idxs(el::LagrangeTriangle)
-    I = vertices_idxs(el)
+function boundary_idxs(T::Type{<:LagrangeTriangle})
+    I = vertices_idxs(T)
     return (I[1], I[2]), (I[2], I[3]), (I[3], I[1])
 end
 
-function boundary_idxs(el::LagrangeSquare)
-    I = vertices_idxs(el)
+function boundary_idxs(T::Type{<:LagrangeSquare})
+    I = vertices_idxs(T)
     return (I[1], I[2]), (I[2], I[3]), (I[3], I[4]), (I[4], I[1])
 end
 
+function boundary_idxs(::Type{<:LagrangeTetrahedron{4}})
+    return (3, 2, 1), (1, 4, 3), (2, 3, 4), (1, 2, 4)
+end
+
 # generic ℚₖ elements for ReferenceHyperCube
 function reference_nodes(T::Type{<:LagrangeElement{ReferenceHyperCube{D}, Np}}) where {D, Np}
     n = order(T) + 1
@@ -608,3 +614,43 @@ function lagrange_basis(::Type{LagrangeElement{D, N, T}}) where {D, N, T}
     vals = svector(i -> svector(j -> i == j, N), N)
     return LagrangeElement{D}(vals)
 end
+
+function boundarynd(::Type{T}, els, msh) where {T}
+    bdi = Inti.boundary_idxs(T)
+    nedges = length(els) * length(bdi)
+    edgelist = Vector{SVector{length(bdi[1]), Int64}}(undef, nedges)
+    edgelist_unsrt = Vector{SVector{length(bdi[1]), Int64}}(undef, nedges)
+    bords = Vector{MVector{length(bdi[1]), Int64}}(undef, length(bdi))
+    for i in 1:length(bdi)
+        bords[i] = MVector{length(bdi[1]), Int64}(undef)
+    end
+    j = 1
+    for ii in els
+        for k in 1:length(bdi)
+            for jjj in 1:length(bdi[k])
+                bords[k][jjj] = Inti.connectivity(msh, T)[bdi[k][jjj], ii]
+            end
+        end
+        for q in bords
+            edgelist_unsrt[j] = q[:]
+            edgelist[j] = sort!(q)
+            j += 1
+        end
+    end
+    I = sortperm(edgelist)
+    uniqlist = Int64[]
+    sizehint!(uniqlist, length(els))
+    i = 1
+    while i <= length(edgelist) - 1
+        if isequal(edgelist[I[i]], edgelist[I[i + 1]])
+            i += 1
+        else
+            push!(uniqlist, I[i])
+        end
+        i += 1
+    end
+    if !isequal(edgelist[I[end - 1]], edgelist[I[end]])
+        push!(uniqlist, I[end])
+    end
+    return edgelist_unsrt[uniqlist]
+end
diff --git a/src/utils.jl b/src/utils.jl
index 355f8d22..0d018524 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -198,12 +198,12 @@ function _normalize_compression(compression, target, source)
 end
 
 function _normalize_correction(correction, target, source)
-    methods = (:dim, :adaptive, :none)
+    methods = (:dim, :ldim, :adaptive, :none)
     # check that method is valid
     correction.method ∈ methods ||
         error("Unknown correction.method $(correction.method). Available options: $methods")
     # set default values if absent
-    if correction.method == :dim
+    if correction.method == :dim || correction.method == :ldim
         haskey(correction, :target_location) &&
             target === source &&
             correction.target_location != :on &&
@@ -219,6 +219,10 @@ function _normalize_correction(correction, target, source)
         haskey(correction, :maxdist) ||
             target === source ||
             @warn("missing maxdist field in correction: setting to Inf")
+        if correction.method == :ldim
+            haskey(correction, :mesh) ||
+                error("missing mesh information needed for local dim")
+        end
         correction = merge(
             (maxdist = Inf, interpolation_order = nothing, center = nothing),
             correction,
diff --git a/src/vdim.jl b/src/vdim.jl
index 88243f30..dae303af 100644
--- a/src/vdim.jl
+++ b/src/vdim.jl
@@ -27,6 +27,7 @@ See [anderson2024fast](@cite) for more details on the method.
 - `shift`: a boolean indicating whether the basis functions should be shifted
   and rescaled to each element.
 """
+
 function vdim_correction(
         op,
         target,
@@ -112,7 +113,7 @@ function vdim_correction(
                 for k in 1:nq
                     push!(Is, i)
                     push!(Js, jglob[k])
-                    push!(Vs, wei[k])
+                    push!(Vs, -wei[k])
                 end
             end
         end
@@ -128,6 +129,360 @@ function vdim_correction(
     return δV
 end
 
+# function barrier for type stability purposes
+function build_vander(vals_trg, pts, PFE_p, c, r)
+    tmp = Vector{Float64}(undef, length(c))
+    for i in 1:length(pts)
+        tmp .= (pts[i].coords - c) / r
+        ElementaryPDESolutions.fast_evaluate!(view(vals_trg, :, i), tmp, PFE_p)
+    end
+    return vals_trg
+end
+
+function _scaled_operator(op::AbstractDifferentialOperator{N}, scale) where {N}
+    if op isa Helmholtz
+        return Helmholtz(; k = scale * op.k, dim = N)
+    elseif op isa Laplace
+        return op
+    else
+        error("Unsupported operator for stabilized Local VDIM")
+    end
+end
+
+function _lowfreq_operator(op::AbstractDifferentialOperator{N}) where {N}
+    if op isa Helmholtz
+        return Laplace(; dim = N)
+    elseif op isa Laplace
+        return op
+    else
+        error("Unsupported operator for stabilized Local VDIM")
+    end
+end
+
+function local_vdim_correction(
+        op,
+        ::Type{Eltype},
+        target,
+        source::Quadrature,
+        mesh::AbstractMesh,
+        bdry_nodes;
+        green_multiplier::Vector{<:Real},
+        interpolation_order = nothing,
+        quadrature_order = nothing,
+        meshsize = 1.0,
+        maxdist = Inf,
+        center = nothing,
+        shift::Val{SHIFT} = Val(false),
+        form::Symbol = :analytic,
+    ) where {SHIFT, Eltype}
+    SHIFT || error("unsupported local VDIM without shifting")
+    # `form` selects how the low-frequency local correction `δV = -(quad - exact)`
+    # forms its `quad - exact` difference (both give the same sparse δV, columns
+    # on the element's own nodes):
+    #   :contraction — build the naive patch operator by contracting the literal
+    #                  single-layer kernel entries against the basis values
+    #                  (`Rq = Gmat*Bmat`) and subtract the exact operator `R`.
+    #                  Uses the same kernel function (and `G(x,x)=0` convention)
+    #                  as the dense/FMM forward map, so no analytic self-node
+    #                  term is needed. This code path shows the math more
+    #                  cleanly, but is much slower.
+    #   :analytic    — obtain `quad - exact` directly from the rescaled-coordinate
+    #                  Laplace expansion (eqs. (3.11)/(4.1)), reusing the scaled
+    #                  single/double/volume operators already assembled by
+    #                  `_local_vdim_auxiliary_quantities`. Mirrors the
+    #                  manuscript, but must add an analytic self-node term to
+    #                  account for the `G(x,x)=0` convention of the operator
+    #                  being corrected, that depends on `H(z=0)`.
+    form in (:contraction, :analytic) ||
+        error("unknown local VDIM correction form: $form (expected :contraction or :analytic)")
+    # variables for debugging the condition properties of the method
+    vander_cond = vander_norm = rhs_norm = res_norm = shift_norm = -Inf
+    # figure out if we are dealing with a scalar or vector PDE
+    m, n = length(target), length(source)
+    N = ambient_dimension(op)
+    @assert ambient_dimension(source) == N "vdim only works for volume potentials"
+    m, n = length(target), length(source)
+    # a reasonable interpolation_order if not provided
+    isnothing(interpolation_order) &&
+        (interpolation_order = maximum(order, values(source.etype2qrule)))
+
+    # Helmholtz PDE operator in x̂ coordinates where x = scale * x̂
+    if op isa Helmholtz
+        s = meshsize
+    elseif op isa Laplace
+        s = 1.0
+    else
+        error("not implemented")
+    end
+    op_hat = _scaled_operator(op, s)
+    op_lowfreq = _lowfreq_operator(op)
+
+    dict_near = etype_to_nearest_points(target, source; maxdist)
+    bdry_kdtree = KDTree(bdry_nodes)
+    # compute sparse correction
+    Is = Int[]
+    Js = Int[]
+    Vs = Eltype[]
+    for (E, qtags) in source.etype2qtags
+        els = elements(source.mesh, E)
+        near_list = dict_near[E]
+        nq, ne = size(qtags)
+        @assert length(near_list) == ne
+        sizehint!(Is, ne * nq * nq)
+        sizehint!(Js, ne * nq * nq)
+        sizehint!(Vs, ne * nq * nq)
+        num_basis = binomial(interpolation_order + N, N)
+
+        bdry_qorder = 2 * quadrature_order
+        if N == 3
+            bdry_qrule = _qrule_for_reference_shape(ReferenceSimplex{2}(), bdry_qorder)
+            bdry_etype2qrule = OrderedDict(ReferenceSimplex{2} => bdry_qrule)
+        else
+            bdry_qrule = _qrule_for_reference_shape(ReferenceHyperCube{1}(), bdry_qorder)
+            bdry_etype2qrule = OrderedDict(ReferenceHyperCube{1} => bdry_qrule)
+        end
+        vol_qrule = VioreanuRokhlin(; domain = domain(E), order = quadrature_order)
+        vol_etype2qrule = OrderedDict(E => vol_qrule)
+
+        topo_neighs = 1
+        neighbors = topological_neighbors(mesh, topo_neighs)
+
+        # Parallelize the element loop across threads.  The work is split into
+        # one chunk per thread; each chunk owns *private* scratch that must NOT
+        # be shared between threads: a `LocalVDIMWorkspace`, the in-place local
+        # quadrature buffers (`Yvol`/`Ybdry`), and the Vandermonde scratch
+        # (`L̃`/`vals_trg`).  Each chunk accumulates into its own (Is, Js, Vs)
+        # buffers, which are concatenated into the global ones after the
+        # threaded region (so no locking is needed in the hot loop).
+        nchunks = max(1, min(Threads.nthreads(), ne))
+        chunks = collect(Iterators.partition(1:ne, cld(ne, nchunks)))
+        chunk_Is = [Int[] for _ in chunks]
+        chunk_Js = [Int[] for _ in chunks]
+        chunk_Vs = [Eltype[] for _ in chunks]
+        # Per-thread polynomial fast-evaluators.  Each carries an internal
+        # mutable `cfg` (FixedPolynomials.JacobianConfig) written on every
+        # `fast_evaluate!`, so it MUST NOT be shared between threads.  Build a
+        # private set per chunk *serially* here (the construction goes through
+        # DynamicPolynomials' `@polyvar`, which is itself not thread-safe).  The
+        # accompanying multiindices / monomials_indices are read-only data.
+        chunk_pfe_lowfreq =
+            [polynomial_solutions_local_vdim(op_lowfreq, interpolation_order + 4) for _ in chunks]
+        chunk_pfe =
+            [polynomial_solutions_local_vdim(op_hat, interpolation_order) for _ in chunks]
+        Threads.@threads for ci in eachindex(chunks)
+            # --- per-thread (per-chunk) private scratch; never shared ---
+            L̃ = Matrix{Float64}(undef, nq, num_basis)
+            vals_trg = Matrix{Float64}(undef, num_basis, nq)
+            # preallocated local quadratures, reused across this chunk's
+            # elements so the qnodes buffers are not reallocated every iteration
+            Yvol = Quadrature{N, Float64}(
+                nothing,
+                vol_etype2qrule,
+                QuadratureNode{N, Float64}[],
+                OrderedDict{DataType, Matrix{Int}}(),
+            )
+            Ybdry = Quadrature{N, Float64}(
+                nothing,
+                bdry_etype2qrule,
+                QuadratureNode{N, Float64}[],
+                OrderedDict{DataType, Matrix{Int}}(),
+            )
+            # reusable scratch for the per-element auxiliary quantities
+            ws = LocalVDIMWorkspace{Eltype, N}()
+            # this chunk's private fast-evaluators (built serially above)
+            PFE_p_lowfreq, PFE_P_lowfreq, multiindices_lowfreq, monomials_indices_lowfreq =
+                chunk_pfe_lowfreq[ci]
+            PFE_p, PFE_P, multiindices, monomials_indices = chunk_pfe[ci]
+            # chunk-local correction triplets, merged after the threaded region
+            lIs = chunk_Is[ci]
+            lJs = chunk_Js[ci]
+            lVs = chunk_Vs[ci]
+            for n in chunks[ci]
+                # indices of nodes in element `n`
+                isempty(near_list[n]) && continue
+                c, r, diam = translation_and_scaling(els[n])
+                # Low-frequency (kr small) elements use the Laplace-based expansion
+                # of Section 3.1/4; otherwise the high-frequency rescaling of
+                # Section 3.2 with s = meshsize (so that k*s = O(1) is fixed and
+                # s/r is bounded from above and below) is stable.
+
+                # Run Laplace through low-frequency path too, for stability.
+                if op isa Laplace || (op isa Helmholtz && r * op.k < 5*10^(-2))
+                    lowfreq = true
+                    Yvol, Ybdry, need_layer_corr, els_idxs = _local_vdim_construct_local_quadratures(
+                        N,
+                        mesh,
+                        neighbors,
+                        n,
+                        c,
+                        r,
+                        diam,
+                        bdry_kdtree,
+                        Yvol,
+                        Ybdry,
+                        bdry_qrule,
+                        vol_qrule,
+                    )
+                    if form === :analytic
+                        # R holds the physical (quad - exact) difference directly,
+                        # from the rescaled expansion + analytic self-node term.
+                        R = _lowfreq_vdim_cancellation_quantities(
+                            op,
+                            op_lowfreq,
+                            c,
+                            r,
+                            num_basis,
+                            PFE_p_lowfreq,
+                            PFE_P_lowfreq,
+                            multiindices,
+                            multiindices_lowfreq,
+                            monomials_indices,
+                            monomials_indices_lowfreq,
+                            target[near_list[n]],
+                            green_multiplier[near_list[n]],
+                            Yvol,
+                            Ybdry,
+                            diam,
+                            need_layer_corr,
+                            ws,
+                        )
+                    else
+                        # R holds the exact local operator; the naive quad side is
+                        # built by contraction in the assembly below.
+                        R = _lowfreq_vdim_auxiliary_quantities(
+                            op,
+                            op_lowfreq,
+                            c,
+                            r,
+                            num_basis,
+                            PFE_p_lowfreq,
+                            PFE_P_lowfreq,
+                            multiindices,
+                            multiindices_lowfreq,
+                            monomials_indices,
+                            monomials_indices_lowfreq,
+                            target[near_list[n]],
+                            green_multiplier[near_list[n]],
+                            Yvol,
+                            Ybdry,
+                            diam,
+                            need_layer_corr
+                        )
+                    end
+                else
+                    lowfreq = false
+                    Yvol, Ybdry, need_layer_corr, els_idxs = _local_vdim_construct_local_quadratures(
+                        N,
+                        mesh,
+                        neighbors,
+                        n,
+                        c,
+                        s,
+                        diam,
+                        bdry_kdtree,
+                        Yvol,
+                        Ybdry,
+                        bdry_qrule,
+                        vol_qrule,
+                    )
+                    R, b = _local_vdim_auxiliary_quantities(
+                        op_hat,
+                        c,
+                        s,
+                        PFE_p,
+                        PFE_P,
+                        target[near_list[n]],
+                        green_multiplier[near_list[n]],
+                        Yvol,
+                        Ybdry,
+                        diam,
+                        need_layer_corr,
+                        ws,
+                    )
+                end
+                jglob = @view qtags[:, n]
+                L̃ .= transpose(build_vander(vals_trg, view(source, jglob), PFE_p, c, r))
+                Linv = pinv(L̃)
+                if lowfreq
+                    # eq. (2.10): δV = -(quad - exact) applied through the
+                    # interpolant of element `n`, with columns on the element's own
+                    # nodes only (and including the V_N[f - fₙ] remainder). How the
+                    # `quad - exact` difference `Δ` is formed depends on `form`.
+                    if form === :analytic
+                        # R already holds the physical (quad - exact) directly.
+                        Δ = R
+                    else
+                        # `R` holds the exact local operator V_exact,N[p_β]; the quad
+                        # side is obtained by contracting the naive operator entries
+                        # against the basis values, using the same kernel function as
+                        # the dense assembly (and consistent with the FMM), including
+                        # its zero convention at coincident points — so it matches the
+                        # operator being corrected by construction.
+                        #
+                        # The naive-quadrature contraction is the matrix product
+                        # Rq = Gmat * Bmat, with
+                        #   Gmat[ii, j] = G(xᵢ, yⱼ)   (near target × patch node)
+                        #   Bmat[j,  β] = wⱼ pβ(yⱼ)   (patch node × monomial)
+                        Gker = SingleLayerKernel(op)
+                        ntarg = length(near_list[n])
+                        nsrc = nq * length(els_idxs)
+                        Gmat = Matrix{eltype(R)}(undef, ntarg, nsrc)
+                        Bmat = Matrix{Float64}(undef, nsrc, num_basis)
+                        pvals = Vector{Float64}(undef, num_basis)
+                        ytmp = Vector{Float64}(undef, N)
+                        jcol = 0
+                        for el_idx in els_idxs
+                            for j in @view qtags[:, el_idx]
+                                jcol += 1
+                                yq = source[j]
+                                ytmp .= (coords(yq) - c) / r
+                                ElementaryPDESolutions.fast_evaluate!(pvals, ytmp, PFE_p)
+                                w = yq.weight
+                                @inbounds for β in 1:num_basis
+                                    Bmat[jcol, β] = w * pvals[β]
+                                end
+                                @inbounds for (ii, i) in enumerate(near_list[n])
+                                    Gmat[ii, jcol] = Gker(target[i], yq)
+                                end
+                            end
+                        end
+                        Δ = Gmat * Bmat - R
+                    end
+                    wei = transpose(Linv) * transpose(Δ)
+                    append!(lIs, repeat(near_list[n]; inner = nq))
+                    append!(lJs, repeat(jglob; outer = length(near_list[n])))
+                    append!(lVs, -wei)
+                else
+                    S = ws.Sdiagvec
+                    resize!(S, length(multiindices))
+                    S .= s^N * (s / r) .^ (abs.(multiindices))
+                    R .*= transpose(S)
+                    wei = transpose(Linv) * transpose(R)
+                    # δV = -(quad - exact)
+                    append!(lIs, repeat(near_list[n]; inner = nq))
+                    append!(lJs, repeat(jglob; outer = length(near_list[n])))
+                    append!(lVs, -wei)
+                end
+            end
+        end
+        # merge per-chunk correction triplets into the global buffers
+        for ci in eachindex(chunks)
+            append!(Is, chunk_Is[ci])
+            append!(Js, chunk_Js[ci])
+            append!(Vs, chunk_Vs[ci])
+        end
+    end
+    @debug """Condition properties of vdim correction:
+    |-- max interp. matrix condition: $vander_cond
+    |-- max norm of source term:      $rhs_norm
+    |-- max residual error:           $res_norm
+    |-- max interp. matrix norm :     $vander_norm
+    |-- max shift norm :              $shift_norm
+    """
+    δV = sparse(Is, Js, Vs, m, n)
+    return δV
+end
+
 function change_of_basis(multiindices, p, c, r)
     nbasis = length(multiindices)
     P = zeros(nbasis, nbasis)
@@ -151,6 +506,7 @@ function translation_and_scaling(el::LagrangeTriangle)
     l1 = norm(vertices[1] - vertices[2])
     l2 = norm(vertices[2] - vertices[3])
     l3 = norm(vertices[3] - vertices[1])
+    diam = max(l1, l2, l3)
     if ((l1^2 + l2^2 >= l3^2) && (l2^2 + l3^2 >= l1^2) && (l3^2 + l1^2 > l2^2))
         acuteright = true
     else
@@ -163,7 +519,7 @@ function translation_and_scaling(el::LagrangeTriangle)
         Cp = vertices[3] - vertices[1]
         Dp = 2 * (Bp[1] * Cp[2] - Bp[2] * Cp[1])
         Upx = 1 / Dp * (Cp[2] * (Bp[1]^2 + Bp[2]^2) - Bp[2] * (Cp[1]^2 + Cp[2]^2))
-        Upy = 1 / Dp * (Bp[1] * (Cp[1]^2 + Cp[2]^2) - Cp[2] * (Bp[1]^2 + Bp[2]^2))
+        Upy = 1 / Dp * (Bp[1] * (Cp[1]^2 + Cp[2]^2) - Cp[1] * (Bp[1]^2 + Bp[2]^2))
         Up = SVector{2}(Upx, Upy)
         r = norm(Up)
         c = Up + vertices[1]
@@ -179,7 +535,7 @@ function translation_and_scaling(el::LagrangeTriangle)
             r = l3 / 2
         end
     end
-    return c, r
+    return c, r, diam
 end
 
 function translation_and_scaling(el::ParametricElement{ReferenceSimplex{3}})
@@ -207,6 +563,7 @@ function translation_and_scaling(el::LagrangeTetrahedron)
     d = norm(vertices[3] - vertices[2])
     e = norm(vertices[3] - vertices[1])
     f = norm(vertices[2] - vertices[1])
+    diam = max(a, b, c, d, e, f)
     f² = f^2
     a² = a^2
     b² = b^2
@@ -254,7 +611,683 @@ function translation_and_scaling(el::LagrangeTetrahedron)
             R = f / 2
         end
     end
-    return center, R
+    return center, R, diam
+end
+
+# function barrier for type stability purposes
+_newbord_line(vtxs) = LagrangeLine(SVector{2}(vtxs))
+
+# function barrier for type stability purposes
+_newbord_tri(vtxs) = LagrangeElement{ReferenceSimplex{2}}(SVector{3}(vtxs))
+
+function _local_vdim_construct_local_quadratures(
+        N,
+        mesh,
+        neighbors,
+        el,
+        center,
+        scale,
+        diam,
+        bdry_kdtree,
+        Yvol,
+        Ybdry,
+        bdry_qrule,
+        vol_qrule
+    )
+    # construct the local region
+    Etype = first(element_types(mesh))
+    el_neighs = neighbors[(Etype, el)]
+
+    T = first(el_neighs)[1]
+    els_idxs = [i[2] for i in collect(el_neighs)]
+    els_list = mesh.etype2els[Etype][els_idxs]
+
+    loc_bdry = boundarynd(T, els_idxs, mesh)
+    # TODO handle curved boundary of Γ??
+    if N == 2
+        bords = LagrangeElement{ReferenceHyperCube{N - 1}, 2, SVector{N, Float64}}[]
+    else
+        bords = LagrangeElement{ReferenceSimplex{N - 1}, 3, SVector{N, Float64}}[]
+    end
+
+    for idxs in loc_bdry
+        vtxs = nodes(mesh)[idxs]
+        if N == 2
+            bord = _newbord_line(vtxs)
+        else
+            bord = _newbord_tri(vtxs)
+        end
+        push!(bords, bord)
+    end
+
+    # Check if we need to do near-singular layer potential evaluation
+    vertices = mesh.etype2els[Etype][el].vals[vertices_idxs(Etype)]
+    need_layer_corr = sum(inrangecount(bdry_kdtree, vertices, diam / 2)) > 0
+
+    # Now begin working in x̂ coordinates where x = scale * x̂
+
+    # build O(h) volume neighbors, reusing the preallocated quadrature buffers
+    build_local_quadrature!(Yvol, els_list, vol_qrule; center, scale)
+    build_local_quadrature!(Ybdry, bords, bdry_qrule; center, scale)
+
+    return Yvol, Ybdry, need_layer_corr, els_idxs
+end
+
+# Reusable scratch buffers for LVDIM.  Each field is a flat Vector whose
+# capacity is retained across calls (growing via `resize!` only at new
+# high-water marks), so after the first few elements no per-element allocation
+# of these buffers occurs. `T` is the kernel/matrix eltype.
+struct LocalVDIMWorkspace{T, N}
+    Xshift::Vector{SVector{N, Float64}}
+    Smat::Vector{T}
+    Dmat::Vector{T}
+    Vmat::Vector{T}
+    Θ::Vector{T}
+    b::Vector{Float64}
+    γ₀B::Vector{Float64}
+    γ₁B::Vector{Float64}
+    P::Vector{Float64}
+    grad::Vector{Float64}
+    gm::Vector{Float64}
+    Sdiagvec::Vector{Float64}
+end
+
+function LocalVDIMWorkspace{T, N}() where {T, N}
+    return LocalVDIMWorkspace{T, N}(
+        SVector{N, Float64}[],
+        T[], T[], T[], T[],
+        Float64[], Float64[], Float64[], Float64[], Float64[],
+        Float64[], Float64[],
+    )
+end
+
+# Return a `dims`-shaped, BLAS-strided view backed by `buf`, growing `buf` only
+# when a larger block is needed. `reshape(view(buf, 1:len), dims)` is a packed
+# `StridedArray`, so the result stays on the BLAS path in `mul!`.
+@inline function _ws_reshape(buf::Vector, dims::Dims)
+    len = prod(dims)
+    length(buf) < len && resize!(buf, len)
+    return reshape(view(buf, 1:len), dims)
+end
+_ws_reshape(buf::Vector, dims::Integer...) = _ws_reshape(buf, dims)
+
+function _local_vdim_auxiliary_quantities(
+        op::AbstractDifferentialOperator{N},
+        center,
+        scale,
+        PFE_p,
+        PFE_P,
+        X,
+        μ,
+        Yvol,
+        Ybdry,
+        diam,
+        need_layer_corr,
+        ws::LocalVDIMWorkspace{T, N},
+    ) where {T, N}
+    # TODO handle derivative case
+    G = SingleLayerKernel(op)
+    dG = DoubleLayerKernel(op)
+    num_basis = length(PFE_P)
+    num_targets = length(X)
+    nb = length(Ybdry)
+    nv = length(Yvol)
+
+    Xshift = ws.Xshift
+    resize!(Xshift, num_targets)
+    @inbounds for i in 1:num_targets
+        Xshift[i] = (coords(X[i]) - center) / scale
+    end
+    Sop = IntegralOperator(G, Xshift, Ybdry)
+    Dop = IntegralOperator(dG, Xshift, Ybdry)
+    Vop = IntegralOperator(G, Xshift, Yvol)
+    Smat = _ws_reshape(ws.Smat, num_targets, nb)
+    Dmat = _ws_reshape(ws.Dmat, num_targets, nb)
+    Vmat = _ws_reshape(ws.Vmat, num_targets, nv)
+    assemble_matrix!(Smat, Sop; threads = false)
+    assemble_matrix!(Dmat, Dop; threads = false)
+    assemble_matrix!(Vmat, Vop; threads = false)
+    Smap = LinearMap(Smat)
+    Dmap = LinearMap(Dmat)
+    if need_layer_corr
+        green_multiplier = resize!(ws.gm, num_targets)
+        copyto!(green_multiplier, μ)
+        δS, δD = bdim_correction(
+            op,
+            Xshift,
+            Ybdry,
+            Smat,
+            Dmat;
+            green_multiplier,
+            maxdist = diam / scale,
+            derivative = false,
+        )
+
+        #Smat += δS
+        Smap += LinearMap(δS)
+        #Dmat += δD
+        Dmap += LinearMap(δD)
+    end
+
+    b = _ws_reshape(ws.b, nv, num_basis)
+    γ₁B = _ws_reshape(ws.γ₁B, nb, num_basis)
+    γ₀B = _ws_reshape(ws.γ₀B, nb, num_basis)
+    P = _ws_reshape(ws.P, num_targets, num_basis)
+    grad = _ws_reshape(ws.grad, num_basis, N, nb)
+
+    for i in 1:nv
+        ElementaryPDESolutions.fast_evaluate!(view(b, i, :), Yvol[i].coords, PFE_p)
+    end
+    for i in 1:num_targets
+        ElementaryPDESolutions.fast_evaluate!(view(P, i, :), Xshift[i], PFE_P)
+    end
+    for i in 1:nb
+        ElementaryPDESolutions.fast_evaluate_with_jacobian!(
+            view(γ₀B, i, :),
+            view(grad, :, :, i),
+            Ybdry[i].coords,
+            PFE_P,
+        )
+    end
+    for i in 1:nb
+        for j in 1:num_basis
+            γ₁B[i, j] = 0
+            for k in 1:N
+                γ₁B[i, j] += grad[j, k, i] * Ybdry[i].normal[k] #nrml_bdry_vec[i][k]
+            end
+        end
+    end
+
+    Θ = _ws_reshape(ws.Θ, num_targets, num_basis)
+    fill!(Θ, zero(T))
+    # Compute Θ <-- S * γ₁B - D * γ₀B + V * b + σ * B(x) using in-place matvec
+    for n in 1:num_basis
+        @views mul!(Θ[:, n], Smap, γ₁B[:, n])
+        @views mul!(Θ[:, n], Dmap, γ₀B[:, n], -1, 1)
+        @views mul!(Θ[:, n], Vmat, b[:, n], 1, 1)
+        for i in 1:num_targets
+            Θ[i, n] += μ[i] * P[i, n]
+        end
+    end
+    if op isa Helmholtz && N == 3
+        Θ .*= 1 / scale
+    end
+    return Θ, b
+end
+
+"""
+    _local_vdim_exact_quantities(op, center, scale, PFE_p, PFE_P, X, μ, Yvol, Ybdry, diam, need_layer_corr)
+
+Exact local volume potentials on the rescaled patch: return `(E, b)` where
+`E[i, n] = Ṽ_exact[pₙ](x̃ᵢ)` is the exact volume potential of the monomial `pₙ`
+over the patch, evaluated via Green's identity (no naive patch quadrature is
+involved), and `b` holds the monomial values at the patch quadrature nodes.
+"""
+function _local_vdim_exact_quantities(
+        op::AbstractDifferentialOperator{N},
+        center,
+        scale,
+        PFE_p,
+        PFE_P,
+        X,
+        μ,
+        Yvol,
+        Ybdry,
+        diam,
+        need_layer_corr
+    ) where {N}
+    G = SingleLayerKernel(op)
+    dG = DoubleLayerKernel(op)
+    Xshift = [(coords(q) - center) / scale for q in X]
+    Sop = IntegralOperator(G, Xshift, Ybdry)
+    Dop = IntegralOperator(dG, Xshift, Ybdry)
+    Smat = assemble_matrix(Sop)
+    Dmat = assemble_matrix(Dop)
+    if need_layer_corr
+        green_multiplier = collect(Float64, μ)
+        δS, δD = bdim_correction(
+            op,
+            Xshift,
+            Ybdry,
+            Smat,
+            Dmat;
+            green_multiplier,
+            maxdist = diam / scale,
+            derivative = false,
+        )
+        Smat += δS
+        Dmat += δD
+    end
+
+    num_basis = length(PFE_P)
+    num_targets = length(X)
+    b = Matrix{Float64}(undef, length(Yvol), num_basis)
+    γ₁B = Matrix{Float64}(undef, length(Ybdry), num_basis)
+    γ₀B = Matrix{Float64}(undef, length(Ybdry), num_basis)
+    P = Matrix{Float64}(undef, length(X), num_basis)
+    grad = Array{Float64}(undef, num_basis, N, length(Ybdry))
+
+    for i in 1:length(Yvol)
+        ElementaryPDESolutions.fast_evaluate!(view(b, i, :), Yvol[i].coords, PFE_p)
+    end
+    for i in 1:length(X)
+        ElementaryPDESolutions.fast_evaluate!(view(P, i, :), Xshift[i], PFE_P)
+    end
+    for i in 1:length(Ybdry)
+        ElementaryPDESolutions.fast_evaluate_with_jacobian!(
+            view(γ₀B, i, :),
+            view(grad, :, :, i),
+            Ybdry[i].coords,
+            PFE_P,
+        )
+    end
+    for i in 1:length(Ybdry)
+        for j in 1:num_basis
+            γ₁B[i, j] = 0
+            for k in 1:N
+                γ₁B[i, j] += grad[j, k, i] * Ybdry[i].normal[k]
+            end
+        end
+    end
+
+    # Green's identity, valid for any target location relative to the patch:
+    # Ṽ_exact[pₙ](x̃) = -S[γ₁Pₙ](x̃) + D[γ₀Pₙ](x̃) - μ(x̃) Pₙ(x̃)
+    # (μ = -1 inside the patch, 0 outside, -1/2 on its smooth boundary)
+    E = zeros(eltype(Sop), num_targets, num_basis)
+    for n in 1:num_basis
+        @views mul!(E[:, n], Smat, γ₁B[:, n], -1, 0)
+        @views mul!(E[:, n], Dmat, γ₀B[:, n], 1, 1)
+        for i in 1:num_targets
+            E[i, n] -= μ[i] * P[i, n]
+        end
+    end
+    return E, b
+end
+
+function _lowfreq_vdim_auxiliary_quantities(
+        op::Laplace{2},
+        op_lowfreq::Laplace{2},
+        center,
+        scale,
+        num_basis,
+        PFE_p_lowfreq,
+        PFE_P_lowfreq,
+        multiindices,
+        multiindices_lowfreq,
+        monomials_indices,
+        monomials_indices_lowfreq,
+        X,
+        μ,
+        Yvol,
+        Ybdry,
+        diam,
+        need_layer_corr
+    )
+    E, b = _local_vdim_exact_quantities(
+        op_lowfreq,
+        center,
+        scale,
+        PFE_p_lowfreq,
+        PFE_P_lowfreq,
+        X,
+        μ,
+        Yvol,
+        Ybdry,
+        diam,
+        need_layer_corr
+    )
+    num_targets = length(X)
+
+    # Exact local operator via eq. (4.1):
+    # V_N[p_β](x) = scale² (Ṽ_exact[p̃_β](x̃) - log(scale)/(2π) ∫_Ñ p̃_β dỹ),
+    # with ∫_Ñ p̃_β evaluated by the local quadrature rule.
+    R = Matrix{eltype(E)}(undef, num_targets, num_basis)
+    for n in 1:num_basis
+        β = multiindices[n]
+        col = monomials_indices_lowfreq[β]
+        intp = sum(Yvol[j].weight * b[j, col] for j in 1:length(Yvol))
+        for i in 1:num_targets
+            R[i, n] = scale^2 * (E[i, col] - 1 / (2π) * log(scale) * intp)
+        end
+    end
+    return R
+end
+
+function _lowfreq_vdim_auxiliary_quantities(
+        op::Helmholtz{2},
+        op_lowfreq::Laplace{2},
+        center,
+        scale,
+        num_basis,
+        PFE_p_lowfreq,
+        PFE_P_lowfreq,
+        multiindices,
+        multiindices_lowfreq,
+        monomials_indices,
+        monomials_indices_lowfreq,
+        X,
+        μ,
+        Yvol,
+        Ybdry,
+        diam,
+        need_layer_corr
+    )
+    Xshift = [(coords(q) - center) / scale for q in X]
+    # Exact Laplace potentials on the rescaled patch
+    E, b = _local_vdim_exact_quantities(
+        op_lowfreq,
+        center,
+        scale,
+        PFE_p_lowfreq,
+        PFE_P_lowfreq,
+        X,
+        μ,
+        Yvol,
+        Ybdry,
+        diam,
+        need_layer_corr,
+    )
+
+    num_targets = length(X)
+    R = zeros(ComplexF64, num_targets, num_basis)
+    kr2 = (op.k * scale)^2
+    γ = 0.5772156649015328606
+
+    # Smooth part H of the kernel split G_kr = -1/(2π) log|x̃-ỹ| J₀(z) + H(z),
+    # z² = (k*scale)²|x̃-ỹ|² (cf. eqs. (3.4)-(3.6)), with both series truncated
+    # at O(z⁶); the entries carry the local quadrature weights so that Hmat*b
+    # approximates ∫_Ñ H(x̃,ỹ) p̃_β(ỹ) dỹ.
+    Hmat = Matrix{ComplexF64}(undef, num_targets, length(Yvol))
+    for i in 1:num_targets
+        for j in 1:length(Yvol)
+            z2 =
+                kr2 * (
+                (Xshift[i][1] - Yvol[j].coords[1])^2 +
+                    (Xshift[i][2] - Yvol[j].coords[2])^2
+            )
+            bessj0 = 0.0
+            for m in 0:3
+                bessj0 += (-1)^m * (z2 / 4)^m / factorial(m)^2
+            end
+            ytail =
+                z2 / 4 - (1 + 1 / 2) * (z2 / 4)^2 / 4 +
+                (1 + 1 / 2 + 1 / 3) * (z2 / 4)^3 / 36
+            Hmat[i, j] =
+                ((im / 4 - 1 / (2π) * (γ + 1 / 2 * log(kr2 / 4))) * bessj0 - 1 / (2π) * ytail) *
+                Yvol[j].weight
+        end
+    end
+
+    # Exact local operator via eqs. (3.11)-(3.12): the log-kernel part is the
+    # P_J⁽¹⁾ combination of exact Laplace potentials of nearby monomials, the
+    # smooth part is ∫_Ñ H p̃_β computed with the local quadrature.
+    for n in 1:num_basis
+        beta = multiindices[n]
+        beta10 = beta + MultiIndex((1, 0))
+        beta01 = beta + MultiIndex((0, 1))
+        beta20 = beta + MultiIndex((2, 0))
+        beta02 = beta + MultiIndex((0, 2))
+        # N=2 terms
+        beta11 = beta + MultiIndex((1, 1))
+        beta30 = beta + MultiIndex((3, 0))
+        beta21 = beta + MultiIndex((2, 1))
+        beta12 = beta + MultiIndex((1, 2))
+        beta03 = beta + MultiIndex((0, 3))
+        beta40 = beta + MultiIndex((4, 0))
+        beta22 = beta + MultiIndex((2, 2))
+        beta04 = beta + MultiIndex((0, 4))
+        for j in 1:num_targets
+            x1t = Xshift[j][1]
+            x2t = Xshift[j][2]
+            R[j, n] =
+                scale^2 * (
+                (1 - 1 / 4 * kr2 * (x1t^2 + x2t^2)) * E[j, monomials_indices_lowfreq[beta]] +
+                    1 / 2 * kr2 * x1t * factorial(beta10) / factorial(beta) * E[j, monomials_indices_lowfreq[beta10]] +
+                    1 / 2 * kr2 * x2t * factorial(beta01) / factorial(beta) * E[j, monomials_indices_lowfreq[beta01]] -
+                    1 / 4 * kr2 * factorial(beta20) / factorial(beta) * E[j, monomials_indices_lowfreq[beta20]] -
+                    1 / 4 * kr2 * factorial(beta02) / factorial(beta) * E[j, monomials_indices_lowfreq[beta02]]
+            )
+            R[j, n] += scale^2 * kr2^2 / 64.0 * (
+                (x1t^2 + x2t^2)^2 * E[j, monomials_indices_lowfreq[beta]] +
+                -4*x1t*(x1t^2 + x2t^2)*factorial(beta10)/factorial(beta) * E[j, monomials_indices_lowfreq[beta10]] +
+                -4*x2t*(x1t^2 + x2t^2)*factorial(beta01)/factorial(beta) * E[j, monomials_indices_lowfreq[beta01]] +
+                (6*x1t^2 + 2*x2t^2)*factorial(beta20)/factorial(beta) * E[j, monomials_indices_lowfreq[beta20]] +
+                8*x1t*x2t * factorial(beta11)/factorial(beta) * E[j, monomials_indices_lowfreq[beta11]] +
+                (2*x1t^2 + 6*x2t^2)*factorial(beta02)/factorial(beta) * E[j, monomials_indices_lowfreq[beta02]] +
+                -4*x1t*factorial(beta30)/factorial(beta) * E[j, monomials_indices_lowfreq[beta30]] +
+                -4*x2t*factorial(beta21)/factorial(beta) * E[j, monomials_indices_lowfreq[beta21]] +
+                -4*x1t*factorial(beta12)/factorial(beta) * E[j, monomials_indices_lowfreq[beta12]] +
+                -4*x2t*factorial(beta03)/factorial(beta) * E[j, monomials_indices_lowfreq[beta03]] +
+                factorial(beta40)/factorial(beta) * E[j, monomials_indices_lowfreq[beta40]] +
+                2*factorial(beta22)/factorial(beta) * E[j, monomials_indices_lowfreq[beta22]] +
+                factorial(beta04)/factorial(beta) * E[j, monomials_indices_lowfreq[beta04]]
+            )
+        end
+        @views R[:, n] .+= scale^2 .* (Hmat * b[:, monomials_indices_lowfreq[beta]])
+    end
+    return R
+end
+
+"""
+    _lowfreq_vdim_cancellation_quantities(op, op_lowfreq, center, scale, num_basis, ...)
+
+Return the physical *(naive quad - exact)* local volume operator `R[i, β] =
+V_quad,N[p_β](xᵢ) - V_exact,N[p_β](xᵢ)` for the low-frequency path, formed
+analytically from the rescaled-coordinate Laplace expansion rather than by an
+explicit kernel contraction (see [`local_vdim_correction`](@ref) `form` kwarg).
+
+The key reuse is that `_local_vdim_auxiliary_quantities` already returns
+`Θ = V_scaled·b - E_scaled`, i.e. the scaled (naive quad - exact) Laplace
+potentials of the padded monomial basis. The log(scale)/(2π)∫p̃ term of eq.
+(4.1) (Laplace) and the smooth ∫H p̃ term of eq. (3.12) (Helmholtz) cancel
+between quad and exact, leaving only an analytic self-node term that accounts
+for the `G(x,x)=0` convention of the operator being corrected.
+"""
+function _lowfreq_vdim_cancellation_quantities(
+        op::Laplace{2},
+        op_lowfreq::Laplace{2},
+        center,
+        scale,
+        num_basis,
+        PFE_p_lowfreq,
+        PFE_P_lowfreq,
+        multiindices,
+        multiindices_lowfreq,
+        monomials_indices,
+        monomials_indices_lowfreq,
+        X,
+        μ,
+        Yvol,
+        Ybdry,
+        diam,
+        need_layer_corr,
+        ws::LocalVDIMWorkspace,
+    )
+    Θ, b = _local_vdim_auxiliary_quantities(
+        op_lowfreq,
+        center,
+        scale,
+        PFE_p_lowfreq,
+        PFE_P_lowfreq,
+        X,
+        μ,
+        Yvol,
+        Ybdry,
+        diam,
+        need_layer_corr,
+        ws,
+    )
+    num_targets = length(X)
+    # quad - exact = scale² Θ; the log(scale)/(2π)∫p̃ term cancels.
+    R = Matrix{eltype(Θ)}(undef, num_targets, num_basis)
+    for n in 1:num_basis
+        col = monomials_indices_lowfreq[multiindices[n]]
+        for i in 1:num_targets
+            R[i, n] = scale^2 * Θ[i, col]
+        end
+    end
+    # Self-node term: the physical naive operator uses G(x,x) = 0, omitting the
+    # -log(scale)/(2π) shift the rescaling applies at the coincident node.
+    Xshift = [(coords(q) - center) / scale for q in X]
+    for i in 1:num_targets
+        for j in 1:length(Yvol)
+            if norm(Xshift[i] - Yvol[j].coords) ≤ SAME_POINT_TOLERANCE
+                coef = 1 / (2π) * log(scale) * Yvol[j].weight * scale^2
+                for n in 1:num_basis
+                    R[i, n] += coef * b[j, monomials_indices_lowfreq[multiindices[n]]]
+                end
+            end
+        end
+    end
+    return R
+end
+
+function _lowfreq_vdim_cancellation_quantities(
+        op::Laplace{3},
+        op_lowfreq::Laplace{3},
+        center,
+        scale,
+        num_basis,
+        PFE_p_lowfreq,
+        PFE_P_lowfreq,
+        multiindices,
+        multiindices_lowfreq,
+        monomials_indices,
+        monomials_indices_lowfreq,
+        X,
+        μ,
+        Yvol,
+        Ybdry,
+        diam,
+        need_layer_corr,
+        ws::LocalVDIMWorkspace,
+    )
+    Θ, b = _local_vdim_auxiliary_quantities(
+        op_lowfreq,
+        center,
+        scale,
+        PFE_p_lowfreq,
+        PFE_P_lowfreq,
+        X,
+        μ,
+        Yvol,
+        Ybdry,
+        diam,
+        need_layer_corr,
+        ws,
+    )
+    num_targets = length(X)
+    # quad - exact = scale² Θ;
+    R = Matrix{eltype(Θ)}(undef, num_targets, num_basis)
+    for n in 1:num_basis
+        col = monomials_indices_lowfreq[multiindices[n]]
+        for i in 1:num_targets
+            R[i, n] = scale^2 * Θ[i, col]
+        end
+    end
+    return R
+end
+
+function _lowfreq_vdim_cancellation_quantities(
+        op::Helmholtz{2},
+        op_lowfreq::Laplace{2},
+        center,
+        scale,
+        num_basis,
+        PFE_p_lowfreq,
+        PFE_P_lowfreq,
+        multiindices,
+        multiindices_lowfreq,
+        monomials_indices,
+        monomials_indices_lowfreq,
+        X,
+        μ,
+        Yvol,
+        Ybdry,
+        diam,
+        need_layer_corr,
+        ws::LocalVDIMWorkspace,
+    )
+    Θ, b = _local_vdim_auxiliary_quantities(
+        op_lowfreq,
+        center,
+        scale,
+        PFE_p_lowfreq,
+        PFE_P_lowfreq,
+        X,
+        μ,
+        Yvol,
+        Ybdry,
+        diam,
+        need_layer_corr,
+        ws,
+    )
+    Xshift = [(coords(q) - center) / scale for q in X]
+    num_targets = length(X)
+    R = zeros(ComplexF64, num_targets, num_basis)
+    kr2 = (op.k * scale)^2
+    γ = 0.5772156649015328606
+
+    # quad - exact: the P_J⁽¹⁾ combination (eq. (3.11)) of the Laplace
+    # (quad - exact) Θ of nearby monomials. The smooth ∫H p̃ part cancels
+    # between quad and exact except at the self node, handled below.
+    for n in 1:num_basis
+        beta = multiindices[n]
+        beta10 = beta + MultiIndex((1, 0))
+        beta01 = beta + MultiIndex((0, 1))
+        beta20 = beta + MultiIndex((2, 0))
+        beta02 = beta + MultiIndex((0, 2))
+        # N=2 terms
+        beta11 = beta + MultiIndex((1, 1))
+        beta30 = beta + MultiIndex((3, 0))
+        beta21 = beta + MultiIndex((2, 1))
+        beta12 = beta + MultiIndex((1, 2))
+        beta03 = beta + MultiIndex((0, 3))
+        beta40 = beta + MultiIndex((4, 0))
+        beta22 = beta + MultiIndex((2, 2))
+        beta04 = beta + MultiIndex((0, 4))
+        for j in 1:num_targets
+            x1t = Xshift[j][1]
+            x2t = Xshift[j][2]
+            R[j, n] =
+                scale^2 * (
+                (1 - 1 / 4 * kr2 * (x1t^2 + x2t^2)) * Θ[j, monomials_indices_lowfreq[beta]] +
+                    1 / 2 * kr2 * x1t * factorial(beta10) / factorial(beta) * Θ[j, monomials_indices_lowfreq[beta10]] +
+                    1 / 2 * kr2 * x2t * factorial(beta01) / factorial(beta) * Θ[j, monomials_indices_lowfreq[beta01]] -
+                    1 / 4 * kr2 * factorial(beta20) / factorial(beta) * Θ[j, monomials_indices_lowfreq[beta20]] -
+                    1 / 4 * kr2 * factorial(beta02) / factorial(beta) * Θ[j, monomials_indices_lowfreq[beta02]]
+            )
+            R[j, n] += scale^2 * kr2^2 / 64.0 * (
+                (x1t^2 + x2t^2)^2 * Θ[j, monomials_indices_lowfreq[beta]] +
+                -4*x1t*(x1t^2 + x2t^2)*factorial(beta10)/factorial(beta) * Θ[j, monomials_indices_lowfreq[beta10]] +
+                -4*x2t*(x1t^2 + x2t^2)*factorial(beta01)/factorial(beta) * Θ[j, monomials_indices_lowfreq[beta01]] +
+                (6*x1t^2 + 2*x2t^2)*factorial(beta20)/factorial(beta) * Θ[j, monomials_indices_lowfreq[beta20]] +
+                8*x1t*x2t * factorial(beta11)/factorial(beta) * Θ[j, monomials_indices_lowfreq[beta11]] +
+                (2*x1t^2 + 6*x2t^2)*factorial(beta02)/factorial(beta) * Θ[j, monomials_indices_lowfreq[beta02]] +
+                -4*x1t*factorial(beta30)/factorial(beta) * Θ[j, monomials_indices_lowfreq[beta30]] +
+                -4*x2t*factorial(beta21)/factorial(beta) * Θ[j, monomials_indices_lowfreq[beta21]] +
+                -4*x1t*factorial(beta12)/factorial(beta) * Θ[j, monomials_indices_lowfreq[beta12]] +
+                -4*x2t*factorial(beta03)/factorial(beta) * Θ[j, monomials_indices_lowfreq[beta03]] +
+                factorial(beta40)/factorial(beta) * Θ[j, monomials_indices_lowfreq[beta40]] +
+                2*factorial(beta22)/factorial(beta) * Θ[j, monomials_indices_lowfreq[beta22]] +
+                factorial(beta04)/factorial(beta) * Θ[j, monomials_indices_lowfreq[beta04]]
+            )
+        end
+    end
+    # Self-node term: physical naive drops H(0) = i/4 - (γ + log(kr/2))/(2π)
+    # at the coincident node (G_k(x,x) = 0).
+    H0 = im / 4 - 1 / (2π) * (γ + 1 / 2 * log(kr2 / 4))
+    for i in 1:num_targets
+        for j in 1:length(Yvol)
+            if norm(Xshift[i] - Yvol[j].coords) ≤ SAME_POINT_TOLERANCE
+                w = Yvol[j].weight * scale^2
+                for n in 1:num_basis
+                    R[i, n] -= H0 * w * b[j, monomials_indices_lowfreq[multiindices[n]]]
+                end
+            end
+        end
+    end
+    return R
 end
 
 function _vdim_auxiliary_quantities(
@@ -275,11 +1308,11 @@ function _vdim_auxiliary_quantities(
     γ₀B = [f(q) for q in Γ, f in P]
     γ₁B = [f(q) for q in Γ, f in γ₁P]
     Θ = zeros(eltype(Vop), num_targets, num_basis)
-    # Compute Θ <-- S * γ₁B - D * γ₀B - V * b + σ * B(x) using in-place matvec
+    # Compute Θ <-- S * γ₁B - D * γ₀B + V * b + σ * B(x) using in-place matvec
     for n in 1:num_basis
         @views mul!(Θ[:, n], Sop, γ₁B[:, n])
         @views mul!(Θ[:, n], Dop, γ₀B[:, n], -1, 1)
-        @views mul!(Θ[:, n], Vop, b[:, n], -1, 1)
+        @views mul!(Θ[:, n], Vop, b[:, n], 1, 1)
         for i in 1:num_targets
             Θ[i, n] += μ[i] * P[n](X[i])
         end
@@ -308,6 +1341,41 @@ function vdim_mesh_center(msh::AbstractMesh)
     end
     return xc / M
 end
+"""
+    polynomial_solutions_local_vdim(op, order)
+
+For every monomial term `pₙ` of degree `order`, compute a polynomial `Pₙ` such
+that `ℒ[Pₙ] = pₙ`, where `ℒ` is the differential operator `op`.
+This function returns `{pₙ,Pₙ,γ₁Pₙ}`, where `γ₁Pₙ` is the generalized Neumann
+trace of `Pₙ`.
+"""
+function polynomial_solutions_local_vdim(op::AbstractDifferentialOperator, order::Integer)
+    N = ambient_dimension(op)
+    # create empty arrays to store the monomials, solutions, and traces. For the
+    # neumann trace, we try to infer the concrete return type instead of simply
+    # having a vector of `Function`.
+    monomials = Vector{ElementaryPDESolutions.Polynomial{N, Float64}}()
+    poly_solutions = Vector{ElementaryPDESolutions.Polynomial{N, Float64}}()
+    multiindices = Vector{MultiIndex{N}}()
+    # iterate over N-tuples going from 0 to order
+    for I in Iterators.product(ntuple(i -> 0:order, N)...)
+        sum(I) > order && continue
+        # define the monomial basis functions, and the corresponding solutions.
+        # TODO: adapt this to vectorial case
+        p = ElementaryPDESolutions.Polynomial(I => 1 / factorial(MultiIndex(I)))
+        P = polynomial_solution(op, p)
+        push!(multiindices, MultiIndex(I))
+        push!(monomials, p)
+        push!(poly_solutions, P)
+    end
+    monomials_indices = Dict(multiindices .=> 1:length(multiindices))
+
+    PFE_monomials = ElementaryPDESolutions.assemble_fastevaluator(monomials, Float64)
+    PFE_polysolutions =
+        ElementaryPDESolutions.assemble_fastevaluator(poly_solutions, Float64)
+
+    return PFE_monomials, PFE_polysolutions, multiindices, monomials_indices
+end
 
 """
     polynomial_solutions_vdim(op, order[, center])
@@ -354,27 +1422,34 @@ function polynomial_solutions_vdim(
         return (q) -> f(coords(q) - center)
     end
     neumann_shift = map(neumann_traces) do f
-        return (q) -> f((coords = q.coords - center, normal = q.normal))
+        # return (q) -> f((coords = q.coords - center, normal = q.normal))
+        return (q) -> f(coords(q) - center, normal(q))
     end
     return monomial_shift, dirchlet_shift, neumann_shift, multiindices
     # return monomials, dirchlet_traces, neumann_traces, multiindices
 end
 
-# dispatch to the correct solver in ElementaryPDESolutions
+# TODO/FIXME: This is fixed in `main` but we have not yet merged that into this branch
+# Dispatch to the correct solver in ElementaryPDESolutions.
+# CONVENTION: `polynomial_solution(op, p)` returns `P` with `L[P] = -p`. The
+# Green-identity assemblies in (global and local) VDIM rely on this sign:
+# V[p](x) = -(S[γ₁P](x) - D[γ₀P](x) + μ(x)P(x)). All operators must follow the
+# same convention; note `solve_laplace(Q)` and `solve_helmholtz(Q; k)` both
+# return solutions with L[P] = +Q, hence the `-p` below.
 function polynomial_solution(::Laplace, p::ElementaryPDESolutions.Polynomial)
-    P = ElementaryPDESolutions.solve_laplace(p)
+    P = ElementaryPDESolutions.solve_laplace(-p)
     return ElementaryPDESolutions.convert_coefs(P, Float64)
 end
 
 function polynomial_solution(op::Helmholtz, p::ElementaryPDESolutions.Polynomial)
     k = op.k
-    P = ElementaryPDESolutions.solve_helmholtz(p; k)
+    P = ElementaryPDESolutions.solve_helmholtz(-p; k)
     return ElementaryPDESolutions.convert_coefs(P, Float64)
 end
 
 function polynomial_solution(op::Yukawa, p::ElementaryPDESolutions.Polynomial)
     k = im * op.λ
-    P = ElementaryPDESolutions.solve_helmholtz(p; k)
+    P = ElementaryPDESolutions.solve_helmholtz(-p; k)
     return ElementaryPDESolutions.convert_coefs(P, Float64)
 end
 
@@ -387,7 +1462,8 @@ end
 
 function _normal_derivative(P::ElementaryPDESolutions.Polynomial{N, T}) where {N, T}
     ∇P = ElementaryPDESolutions.gradient(P)
-    return (q) -> dot(normal(q), ∇P(coords(q)))
+    # return (q) -> dot(normal(q), ∇P(coords(q)))
+    return (x, n) -> dot(n, ∇P(x))
 end
 
 function (∇P::NTuple{N, <:ElementaryPDESolutions.Polynomial})(x) where {N}
diff --git a/test/Project.toml b/test/Project.toml
index 3280b59f..97f2bce8 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,7 +1,11 @@
 [deps]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+DynamicPolynomials = "7c1d4256-1411-5781-91ec-d7bc3513ac07"
+ElementaryPDESolutions = "88a69b33-da0f-4502-8c8c-d680cf4d883b"
 FMM2D = "2d63477d-9690-4b75-bcc1-c3461d43fecc"
 FMM3D = "1e13804c-f9b7-11ea-0ef0-29f3b1745df8"
+FixedPolynomials = "3ff9d00c-e216-4acf-8d13-1445c3c4bece"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 Gmsh = "705231aa-382f-11e9-3f0c-b7cb4346fdeb"
diff --git a/test/boundary_test.jl b/test/boundary_test.jl
new file mode 100644
index 00000000..0c47cc54
--- /dev/null
+++ b/test/boundary_test.jl
@@ -0,0 +1,84 @@
+using Test
+using LinearAlgebra
+using Inti
+using Random
+using Meshes
+import GLMakie
+
+include("test_utils.jl")
+Random.seed!(1)
+
+N = 2
+t = :interior
+pde = Inti.Laplace(; dim = N)
+Inti.clear_entities!()
+Ω, msh = gmsh_disk(; center = [0.0, 0.0], rx = 1.0, ry = 1.0, meshsize = 0.2, order = 2)
+# Ω, msh = gmsh_ball(; center = [0.0, 0.0, 0.0], radius=1.0, meshsize = 0.2)
+Γ = Inti.external_boundary(Ω)
+
+k = 2
+Γ_msh = msh[Γ]
+Ω_msh = msh[Ω]
+test_msh = Ω_msh
+nei = Inti.topological_neighbors(test_msh, 1)
+E = first(Inti.element_types(test_msh))
+function viz_neighbors(i, msh)
+    k, v = nei[i]
+    E, idx = k
+    el = Inti.elements(msh, E)[idx]
+    fig, _, _ = viz(el; color = 0, showsegments = true)
+    for (E, i) in v
+        el = Inti.elements(msh, E)[i]
+        viz!(el; color = 1 / 2, showsegments = true, alpha = 0.2)
+    end
+    return display(fig)
+end
+
+#function viz_elements(els, msh)
+#    Els = [Inti.elements(msh, E)[i] for (E, i) in els]
+#    fig, _, _ = viz(Els)
+#    viz!(msh; color = 0, showsegments = true,alpha=0.3)
+#    display(fig)
+#end
+#
+#function viz_elements_bords(Ei, els, bords, msh)
+#    ell = collect(Ei[(E, 1)])[1]
+#    el = Inti.elements(msh, ell[1])[ell[2]]
+#    fig, _, _ = viz(msh; color = 0, showsegments = true,alpha=0.3)
+#    viz!(el; color = 0, showsegments = true,alpha=0.5)
+#    for (E, i) in els
+#        el = Inti.elements(msh, E)[i]
+#        viz!(el; showsegments = true, alpha=0.7)
+#    end
+#    viz!(bords;color=4,showsegments = false,segmentsize=5,segmentcolor=4)
+#    display(fig)
+#end
+
+# el_in_set(el, set) = any(x->sort(x) == sort(el), set)
+
+I = 3
+test_els = union(copy(nei[(E, I)]))
+els = Inti.elements(test_msh, E)
+#test_els = union(copy(nei[(E,1)]), nei[(E,2)])
+#test_els = union(copy(nei[(E,1)]), nei[(E,2)], nei[(E,3)], nei[(E,4)])
+Inti.viz_elements(test_els, test_msh)
+
+components = Inti.connected_components(test_els, nei)
+
+test_els = copy(nei[(E, I)])
+BD = Inti.boundarynd(test_els, test_msh)
+# bords = [Inti.nodes(test_msh)[abs(i)] for i in BD]
+
+bords = Inti.LagrangeElement[]
+for idxs in BD
+    vtxs = Inti.nodes(Ω_msh)[idxs]
+    bord = Inti.LagrangeLine(vtxs...)
+    push!(bords, bord)
+end
+
+els_idxs = [i[2] for i in collect(test_els)]
+E = collect(test_els)[1][1]
+els_list = test_msh.etype2els[E][els_idxs]
+newquad = Inti.Quadrature(test_msh, els_list; qorder = 2)
+
+Inti.viz_elements_bords(nei, test_els, (E, I), bords, test_msh)
diff --git a/test/convergence/vdim_potential_script.jl b/test/convergence/vdim_potential_script.jl
index b7815324..a9043661 100644
--- a/test/convergence/vdim_potential_script.jl
+++ b/test/convergence/vdim_potential_script.jl
@@ -6,7 +6,7 @@ using Gmsh
 using LinearAlgebra
 using HMatrices
 using FMMLIB2D
-using CairoMakie
+using GLMakie
 
 function domain_and_mesh(; meshsize, meshorder = 1)
     Inti.clear_entities!()
@@ -31,18 +31,14 @@ interpolation_order = 4
 VR_qorder = Inti.Triangle_VR_interpolation_order_to_quadrature_order(interpolation_order)
 bdry_qorder = 2 * VR_qorder
 
-tmesh = @elapsed begin
-    Ω, msh = domain_and_mesh(; meshsize)
-end
-@info "Mesh generation time: $tmesh"
-
+Ω, msh = domain_and_mesh(; meshsize)
 Γ = Inti.external_boundary(Ω)
 #Γₕ = msh[Γ]
 #Ωₕ = msh[Ω]
 
 ψ = (t) -> [cos(2 * π * t), sin(2 * π * t)]
 θ = 3 # smoothness order of curved elements
-crvmsh = Inti.curve_mesh(msh, ψ, θ, 500 * Int(1 / meshsize))
+crvmsh = Inti.curve_mesh(msh, ψ, θ; patch_sample_num = 500 * Int(1 / meshsize))
 Ωₕ = view(crvmsh, Ω)
 Γₕ = view(crvmsh, Γ)
 #
@@ -54,7 +50,6 @@ tquad = @elapsed begin
     Q = Inti.VioreanuRokhlin(; domain = :triangle, order = VR_qorder)
     dict = Dict(E => Q for E in Inti.element_types(Ωₕ))
     Ωₕ_quad = Inti.Quadrature(Ωₕ, dict)
-    # Ωₕ_quad = Inti.Quadrature(Ωₕ; qorder = qorders[1])
     Γₕ_quad = Inti.Quadrature(Γₕ; qorder = bdry_qorder)
 end
 @info "Quadrature generation time: $tquad"
diff --git a/test/lvdim_test.jl b/test/lvdim_test.jl
new file mode 100644
index 00000000..de356841
--- /dev/null
+++ b/test/lvdim_test.jl
@@ -0,0 +1,197 @@
+# # Testing local vdim
+
+using DynamicPolynomials
+using FixedPolynomials
+using Inti
+using StaticArrays
+using Gmsh
+using LinearAlgebra
+using HMatrices
+using FMMLIB2D
+using Meshes
+using DataStructures
+
+#meshsize = 0.001/8
+#meshsize = 0.125/8
+meshsize = 0.125 / 2 /2 / 2 / 4 / 2
+interpolation_order = 2
+VR_qorder = Inti.Triangle_VR_interpolation_order_to_quadrature_order(3)
+#VR_qorder = Inti.Triangle_VR_interpolation_order_to_quadrature_order(interpolation_order)
+bdry_qorder = 7
+
+function gmsh_disk(; name, meshsize, order = 1, center = (0, 0), paxis = (2, 1))
+    return try
+        gmsh.initialize()
+        gmsh.option.setNumber("General.Terminal", 0)
+        gmsh.model.add("circle-mesh")
+        gmsh.option.setNumber("Mesh.MeshSizeMax", meshsize)
+        gmsh.option.setNumber("Mesh.MeshSizeMin", meshsize)
+        gmsh.model.occ.addDisk(center[1], center[2], 0, paxis[1], paxis[2])
+        gmsh.model.occ.synchronize()
+        gmsh.model.mesh.generate(2)
+        gmsh.model.mesh.setOrder(order)
+        gmsh.write(name)
+    finally
+        gmsh.finalize()
+    end
+end
+
+name = joinpath(@__DIR__, "disk.msh")
+gmsh_disk(; meshsize, order = 1, name, paxis = (1, 1))
+#gmsh_disk(; meshsize, order = 1, name, paxis = (meshsize * 20, meshsize * 10))
+
+Inti.clear_entities!() # empty the entity cache
+msh = Inti.import_mesh(name; dim = 2)
+Ω = Inti.Domain(e -> Inti.geometric_dimension(e) == 2, Inti.entities(msh))
+Γ = Inti.boundary(Ω)
+
+Ωₕ = msh[Ω]
+Γₕ = msh[Γ]
+Ωₕ_Sub = view(msh, Ω)
+Γₕ_Sub = view(msh, Γ)
+
+tquad = @elapsed begin
+    # Use VDIM with the Vioreanu-Rokhlin quadrature rule for Ωₕ
+    Q = Inti.VioreanuRokhlin(; domain = :triangle, order = VR_qorder)
+    dict = OrderedDict(E => Q for E in Inti.element_types(Ωₕ))
+    Ωₕ_quad = Inti.Quadrature(Ωₕ, dict)
+    Ωₕ_Sub_quad = Inti.Quadrature(Ωₕ_Sub, dict)
+    # Ωₕ_quad = Inti.Quadrature(Ωₕ; qorder = qorders[1])
+    Γₕ_quad = Inti.Quadrature(Γₕ; qorder = bdry_qorder)
+    Γₕ_Sub_quad = Inti.Quadrature(Γₕ_Sub; qorder = bdry_qorder)
+end
+@info "Quadrature generation time: $tquad"
+
+#k = 0.1 / meshsize
+#k = 1.0
+k = 32.0
+op = k == 0 ? Inti.Laplace(; dim = 2) : Inti.Helmholtz(; dim = 2, k)
+
+## Boundary operators
+tbnd = @elapsed begin
+    S_b2d, D_b2d = Inti.single_double_layer(;
+        op,
+        target = Ωₕ_quad,
+        source = Γₕ_quad,
+        compression = (method = :fmm, tol = 1.0e-14),
+        correction = (method = :dim, maxdist = 5 * meshsize, target_location = :inside),
+    )
+end
+@info "Boundary operators time: $tbnd"
+
+## Volume potentials
+tvol = @elapsed begin
+V_d2d = Inti.volume_potential(;
+    op,
+    target = Ωₕ_quad,
+    source = Ωₕ_quad,
+    compression = (method = :fmm, tol = 1.0e-14),
+    correction = (
+        method = :ldim,
+        mesh = Ωₕ,
+        interpolation_order,
+        quadrature_order = VR_qorder,
+        bdry_nodes = Γₕ.nodes,
+        maxdist = 5 * meshsize,
+        meshsize = meshsize,
+        boundary = Γₕ_quad,
+        form = :analytic,
+    ),
+)
+end
+@info "Volume potential time: $tvol"
+
+using ElementaryPDESolutions
+import ElementaryPDESolutions: Polynomial
+
+k0 = 1.1
+θ = (cos(π / 3), sin(π / 3))
+#u = (x) -> exp(im * k0 * dot(x, θ))
+#du = (x, n) -> im * k0 * dot(θ, n) * exp(im * k0 * dot(x, θ))
+u = (x) -> cos(k0 * dot(x, θ))
+du = (x, n) -> -k0 * dot(θ, n) * sin(k0 * dot(x, θ))
+f = (x) -> -1 * (k^2 - k0^2) * u(x)
+
+#I = (2, 0)
+# `L` must match the major semi-axis passed to `gmsh_disk` so that f = (x/L)²
+# is O(1) on the domain; otherwise norm(er, Inf) is inflated by the constant
+# 1/L² while the relative error is unaffected.
+#L = 1.0
+#f = Polynomial(I => 1 / L^2)
+#u = if k == 0
+#    convert_coefs(ElementaryPDESolutions.solve_laplace(-f), Float64)
+#else
+#    convert_coefs(ElementaryPDESolutions.solve_helmholtz(-f; k), ComplexF64)
+#end
+#gradu = ElementaryPDESolutions.gradient(u)
+#du = (x, n) -> map(zip(gradu, 1:length(n))) do v
+#
+#    p = v[1](x)
+#    normal = n[v[2]]
+#    return p[1]*normal[1] + p[2]*normal[2]
+#
+#end
+#function du(x, n)
+#    accum = 0.0
+#    for i in 1:length(n)
+#        accum+= gradu[i](x) * n[i]
+#    end
+#    return accum
+#end
+
+
+#s  = 4
+#u  = (x) -> 1 / (k^2 - k0^2) * exp(im * k0 * dot(x, θ)) + 1 / (k^2 - 4 * s) * exp(-s * norm(x)^2)
+#du = (x, n) -> im * k0 * dot(θ, n) / (k^2 - k0^2) * exp(im * k0 * dot(x, θ)) - 2 * s / (k^2 - 4 * s) * dot(x, n) * exp(-s * norm(x)^2)
+#f  = (x) -> exp(im * k0 * dot(x, θ)) + 1 / (k^2 - 4 * s) * (4 * s^2 * norm(x)^2 - 4 * s + k^2) * exp(-s * norm(x)^2)
+
+u_d = map(q -> u(q.coords), Ωₕ_quad)
+u_b = map(q -> u(q.coords), Γₕ_quad)
+du_b = map(q -> du(q.coords, q.normal), Γₕ_quad)
+f_d = map(q -> f(q.coords), Ωₕ_quad)
+
+vref = u_d + D_b2d * u_b - S_b2d * du_b
+#vref = -u_d - D_b2d * u_b + S_b2d * du_b
+vapprox = V_d2d * f_d
+er = vref - vapprox
+
+ndofs = length(er)
+
+@show ndofs, meshsize, k, norm(er, Inf), norm(er, Inf) / norm(vref, Inf)
+
+## ---- exterior evaluation just outside Γ ----
+δoff = 0.5 * meshsize
+ext_pts = [q.coords + δoff * q.normal for q in Γₕ_quad]
+
+S_b2e, D_b2e = Inti.single_double_layer(;
+    op,
+    target = ext_pts,
+    source = Γₕ_quad,
+    compression = (method = :fmm, tol = 1.0e-14),
+    correction = (method = :dim, maxdist = 5 * meshsize, target_location = :outside),
+)
+
+V_d2e = Inti.volume_potential(;
+    op,
+    target = ext_pts,
+    source = Ωₕ_quad,
+    compression = (method = :fmm, tol = 1.0e-14),
+    correction = (
+        method = :ldim,
+        mesh = Ωₕ,
+        interpolation_order,
+        quadrature_order = VR_qorder,
+        bdry_nodes = Γₕ.nodes,
+        maxdist = 5 * meshsize,
+        meshsize = meshsize,
+        boundary = Γₕ_quad,
+        target_location = :outside,
+        form = :analytic,
+    ),
+)
+
+# Green identity for exterior targets: μ = 0, so the u term drops out
+vref_ext = D_b2e * u_b - S_b2e * du_b
+vapprox_ext = V_d2e * f_d
+er_ext = vref_ext - vapprox_ext
+@show norm(er_ext, Inf), norm(er_ext, Inf) / norm(vref_ext, Inf)
diff --git a/test/lvdim_test_3d.jl b/test/lvdim_test_3d.jl
new file mode 100644
index 00000000..99f02bbd
--- /dev/null
+++ b/test/lvdim_test_3d.jl
@@ -0,0 +1,113 @@
+# # High-order convergence of vdim
+
+using DynamicPolynomials
+using FixedPolynomials
+using Inti
+using Meshes
+using StaticArrays
+using Gmsh
+using LinearAlgebra
+using HMatrices
+using FMM3D
+using GLMakie
+using DataStructures
+
+meshsize = 0.2
+interpolation_order = 2
+VR_qorder = Inti.Tetrahedron_VR_interpolation_order_to_quadrature_order(interpolation_order+1)
+bdry_qorder = 2 * VR_qorder
+
+function gmsh_sphere(; order = 1, name, meshsize)
+    return try
+        gmsh.initialize()
+        gmsh.option.setNumber("General.Terminal", 0)
+        gmsh.model.add("sphere-mesh")
+        gmsh.option.setNumber("Mesh.MeshSizeMax", meshsize)
+        gmsh.option.setNumber("Mesh.MeshSizeMin", meshsize)
+        gmsh.model.occ.addSphere(0, 0, 0, 1)
+        gmsh.model.occ.synchronize()
+        gmsh.model.mesh.generate()
+        gmsh.model.mesh.setOrder(order)
+        gmsh.write(name)
+    finally
+        gmsh.finalize()
+    end
+end
+
+name = joinpath(@__DIR__, "sphere.msh")
+gmsh_sphere(; order = 1, name, meshsize)
+
+Inti.clear_entities!() # empty the entity cache
+msh = Inti.import_mesh(name; dim = 3)
+Ω = Inti.Domain(e -> Inti.geometric_dimension(e) == 3, Inti.entities(msh))
+Γ = Inti.boundary(Ω)
+
+Ωₕ = msh[Ω]
+Γₕ = msh[Γ]
+
+tquad = @elapsed begin
+    # Use VDIM with the Vioreanu-Rokhlin quadrature rule for Ωₕ
+    Q = Inti.VioreanuRokhlin(; domain = :tetrahedron, order = VR_qorder)
+    dict = OrderedDict(E => Q for E in Inti.element_types(Ωₕ))
+    Ωₕ_quad = Inti.Quadrature(Ωₕ, dict)
+    # Ωₕ_quad = Inti.Quadrature(Ωₕ; qorder = qorders[1])
+    Γₕ_quad = Inti.Quadrature(Γₕ; qorder = bdry_qorder)
+end
+@info "Quadrature generation time: $tquad"
+
+k0 = π
+k = 1.0
+θ = (sin(π / 3) * cos(π / 3), sin(π / 3) * sin(π / 3), cos(π / 3))
+#u  = (x) -> exp(im * k0 * dot(x, θ))
+#du = (x,n) -> im * k0 * dot(θ, n) * exp(im * k0 * dot(x, θ))
+u = (x) -> cos(k0 * dot(x, θ))
+du = (x, n) -> -k0 * dot(θ, n) * sin(k0 * dot(x, θ))
+f = (x) -> (k^2 - k0^2) * u(x)
+
+u_d = map(q -> u(q.coords), Ωₕ_quad)
+u_b = map(q -> u(q.coords), Γₕ_quad)
+du_b = map(q -> du(q.coords, q.normal), Γₕ_quad)
+f_d = map(q -> f(q.coords), Ωₕ_quad)
+
+op = k == 0 ? Inti.Laplace(; dim = 3) : Inti.Helmholtz(; dim = 3, k)
+
+## Boundary operators
+tbnd = @elapsed begin
+    S_b2d, D_b2d = Inti.single_double_layer(;
+        op,
+        target = Ωₕ_quad,
+        source = Γₕ_quad,
+        compression = (method = :fmm, tol = 1.0e-8),
+        correction = (method = :dim, maxdist = 5 * meshsize, target_location = :inside),
+    )
+end
+@info "Boundary operators time: $tbnd"
+
+## Volume potentials
+tvol = @elapsed begin
+    V_d2d = Inti.volume_potential(;
+        op,
+        target = Ωₕ_quad,
+        source = Ωₕ_quad,
+        compression = (method = :fmm, tol = 1.0e-8),
+        correction = (
+            method = :ldim,
+            interpolation_order,
+            quadrature_order = VR_qorder,
+            mesh = Ωₕ,
+            bdry_nodes = Γₕ.nodes,
+            maxdist = 5 * meshsize,
+            meshsize = meshsize,
+            form = :analytic,
+        ),
+    )
+end
+@info "Volume potential time: $tvol"
+
+vref = -u_d - D_b2d * u_b + S_b2d * du_b
+vapprox = V_d2d * f_d
+er = vref - vapprox
+
+ndofs = length(er)
+
+@show ndofs, meshsize, norm(er, Inf)
diff --git a/test/poly_test.jl b/test/poly_test.jl
new file mode 100644
index 00000000..671e1faa
--- /dev/null
+++ b/test/poly_test.jl
@@ -0,0 +1,77 @@
+using Inti
+using StaticArrays
+using LinearAlgebra
+import DynamicPolynomials: @polyvar
+using FixedPolynomials
+
+#Run with npts = 1000000
+function poly_test(npts)
+    #npts = 10000
+    pde = Inti.Laplace(; dim = 2)
+    interpolation_order = 4
+    p, P, γ₁P, multiindices = Inti.polynomial_solutions_vdim(pde, interpolation_order)
+
+    @polyvar x y
+
+    N = 2
+    PolArray = Array{FixedPolynomials.Polynomial{Float64}}(undef, length(P))
+
+    tsetup = @elapsed begin
+        for (polind, ElemPolySolsPols) in enumerate(P)
+            pp = ElemPolySolsPols.f.order2coeff
+            exp_data = Matrix{Int64}(undef, N, length(pp))
+            coeff_data = Vector{Float64}(undef, length(pp))
+            for (i, pol) in enumerate(pp)
+                exp_data[:, i] = [q for q in pol[1]]
+                coeff_data[i] = pol[2]
+            end
+            PolArray[polind] =
+                FixedPolynomials.Polynomial(exp_data, coeff_data, [:x, :y])
+        end
+        PolSystem = System(PolArray)
+        pts = Vector{Vector{Float64}}(undef, npts)
+        pts[1] = [0, 0]
+        cfg = JacobianConfig(PolSystem, pts[1])
+    end
+    @info "FixedPolynomials.jl setup time: $tsetup"
+
+    pts = Vector{Vector{Float64}}(undef, npts)
+    for i in 1:npts
+        pts[i] = rand(2)
+    end
+    cfg = JacobianConfig(PolSystem, pts[1])
+    res1 = Matrix{Float64}(undef, length(PolArray), length(pts))
+    res2 = Matrix{Float64}(undef, length(PolArray), length(pts))
+    res3 = Vector{MVector{length(PolArray), Float64}}(undef, npts)
+
+    cfg = JacobianConfig(PolSystem, pts[1])
+    u = Vector{Float64}(undef, length(PolArray))
+    tfixed = @elapsed begin
+        for i in 1:npts
+            evaluate!(view(res1, :, i), PolSystem, pts[i], cfg)
+        end
+    end
+    @info "FixedPolynomials.jl time: $tfixed"
+    evaluator = (xx) -> evaluate(PolSystem, xx, cfg)
+    tbroadcast = @elapsed begin
+        res3 .= evaluator.(pts)
+    end
+    @info "FixedPolynomials.jl w/ broadcast time: $tbroadcast"
+    tregular = @elapsed begin
+        for i in 1:length(P)
+            res2[i, :] .= P[i].f.(pts)
+        end
+    end
+    @info "ElementaryPDESolutions.jl time: $tregular"
+
+    # Evaluate Jacobian
+    u = Matrix{Float64}(undef, length(P), npts)
+    U = Array{Float64}(undef, length(P), 2, npts)
+    tjacob = @elapsed begin
+        for i in 1:npts
+            evaluate_and_jacobian!(view(u, :, i), view(U, :, :, i), PolSystem, pts[i], cfg)
+        end
+    end
+    @info "FixedPolynomials.jl Jacobian+Eval time: $tjacob"
+    return res1, res2, res3, tfixed, tregular, tbroadcast, tjacob
+end
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 3dfe77fc..80dd3305 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -1,7 +1,7 @@
 using Inti
 using Gmsh
 
-function gmsh_disk(; center, rx, ry, meshsize)
+function gmsh_disk(; center, rx, ry, meshsize, order = 1)
     msh = try
         gmsh.initialize(String[], false)
         gmsh.option.setNumber("General.Verbosity", 2)
@@ -12,6 +12,31 @@ function gmsh_disk(; center, rx, ry, meshsize)
         gmsh.model.occ.addDisk(center[1], center[2], 0, rx, ry)
         gmsh.model.occ.synchronize()
         gmsh.model.mesh.generate(2)
+        gmsh.model.mesh.setOrder(order)
+        Inti.import_mesh(; dim = 2)
+    finally
+        gmsh.finalize()
+    end
+    Ω = Inti.Domain(Inti.entities(msh)) do e
+        return Inti.geometric_dimension(e) == 2
+    end
+    return Ω, msh
+end
+
+function gmsh_disks(disks; meshsize, order = 1)
+    msh = try
+        gmsh.initialize()
+        gmsh.option.setNumber("General.Verbosity", 2)
+        gmsh.model.add("disk")
+        # set max and min meshsize to meshsize
+        gmsh.option.setNumber("Mesh.MeshSizeMax", meshsize)
+        gmsh.option.setNumber("Mesh.MeshSizeMin", meshsize)
+        for (center, rx, ry) in disks
+            gmsh.model.occ.addDisk(center[1], center[2], 0, rx, ry)
+        end
+        gmsh.model.occ.synchronize()
+        gmsh.model.mesh.generate(2)
+        gmsh.model.mesh.setOrder(order)
         Inti.import_mesh(; dim = 2)
     finally
         gmsh.finalize()