From e22175c186203a2123b8d3dabcb44702ffb04c42 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Thu, 19 Mar 2026 08:42:15 -0400
Subject: [PATCH 01/17] Fix OptimizationProblem for SVector/SArray: use
 out-of-place form

The newer SciMLBase enforces that immutable types like SVector/SArray
must use out-of-place OptimizationProblem{false}(...) form since they
cannot be mutated in-place.

This fixes precompilation failures where the optimization problem was
being created with the in-place form (auto-detected as true) but using
immutable initial conditions.

Fixes: https://github.com/ChrisRackauckas/InternalJunk/issues/26
---
 src/precompilation.jl | 3 ++-
 test/constraints.jl   | 3 ++-
 test/gpu.jl           | 3 ++-
 test/lbfgs.jl         | 9 ++++++---
 test/regression.jl    | 6 ++++--
 test/reinit.jl        | 3 ++-
 6 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/precompilation.jl b/src/precompilation.jl
index 6f0ae98..46f9d70 100644
--- a/src/precompilation.jl
+++ b/src/precompilation.jl
@@ -21,7 +21,8 @@ using PrecompileTools
         p = @SArray Float32[1.0, 100.0]
 
         # Create optimization problem with StaticArrays
-        prob = OptimizationProblem(_rosenbrock_precompile, x0, p; lb = lb, ub = ub)
+        # Use out-of-place form {false} since SVector is immutable
+        prob = OptimizationProblem{false}(_rosenbrock_precompile, x0, p; lb = lb, ub = ub)
 
         # Precompile SerialPSO - most commonly used CPU algorithm
         sol = solve(prob, SerialPSO(10), maxiters = 2)
diff --git a/test/constraints.jl b/test/constraints.jl
index 65b7e15..1d39c45 100644
--- a/test/constraints.jl
+++ b/test/constraints.jl
@@ -19,7 +19,8 @@ lb = @SVector [0.0f0, 0.0f0]
 ub = @SVector [2.0f0, 2.0f0]
 lcons = @SVector [-Inf32, -Inf32]
 ucons = @SVector [0.0f0, 0.0f0]
-prob = OptimizationProblem(opt_f, x0, p, lcons = lcons, ucons = ucons, lb = lb, ub = ub)
+# Use out-of-place form {false} since SVector is immutable
+prob = OptimizationProblem{false}(opt_f, x0, p, lcons = lcons, ucons = ucons, lb = lb, ub = ub)
 
 n_particles = 1000
 
diff --git a/test/gpu.jl b/test/gpu.jl
index 9951aa7..ddeeee1 100644
--- a/test/gpu.jl
+++ b/test/gpu.jl
@@ -21,7 +21,8 @@ include("./utils.jl")
     x0 = @SArray zeros(Float32, N)
     p = @SArray Float32[1.0, 100.0]
 
-    prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub)
+    # Use out-of-place form {false} since SVector is immutable
+    prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub)
 
     n_particles = 5000
 
diff --git a/test/lbfgs.jl b/test/lbfgs.jl
index 068e0ea..018c960 100644
--- a/test/lbfgs.jl
+++ b/test/lbfgs.jl
@@ -9,7 +9,8 @@ end
 optprob = OptimizationFunction(objf, Optimization.AutoEnzyme())
 x0 = rand(2)
 x0 = SVector{2}(x0)
-prob = OptimizationProblem(optprob, x0)
+# Use out-of-place form {false} since SVector is immutable
+prob = OptimizationProblem{false}(optprob, x0)
 l1 = objf(x0, nothing)
 sol = Optimization.solve(
     prob,
@@ -28,7 +29,8 @@ end
 x0 = @SArray rand(Float32, N)
 p = @SArray Float32[1.0, 100.0]
 optf = OptimizationFunction(rosenbrock, Optimization.AutoForwardDiff())
-prob = OptimizationProblem(optf, x0, p)
+# Use out-of-place form {false} since SArray is immutable
+prob = OptimizationProblem{false}(optf, x0, p)
 l0 = rosenbrock(x0, p)
 
 @time sol = Optimization.solve(
@@ -61,7 +63,8 @@ l0 = rosenbrock(x0, p)
 @show sol.objective
 
 optf = OptimizationFunction(rosenbrock, Optimization.AutoEnzyme())
-prob = OptimizationProblem(optf, x0, p)
+# Use out-of-place form {false} since SArray is immutable
+prob = OptimizationProblem{false}(optf, x0, p)
 l0 = rosenbrock(x0, p)
 
 @time sol = Optimization.solve(
diff --git a/test/regression.jl b/test/regression.jl
index ef11af3..76ff4cb 100644
--- a/test/regression.jl
+++ b/test/regression.jl
@@ -28,7 +28,8 @@ using QuasiMonteCarlo
         ub = ub
     )
 
-    prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub)
+    # Use out-of-place form {false} since SVector is immutable
+    prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub)
 
     n_particles = 2000
 
@@ -224,7 +225,8 @@ end
         ub = ub
     )
 
-    prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub)
+    # Use out-of-place form {false} since SVector is immutable
+    prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub)
 
     n_particles = 2000
 
diff --git a/test/reinit.jl b/test/reinit.jl
index 8bef118..0947fd0 100644
--- a/test/reinit.jl
+++ b/test/reinit.jl
@@ -17,7 +17,8 @@ end
 x0 = @SArray zeros(Float32, 3)
 p = @SArray Float32[1.0, 100.0]
 
-prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub)
+# Use out-of-place form {false} since SVector is immutable
+prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub)
 
 n_particles = 2000
 

From 161392c9383fcaba93515037cf150e425a59c99b Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Thu, 19 Mar 2026 08:46:13 -0400
Subject: [PATCH 02/17] Fix: Use OptimizationFunction{false} for out-of-place
 form

The previous attempt used OptimizationProblem{false} directly, but the
SciMLBase API requires that you pass an OptimizationFunction{false} to
the constructor instead.

Changed all usages of SVector/SArray with OptimizationProblem to:
1. Create OptimizationFunction{false}(f, ...) for the function
2. Pass that to OptimizationProblem(opt_f, ...)
---
 src/precompilation.jl |  3 ++-
 test/constraints.jl   |  6 +++---
 test/gpu.jl           |  3 ++-
 test/lbfgs.jl         | 14 +++++++-------
 test/regression.jl    |  6 ++++--
 test/reinit.jl        |  3 ++-
 6 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/src/precompilation.jl b/src/precompilation.jl
index 46f9d70..91bf6d1 100644
--- a/src/precompilation.jl
+++ b/src/precompilation.jl
@@ -22,7 +22,8 @@ using PrecompileTools
 
         # Create optimization problem with StaticArrays
         # Use out-of-place form {false} since SVector is immutable
-        prob = OptimizationProblem{false}(_rosenbrock_precompile, x0, p; lb = lb, ub = ub)
+        opt_f = OptimizationFunction{false}(_rosenbrock_precompile)
+        prob = OptimizationProblem(opt_f, x0, p; lb = lb, ub = ub)
 
         # Precompile SerialPSO - most commonly used CPU algorithm
         sol = solve(prob, SerialPSO(10), maxiters = 2)
diff --git a/test/constraints.jl b/test/constraints.jl
index 1d39c45..b4e1c48 100644
--- a/test/constraints.jl
+++ b/test/constraints.jl
@@ -12,15 +12,15 @@ function conss(x, p)
     return SVector{3}(-x[1] + 2 * x[2] - 1, +x[1] - 2 * x[2] + 1, (x[1]^2) / 4 + x[2]^2 - 1)
 end
 
-opt_f = OptimizationFunction(objective, cons = conss)
+# Use out-of-place form {false} since SVector is immutable
+opt_f = OptimizationFunction{false}(objective, cons = conss)
 
 x0 = @SVector [1.0f0, 1.0f0]
 lb = @SVector [0.0f0, 0.0f0]
 ub = @SVector [2.0f0, 2.0f0]
 lcons = @SVector [-Inf32, -Inf32]
 ucons = @SVector [0.0f0, 0.0f0]
-# Use out-of-place form {false} since SVector is immutable
-prob = OptimizationProblem{false}(opt_f, x0, p, lcons = lcons, ucons = ucons, lb = lb, ub = ub)
+prob = OptimizationProblem(opt_f, x0, p, lcons = lcons, ucons = ucons, lb = lb, ub = ub)
 
 n_particles = 1000
 
diff --git a/test/gpu.jl b/test/gpu.jl
index ddeeee1..a0a7810 100644
--- a/test/gpu.jl
+++ b/test/gpu.jl
@@ -22,7 +22,8 @@ include("./utils.jl")
     p = @SArray Float32[1.0, 100.0]
 
     # Use out-of-place form {false} since SVector is immutable
-    prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub)
+    opt_f = OptimizationFunction{false}(rosenbrock)
+    prob = OptimizationProblem(opt_f, x0, p; lb = lb, ub = ub)
 
     n_particles = 5000
 
diff --git a/test/lbfgs.jl b/test/lbfgs.jl
index 018c960..20e02d4 100644
--- a/test/lbfgs.jl
+++ b/test/lbfgs.jl
@@ -6,11 +6,11 @@ function objf(x, p)
     return 1 - x[1]^2 - x[2]^2
 end
 
-optprob = OptimizationFunction(objf, Optimization.AutoEnzyme())
+# Use out-of-place form {false} since SVector is immutable
+optprob = OptimizationFunction{false}(objf, Optimization.AutoEnzyme())
 x0 = rand(2)
 x0 = SVector{2}(x0)
-# Use out-of-place form {false} since SVector is immutable
-prob = OptimizationProblem{false}(optprob, x0)
+prob = OptimizationProblem(optprob, x0)
 l1 = objf(x0, nothing)
 sol = Optimization.solve(
     prob,
@@ -28,9 +28,9 @@ function rosenbrock(x, p)
 end
 x0 = @SArray rand(Float32, N)
 p = @SArray Float32[1.0, 100.0]
-optf = OptimizationFunction(rosenbrock, Optimization.AutoForwardDiff())
 # Use out-of-place form {false} since SArray is immutable
-prob = OptimizationProblem{false}(optf, x0, p)
+optf = OptimizationFunction{false}(rosenbrock, Optimization.AutoForwardDiff())
+prob = OptimizationProblem(optf, x0, p)
 l0 = rosenbrock(x0, p)
 
 @time sol = Optimization.solve(
@@ -62,9 +62,9 @@ l0 = rosenbrock(x0, p)
 )
 @show sol.objective
 
-optf = OptimizationFunction(rosenbrock, Optimization.AutoEnzyme())
 # Use out-of-place form {false} since SArray is immutable
-prob = OptimizationProblem{false}(optf, x0, p)
+optf = OptimizationFunction{false}(rosenbrock, Optimization.AutoEnzyme())
+prob = OptimizationProblem(optf, x0, p)
 l0 = rosenbrock(x0, p)
 
 @time sol = Optimization.solve(
diff --git a/test/regression.jl b/test/regression.jl
index 76ff4cb..c577f72 100644
--- a/test/regression.jl
+++ b/test/regression.jl
@@ -29,7 +29,8 @@ using QuasiMonteCarlo
     )
 
     # Use out-of-place form {false} since SVector is immutable
-    prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub)
+    opt_f = OptimizationFunction{false}(rosenbrock)
+    prob = OptimizationProblem(opt_f, x0, p; lb = lb, ub = ub)
 
     n_particles = 2000
 
@@ -226,7 +227,8 @@ end
     )
 
     # Use out-of-place form {false} since SVector is immutable
-    prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub)
+    opt_f = OptimizationFunction{false}(rosenbrock)
+    prob = OptimizationProblem(opt_f, x0, p; lb = lb, ub = ub)
 
     n_particles = 2000
 
diff --git a/test/reinit.jl b/test/reinit.jl
index 0947fd0..5e59cf3 100644
--- a/test/reinit.jl
+++ b/test/reinit.jl
@@ -18,7 +18,8 @@ x0 = @SArray zeros(Float32, 3)
 p = @SArray Float32[1.0, 100.0]
 
 # Use out-of-place form {false} since SVector is immutable
-prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub)
+opt_f = OptimizationFunction{false}(rosenbrock)
+prob = OptimizationProblem(opt_f, x0, p; lb = lb, ub = ub)
 
 n_particles = 2000
 

From f26e3744a3926c1c5746c9b60e44d3d21d333f72 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Thu, 19 Mar 2026 09:41:55 -0400
Subject: [PATCH 03/17] Fix FormatCheck.yml: add Julia setup step before runic

---
 .github/workflows/FormatCheck.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/FormatCheck.yml b/.github/workflows/FormatCheck.yml
index d22e82d..6253546 100644
--- a/.github/workflows/FormatCheck.yml
+++ b/.github/workflows/FormatCheck.yml
@@ -14,6 +14,9 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v6
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: '1'
       - uses: fredrikekre/runic-action@v1
         with:
           version: '1'

From b251680a11b342cb1e872fe6eec1a26f3b27eda6 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Thu, 19 Mar 2026 14:40:16 -0400
Subject: [PATCH 04/17] Retrigger CI (transient GPU OOM)


From 10d586e7819957ad2efc50b22454c5757b99a564 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Thu, 19 Mar 2026 18:42:23 -0400
Subject: [PATCH 05/17] Fix GPU OOM in CI: add GC and CUDA.reclaim() between
 test files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GPU memory from gpu.jl (5000 particles × 3 sizes × 3 algorithms)
accumulates and causes OOM when lbfgs.jl runs. Add GC.gc(true)
between test includes and explicit CUDA.reclaim() at the start
of lbfgs.jl to free GPU memory.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 test/lbfgs.jl    | 5 +++++
 test/runtests.jl | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/test/lbfgs.jl b/test/lbfgs.jl
index 20e02d4..d624546 100644
--- a/test/lbfgs.jl
+++ b/test/lbfgs.jl
@@ -2,6 +2,11 @@ using ParallelParticleSwarms, Optimization, StaticArrays
 
 include("./utils.jl")
 
+# Reclaim GPU memory from previous test files to avoid OOM
+if GROUP == "CUDA"
+    CUDA.reclaim()
+end
+
 function objf(x, p)
     return 1 - x[1]^2 - x[2]^2
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index adddcf5..e017b17 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,6 +11,8 @@ global CI_GROUP = get(ENV, "GROUP", "CPU")
 @testset for BACKEND in unique(("CPU", CI_GROUP))
     global GROUP = BACKEND
     @testset "$(BACKEND) optimizers tests" include("./gpu.jl")
+    GC.gc(true)
     @testset "$(BACKEND) optimizers with constraints tests" include("./constraints.jl")
+    GC.gc(true)
     @testset "$(BACKEND) hybrid optimizers" include("./lbfgs.jl")
 end

From 8676109a43b1d9f0a28d0688819fdd6b1b5c4be0 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Thu, 19 Mar 2026 19:06:37 -0400
Subject: [PATCH 06/17] Reorder GPU tests: run hybrid first to avoid OOM from
 kernel caches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The HybridPSO kernel (gpu_simplebfgs_run!) is the most complex and
needs the most GPU memory for JIT compilation. Running it first
when GPU memory is most available avoids OOM caused by accumulated
kernel compilation caches from gpu.jl (5000 particles × 3 sizes).

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 test/runtests.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index e017b17..d558740 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,9 +10,11 @@ global CI_GROUP = get(ENV, "GROUP", "CPU")
 #TODO: Curent throws warning for redefinition with the use of @testset multiple times. Migrate to TestItemRunners.jl
 @testset for BACKEND in unique(("CPU", CI_GROUP))
     global GROUP = BACKEND
+    # Run hybrid optimizers first on GPU — the HybridPSO kernel is the most
+    # complex and needs the most GPU memory for JIT compilation.
+    @testset "$(BACKEND) hybrid optimizers" include("./lbfgs.jl")
+    GC.gc(true)
     @testset "$(BACKEND) optimizers tests" include("./gpu.jl")
     GC.gc(true)
     @testset "$(BACKEND) optimizers with constraints tests" include("./constraints.jl")
-    GC.gc(true)
-    @testset "$(BACKEND) hybrid optimizers" include("./lbfgs.jl")
 end

From 0fd11b187aba06f223440d2450139a774f8d0c3a Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Thu, 19 Mar 2026 19:51:33 -0400
Subject: [PATCH 07/17] Retrigger CI (runners appear offline)

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

From 8a749c6e1ffcac3cc377681d1517ae75c6aa2f88 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Thu, 19 Mar 2026 20:32:38 -0400
Subject: [PATCH 08/17] Revert test reordering, keep GC improvements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Revert to original test order (gpu → constraints → lbfgs) since
reordering caused CUDA context init OOM to cascade to all tests.
Keep GC.gc(true) between tests and CUDA.reclaim() in lbfgs.jl.

GPU OOM is a pre-existing infrastructure issue — shared self-hosted
runners have oversubscribed GPUs. The main branch also fails GPU
tests (with a different error: precompilation failure that this
PR fixes).

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 test/runtests.jl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index d558740..e017b17 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,11 +10,9 @@ global CI_GROUP = get(ENV, "GROUP", "CPU")
 #TODO: Curent throws warning for redefinition with the use of @testset multiple times. Migrate to TestItemRunners.jl
 @testset for BACKEND in unique(("CPU", CI_GROUP))
     global GROUP = BACKEND
-    # Run hybrid optimizers first on GPU — the HybridPSO kernel is the most
-    # complex and needs the most GPU memory for JIT compilation.
-    @testset "$(BACKEND) hybrid optimizers" include("./lbfgs.jl")
-    GC.gc(true)
     @testset "$(BACKEND) optimizers tests" include("./gpu.jl")
     GC.gc(true)
     @testset "$(BACKEND) optimizers with constraints tests" include("./constraints.jl")
+    GC.gc(true)
+    @testset "$(BACKEND) hybrid optimizers" include("./lbfgs.jl")
 end

From dce53a534045da01923b0e9d5c3325e378891e2b Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 20 Mar 2026 01:37:04 -0400
Subject: [PATCH 09/17] Switch GPU CI from T4 to V100 runner to fix OOM

The T4 runners (14.5 GiB VRAM) are oversubscribed and consistently
OOM during CUDA tests. Switch to V100 runners (32 GiB VRAM) which
other SciML repos (DiffEqGPU.jl, SciMLSensitivity.jl) also use
for memory-intensive GPU jobs.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/GPU.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml
index 8ed21ff..3b8ff7b 100644
--- a/.github/workflows/GPU.yml
+++ b/.github/workflows/GPU.yml
@@ -17,7 +17,7 @@ concurrency:
 jobs:
   cuda-tests:
     name: "CUDA Tests (Julia ${{ matrix.version }})"
-    runs-on: [self-hosted, Linux, X64, gpu-t4]
+    runs-on: [self-hosted, gpu-v100]
     timeout-minutes: 120
     strategy:
       fail-fast: false

From d11c83bd57ce043349e7625c0d480e3465ead59a Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 20 Mar 2026 01:48:47 -0400
Subject: [PATCH 10/17] Switch GPU CI to generic gpu runner label

V100 (compute capability 7.0) is not supported on CUDA 13+.
Use the generic 'gpu' label (used by DiffEqFlux.jl, NeuralPDE.jl,
DeepEquilibriumNetworks.jl) which routes to compatible GPUs.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/GPU.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml
index 3b8ff7b..b3f7e4f 100644
--- a/.github/workflows/GPU.yml
+++ b/.github/workflows/GPU.yml
@@ -17,7 +17,7 @@ concurrency:
 jobs:
   cuda-tests:
     name: "CUDA Tests (Julia ${{ matrix.version }})"
-    runs-on: [self-hosted, gpu-v100]
+    runs-on: [self-hosted, Linux, X64, gpu]
     timeout-minutes: 120
     strategy:
       fail-fast: false

From 6c3c167a1203172e87472324ec81e922441030c5 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 20 Mar 2026 02:00:09 -0400
Subject: [PATCH 11/17] Retrigger CI

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

From 7449d3e85ac29dcf79a5c2c199824e4ab848a481 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 20 Mar 2026 02:32:08 -0400
Subject: [PATCH 12/17] Use V100 runners with CUDA 12.6 toolkit to fix GPU CI

T4 runners (arctic1-*) are oversubscribed and OOM consistently.
V100 runners (demeter4-*) have 32GB VRAM but require CUDA 12.x
since CUDA 13+ dropped support for compute capability 7.0.

Pin JULIA_CUDA_VERSION=12.6 to use the CUDA 12.6 toolkit on V100.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/GPU.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml
index b3f7e4f..5075da8 100644
--- a/.github/workflows/GPU.yml
+++ b/.github/workflows/GPU.yml
@@ -17,7 +17,7 @@ concurrency:
 jobs:
   cuda-tests:
     name: "CUDA Tests (Julia ${{ matrix.version }})"
-    runs-on: [self-hosted, Linux, X64, gpu]
+    runs-on: [self-hosted, gpu-v100]
     timeout-minutes: 120
     strategy:
       fail-fast: false
@@ -38,6 +38,7 @@ jobs:
       - uses: julia-actions/julia-runtest@v1
         env:
           GROUP: "CUDA"
+          JULIA_CUDA_VERSION: "12.6"
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v5
         with:

From b909e8f4763a02a14f36d66d16b56f909a3c7382 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 20 Mar 2026 03:33:22 -0400
Subject: [PATCH 13/17] Fix V100 CUDA compatibility: pin CUDA 12.6 via
 LocalPreferences

JULIA_CUDA_VERSION env var is deprecated and ignored by CUDA.jl.
Write LocalPreferences.toml directly to pin CUDA_Runtime_jll to
v12.6, which supports V100 (compute 7.0). CUDA 13+ dropped
support for compute < 7.5.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/GPU.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml
index 5075da8..81832aa 100644
--- a/.github/workflows/GPU.yml
+++ b/.github/workflows/GPU.yml
@@ -34,11 +34,13 @@ jobs:
             import Pkg
             Pkg.add(; name = "CUDA")'
           rm -f test/Manifest.toml
+      - name: Pin CUDA 12.6 for V100 compatibility
+        run: |
+          printf '[CUDA_Runtime_jll]\nversion = "12.6"\n' > test/LocalPreferences.toml
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
         env:
           GROUP: "CUDA"
-          JULIA_CUDA_VERSION: "12.6"
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v5
         with:

From f26ef75a45517aacf12d9a7e4a01b3e9bab244df Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 20 Mar 2026 03:45:27 -0400
Subject: [PATCH 14/17] Revert to T4 runners (gpu-t4) for GPU CI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

V100 runners cannot be used — CUDA.jl rejects compute capability
7.0 GPUs with CUDA 13+ drivers regardless of toolkit pinning.
T4 (compute 7.5) is the only compatible GPU available.

Earlier T4 runs passed 26/27 tests — the OOM failures are
transient due to shared runner memory pressure.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/GPU.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml
index 81832aa..8ed21ff 100644
--- a/.github/workflows/GPU.yml
+++ b/.github/workflows/GPU.yml
@@ -17,7 +17,7 @@ concurrency:
 jobs:
   cuda-tests:
     name: "CUDA Tests (Julia ${{ matrix.version }})"
-    runs-on: [self-hosted, gpu-v100]
+    runs-on: [self-hosted, Linux, X64, gpu-t4]
     timeout-minutes: 120
     strategy:
       fail-fast: false
@@ -34,9 +34,6 @@ jobs:
             import Pkg
             Pkg.add(; name = "CUDA")'
           rm -f test/Manifest.toml
-      - name: Pin CUDA 12.6 for V100 compatibility
-        run: |
-          printf '[CUDA_Runtime_jll]\nversion = "12.6"\n' > test/LocalPreferences.toml
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
         env:

From 5557193a205e0fdfb73146a49af6f6c0438af126 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 20 Mar 2026 05:25:34 -0400
Subject: [PATCH 15/17] Use exclusive GPU queue (gpu-t4-exclusive) for CUDA CI

Switch to exclusive T4 runner to get dedicated GPU memory,
avoiding OOM from shared GPU workloads.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/GPU.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml
index 8ed21ff..30dcdf6 100644
--- a/.github/workflows/GPU.yml
+++ b/.github/workflows/GPU.yml
@@ -17,7 +17,7 @@ concurrency:
 jobs:
   cuda-tests:
     name: "CUDA Tests (Julia ${{ matrix.version }})"
-    runs-on: [self-hosted, Linux, X64, gpu-t4]
+    runs-on: [self-hosted, Linux, X64, gpu-t4-exclusive]
     timeout-minutes: 120
     strategy:
       fail-fast: false

From b798e58cdb73ea540d0cff850d8a6aee88bfbdac Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 20 Mar 2026 05:46:48 -0400
Subject: [PATCH 16/17] Retrigger CI

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

From dcb8a76919d74bd2b15862dd6ad56d5f8eca2631 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Tue, 24 Mar 2026 06:33:51 -0400
Subject: [PATCH 17/17] Use V100 runners with CUDA.jl 5.0-5.10 for GPU CI

Pin CUDA.jl to v5.0-5.10 which uses CUDA 12.x runtime,
compatible with V100 (compute 7.0). CUDA.jl 5.11+ resolves
CUDA_Driver_jll v13.2+ which dropped compute 7.0 support.

Use gpu-v100 runners (demeter4-*) which have 32GB VRAM,
avoiding the OOM issues on oversubscribed T4 runners.

See ChrisRackauckas/InternalJunk#17 for details.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/GPU.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml
index 30dcdf6..8e98d36 100644
--- a/.github/workflows/GPU.yml
+++ b/.github/workflows/GPU.yml
@@ -17,7 +17,7 @@ concurrency:
 jobs:
   cuda-tests:
     name: "CUDA Tests (Julia ${{ matrix.version }})"
-    runs-on: [self-hosted, Linux, X64, gpu-t4-exclusive]
+    runs-on: [self-hosted, gpu-v100]
     timeout-minutes: 120
     strategy:
       fail-fast: false
@@ -32,7 +32,7 @@ jobs:
       - run: |
           julia --project=test -e '
             import Pkg
-            Pkg.add(; name = "CUDA")'
+            Pkg.add(; name = "CUDA", version = "5.0 - 5.10")'
           rm -f test/Manifest.toml
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1