From e22175c186203a2123b8d3dabcb44702ffb04c42 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Thu, 19 Mar 2026 08:42:15 -0400 Subject: [PATCH 01/17] Fix OptimizationProblem for SVector/SArray: use out-of-place form The newer SciMLBase enforces that immutable types like SVector/SArray must use out-of-place OptimizationProblem{false}(...) form since they cannot be mutated in-place. This fixes precompilation failures where the optimization problem was being created with the in-place form (auto-detected as true) but using immutable initial conditions. Fixes: https://github.com/ChrisRackauckas/InternalJunk/issues/26 --- src/precompilation.jl | 3 ++- test/constraints.jl | 3 ++- test/gpu.jl | 3 ++- test/lbfgs.jl | 9 ++++++--- test/regression.jl | 6 ++++-- test/reinit.jl | 3 ++- 6 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/precompilation.jl b/src/precompilation.jl index 6f0ae98..46f9d70 100644 --- a/src/precompilation.jl +++ b/src/precompilation.jl @@ -21,7 +21,8 @@ using PrecompileTools p = @SArray Float32[1.0, 100.0] # Create optimization problem with StaticArrays - prob = OptimizationProblem(_rosenbrock_precompile, x0, p; lb = lb, ub = ub) + # Use out-of-place form {false} since SVector is immutable + prob = OptimizationProblem{false}(_rosenbrock_precompile, x0, p; lb = lb, ub = ub) # Precompile SerialPSO - most commonly used CPU algorithm sol = solve(prob, SerialPSO(10), maxiters = 2) diff --git a/test/constraints.jl b/test/constraints.jl index 65b7e15..1d39c45 100644 --- a/test/constraints.jl +++ b/test/constraints.jl @@ -19,7 +19,8 @@ lb = @SVector [0.0f0, 0.0f0] ub = @SVector [2.0f0, 2.0f0] lcons = @SVector [-Inf32, -Inf32] ucons = @SVector [0.0f0, 0.0f0] -prob = OptimizationProblem(opt_f, x0, p, lcons = lcons, ucons = ucons, lb = lb, ub = ub) +# Use out-of-place form {false} since SVector is immutable +prob = OptimizationProblem{false}(opt_f, x0, p, lcons = lcons, ucons = ucons, lb = lb, ub = ub) n_particles = 1000 diff --git a/test/gpu.jl b/test/gpu.jl index 9951aa7..ddeeee1 100644 --- a/test/gpu.jl +++ b/test/gpu.jl @@ -21,7 +21,8 @@ include("./utils.jl") x0 = @SArray zeros(Float32, N) p = @SArray Float32[1.0, 100.0] - prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub) + # Use out-of-place form {false} since SVector is immutable + prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub) n_particles = 5000 diff --git a/test/lbfgs.jl b/test/lbfgs.jl index 068e0ea..018c960 100644 --- a/test/lbfgs.jl +++ b/test/lbfgs.jl @@ -9,7 +9,8 @@ end optprob = OptimizationFunction(objf, Optimization.AutoEnzyme()) x0 = rand(2) x0 = SVector{2}(x0) -prob = OptimizationProblem(optprob, x0) +# Use out-of-place form {false} since SVector is immutable +prob = OptimizationProblem{false}(optprob, x0) l1 = objf(x0, nothing) sol = Optimization.solve( prob, @@ -28,7 +29,8 @@ end x0 = @SArray rand(Float32, N) p = @SArray Float32[1.0, 100.0] optf = OptimizationFunction(rosenbrock, Optimization.AutoForwardDiff()) -prob = OptimizationProblem(optf, x0, p) +# Use out-of-place form {false} since SArray is immutable +prob = OptimizationProblem{false}(optf, x0, p) l0 = rosenbrock(x0, p) @time sol = Optimization.solve( @@ -61,7 +63,8 @@ l0 = rosenbrock(x0, p) @show sol.objective optf = OptimizationFunction(rosenbrock, Optimization.AutoEnzyme()) -prob = OptimizationProblem(optf, x0, p) +# Use out-of-place form {false} since SArray is immutable +prob = OptimizationProblem{false}(optf, x0, p) l0 = rosenbrock(x0, p) @time sol = Optimization.solve( diff --git a/test/regression.jl b/test/regression.jl index ef11af3..76ff4cb 100644 --- a/test/regression.jl +++ b/test/regression.jl @@ -28,7 +28,8 @@ using QuasiMonteCarlo ub = ub ) - prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub) + # Use out-of-place form {false} since SVector is immutable + prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub) n_particles = 2000 @@ -224,7 +225,8 @@ end ub = ub ) - prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub) + # Use out-of-place form {false} since SVector is immutable + prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub) n_particles = 2000 diff --git a/test/reinit.jl b/test/reinit.jl index 8bef118..0947fd0 100644 --- a/test/reinit.jl +++ b/test/reinit.jl @@ -17,7 +17,8 @@ end x0 = @SArray zeros(Float32, 3) p = @SArray Float32[1.0, 100.0] -prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub) +# Use out-of-place form {false} since SVector is immutable +prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub) n_particles = 2000 From 161392c9383fcaba93515037cf150e425a59c99b Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Thu, 19 Mar 2026 08:46:13 -0400 Subject: [PATCH 02/17] Fix: Use OptimizationFunction{false} for out-of-place form The previous attempt used OptimizationProblem{false} directly, but the SciMLBase API requires that you pass an OptimizationFunction{false} to the constructor instead. Changed all usages of SVector/SArray with OptimizationProblem to: 1. Create OptimizationFunction{false}(f, ...) for the function 2. Pass that to OptimizationProblem(opt_f, ...) --- src/precompilation.jl | 3 ++- test/constraints.jl | 6 +++--- test/gpu.jl | 3 ++- test/lbfgs.jl | 14 +++++++------- test/regression.jl | 6 ++++-- test/reinit.jl | 3 ++- 6 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/precompilation.jl b/src/precompilation.jl index 46f9d70..91bf6d1 100644 --- a/src/precompilation.jl +++ b/src/precompilation.jl @@ -22,7 +22,8 @@ using PrecompileTools # Create optimization problem with StaticArrays # Use out-of-place form {false} since SVector is immutable - prob = OptimizationProblem{false}(_rosenbrock_precompile, x0, p; lb = lb, ub = ub) + opt_f = OptimizationFunction{false}(_rosenbrock_precompile) + prob = OptimizationProblem(opt_f, x0, p; lb = lb, ub = ub) # Precompile SerialPSO - most commonly used CPU algorithm sol = solve(prob, SerialPSO(10), maxiters = 2) diff --git a/test/constraints.jl b/test/constraints.jl index 1d39c45..b4e1c48 100644 --- a/test/constraints.jl +++ b/test/constraints.jl @@ -12,15 +12,15 @@ function conss(x, p) return SVector{3}(-x[1] + 2 * x[2] - 1, +x[1] - 2 * x[2] + 1, (x[1]^2) / 4 + x[2]^2 - 1) end -opt_f = OptimizationFunction(objective, cons = conss) +# Use out-of-place form {false} since SVector is immutable +opt_f = OptimizationFunction{false}(objective, cons = conss) x0 = @SVector [1.0f0, 1.0f0] lb = @SVector [0.0f0, 0.0f0] ub = @SVector [2.0f0, 2.0f0] lcons = @SVector [-Inf32, -Inf32] ucons = @SVector [0.0f0, 0.0f0] -# Use out-of-place form {false} since SVector is immutable -prob = OptimizationProblem{false}(opt_f, x0, p, lcons = lcons, ucons = ucons, lb = lb, ub = ub) +prob = OptimizationProblem(opt_f, x0, p, lcons = lcons, ucons = ucons, lb = lb, ub = ub) n_particles = 1000 diff --git a/test/gpu.jl b/test/gpu.jl index ddeeee1..a0a7810 100644 --- a/test/gpu.jl +++ b/test/gpu.jl @@ -22,7 +22,8 @@ include("./utils.jl") p = @SArray Float32[1.0, 100.0] # Use out-of-place form {false} since SVector is immutable - prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub) + opt_f = OptimizationFunction{false}(rosenbrock) + prob = OptimizationProblem(opt_f, x0, p; lb = lb, ub = ub) n_particles = 5000 diff --git a/test/lbfgs.jl b/test/lbfgs.jl index 018c960..20e02d4 100644 --- a/test/lbfgs.jl +++ b/test/lbfgs.jl @@ -6,11 +6,11 @@ function objf(x, p) return 1 - x[1]^2 - x[2]^2 end -optprob = OptimizationFunction(objf, Optimization.AutoEnzyme()) +# Use out-of-place form {false} since SVector is immutable +optprob = OptimizationFunction{false}(objf, Optimization.AutoEnzyme()) x0 = rand(2) x0 = SVector{2}(x0) -# Use out-of-place form {false} since SVector is immutable -prob = OptimizationProblem{false}(optprob, x0) +prob = OptimizationProblem(optprob, x0) l1 = objf(x0, nothing) sol = Optimization.solve( prob, @@ -28,9 +28,9 @@ function rosenbrock(x, p) end x0 = @SArray rand(Float32, N) p = @SArray Float32[1.0, 100.0] -optf = OptimizationFunction(rosenbrock, Optimization.AutoForwardDiff()) # Use out-of-place form {false} since SArray is immutable -prob = OptimizationProblem{false}(optf, x0, p) +optf = OptimizationFunction{false}(rosenbrock, Optimization.AutoForwardDiff()) +prob = OptimizationProblem(optf, x0, p) l0 = rosenbrock(x0, p) @time sol = Optimization.solve( @@ -62,9 +62,9 @@ l0 = rosenbrock(x0, p) ) @show sol.objective -optf = OptimizationFunction(rosenbrock, Optimization.AutoEnzyme()) # Use out-of-place form {false} since SArray is immutable -prob = OptimizationProblem{false}(optf, x0, p) +optf = OptimizationFunction{false}(rosenbrock, Optimization.AutoEnzyme()) +prob = OptimizationProblem(optf, x0, p) l0 = rosenbrock(x0, p) @time sol = Optimization.solve( diff --git a/test/regression.jl b/test/regression.jl index 76ff4cb..c577f72 100644 --- a/test/regression.jl +++ b/test/regression.jl @@ -29,7 +29,8 @@ using QuasiMonteCarlo ) # Use out-of-place form {false} since SVector is immutable - prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub) + opt_f = OptimizationFunction{false}(rosenbrock) + prob = OptimizationProblem(opt_f, x0, p; lb = lb, ub = ub) n_particles = 2000 @@ -226,7 +227,8 @@ end ) # Use out-of-place form {false} since SVector is immutable - prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub) + opt_f = OptimizationFunction{false}(rosenbrock) + prob = OptimizationProblem(opt_f, x0, p; lb = lb, ub = ub) n_particles = 2000 diff --git a/test/reinit.jl b/test/reinit.jl index 0947fd0..5e59cf3 100644 --- a/test/reinit.jl +++ b/test/reinit.jl @@ -18,7 +18,8 @@ x0 = @SArray zeros(Float32, 3) p = @SArray Float32[1.0, 100.0] # Use out-of-place form {false} since SVector is immutable -prob = OptimizationProblem{false}(rosenbrock, x0, p; lb = lb, ub = ub) +opt_f = OptimizationFunction{false}(rosenbrock) +prob = OptimizationProblem(opt_f, x0, p; lb = lb, ub = ub) n_particles = 2000 From f26e3744a3926c1c5746c9b60e44d3d21d333f72 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Thu, 19 Mar 2026 09:41:55 -0400 Subject: [PATCH 03/17] Fix FormatCheck.yml: add Julia setup step before runic --- .github/workflows/FormatCheck.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/FormatCheck.yml b/.github/workflows/FormatCheck.yml index d22e82d..6253546 100644 --- a/.github/workflows/FormatCheck.yml +++ b/.github/workflows/FormatCheck.yml @@ -14,6 +14,9 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 + - uses: julia-actions/setup-julia@v2 + with: + version: '1' - uses: fredrikekre/runic-action@v1 with: version: '1' From b251680a11b342cb1e872fe6eec1a26f3b27eda6 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Thu, 19 Mar 2026 14:40:16 -0400 Subject: [PATCH 04/17] Retrigger CI (transient GPU OOM) From 10d586e7819957ad2efc50b22454c5757b99a564 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Thu, 19 Mar 2026 18:42:23 -0400 Subject: [PATCH 05/17] Fix GPU OOM in CI: add GC and CUDA.reclaim() between test files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GPU memory from gpu.jl (5000 particles × 3 sizes × 3 algorithms) accumulates and causes OOM when lbfgs.jl runs. Add GC.gc(true) between test includes and explicit CUDA.reclaim() at the start of lbfgs.jl to free GPU memory. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.6 (1M context) --- test/lbfgs.jl | 5 +++++ test/runtests.jl | 2 ++ 2 files changed, 7 insertions(+) diff --git a/test/lbfgs.jl b/test/lbfgs.jl index 20e02d4..d624546 100644 --- a/test/lbfgs.jl +++ b/test/lbfgs.jl @@ -2,6 +2,11 @@ using ParallelParticleSwarms, Optimization, StaticArrays include("./utils.jl") +# Reclaim GPU memory from previous test files to avoid OOM +if GROUP == "CUDA" + CUDA.reclaim() +end + function objf(x, p) return 1 - x[1]^2 - x[2]^2 end diff --git a/test/runtests.jl b/test/runtests.jl index adddcf5..e017b17 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,6 +11,8 @@ global CI_GROUP = get(ENV, "GROUP", "CPU") @testset for BACKEND in unique(("CPU", CI_GROUP)) global GROUP = BACKEND @testset "$(BACKEND) optimizers tests" include("./gpu.jl") + GC.gc(true) @testset "$(BACKEND) optimizers with constraints tests" include("./constraints.jl") + GC.gc(true) @testset "$(BACKEND) hybrid optimizers" include("./lbfgs.jl") end From 8676109a43b1d9f0a28d0688819fdd6b1b5c4be0 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Thu, 19 Mar 2026 19:06:37 -0400 Subject: [PATCH 06/17] Reorder GPU tests: run hybrid first to avoid OOM from kernel caches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HybridPSO kernel (gpu_simplebfgs_run!) is the most complex and needs the most GPU memory for JIT compilation. Running it first when GPU memory is most available avoids OOM caused by accumulated kernel compilation caches from gpu.jl (5000 particles × 3 sizes). Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.6 (1M context) --- test/runtests.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index e017b17..d558740 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,9 +10,11 @@ global CI_GROUP = get(ENV, "GROUP", "CPU") #TODO: Curent throws warning for redefinition with the use of @testset multiple times. Migrate to TestItemRunners.jl @testset for BACKEND in unique(("CPU", CI_GROUP)) global GROUP = BACKEND + # Run hybrid optimizers first on GPU — the HybridPSO kernel is the most + # complex and needs the most GPU memory for JIT compilation. + @testset "$(BACKEND) hybrid optimizers" include("./lbfgs.jl") + GC.gc(true) @testset "$(BACKEND) optimizers tests" include("./gpu.jl") GC.gc(true) @testset "$(BACKEND) optimizers with constraints tests" include("./constraints.jl") - GC.gc(true) - @testset "$(BACKEND) hybrid optimizers" include("./lbfgs.jl") end From 0fd11b187aba06f223440d2450139a774f8d0c3a Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Thu, 19 Mar 2026 19:51:33 -0400 Subject: [PATCH 07/17] Retrigger CI (runners appear offline) Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.6 (1M context) From 8a749c6e1ffcac3cc377681d1517ae75c6aa2f88 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Thu, 19 Mar 2026 20:32:38 -0400 Subject: [PATCH 08/17] Revert test reordering, keep GC improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Revert to original test order (gpu → constraints → lbfgs) since reordering caused CUDA context init OOM to cascade to all tests. Keep GC.gc(true) between tests and CUDA.reclaim() in lbfgs.jl. GPU OOM is a pre-existing infrastructure issue — shared self-hosted runners have oversubscribed GPUs. The main branch also fails GPU tests (with a different error: precompilation failure that this PR fixes). Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.6 (1M context) --- test/runtests.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index d558740..e017b17 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,11 +10,9 @@ global CI_GROUP = get(ENV, "GROUP", "CPU") #TODO: Curent throws warning for redefinition with the use of @testset multiple times. Migrate to TestItemRunners.jl @testset for BACKEND in unique(("CPU", CI_GROUP)) global GROUP = BACKEND - # Run hybrid optimizers first on GPU — the HybridPSO kernel is the most - # complex and needs the most GPU memory for JIT compilation. - @testset "$(BACKEND) hybrid optimizers" include("./lbfgs.jl") - GC.gc(true) @testset "$(BACKEND) optimizers tests" include("./gpu.jl") GC.gc(true) @testset "$(BACKEND) optimizers with constraints tests" include("./constraints.jl") + GC.gc(true) + @testset "$(BACKEND) hybrid optimizers" include("./lbfgs.jl") end From dce53a534045da01923b0e9d5c3325e378891e2b Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 20 Mar 2026 01:37:04 -0400 Subject: [PATCH 09/17] Switch GPU CI from T4 to V100 runner to fix OOM The T4 runners (14.5 GiB VRAM) are oversubscribed and consistently OOM during CUDA tests. Switch to V100 runners (32 GiB VRAM) which other SciML repos (DiffEqGPU.jl, SciMLSensitivity.jl) also use for memory-intensive GPU jobs. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/GPU.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml index 8ed21ff..3b8ff7b 100644 --- a/.github/workflows/GPU.yml +++ b/.github/workflows/GPU.yml @@ -17,7 +17,7 @@ concurrency: jobs: cuda-tests: name: "CUDA Tests (Julia ${{ matrix.version }})" - runs-on: [self-hosted, Linux, X64, gpu-t4] + runs-on: [self-hosted, gpu-v100] timeout-minutes: 120 strategy: fail-fast: false From d11c83bd57ce043349e7625c0d480e3465ead59a Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 20 Mar 2026 01:48:47 -0400 Subject: [PATCH 10/17] Switch GPU CI to generic gpu runner label V100 (compute capability 7.0) is not supported on CUDA 13+. Use the generic 'gpu' label (used by DiffEqFlux.jl, NeuralPDE.jl, DeepEquilibriumNetworks.jl) which routes to compatible GPUs. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/GPU.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml index 3b8ff7b..b3f7e4f 100644 --- a/.github/workflows/GPU.yml +++ b/.github/workflows/GPU.yml @@ -17,7 +17,7 @@ concurrency: jobs: cuda-tests: name: "CUDA Tests (Julia ${{ matrix.version }})" - runs-on: [self-hosted, gpu-v100] + runs-on: [self-hosted, Linux, X64, gpu] timeout-minutes: 120 strategy: fail-fast: false From 6c3c167a1203172e87472324ec81e922441030c5 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 20 Mar 2026 02:00:09 -0400 Subject: [PATCH 11/17] Retrigger CI Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.6 (1M context) From 7449d3e85ac29dcf79a5c2c199824e4ab848a481 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 20 Mar 2026 02:32:08 -0400 Subject: [PATCH 12/17] Use V100 runners with CUDA 12.6 toolkit to fix GPU CI T4 runners (arctic1-*) are oversubscribed and OOM consistently. V100 runners (demeter4-*) have 32GB VRAM but require CUDA 12.x since CUDA 13+ dropped support for compute capability 7.0. Pin JULIA_CUDA_VERSION=12.6 to use the CUDA 12.6 toolkit on V100. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/GPU.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml index b3f7e4f..5075da8 100644 --- a/.github/workflows/GPU.yml +++ b/.github/workflows/GPU.yml @@ -17,7 +17,7 @@ concurrency: jobs: cuda-tests: name: "CUDA Tests (Julia ${{ matrix.version }})" - runs-on: [self-hosted, Linux, X64, gpu] + runs-on: [self-hosted, gpu-v100] timeout-minutes: 120 strategy: fail-fast: false @@ -38,6 +38,7 @@ jobs: - uses: julia-actions/julia-runtest@v1 env: GROUP: "CUDA" + JULIA_CUDA_VERSION: "12.6" - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v5 with: From b909e8f4763a02a14f36d66d16b56f909a3c7382 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 20 Mar 2026 03:33:22 -0400 Subject: [PATCH 13/17] Fix V100 CUDA compatibility: pin CUDA 12.6 via LocalPreferences JULIA_CUDA_VERSION env var is deprecated and ignored by CUDA.jl. Write LocalPreferences.toml directly to pin CUDA_Runtime_jll to v12.6, which supports V100 (compute 7.0). CUDA 13+ dropped support for compute < 7.5. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/GPU.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml index 5075da8..81832aa 100644 --- a/.github/workflows/GPU.yml +++ b/.github/workflows/GPU.yml @@ -34,11 +34,13 @@ jobs: import Pkg Pkg.add(; name = "CUDA")' rm -f test/Manifest.toml + - name: Pin CUDA 12.6 for V100 compatibility + run: | + printf '[CUDA_Runtime_jll]\nversion = "12.6"\n' > test/LocalPreferences.toml - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 env: GROUP: "CUDA" - JULIA_CUDA_VERSION: "12.6" - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v5 with: From f26ef75a45517aacf12d9a7e4a01b3e9bab244df Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 20 Mar 2026 03:45:27 -0400 Subject: [PATCH 14/17] Revert to T4 runners (gpu-t4) for GPU CI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit V100 runners cannot be used — CUDA.jl rejects compute capability 7.0 GPUs with CUDA 13+ drivers regardless of toolkit pinning. T4 (compute 7.5) is the only compatible GPU available. Earlier T4 runs passed 26/27 tests — the OOM failures are transient due to shared runner memory pressure. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/GPU.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml index 81832aa..8ed21ff 100644 --- a/.github/workflows/GPU.yml +++ b/.github/workflows/GPU.yml @@ -17,7 +17,7 @@ concurrency: jobs: cuda-tests: name: "CUDA Tests (Julia ${{ matrix.version }})" - runs-on: [self-hosted, gpu-v100] + runs-on: [self-hosted, Linux, X64, gpu-t4] timeout-minutes: 120 strategy: fail-fast: false @@ -34,9 +34,6 @@ jobs: import Pkg Pkg.add(; name = "CUDA")' rm -f test/Manifest.toml - - name: Pin CUDA 12.6 for V100 compatibility - run: | - printf '[CUDA_Runtime_jll]\nversion = "12.6"\n' > test/LocalPreferences.toml - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 env: From 5557193a205e0fdfb73146a49af6f6c0438af126 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 20 Mar 2026 05:25:34 -0400 Subject: [PATCH 15/17] Use exclusive GPU queue (gpu-t4-exclusive) for CUDA CI Switch to exclusive T4 runner to get dedicated GPU memory, avoiding OOM from shared GPU workloads. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/GPU.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml index 8ed21ff..30dcdf6 100644 --- a/.github/workflows/GPU.yml +++ b/.github/workflows/GPU.yml @@ -17,7 +17,7 @@ concurrency: jobs: cuda-tests: name: "CUDA Tests (Julia ${{ matrix.version }})" - runs-on: [self-hosted, Linux, X64, gpu-t4] + runs-on: [self-hosted, Linux, X64, gpu-t4-exclusive] timeout-minutes: 120 strategy: fail-fast: false From b798e58cdb73ea540d0cff850d8a6aee88bfbdac Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 20 Mar 2026 05:46:48 -0400 Subject: [PATCH 16/17] Retrigger CI Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.6 (1M context) From dcb8a76919d74bd2b15862dd6ad56d5f8eca2631 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Tue, 24 Mar 2026 06:33:51 -0400 Subject: [PATCH 17/17] Use V100 runners with CUDA.jl 5.0-5.10 for GPU CI Pin CUDA.jl to v5.0-5.10 which uses CUDA 12.x runtime, compatible with V100 (compute 7.0). CUDA.jl 5.11+ resolves CUDA_Driver_jll v13.2+ which dropped compute 7.0 support. Use gpu-v100 runners (demeter4-*) which have 32GB VRAM, avoiding the OOM issues on oversubscribed T4 runners. See ChrisRackauckas/InternalJunk#17 for details. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/GPU.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml index 30dcdf6..8e98d36 100644 --- a/.github/workflows/GPU.yml +++ b/.github/workflows/GPU.yml @@ -17,7 +17,7 @@ concurrency: jobs: cuda-tests: name: "CUDA Tests (Julia ${{ matrix.version }})" - runs-on: [self-hosted, Linux, X64, gpu-t4-exclusive] + runs-on: [self-hosted, gpu-v100] timeout-minutes: 120 strategy: fail-fast: false @@ -32,7 +32,7 @@ jobs: - run: | julia --project=test -e ' import Pkg - Pkg.add(; name = "CUDA")' + Pkg.add(; name = "CUDA", version = "5.0 - 5.10")' rm -f test/Manifest.toml - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1