From b3fac5976a65c0cff1b14f910016f7e3430e534c Mon Sep 17 00:00:00 2001
From: James Le Houx <37665786+jameslehoux@users.noreply.github.com>
Date: Sat, 13 Jun 2026 09:08:00 +0000
Subject: [PATCH 1/2] notebooks: skip standalone SMG/PFMG on GPU instead of
 subprocessing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous subprocess-isolation attempt backfired: spawning a CUDA-
initialising child per combo while the parent kernel still holds the GPU
caused even pcg to abort (MPI_Abort, exit 6) from device-memory contention,
and did not reliably keep the kernel alive.

Revert to the proven in-process loop and instead drop only the two solvers
that actually hard-abort on a GPU build — standalone smg and pfmg (no Krylov
wrapper). In the original report every other combo up to bicgstab ran fine
in-process; standalone SMG was the sole kernel-killer. As preconditioners
(pcg+smg, flexgmres+pfmg) these multigrids are capped at one V-cycle and stay.
On CPU builds nothing is skipped. Soft non-convergence is still caught and
shown as a FAILED row.
---
 notebooks/profiling_and_tuning.ipynb | 127 +++++++++++----------------
 1 file changed, 49 insertions(+), 78 deletions(-)

diff --git a/notebooks/profiling_and_tuning.ipynb b/notebooks/profiling_and_tuning.ipynb
index 571e6ab..000a65e 100644
--- a/notebooks/profiling_and_tuning.ipynb
+++ b/notebooks/profiling_and_tuning.ipynb
@@ -127,7 +127,7 @@
     "\n",
     "**The default solver (`solver=\"auto\"`, since v4.2.20) is MLMG** — it scales near-linearly (see §4) and has the lowest memory footprint of any option. The bake-off below confirms it's also the fastest on this geometry. The runner-up among HYPRE configurations is `flexgmres+pfmg`; `pcg+smg` is the safe fallback if MLMG ever stalls.\n",
     "\n",
-    "> **Robustness note.** Standalone structured-multigrid solvers (`smg`, `pfmg`) can abort at the C++ level on a GPU build — a CUDA out-of-memory or HYPRE kernel error that would otherwise kill the notebook kernel and wipe every result. Each combo below therefore runs in its own subprocess, so a hard crash is recorded as a red `FAILED` bar instead of taking the session down with it."
+    "> **Robustness note.** Standalone structured-multigrid solvers (`smg`, `pfmg` with no Krylov wrapper) can abort at the C++ level on a GPU build — a CUDA out-of-memory or HYPRE kernel error that kills the notebook kernel uncatchably. They are diagnostic-only and stall on masked porous rows in any case, so the cell below skips them on GPU builds (they still run on CPU, where they are stable). Every other combo either converges or fails with a catchable Python exception, shown as a red `FAILED` bar."
    ]
   },
   {
@@ -136,7 +136,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import sys, os, json, subprocess, tempfile\n",
+    "import sys\n",
     "\n",
     "# Sanity: make sure the porespy datasets cell (§2) has run. The restructure\n",
     "# moved cells around; if you opened the notebook and jumped here directly,\n",
@@ -164,89 +164,60 @@
     "    (\"mlmg\",           \"mlmg\",      None),\n",
     "]\n",
     "\n",
-    "# --- Crash isolation ---------------------------------------------------------\n",
-    "# Some HYPRE configurations — notably *standalone* SMG/PFMG on a GPU build —\n",
-    "# can abort at the C++ level (CUDA out-of-memory or a HYPRE kernel error).\n",
-    "# That aborts the whole process, killing the Python kernel with no traceback\n",
-    "# and no way to catch it from the `except` below; df_solvers and every later\n",
-    "# section die with it.\n",
+    "# --- Skip the uncatchable crashers on GPU ------------------------------------\n",
+    "# *Standalone* structured multigrid (smg/pfmg with no Krylov wrapper) drives\n",
+    "# many full cyclic-reduction V-cycles to tolerance. On a GPU build that path\n",
+    "# can abort at the C++ level — a CUDA out-of-memory or a HYPRE kernel error —\n",
+    "# which kills the Python kernel outright, with no traceback and nothing the\n",
+    "# `except` below can catch. (As a *preconditioner*, e.g. pcg+smg, SMG is capped\n",
+    "# at one V-cycle per iteration and is fine — hence we keep those.)\n",
     "#\n",
-    "# To stay robust we run each combo in its own short-lived subprocess. A hard\n",
-    "# abort then only takes down that child — the parent records it as FAILED and\n",
-    "# moves on, and the crashed child's CUDA context is reclaimed cleanly on exit.\n",
-    "# The cost is one fresh AMReX/HYPRE init per combo (a few seconds); cheap\n",
-    "# insurance for a diagnostic that deliberately probes failing solvers.\n",
-    "_data_path = os.path.join(tempfile.gettempdir(), \"oi_bakeoff_data.npy\")\n",
-    "np.save(_data_path, data_medium)\n",
-    "\n",
-    "_CHILD = r\"\"\"\n",
-    "import sys, json, time, numpy as np, openimpala as oi\n",
-    "data = np.load(sys.argv[1])\n",
-    "solver, precond, mgs = sys.argv[2], sys.argv[3], int(sys.argv[4])\n",
-    "kwargs = dict(phase=0, direction=\"z\", solver=solver, max_grid_size=mgs, verbose=0)\n",
-    "if precond != \"none\":\n",
-    "    kwargs[\"preconditioner\"] = precond\n",
-    "with oi.Session():\n",
-    "    t0 = time.perf_counter()\n",
-    "    res = oi.tortuosity(data, **kwargs)\n",
-    "    dt = time.perf_counter() - t0\n",
-    "print(\"RESULT \" + json.dumps({\n",
-    "    \"t\": dt, \"iters\": int(res.iterations),\n",
-    "    \"tau\": float(res.tortuosity), \"ok\": bool(res.solver_converged),\n",
-    "}))\n",
-    "\"\"\"\n",
-    "\n",
-    "\n",
-    "def run_combo(solver, precond, max_grid_size=32, timeout=300):\n",
-    "    \"\"\"Run one solver in an isolated process. A crash, timeout, or\n",
-    "    non-convergence all come back as ok=False instead of killing this kernel.\"\"\"\n",
-    "    try:\n",
-    "        proc = subprocess.run(\n",
-    "            [sys.executable, \"-c\", _CHILD, _data_path, solver,\n",
-    "             precond or \"none\", str(max_grid_size)],\n",
-    "            capture_output=True, text=True, timeout=timeout,\n",
-    "        )\n",
-    "    except subprocess.TimeoutExpired:\n",
-    "        return {\"t\": np.nan, \"iters\": np.nan, \"tau\": np.nan, \"ok\": False,\n",
-    "                \"error\": f\"timeout (>{timeout}s)\"}\n",
-    "\n",
-    "    for line in proc.stdout.splitlines():\n",
-    "        if line.startswith(\"RESULT \"):\n",
-    "            r = json.loads(line[len(\"RESULT \"):])\n",
-    "            r[\"error\"] = None\n",
-    "            return r\n",
-    "\n",
-    "    # No RESULT line → the child died before printing. Decode why.\n",
-    "    if proc.returncode < 0:\n",
-    "        import signal\n",
-    "        try:\n",
-    "            sig = signal.Signals(-proc.returncode).name\n",
-    "        except ValueError:\n",
-    "            sig = f\"signal {-proc.returncode}\"\n",
-    "        reason = f\"crashed ({sig}) — likely CUDA OOM / HYPRE abort\"\n",
-    "    else:\n",
-    "        tail = (proc.stderr.strip().splitlines() or [\"no stderr\"])[-1]\n",
-    "        reason = f\"exit {proc.returncode}: {tail[:120]}\"\n",
-    "    return {\"t\": np.nan, \"iters\": np.nan, \"tau\": np.nan, \"ok\": False,\n",
-    "            \"error\": reason}\n",
-    "\n",
+    "# These standalone solvers are diagnostic-only and stall on the masked\n",
+    "# (inactive) rows of porous media anyway, so we drop them on GPU. On CPU they\n",
+    "# are stable and stay in for completeness. Everything else either converges or\n",
+    "# raises a catchable Python exception, shown below as a red FAILED bar.\n",
+    "if is_gpu_build:\n",
+    "    _unsafe = {(\"smg\", None), (\"pfmg\", None)}\n",
+    "    combos = [c for c in combos if (c[1], c[2]) not in _unsafe]\n",
+    "    print(\"GPU build detected — skipping standalone SMG/PFMG \"\n",
+    "          \"(they hard-abort on GPU; use them as preconditioners instead).\")\n",
     "\n",
     "print(f\"Running bake-off on {data_medium.shape[0]}³ porespy data... \"\n",
-    "      f\"({len(combos)} solver/preconditioner combinations, each isolated)\",\n",
-    "      flush=True)\n",
+    "      f\"({len(combos)} solver/preconditioner combinations)\", flush=True)\n",
     "\n",
     "records = []\n",
     "for label, s, pc in combos:\n",
+    "    kwargs = dict(phase=0, direction=\"z\", solver=s, max_grid_size=32, verbose=0)\n",
+    "    if pc is not None:\n",
+    "        kwargs[\"preconditioner\"] = pc\n",
+    "\n",
+    "    # Flush before each call: a C++-level AMReX::Abort or CUDA OOM kills\n",
+    "    # the kernel without unwinding Python, so anything we haven't printed\n",
+    "    # already is lost. Live progress also makes it obvious *which* solver\n",
+    "    # killed the cell if one of them does.\n",
     "    sys.stdout.flush()\n",
-    "    r = run_combo(s, pc, max_grid_size=32)\n",
-    "    records.append({\"label\": label, \"solver\": s, \"precond\": pc,\n",
-    "                    \"t\": r[\"t\"], \"iters\": r[\"iters\"], \"tau\": r[\"tau\"],\n",
-    "                    \"ok\": r[\"ok\"]})\n",
-    "    if r[\"ok\"]:\n",
-    "        print(f\"  {label:18s}  t={r['t']:6.2f}s  iters={int(r['iters']):4d}  \"\n",
-    "              f\"tau={r['tau']:.4f}\", flush=True)\n",
-    "    else:\n",
-    "        print(f\"  {label:18s}  FAILED — {r['error']}\", flush=True)\n",
+    "\n",
+    "    try:\n",
+    "        t0 = time.perf_counter()\n",
+    "        res = oi.tortuosity(data_medium, **kwargs)\n",
+    "        dt = time.perf_counter() - t0\n",
+    "        records.append({\"label\": label, \"solver\": s, \"precond\": pc, \"t\": dt,\n",
+    "                        \"iters\": res.iterations, \"tau\": res.tortuosity,\n",
+    "                        \"ok\": res.solver_converged})\n",
+    "        print(f\"  {label:18s}  t={dt:6.2f}s  iters={res.iterations:4d}  \"\n",
+    "              f\"tau={res.tortuosity:.4f}\", flush=True)\n",
+    "    except TypeError as e:\n",
+    "        if \"preconditioner\" in str(e) and pc is not None:\n",
+    "            print(f\"  {label:18s}  SKIP — wheel predates preconditioner plumbing\",\n",
+    "                  flush=True)\n",
+    "            records.append({\"label\": label, \"solver\": s, \"precond\": pc,\n",
+    "                            \"t\": np.nan, \"iters\": np.nan, \"tau\": np.nan, \"ok\": False})\n",
+    "            continue\n",
+    "        raise\n",
+    "    except Exception as e:\n",
+    "        records.append({\"label\": label, \"solver\": s, \"precond\": pc,\n",
+    "                        \"t\": np.nan, \"iters\": np.nan, \"tau\": np.nan, \"ok\": False})\n",
+    "        print(f\"  {label:18s}  FAILED — {e}\", flush=True)\n",
     "\n",
     "df_solvers = pd.DataFrame(records)\n",
     "df_solvers\n"

From ec73a14c572d0b992accd41d898f879316fe419c Mon Sep 17 00:00:00 2001
From: James Le Houx <37665786+jameslehoux@users.noreply.github.com>
Date: Sat, 13 Jun 2026 09:47:56 +0000
Subject: [PATCH 2/2] fix(mlmg): use managed memory for EB mask to stop GPU
 segfault
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TortuosityMLMG::solve() passed a GPU DeviceVector pointer to ActiveMaskIF,
the implicit function consumed by EB2::Build. AMReX evaluates that implicit
function on the host while generating the EB geometry, so on a GPU build the
host code dereferenced a device pointer and segfaulted (SIGSEGV/MPI_ABORT
errorcode 11) at the very first EB step — before the solve began, for every
geometry, with the GPU otherwise idle.

Because MLMG is the default solver (solver="auto"), this broke the entire
openimpala-cuda wheel for all geometries. CPU-only CI never caught it: on a
CPU build the mask pointer is host memory, so the host-side IF evaluation is
valid and the tTortuosityMLMG tests pass.

Allocate the mask as amrex::Gpu::ManagedVector so the single pointer is valid
on both host and device, regardless of where AMReX evaluates the IF. On CPU
builds ManagedVector degrades to an ordinary host allocation, so behaviour is
unchanged there.

Needs validation on real GPU hardware: rebuild the CUDA wheel and re-run
solver="mlmg" on a T4 (and the profiling notebook bake-off).
---
 src/props/TortuosityMLMG.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/props/TortuosityMLMG.cpp b/src/props/TortuosityMLMG.cpp
index 5839e9f..4fe6921 100644
--- a/src/props/TortuosityMLMG.cpp
+++ b/src/props/TortuosityMLMG.cpp
@@ -156,15 +156,17 @@ bool TortuosityMLMG::solve() {
         });
     }
 
-#ifdef AMREX_USE_GPU
-    amrex::Gpu::DeviceVector<int> device_mask(total_cells);
-    amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, host_mask.data(),
-                          host_mask.data() + total_cells, device_mask.data());
+    // EB2::Build evaluates the implicit function (ActiveMaskIF, above) on the
+    // HOST while generating the geometry, and that IF dereferences this mask.
+    // On a GPU build a plain DeviceVector pointer would therefore be read from
+    // CPU code and segfault before the solve even starts. Use managed (unified)
+    // memory: a single pointer that is valid on both host and device, so the IF
+    // works wherever AMReX chooses to evaluate it. (On CPU-only builds
+    // ManagedVector degrades to an ordinary host allocation.)
+    amrex::Gpu::ManagedVector<int> managed_mask(total_cells);
+    std::copy(host_mask.begin(), host_mask.end(), managed_mask.begin());
     amrex::Gpu::streamSynchronize();
-    const int* mask_data_ptr = device_mask.data();
-#else
-    const int* mask_data_ptr = host_mask.data();
-#endif
+    const int* mask_data_ptr = managed_mask.data();
 
     // -----------------------------------------------------------------
     // Step 2: build EB from the mask.