diff --git a/notebooks/profiling_and_tuning.ipynb b/notebooks/profiling_and_tuning.ipynb index 571e6ab..000a65e 100644 --- a/notebooks/profiling_and_tuning.ipynb +++ b/notebooks/profiling_and_tuning.ipynb @@ -127,7 +127,7 @@ "\n", "**The default solver (`solver=\"auto\"`, since v4.2.20) is MLMG** — it scales near-linearly (see §4) and has the lowest memory footprint of any option. The bake-off below confirms it's also the fastest on this geometry. The runner-up among HYPRE configurations is `flexgmres+pfmg`; `pcg+smg` is the safe fallback if MLMG ever stalls.\n", "\n", - "> **Robustness note.** Standalone structured-multigrid solvers (`smg`, `pfmg`) can abort at the C++ level on a GPU build — a CUDA out-of-memory or HYPRE kernel error that would otherwise kill the notebook kernel and wipe every result. Each combo below therefore runs in its own subprocess, so a hard crash is recorded as a red `FAILED` bar instead of taking the session down with it." + "> **Robustness note.** Standalone structured-multigrid solvers (`smg`, `pfmg` with no Krylov wrapper) can abort at the C++ level on a GPU build — a CUDA out-of-memory or HYPRE kernel error that kills the notebook kernel uncatchably. They are diagnostic-only and stall on masked porous rows in any case, so the cell below skips them on GPU builds (they still run on CPU, where they are stable). Every other combo either converges or fails with a catchable Python exception, shown as a red `FAILED` bar." ] }, { @@ -136,7 +136,7 @@ "metadata": {}, "outputs": [], "source": [ - "import sys, os, json, subprocess, tempfile\n", + "import sys\n", "\n", "# Sanity: make sure the porespy datasets cell (§2) has run. The restructure\n", "# moved cells around; if you opened the notebook and jumped here directly,\n", @@ -164,89 +164,60 @@ " (\"mlmg\", \"mlmg\", None),\n", "]\n", "\n", - "# --- Crash isolation ---------------------------------------------------------\n", - "# Some HYPRE configurations — notably *standalone* SMG/PFMG on a GPU build —\n", - "# can abort at the C++ level (CUDA out-of-memory or a HYPRE kernel error).\n", - "# That aborts the whole process, killing the Python kernel with no traceback\n", - "# and no way to catch it from the `except` below; df_solvers and every later\n", - "# section die with it.\n", + "# --- Skip the uncatchable crashers on GPU ------------------------------------\n", + "# *Standalone* structured multigrid (smg/pfmg with no Krylov wrapper) drives\n", + "# many full cyclic-reduction V-cycles to tolerance. On a GPU build that path\n", + "# can abort at the C++ level — a CUDA out-of-memory or a HYPRE kernel error —\n", + "# which kills the Python kernel outright, with no traceback and nothing the\n", + "# `except` below can catch. (As a *preconditioner*, e.g. pcg+smg, SMG is capped\n", + "# at one V-cycle per iteration and is fine — hence we keep those.)\n", "#\n", - "# To stay robust we run each combo in its own short-lived subprocess. A hard\n", - "# abort then only takes down that child — the parent records it as FAILED and\n", - "# moves on, and the crashed child's CUDA context is reclaimed cleanly on exit.\n", - "# The cost is one fresh AMReX/HYPRE init per combo (a few seconds); cheap\n", - "# insurance for a diagnostic that deliberately probes failing solvers.\n", - "_data_path = os.path.join(tempfile.gettempdir(), \"oi_bakeoff_data.npy\")\n", - "np.save(_data_path, data_medium)\n", - "\n", - "_CHILD = r\"\"\"\n", - "import sys, json, time, numpy as np, openimpala as oi\n", - "data = np.load(sys.argv[1])\n", - "solver, precond, mgs = sys.argv[2], sys.argv[3], int(sys.argv[4])\n", - "kwargs = dict(phase=0, direction=\"z\", solver=solver, max_grid_size=mgs, verbose=0)\n", - "if precond != \"none\":\n", - " kwargs[\"preconditioner\"] = precond\n", - "with oi.Session():\n", - " t0 = time.perf_counter()\n", - " res = oi.tortuosity(data, **kwargs)\n", - " dt = time.perf_counter() - t0\n", - "print(\"RESULT \" + json.dumps({\n", - " \"t\": dt, \"iters\": int(res.iterations),\n", - " \"tau\": float(res.tortuosity), \"ok\": bool(res.solver_converged),\n", - "}))\n", - "\"\"\"\n", - "\n", - "\n", - "def run_combo(solver, precond, max_grid_size=32, timeout=300):\n", - " \"\"\"Run one solver in an isolated process. A crash, timeout, or\n", - " non-convergence all come back as ok=False instead of killing this kernel.\"\"\"\n", - " try:\n", - " proc = subprocess.run(\n", - " [sys.executable, \"-c\", _CHILD, _data_path, solver,\n", - " precond or \"none\", str(max_grid_size)],\n", - " capture_output=True, text=True, timeout=timeout,\n", - " )\n", - " except subprocess.TimeoutExpired:\n", - " return {\"t\": np.nan, \"iters\": np.nan, \"tau\": np.nan, \"ok\": False,\n", - " \"error\": f\"timeout (>{timeout}s)\"}\n", - "\n", - " for line in proc.stdout.splitlines():\n", - " if line.startswith(\"RESULT \"):\n", - " r = json.loads(line[len(\"RESULT \"):])\n", - " r[\"error\"] = None\n", - " return r\n", - "\n", - " # No RESULT line → the child died before printing. Decode why.\n", - " if proc.returncode < 0:\n", - " import signal\n", - " try:\n", - " sig = signal.Signals(-proc.returncode).name\n", - " except ValueError:\n", - " sig = f\"signal {-proc.returncode}\"\n", - " reason = f\"crashed ({sig}) — likely CUDA OOM / HYPRE abort\"\n", - " else:\n", - " tail = (proc.stderr.strip().splitlines() or [\"no stderr\"])[-1]\n", - " reason = f\"exit {proc.returncode}: {tail[:120]}\"\n", - " return {\"t\": np.nan, \"iters\": np.nan, \"tau\": np.nan, \"ok\": False,\n", - " \"error\": reason}\n", - "\n", + "# These standalone solvers are diagnostic-only and stall on the masked\n", + "# (inactive) rows of porous media anyway, so we drop them on GPU. On CPU they\n", + "# are stable and stay in for completeness. Everything else either converges or\n", + "# raises a catchable Python exception, shown below as a red FAILED bar.\n", + "if is_gpu_build:\n", + " _unsafe = {(\"smg\", None), (\"pfmg\", None)}\n", + " combos = [c for c in combos if (c[1], c[2]) not in _unsafe]\n", + " print(\"GPU build detected — skipping standalone SMG/PFMG \"\n", + " \"(they hard-abort on GPU; use them as preconditioners instead).\")\n", "\n", "print(f\"Running bake-off on {data_medium.shape[0]}³ porespy data... \"\n", - " f\"({len(combos)} solver/preconditioner combinations, each isolated)\",\n", - " flush=True)\n", + " f\"({len(combos)} solver/preconditioner combinations)\", flush=True)\n", "\n", "records = []\n", "for label, s, pc in combos:\n", + " kwargs = dict(phase=0, direction=\"z\", solver=s, max_grid_size=32, verbose=0)\n", + " if pc is not None:\n", + " kwargs[\"preconditioner\"] = pc\n", + "\n", + " # Flush before each call: a C++-level AMReX::Abort or CUDA OOM kills\n", + " # the kernel without unwinding Python, so anything we haven't printed\n", + " # already is lost. Live progress also makes it obvious *which* solver\n", + " # killed the cell if one of them does.\n", " sys.stdout.flush()\n", - " r = run_combo(s, pc, max_grid_size=32)\n", - " records.append({\"label\": label, \"solver\": s, \"precond\": pc,\n", - " \"t\": r[\"t\"], \"iters\": r[\"iters\"], \"tau\": r[\"tau\"],\n", - " \"ok\": r[\"ok\"]})\n", - " if r[\"ok\"]:\n", - " print(f\" {label:18s} t={r['t']:6.2f}s iters={int(r['iters']):4d} \"\n", - " f\"tau={r['tau']:.4f}\", flush=True)\n", - " else:\n", - " print(f\" {label:18s} FAILED — {r['error']}\", flush=True)\n", + "\n", + " try:\n", + " t0 = time.perf_counter()\n", + " res = oi.tortuosity(data_medium, **kwargs)\n", + " dt = time.perf_counter() - t0\n", + " records.append({\"label\": label, \"solver\": s, \"precond\": pc, \"t\": dt,\n", + " \"iters\": res.iterations, \"tau\": res.tortuosity,\n", + " \"ok\": res.solver_converged})\n", + " print(f\" {label:18s} t={dt:6.2f}s iters={res.iterations:4d} \"\n", + " f\"tau={res.tortuosity:.4f}\", flush=True)\n", + " except TypeError as e:\n", + " if \"preconditioner\" in str(e) and pc is not None:\n", + " print(f\" {label:18s} SKIP — wheel predates preconditioner plumbing\",\n", + " flush=True)\n", + " records.append({\"label\": label, \"solver\": s, \"precond\": pc,\n", + " \"t\": np.nan, \"iters\": np.nan, \"tau\": np.nan, \"ok\": False})\n", + " continue\n", + " raise\n", + " except Exception as e:\n", + " records.append({\"label\": label, \"solver\": s, \"precond\": pc,\n", + " \"t\": np.nan, \"iters\": np.nan, \"tau\": np.nan, \"ok\": False})\n", + " print(f\" {label:18s} FAILED — {e}\", flush=True)\n", "\n", "df_solvers = pd.DataFrame(records)\n", "df_solvers\n" diff --git a/src/props/TortuosityMLMG.cpp b/src/props/TortuosityMLMG.cpp index 5839e9f..4fe6921 100644 --- a/src/props/TortuosityMLMG.cpp +++ b/src/props/TortuosityMLMG.cpp @@ -156,15 +156,17 @@ bool TortuosityMLMG::solve() { }); } -#ifdef AMREX_USE_GPU - amrex::Gpu::DeviceVector device_mask(total_cells); - amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, host_mask.data(), - host_mask.data() + total_cells, device_mask.data()); + // EB2::Build evaluates the implicit function (ActiveMaskIF, above) on the + // HOST while generating the geometry, and that IF dereferences this mask. + // On a GPU build a plain DeviceVector pointer would therefore be read from + // CPU code and segfault before the solve even starts. Use managed (unified) + // memory: a single pointer that is valid on both host and device, so the IF + // works wherever AMReX chooses to evaluate it. (On CPU-only builds + // ManagedVector degrades to an ordinary host allocation.) + amrex::Gpu::ManagedVector managed_mask(total_cells); + std::copy(host_mask.begin(), host_mask.end(), managed_mask.begin()); amrex::Gpu::streamSynchronize(); - const int* mask_data_ptr = device_mask.data(); -#else - const int* mask_data_ptr = host_mask.data(); -#endif + const int* mask_data_ptr = managed_mask.data(); // ----------------------------------------------------------------- // Step 2: build EB from the mask.