From 3fdf0fe0ab7e813f3956b3b7d8c5b4eefefec7a2 Mon Sep 17 00:00:00 2001 From: Alex Kaszynski Date: Fri, 19 Jun 2026 12:51:22 -0600 Subject: [PATCH] perf(data-interp): default-on threading for cell<->point data interpolation Wrap the per-output-element vtkSMPTools::For loops of vtkCellDataToPointData (5 dispatch sites over numPts) and vtkPointDataToCellData (2 sites over numCells) in fvtk::RunSafeFilterParallel so they thread under the audited 4-thread-capped default policy instead of the Sequential backend. Each output element is an independent average of read-only input: a point gathers from its incident cells (via the deterministically-built cell links) and a cell gathers from its points, each summed in a fixed link/point order and written to its own pre-sized slot. No shared scatter, no cross-element state => bit-identical under any thread count and to serial stock VTK. Only the For compute loops are wrapped; the cell-links build is left outside the scope. Adds a >THRESHOLD (sphere 700x700 ~490k pts / ~980k cells) determinism case parametrized over cd2pd/pd2cd. Byte-exactness vs stock is covered by the existing op_cell2point / op_point2cell / op_point2cell_ugrid bitexact cases. Co-Authored-By: Claude Opus 4.8 --- Filters/Core/vtkCellDataToPointData.cxx | 11 ++-- Filters/Core/vtkPointDataToCellData.cxx | 6 +- tests/bitexact/test_smp_determinism.py | 82 +++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 7 deletions(-) diff --git a/Filters/Core/vtkCellDataToPointData.cxx b/Filters/Core/vtkCellDataToPointData.cxx index 3b116ea9..abd9695b 100644 --- a/Filters/Core/vtkCellDataToPointData.cxx +++ b/Filters/Core/vtkCellDataToPointData.cxx @@ -11,6 +11,7 @@ #include "vtkCellTypeUtilities.h" #include "vtkDataArrayRange.h" #include "vtkDataSet.h" +#include "vtkFVTKSMPDefaults.h" // fvtk: opt into default multithreading (bit-exact) #include "vtkIdList.h" #include "vtkInformation.h" #include "vtkInformationVector.h" @@ -81,13 +82,13 @@ void FastUnstructuredDataACL( if (auto staticCellLinks = vtkStaticCellLinks::SafeDownCast(links)) { UnstructuredDataCD2PD cd2pd(numPts, cfl, pd, staticCellLinks); - vtkSMPTools::For(0, numPts, cd2pd); + fvtk::RunSafeFilterParallel([&]() { vtkSMPTools::For(0, numPts, cd2pd); }); } else // vtkCellLinks { auto cellLinks = vtkCellLinks::SafeDownCast(links); UnstructuredDataCD2PD cd2pd(numPts, cfl, pd, cellLinks); - vtkSMPTools::For(0, numPts, cd2pd); + fvtk::RunSafeFilterParallel([&]() { vtkSMPTools::For(0, numPts, cd2pd); }); } } @@ -108,7 +109,7 @@ void FastUnstructuredDataSCLT( TCellLinks cellLinks; cellLinks.BuildLinks(input); UnstructuredDataCD2PD cd2pd(numberOfPoints, cfl, pd, &cellLinks); - vtkSMPTools::For(0, numberOfPoints, cd2pd); + fvtk::RunSafeFilterParallel([&]() { vtkSMPTools::For(0, numberOfPoints, cd2pd); }); } #ifdef VTK_USE_64BIT_IDS else if (linksType == vtkAbstractCellLinks::STATIC_CELL_LINKS_UINT) @@ -117,7 +118,7 @@ void FastUnstructuredDataSCLT( TCellLinks cellLinks; cellLinks.BuildLinks(input); UnstructuredDataCD2PD cd2pd(numberOfPoints, cfl, pd, &cellLinks); - vtkSMPTools::For(0, numberOfPoints, cd2pd); + fvtk::RunSafeFilterParallel([&]() { vtkSMPTools::For(0, numberOfPoints, cd2pd); }); } #endif else @@ -126,7 +127,7 @@ void FastUnstructuredDataSCLT( TCellLinks cellLinks; cellLinks.BuildLinks(input); UnstructuredDataCD2PD cd2pd(numberOfPoints, cfl, pd, &cellLinks); - vtkSMPTools::For(0, numberOfPoints, cd2pd); + fvtk::RunSafeFilterParallel([&]() { vtkSMPTools::For(0, numberOfPoints, cd2pd); }); } } diff --git a/Filters/Core/vtkPointDataToCellData.cxx b/Filters/Core/vtkPointDataToCellData.cxx index 5eb810e7..1c3a59f3 100644 --- a/Filters/Core/vtkPointDataToCellData.cxx +++ b/Filters/Core/vtkPointDataToCellData.cxx @@ -8,6 +8,7 @@ #include "vtkCellData.h" #include "vtkDataArray.h" #include "vtkDataSet.h" +#include "vtkFVTKSMPDefaults.h" // fvtk: opt into default multithreading (bit-exact) #include "vtkIdList.h" #include "vtkInformation.h" #include "vtkInformationVector.h" @@ -338,7 +339,8 @@ struct PointDataToCellDataCategoricalWorker vtkPointDataToCellData* filter) { PointDataToCellDataCategoricalFunctor pd2cd(input, inPD, outCD, scalars, filter); - vtkSMPTools::For(0, input->GetNumberOfCells(), pd2cd); + fvtk::RunSafeFilterParallel( + [&]() { vtkSMPTools::For(0, input->GetNumberOfCells(), pd2cd); }); } }; } // End anonymous namespace @@ -504,7 +506,7 @@ int vtkPointDataToCellData::RequestData( { // Thread the process PointDataToCellDataFunctor pd2cd(input, inPD, outCD, this); - vtkSMPTools::For(0, numCells, pd2cd); + fvtk::RunSafeFilterParallel([&]() { vtkSMPTools::For(0, numCells, pd2cd); }); } // Create a threaded fast path for categorical data. else diff --git a/tests/bitexact/test_smp_determinism.py b/tests/bitexact/test_smp_determinism.py index 1859e9a6..f541bc85 100644 --- a/tests/bitexact/test_smp_determinism.py +++ b/tests/bitexact/test_smp_determinism.py @@ -251,3 +251,85 @@ def test_large_transform_threaded_path_is_deterministic( f"large {kind} transform digest at {nthreads} threads ({got[:16]}) != " f"1-thread reference ({ref[:16]}) -- threading is non-deterministic" ) + + +# A large-mesh script that forces the threaded cell<->point data-interpolation +# paths: vtkCellDataToPointData (For over numPts, each point averages its incident +# cells via read-only cell links) and vtkPointDataToCellData (For over numCells, +# each cell averages its points). Both parallelize with grain = vtkSMPTools:: +# THRESHOLD, so per-suite cases (<10k) run serial; this drives ~490k pts / ~980k +# cells so the multithreaded branch is actually exercised. Each output element is +# an independent average of read-only input => bit-exact under any thread count. +# argv[1] selects "cd2pd" or "pd2cd". +_LARGE_INTERP_SCRIPT = r""" +import os, sys, hashlib +sys.path = [p for p in sys.path if p not in ("", os.getcwd())] +import numpy as np +from vtkmodules.util.numpy_support import numpy_to_vtk, vtk_to_numpy +from vtkmodules.vtkFiltersSources import vtkSphereSource +from vtkmodules.vtkFiltersCore import ( + vtkTriangleFilter, vtkCellDataToPointData, vtkPointDataToCellData) + +kind = sys.argv[1] +s = vtkSphereSource(); s.SetThetaResolution(700); s.SetPhiResolution(700) +t = vtkTriangleFilter(); t.SetInputConnection(s.GetOutputPort()); t.Update() +poly = t.GetOutput() +npts = poly.GetNumberOfPoints(); ncells = poly.GetNumberOfCells() +assert npts > 100000 and ncells > 100000, (npts, ncells) + +if kind == "cd2pd": + # non-trivial per-cell scalar (float64) -> averaged onto points + poly.GetCellData().SetScalars( + numpy_to_vtk(np.linspace(-1, 1, ncells).astype(np.float64), deep=1)) + f = vtkCellDataToPointData(); f.SetInputData(poly); f.Update() + arr = f.GetOutput().GetPointData().GetScalars() +elif kind == "pd2cd": + poly.GetPointData().SetScalars( + numpy_to_vtk(np.linspace(-1, 1, npts).astype(np.float64), deep=1)) + f = vtkPointDataToCellData(); f.SetInputData(poly); f.Update() + arr = f.GetOutput().GetCellData().GetScalars() +else: + raise SystemExit("unknown kind " + kind) + +h = hashlib.sha256() +h.update(np.ascontiguousarray(vtk_to_numpy(arr)).tobytes()) +print(h.hexdigest()) +""" + +INTERP_KINDS = ["cd2pd", "pd2cd"] + + +@pytest.fixture(scope="module") +def large_interp_digests(tmp_path_factory): + fvtk_py = os.environ.get("BITEXACT_FVTK_PY") + if not fvtk_py: + pytest.skip("BITEXACT_FVTK_PY not set.") + script = tmp_path_factory.mktemp("large_interp") / "large_interp.py" + script.write_text(_LARGE_INTERP_SCRIPT) + digests = {k: {} for k in INTERP_KINDS} + for kind in INTERP_KINDS: + for n in THREAD_COUNTS: + proc = subprocess.run( + [fvtk_py, str(script), kind], + env=_env(os.environ.get("BITEXACT_FVTK_LDLP", ""), n), + capture_output=True, + text=True, + ) + assert proc.returncode == 0, f"{kind} @ {n} threads:\n{proc.stderr}" + digests[kind][n] = proc.stdout.strip().splitlines()[-1] + return digests + + +@pytest.mark.parametrize("kind", INTERP_KINDS) +@pytest.mark.parametrize("nthreads", [n for n in THREAD_COUNTS if n != 1]) +def test_large_interp_threaded_path_is_deterministic( + large_interp_digests, kind, nthreads +): + """The >THRESHOLD cell<->point data-interpolation paths (genuinely + multithreaded) hash identically at 1 vs N threads.""" + ref = large_interp_digests[kind][1] + got = large_interp_digests[kind][nthreads] + assert got == ref, ( + f"large {kind} digest at {nthreads} threads ({got[:16]}) != " + f"1-thread reference ({ref[:16]}) -- threading is non-deterministic" + )