diff --git a/Common/Core/vtkFVTKSMPDefaults.cxx b/Common/Core/vtkFVTKSMPDefaults.cxx
index 98565728..36ae050c 100644
--- a/Common/Core/vtkFVTKSMPDefaults.cxx
+++ b/Common/Core/vtkFVTKSMPDefaults.cxx
@@ -29,6 +29,25 @@ bool DefaultThreadingDisabledByEnv()
 }
 }
 
+//------------------------------------------------------------------------------
+// Opt-in NON-EXACT fast mode. Default OFF: filters whose threaded path is not
+// byte-exact (e.g. order-relaxed topology emission) stay serial unless the user
+// opts in. Read live from the FVTK_FAST env var (which fvtk.EnableFast() sets),
+// so it can be toggled at runtime. Truthy: 1/on/true/yes (any case).
+bool FastModeEnabled()
+{
+  const char* v = std::getenv("FVTK_FAST");
+  if (!v || v[0] == '\0')
+  {
+    return false;
+  }
+  return std::strcmp(v, "1") == 0 || std::strcmp(v, "on") == 0 ||
+    std::strcmp(v, "ON") == 0 || std::strcmp(v, "On") == 0 ||
+    std::strcmp(v, "true") == 0 || std::strcmp(v, "True") == 0 ||
+    std::strcmp(v, "TRUE") == 0 || std::strcmp(v, "yes") == 0 ||
+    std::strcmp(v, "YES") == 0;
+}
+
 //------------------------------------------------------------------------------
 // Precedence (first match wins):
 //   1. opt-out env FVTK_SMP_DEFAULT=0/off/serial  -> stay Sequential (serial).
diff --git a/Common/Core/vtkFVTKSMPDefaults.h b/Common/Core/vtkFVTKSMPDefaults.h
index a16c5327..e1c00365 100644
--- a/Common/Core/vtkFVTKSMPDefaults.h
+++ b/Common/Core/vtkFVTKSMPDefaults.h
@@ -79,6 +79,31 @@ inline void RunSafeFilterParallel(Body&& body)
   vtkSMPTools::LocalScope(GetSafeFilterThreadingConfig(), std::forward<Body>(body));
 }
 
+/**
+ * True when the opt-in NON-EXACT fast mode is enabled (env FVTK_FAST, set by the
+ * Python fvtk.EnableFast()). Default OFF. Read live so it can be toggled at
+ * runtime. Filters whose threaded path is not byte-exact gate on this.
+ */
+VTKCOMMONCORE_EXPORT bool FastModeEnabled();
+
+/**
+ * Like RunSafeFilterParallel(), but ONLY threads when FastModeEnabled(). When
+ * fast mode is off (the default), @p body runs serially so the filter stays
+ * byte-exact vs stock. Use this -- not RunSafeFilterParallel() -- for parallel
+ * regions whose output is NOT byte-exact (e.g. order-relaxed topology emission
+ * whose cell order depends on thread scheduling).
+ */
+template <typename Body>
+inline void RunFastFilterParallel(Body&& body)
+{
+  if (!FastModeEnabled())
+  {
+    body();
+    return;
+  }
+  RunSafeFilterParallel(std::forward<Body>(body));
+}
+
 VTK_ABI_NAMESPACE_END
 } // namespace fvtk
 
diff --git a/Filters/Core/vtk3DLinearGridPlaneCutter.cxx b/Filters/Core/vtk3DLinearGridPlaneCutter.cxx
index 4501dae0..ba21ba91 100644
--- a/Filters/Core/vtk3DLinearGridPlaneCutter.cxx
+++ b/Filters/Core/vtk3DLinearGridPlaneCutter.cxx
@@ -22,6 +22,7 @@
 #include "vtkPlane.h"
 #include "vtkPointData.h"
 #include "vtkPolyData.h"
+#include "vtkFVTKSMPDefaults.h"
 #include "vtkSMPThreadLocalObject.h"
 #include "vtkSMPTools.h"
 #include "vtkStaticEdgeLocatorTemplate.h"
@@ -50,7 +51,7 @@ vtkCxxSetObjectMacro(vtk3DLinearGridPlaneCutter, Plane, vtkPlane);
   {                                                                                                \
     if (!_seq)                                                                                     \
     {                                                                                              \
-      vtkSMPTools::For(0, _num, _op);                                                              \
+      fvtk::RunFastFilterParallel([&]() { vtkSMPTools::For(0, _num, _op); });                                                              \
     }                                                                                              \
     else                                                                                           \
     {                                                                                              \
@@ -63,7 +64,7 @@ vtkCxxSetObjectMacro(vtk3DLinearGridPlaneCutter, Plane, vtkPlane);
   {                                                                                                \
     if (!_seq)                                                                                     \
     {                                                                                              \
-      vtkSMPTools::For(0, _num, _op);                                                              \
+      fvtk::RunFastFilterParallel([&]() { vtkSMPTools::For(0, _num, _op); });                                                              \
     }                                                                                              \
     else                                                                                           \
     {                                                                                              \
diff --git a/Filters/Core/vtkContour3DLinearGrid.cxx b/Filters/Core/vtkContour3DLinearGrid.cxx
index a7c4f887..e7e4cb2f 100644
--- a/Filters/Core/vtkContour3DLinearGrid.cxx
+++ b/Filters/Core/vtkContour3DLinearGrid.cxx
@@ -23,6 +23,7 @@
 #include "vtkObjectFactory.h"
 #include "vtkPointData.h"
 #include "vtkPolyData.h"
+#include "vtkFVTKSMPDefaults.h"
 #include "vtkSMPTools.h"
 #include "vtkSmartPointer.h"
 #include "vtkSpanSpace.h"
@@ -59,7 +60,7 @@ vtkCxxSetObjectMacro(vtkContour3DLinearGrid, ScalarTree, vtkScalarTree);
   {                                                                                                \
     if (!_seq)                                                                                     \
     {                                                                                              \
-      vtkSMPTools::For(0, _num, _op);                                                              \
+      fvtk::RunFastFilterParallel([&]() { vtkSMPTools::For(0, _num, _op); });                                                              \
     }                                                                                              \
     else                                                                                           \
     {                                                                                              \
@@ -72,7 +73,7 @@ vtkCxxSetObjectMacro(vtkContour3DLinearGrid, ScalarTree, vtkScalarTree);
   {                                                                                                \
     if (!_seq)                                                                                     \
     {                                                                                              \
-      vtkSMPTools::For(0, _num, _op);                                                              \
+      fvtk::RunFastFilterParallel([&]() { vtkSMPTools::For(0, _num, _op); });                                                              \
     }                                                                                              \
     else                                                                                           \
     {                                                                                              \
@@ -255,11 +256,11 @@ struct ContourCellsBase
     // Copy points output to VTK structures. Only point coordinates are
     // copied for now; later we'll define the triangle topology.
     ProducePoints producePts(localPts, localPtOffsets, this->NewPts);
-    EXECUTE_SMPFOR(this->Filter->GetSequentialProcessing(), this->NumThreadsUsed, producePts);
+    EXECUTE_SMPFOR((this->Filter->GetSequentialProcessing() || this->Filter->GetComputeNormals()), this->NumThreadsUsed, producePts);
 
     // Now produce the output triangles (topology) for this contour n parallel
     ProduceTriangles produceTris(this->TotalTris, this->NewPolys);
-    EXECUTE_SMPFOR(this->Filter->GetSequentialProcessing(), this->NumTris, produceTris);
+    EXECUTE_SMPFOR((this->Filter->GetSequentialProcessing() || this->Filter->GetComputeNormals()), this->NumTris, produceTris);
   } // Reduce
 };  // ContourCellsBase
 
@@ -452,14 +453,14 @@ struct ProcessFastPathWorker
       TContourCellsST contour(
         filter, inPts, outPts, scalars, cellIter, isoValue, st, tris, totalPts, totalTris);
       EXECUTE_REDUCED_SMPFOR(
-        filter->GetSequentialProcessing(), contour.NumBatches, contour, numThreads);
+        (filter->GetSequentialProcessing() || filter->GetComputeNormals()), contour.NumBatches, contour, numThreads);
     }
     else
     {
       using TContourCells = ContourCells<TInputPointsArray, TOutputPointsArray, TScalarsArray>;
       TContourCells contour(
         filter, inPts, outPts, scalars, cellIter, isoValue, tris, totalPts, totalTris);
-      EXECUTE_REDUCED_SMPFOR(filter->GetSequentialProcessing(), numCells, contour, numThreads);
+      EXECUTE_REDUCED_SMPFOR((filter->GetSequentialProcessing() || filter->GetComputeNormals()), numCells, contour, numThreads);
     }
   }
 };
@@ -619,7 +620,7 @@ struct ExtractEdgesBase
     this->Edges =
       new EdgeTuple<IDType, EdgeDataType<IDType>>[3 * this->NumTris]; // three edges per triangle
     ProduceEdges<IDType> produceEdges(localEdges, localTriOffsets, this->Edges, this->Filter);
-    EXECUTE_SMPFOR(this->Filter->GetSequentialProcessing(), this->NumThreadsUsed, produceEdges);
+    EXECUTE_SMPFOR((this->Filter->GetSequentialProcessing() || this->Filter->GetComputeNormals()), this->NumThreadsUsed, produceEdges);
   } // Reduce
 };  // ExtractEdgesBase
 
@@ -818,7 +819,7 @@ struct ExtractEdgesWorker
       TExtractEdgesST extractEdges(
         filter, scalars, cellIter, isoValue, st, newPolys, totalTris, originalCellIds);
       EXECUTE_REDUCED_SMPFOR(
-        filter->GetSequentialProcessing(), extractEdges.NumBatches, extractEdges, numThreads);
+        (filter->GetSequentialProcessing() || filter->GetComputeNormals()), extractEdges.NumBatches, extractEdges, numThreads);
       numTris = extractEdges.NumTris;
       mergeEdges = extractEdges.Edges;
     }
@@ -827,7 +828,7 @@ struct ExtractEdgesWorker
       using TExtractEdges = ExtractEdges<TIds, TScalarArray>;
       TExtractEdges extractEdges(
         filter, scalars, cellIter, isoValue, newPolys, totalTris, originalCellIds);
-      EXECUTE_REDUCED_SMPFOR(filter->GetSequentialProcessing(), numCells, extractEdges, numThreads);
+      EXECUTE_REDUCED_SMPFOR((filter->GetSequentialProcessing() || filter->GetComputeNormals()), numCells, extractEdges, numThreads);
       numTris = extractEdges.NumTris;
       mergeEdges = extractEdges.Edges;
     }
@@ -983,7 +984,7 @@ struct ProduceMergedPointsWorker
   {
     ProduceMergedPoints<TInputPointsArray, TOutputPointsArray, TIds> produceMergedPoints(
       filter, inputPointsArray, outputPointsArray, mergeArray, offsets, totalPoints);
-    EXECUTE_SMPFOR(filter->GetSequentialProcessing(), numPts, produceMergedPoints);
+    EXECUTE_SMPFOR((filter->GetSequentialProcessing() || filter->GetComputeNormals()), numPts, produceMergedPoints);
   }
 };
 
@@ -1121,7 +1122,7 @@ int ProcessMerged(vtkContour3DLinearGrid* filter, vtkPoints* inPts, vtkPoints* o
   // Generate triangles.
   ProduceMergedTriangles<TIds> produceTris(
     mergeEdges, offsets, numTris, newPolys, totalPts, totalTris, filter);
-  EXECUTE_SMPFOR(filter->GetSequentialProcessing(), numPts, produceTris);
+  EXECUTE_SMPFOR((filter->GetSequentialProcessing() || filter->GetComputeNormals()), numPts, produceTris);
   numThreads = nt;
 
   // Generate points (one per unique edge)
@@ -1159,7 +1160,7 @@ int ProcessMerged(vtkContour3DLinearGrid* filter, vtkPoints* inPts, vtkPoints* o
       pointArrays->Realloc(totalPts + numPts);
     }
     ProducePointAttributes<TIds> interpolate(mergeEdges, offsets, pointArrays, totalPts, filter);
-    EXECUTE_SMPFOR(filter->GetSequentialProcessing(), numPts, interpolate);
+    EXECUTE_SMPFOR((filter->GetSequentialProcessing() || filter->GetComputeNormals()), numPts, interpolate);
 
     // interpolate cell data
     if (totalTris <= 0) // first contour value generating output
@@ -1172,7 +1173,7 @@ int ProcessMerged(vtkContour3DLinearGrid* filter, vtkPoints* inPts, vtkPoints* o
       cellArrays->Realloc(totalTris + numTris);
     }
     ProduceCellAttributes<TIds> interpolateCell(originalCellIds, cellArrays, totalTris, filter);
-    EXECUTE_SMPFOR(filter->GetSequentialProcessing(), numTris, interpolateCell);
+    EXECUTE_SMPFOR((filter->GetSequentialProcessing() || filter->GetComputeNormals()), numTris, interpolateCell);
   }
 
   // Clean up
@@ -1246,7 +1247,7 @@ vtkSmartPointer<vtkFloatArray> GenerateTriNormals(
 
   // Execute functor over all triangles
   ComputeCellNormals computeNormals(pts, tris, n, filter);
-  EXECUTE_SMPFOR(filter->GetSequentialProcessing(), numTris, computeNormals);
+  EXECUTE_SMPFOR((filter->GetSequentialProcessing() || filter->GetComputeNormals()), numTris, computeNormals);
 
   return cellNormals;
 }
@@ -1331,7 +1332,7 @@ void GeneratePointNormals(vtkPoints* pts, vtkCellArray* tris, vtkFloatArray* cel
 
   // Process all points, averaging normals
   AverageNormals<TId> average(&links, triN, ptN, filter);
-  EXECUTE_SMPFOR(filter->GetSequentialProcessing(), numPts, average);
+  EXECUTE_SMPFOR((filter->GetSequentialProcessing() || filter->GetComputeNormals()), numPts, average);
 
   // Clean up and get out
   pd->SetNormals(ptNormals);
diff --git a/Wrapping/Python/fvtk/__init__.py.in b/Wrapping/Python/fvtk/__init__.py.in
index f5ebba89..c4736207 100644
--- a/Wrapping/Python/fvtk/__init__.py.in
+++ b/Wrapping/Python/fvtk/__init__.py.in
@@ -111,6 +111,33 @@ def _load_fvtk_static():
 __all__ = [
     @_fvtk_all@
 ]
+#------------------------------------------------------------------------------
+# Opt-in NON-EXACT fast mode (fvtk extension).
+#
+# Some fvtk filters have a multithreaded fast path whose output is NOT byte-exact
+# vs stock VTK -- e.g. order-relaxed topology emission, where the SAME cells are
+# produced but their ORDER (and thus the raw connectivity bytes) depends on thread
+# scheduling. These stay OFF by default so fvtk is a byte-exact drop-in. Call
+# EnableFast() to opt in; points / point-data / the cell set stay correct, only
+# cell ORDER becomes non-deterministic. Backed by the FVTK_FAST env var, which the
+# native filters read live, so this can be toggled at runtime.
+import os as _os
+
+def EnableFast():
+    """Opt in to fvtk's non-exact multithreaded fast paths (see module docs)."""
+    _os.environ["FVTK_FAST"] = "1"
+
+def DisableFast():
+    """Turn the non-exact fast paths back off (the default)."""
+    _os.environ.pop("FVTK_FAST", None)
+
+def IsFastEnabled():
+    """True if the non-exact fast mode is currently enabled."""
+    return _os.environ.get("FVTK_FAST", "").lower() in ("1", "on", "true", "yes")
+
+# Expose the fast-mode toggles for `from fvtk import *` and discoverability.
+__all__ += ["EnableFast", "DisableFast", "IsFastEnabled"]
+
 #------------------------------------------------------------------------------
 # get the version
 __version__ = "@VTK_MAJOR_VERSION@.@VTK_MINOR_VERSION@.@VTK_BUILD_VERSION@"
diff --git a/tests/bitexact/compare.py b/tests/bitexact/compare.py
index b71584c7..6530ba58 100644
--- a/tests/bitexact/compare.py
+++ b/tests/bitexact/compare.py
@@ -33,7 +33,102 @@ def _ulp_distance(x, y):
     return int(np.abs(xi - yi).max()) if xi.size else 0
 
 
-def compare_case(stock_dir, fvtk_dir, key):
+# Tags in their canonical vtkPolyData cell-data global-index order. Cell data is
+# indexed across all cells as verts, then lines, then polys, then strips; the
+# canonical sort must preserve this partition and only reorder WITHIN each group.
+_POLY_TAGS = ("verts", "lines", "polys", "strips")
+
+
+def _cell_records(arrays):
+    """Reconstruct a per-cell canonical sort key list + the global cell order.
+
+    Returns (keys, perm) where ``keys`` is the list of canonical per-cell keys in
+    GLOBAL cell-data index order, and ``perm`` is an argsort (a permutation of
+    range(numCells)) that orders cells canonically while preserving the VTK group
+    partition. Cells are keyed by (group_rank, connectivity-tuple) for polydata or
+    (cell_type, connectivity-tuple) for unstructured grids -- connectivity is point
+    IDs, which are directly comparable because points stay strictly identical.
+    Returns None if the array set has no recognizable topology.
+    """
+    names = set(arrays.files) if hasattr(arrays, "files") else set(arrays)
+
+    def conn_off(tag):
+        ck, ok = (f"conn:{tag}", f"off:{tag}") if tag else ("conn", "off")
+        if ck in names and ok in names:
+            return np.asarray(arrays[ck]).astype(np.int64), np.asarray(arrays[ok]).astype(np.int64)
+        return None, None
+
+    keys = []
+    # Unstructured grid: single conn/off plus celltypes.
+    if "conn" in names and "off" in names:
+        conn, off = conn_off(None)
+        ctypes = np.asarray(arrays["celltypes"]).astype(np.int64) if "celltypes" in names else None
+        for i in range(len(off) - 1):
+            cell = tuple(conn[off[i]:off[i + 1]].tolist())
+            rank = int(ctypes[i]) if ctypes is not None else 0
+            keys.append((rank, len(cell), cell))
+    else:
+        # PolyData: grouped verts|lines|polys|strips.
+        any_topo = False
+        for rank, tag in enumerate(_POLY_TAGS):
+            conn, off = conn_off(tag)
+            if conn is None:
+                continue
+            any_topo = True
+            for i in range(len(off) - 1):
+                cell = tuple(conn[off[i]:off[i + 1]].tolist())
+                keys.append((rank, len(cell), cell))
+        if not any_topo:
+            return None
+    # Stable argsort: preserve group partition (rank leads the key), order within.
+    perm = sorted(range(len(keys)), key=lambda i: keys[i])
+    return keys, perm
+
+
+def _compare_order_relaxed(a, b):
+    """Order-invariant mesh equality: points + point-data strict; cells compared
+    as a multiset carrying their cell-data. Returns (ok, per_array_detail)."""
+    per = {}
+    ok = True
+    names = sorted(set(a.files) & set(b.files))
+    # 1) points + point-data: STRICT (points stay identical, so pd indices align).
+    for name in names:
+        if name == "points" or name.startswith("pd:"):
+            x, y = a[name], b[name]
+            eq = bool(x.shape == y.shape and x.dtype == y.dtype and np.array_equal(x, y))
+            per[name] = {"equal": eq, "mode": "strict", "dtype": str(x.dtype)}
+            ok &= eq
+    # 2) cells: canonicalize both sides, compare keys (connectivity multiset).
+    ra, rb = _cell_records(a), _cell_records(b)
+    if ra is None or rb is None:
+        per["__cells__"] = {"equal": False, "reason": "no topology to canonicalize"}
+        return False, per
+    ka, pa = ra
+    kb, pb = rb
+    keys_eq = bool(len(ka) == len(kb) and [ka[i] for i in pa] == [kb[i] for i in pb])
+    per["__cells__"] = {"equal": keys_eq, "mode": "order-relaxed", "ncells": len(ka)}
+    ok &= keys_eq
+    # 3) cell-data: reorder each cd:* array by the canonical perm, compare (values
+    #    travel with their cell). Width-relaxed for integer cell-data.
+    for name in names:
+        if not name.startswith("cd:"):
+            continue
+        x, y = a[name], b[name]
+        if x.shape[0] != len(pa) or y.shape[0] != len(pb) or x.shape[1:] != y.shape[1:]:
+            per[name] = {"equal": False, "mode": "order-relaxed", "reason": "shape"}
+            ok = False
+            continue
+        xs, ys = x[pa], y[pb]
+        if xs.dtype.kind in "iu" and ys.dtype.kind in "iu":
+            eq = bool(np.array_equal(xs.astype(np.int64), ys.astype(np.int64)))
+        else:
+            eq = bool(xs.dtype == ys.dtype and np.array_equal(xs, ys))
+        per[name] = {"equal": eq, "mode": "order-relaxed", "dtype": str(x.dtype)}
+        ok &= eq
+    return ok, per
+
+
+def compare_case(stock_dir, fvtk_dir, key, order_relaxed=False):
     """Return (ok: bool, detail: dict) for a single case key."""
     sp = os.path.join(stock_dir, key + ".npz")
     fp = os.path.join(fvtk_dir, key + ".npz")
@@ -48,6 +143,12 @@ def compare_case(stock_dir, fvtk_dir, key):
             "only_stock": sorted(names_a - names_b),
             "only_fvtk": sorted(names_b - names_a),
         }
+    if order_relaxed:
+        # Order-relaxed mesh equality: same points/point-data (strict) and the
+        # same multiset of cells carrying their cell-data, but cell ORDER may
+        # differ (e.g. thread-batched topology emission). See _compare_order_relaxed.
+        ok, per = _compare_order_relaxed(a, b)
+        return ok, {"arrays": per, "order_relaxed": True}
     per_array = {}
     ok = True
     for name in sorted(names_a):
@@ -117,6 +218,9 @@ def compare_all(stock_dir, fvtk_dir):
                 "group": cs.get("group"),
             }
             continue
-        ok, detail = compare_case(stock_dir, fvtk_dir, key)
-        cases[key] = {"ok": ok, "detail": detail, "group": cs.get("group")}
+        # A case is order-relaxed if EITHER manifest marks it so (both should agree).
+        order_relaxed = bool(cs.get("order_relaxed") or cf.get("order_relaxed"))
+        ok, detail = compare_case(stock_dir, fvtk_dir, key, order_relaxed=order_relaxed)
+        cases[key] = {"ok": ok, "detail": detail, "group": cs.get("group"),
+                      "order_relaxed": order_relaxed}
     return {"provenance": prov, "cases": cases, "keys": keys}
diff --git a/tests/bitexact/ops.py b/tests/bitexact/ops.py
index bf1a886b..da94bfe3 100644
--- a/tests/bitexact/ops.py
+++ b/tests/bitexact/ops.py
@@ -1881,6 +1881,51 @@ def op_cutter(dtype, size):
     return cut.GetOutput()
 
 
+def op_cutter_linear(dtype, size):
+    # Plane cut of a LARGE linear hex unstructured grid with triangle generation
+    # ON (the default). vtkCutter routes this to vtk3DLinearGridPlaneCutter -- the
+    # threaded fast path that fvtk runs under the OPT-IN non-exact fast mode.
+    #
+    # Fast mode is gated by the FVTK_FAST env var (the fvtk.EnableFast() Python
+    # API just sets this). We set it here so the fvtk side actually threads; stock
+    # VTK ignores the variable, so it still produces the sequential reference. The
+    # mesh is sized so the parallel vtkSMPTools::For batch-splits, so the threaded
+    # triangle emission reorders cells relative to the sequential reference. Output
+    # points + interpolated point scalars + the (constant) plane normal are
+    # thread-INVARIANT; only cell EMISSION ORDER differs. Hence this op is compared
+    # ORDER-RELAXED: same points/point-data (strict) and the same multiset of
+    # triangles carrying their cell-data, cell order negotiable.
+    os.environ["FVTK_FAST"] = "1"  # fvtk: opt in to the threaded cutter; stock: ignored
+    p = vtkPlane()
+    c = (size - 1) / 2.0
+    p.SetOrigin(c, c, c)
+    p.SetNormal(1, 1, 0)
+    cut = vtkCutter()
+    cut.SetInputData(make_hex_ugrid(size, dtype))
+    cut.SetCutFunction(p)
+    cut.SetValue(0, 0.0)  # GenerateTriangles ON (default) -> linear-grid fast path
+    cut.Update()
+    return cut.GetOutput()
+
+
+def op_contour_linear(dtype, size):
+    # Isocontour of a LARGE linear hex unstructured grid with ComputeNormals OFF.
+    # vtkContourFilter routes a linear UG to vtkContour3DLinearGrid -- the threaded
+    # fast path fvtk runs under the OPT-IN non-exact fast mode (FVTK_FAST, set by
+    # fvtk.EnableFast()). With ComputeNormals OFF the merge path produces
+    # thread-INVARIANT points + interpolated point scalars; only triangle EMISSION
+    # ORDER differs, so the case is compared ORDER-RELAXED. ComputeNormals ON is
+    # NOT order-relaxable (normal averaging is reduction-order-dependent) and the
+    # filter keeps it serial / byte-exact -- this op deliberately leaves it off.
+    os.environ["FVTK_FAST"] = "1"  # fvtk: opt in to the threaded contour; stock: ignored
+    c = vtkContourFilter()
+    c.SetInputData(make_hex_ugrid(size, dtype))
+    c.SetComputeNormals(0)
+    c.SetValue(0, 0.25 * (size ** 2))
+    c.Update()
+    return c.GetOutput()
+
+
 def op_cutter_polydata(dtype, size):
     # vtkCutter on a vtkPolyData (triangle sphere) with GenerateTriangles OFF.
     # A polydata input that is NOT eligible for the plane-cutter fast path routes
@@ -2356,6 +2401,12 @@ def op_ply_roundtrip_ascii(dtype, size):
     "tube_vec": dict(fn=op_tube_vec, group="filter", dtypes=["float32", "float64"], sizes=[16, 32]),
     "gradient": dict(fn=op_gradient, group="filter", dtypes=["float32", "float64"], sizes=[16, 24]),
     "cutter": dict(fn=op_cutter, group="filter", dtypes=["float64"], sizes=[8, 12]),
+    # Large linear-grid plane cut: threaded vtk3DLinearGridPlaneCutter, ORDER-RELAXED
+    # (same points/point-data + same triangle multiset; cell order may permute).
+    "cutter_linear": dict(fn=op_cutter_linear, group="filter", dtypes=["float32", "float64"], sizes=[30, 40], order_relaxed=True),
+    # Large linear-grid isocontour (ComputeNormals OFF): threaded vtkContour3DLinearGrid,
+    # ORDER-RELAXED. Normals-ON stays serial/byte-exact (reduction-order-dependent).
+    "contour_linear": dict(fn=op_contour_linear, group="filter", dtypes=["float32", "float64"], sizes=[30, 40], order_relaxed=True),
     "cutter_polydata": dict(fn=op_cutter_polydata, group="filter", dtypes=["float64"], sizes=[12, 20]),
     "cutter_polydata_bycell": dict(fn=op_cutter_polydata_bycell, group="filter", dtypes=["float64"], sizes=[12, 20]),
     "cellcenters": dict(fn=op_cellcenters, group="filter", dtypes=["float32", "float64"], sizes=[8, 12]),
diff --git a/tests/bitexact/run_ops.py b/tests/bitexact/run_ops.py
index e5838262..a27e549b 100644
--- a/tests/bitexact/run_ops.py
+++ b/tests/bitexact/run_ops.py
@@ -69,6 +69,7 @@ def main():
                 "dtype": dtype_name,
                 "size": size,
                 "group": ops.OPS[op_name]["group"],
+                "order_relaxed": bool(ops.OPS[op_name].get("order_relaxed", False)),
                 "n_arrays": len(arrays),
                 "sha256": array_sha(arrays),
                 "arrays": {k: list(np.asarray(v).shape) for k, v in arrays.items()},
diff --git a/tests/bitexact/test_bitexact.py b/tests/bitexact/test_bitexact.py
index cb6376db..d0350d03 100644
--- a/tests/bitexact/test_bitexact.py
+++ b/tests/bitexact/test_bitexact.py
@@ -45,13 +45,17 @@ def _assert_case(results, case_key):
         detail = case["detail"]
         # Build a focused failure message listing the non-equal arrays + ULP.
         msg = [f"BIT DIFFERENCE in {case_key}:"]
+        if detail.get("order_relaxed"):
+            msg.append("  (order-relaxed mesh comparison)")
         if "arrays" in detail:
             for name, info in detail["arrays"].items():
-                if not info["equal"]:
+                if not info.get("equal", True):
                     msg.append(
-                        f"  array {name}: equal=False dtype={info['dtype']} "
-                        f"shape_stock={info['shape_stock']} "
-                        f"shape_fvtk={info['shape_fvtk']} ulp={info['ulp']}"
+                        f"  array {name}: equal=False mode={info.get('mode', 'strict')} "
+                        f"dtype={info.get('dtype', '?')} "
+                        f"shape_stock={info.get('shape_stock', '?')} "
+                        f"shape_fvtk={info.get('shape_fvtk', '?')} "
+                        f"ulp={info.get('ulp')} reason={info.get('reason', '')}"
                     )
         else:
             msg.append(f"  {detail}")
diff --git a/tests/bitexact/test_smp_determinism.py b/tests/bitexact/test_smp_determinism.py
index f541bc85..0fe6c5df 100644
--- a/tests/bitexact/test_smp_determinism.py
+++ b/tests/bitexact/test_smp_determinism.py
@@ -26,7 +26,11 @@
 
 # Ops whose filters opt into fvtk default-on threading. Exercising any of these
 # at >1 thread must produce byte-identical output to the 1-thread run.
-THREADED_OPS = ["warp", "warpvector", "normals", "elevation"]
+# cutter_linear is ORDER-RELAXED (threaded vtk3DLinearGridPlaneCutter): its cell
+# emission order varies with thread count, so compare_all compares it order-relaxed
+# (same points/point-data + same triangle multiset). The assertion below thus
+# checks thread-count invariance of the MESH, not the byte layout.
+THREADED_OPS = ["warp", "warpvector", "normals", "elevation", "cutter_linear", "contour_linear"]
 
 THREAD_COUNTS = [1, 4, 8]