From b283ad0ab3f9f68bae144d2b1b3fea722b9e2f34 Mon Sep 17 00:00:00 2001
From: Kaiqi Yan <kaiqiy@nvidia.com>
Date: Thu, 28 May 2026 06:11:33 +0000
Subject: [PATCH 1/6] sort and check O-signatures

Signed-off-by: Kaiqi Yan <kaiqiy@nvidia.com>
---
 libs/qec/lib/detector_error_model.cpp | 70 ++++++++++++++++++---------
 1 file changed, 48 insertions(+), 22 deletions(-)
diff --git a/libs/qec/lib/detector_error_model.cpp b/libs/qec/lib/detector_error_model.cpp
index 9575d953..b90d1cad 100644
--- a/libs/qec/lib/detector_error_model.cpp
+++ b/libs/qec/lib/detector_error_model.cpp
@@ -9,6 +9,7 @@
 #include "cudaq/qec/detector_error_model.h"
 #include "cudaq/qec/pcm_utils.h"
 #include "cudaq/runtime/logger/logger.h"
+#include <algorithm>
 
 namespace cudaq::qec {
 
@@ -65,13 +66,46 @@ void detector_error_model::canonicalize_for_rounds(
   auto row_indices = dense_to_sparse(detector_error_matrix);
   auto column_order =
       get_sorted_pcm_column_indices(row_indices, num_syndromes_per_round);
+  const std::size_t num_obs = this->num_observables();
+
+  auto observables_match = [&](std::uint32_t lhs, std::uint32_t rhs) {
+    for (std::size_t r = 0; r < num_obs; r++) {
+      if (this->observables_flips_matrix.at({r, lhs}) !=
+          this->observables_flips_matrix.at({r, rhs}))
+        return false;
+    }
+    return true;
+  };
+
+  auto observables_less = [&](std::uint32_t lhs, std::uint32_t rhs) {
+    for (std::size_t r = 0; r < num_obs; r++) {
+      auto lhs_obs = this->observables_flips_matrix.at({r, lhs});
+      auto rhs_obs = this->observables_flips_matrix.at({r, rhs});
+      if (lhs_obs != rhs_obs)
+        return lhs_obs < rhs_obs;
+    }
+    return lhs < rhs;
+  };
+
+  // Keep the PCM/topological ordering from get_sorted_pcm_column_indices, but
+  // sort equal-syndrome groups by observable signature so merge-compatible DEM
+  // columns are adjacent.
+  for (auto group_begin = column_order.begin();
+       group_begin != column_order.end();) {
+    auto group_end = group_begin + 1;
+    while (group_end != column_order.end() &&
+           row_indices[*group_end] == row_indices[*group_begin])
+      ++group_end;
+    std::sort(group_begin, group_end, observables_less);
+    group_begin = group_end;
+  }
+
   std::vector<std::uint32_t> final_column_order;
   // March through the columns in topological order, and combine the probability
   // weight vectors if the columns have the same row indices.
   std::vector<std::vector<std::uint32_t>> new_row_indices;
   std::vector<double> new_weights;
   std::vector<std::size_t> new_error_ids;
-  const std::size_t num_obs = this->num_observables();
   const auto num_cols = column_order.size();
   bool has_error_ids =
       error_ids.has_value() && error_ids->size() == error_rates.size();
@@ -101,10 +135,12 @@ void detector_error_model::canonicalize_for_rounds(
         new_error_ids.push_back(error_ids->at(column_index));
     } else {
       auto &prev_row_indices = new_row_indices.back();
-      if (prev_row_indices == curr_row_indices) {
-        // The current column has the same row indices as the previous column
-        // (i.e. has the same syndrome signature) so we update the error_rates
-        // and do NOT add the duplicate column.
+      auto previous_column = final_column_order.back();
+      if (prev_row_indices == curr_row_indices &&
+          observables_match(previous_column, column_index)) {
+        // The current column has the same syndrome and observable signatures
+        // as the previous column, so update the error rate and do NOT add a
+        // duplicate column.
         auto prev_weight = new_weights.back();
         auto prev_error_id = has_error_ids
                                  ? new_error_ids.back()
@@ -124,27 +160,17 @@ void detector_error_model::canonicalize_for_rounds(
         if (has_error_ids)
           new_error_ids.back() =
               std::min(prev_error_id, error_ids->at(column_index));
-        // Verify that the observables are the same for the duplicate column.
-        auto previous_column = column_order[c - 1];
-        bool match = true;
-        for (std::size_t r = 0; r < num_obs; r++) {
-          if (this->observables_flips_matrix.at({r, previous_column}) !=
-              this->observables_flips_matrix.at({r, column_index})) {
-            match = false;
-            break;
-          }
-        }
-        if (!match) {
+      } else {
+        // Either the syndrome differs, or the same syndrome has a different
+        // observable flip. In both cases this is a distinct error mechanism.
+        if (prev_row_indices == curr_row_indices) {
           cudaq::info(
               "detector_error_model::canonicalize_for_rounds: identical "
               "syndromes exist in detector_error_matrix but have different "
-              "observables in observables_flips_matrix (columns {} and {})",
-              previous_column, column_index);
+              "observables in observables_flips_matrix; keeping column {} as a "
+              "distinct error mechanism (previous column {})",
+              column_index, previous_column);
         }
-      } else {
-        // The current column has different row indices than the previous
-        // column. So we add the current column to the new PCM, and update the
-        // error_rates.
         new_row_indices.push_back(curr_row_indices);
         new_weights.push_back(error_rates[column_index]);
         final_column_order.push_back(column_index);

From d8a9b9608fe0d8de8ab17c6a157685c8f56e8cad Mon Sep 17 00:00:00 2001
From: Kaiqi Yan <kaiqiy@nvidia.com>
Date: Thu, 28 May 2026 06:11:51 +0000
Subject: [PATCH 2/6] update and add tests

Signed-off-by: Kaiqi Yan <kaiqiy@nvidia.com>
---
 libs/qec/python/tests/test_dem.py | 287 +++++++++++++++++++++++++-----
 1 file changed, 245 insertions(+), 42 deletions(-)

diff --git a/libs/qec/python/tests/test_dem.py b/libs/qec/python/tests/test_dem.py
index 2cfffb06..09c10838 100644
--- a/libs/qec/python/tests/test_dem.py
+++ b/libs/qec/python/tests/test_dem.py
@@ -58,6 +58,144 @@ def mat_to_str(mat):
     return s
 
 
+# Stim-oracle helpers. Keep stim imports inside tests so arm64 CI can skip.
+
+
+def _build_steane_z_memory_stim_circuit(stim_mod, p, n_rounds):
+    # Mirrors Steane prep0 and stabilizer schedules in steane_device.cpp.
+    # Qubits: data 0..6, ancx 7..9, ancz 10..12.
+    stab_supports = [
+        [0, 1, 2, 3],
+        [1, 2, 4, 5],
+        [2, 3, 5, 6],
+    ]
+    prep0_cx_pairs = [(0, 1), (4, 5), (6, 3), (6, 5), (4, 2), (0, 3), (4, 1),
+                      (3, 2)]
+
+    c = stim_mod.Circuit()
+    c.append("R", list(range(13)))
+    c.append("H", [0, 4, 6])
+    for ctrl, tgt in prep0_cx_pairs:
+        c.append("CX", [ctrl, tgt])
+        c.append("DEPOLARIZE2", [ctrl, tgt], p)
+
+    for r in range(n_rounds):
+        c.append("H", [7, 8, 9])
+        for xi, support in enumerate(stab_supports):
+            for di in support:
+                c.append("CX", [7 + xi, di])
+                c.append("DEPOLARIZE2", [7 + xi, di], p)
+        c.append("H", [7, 8, 9])
+
+        for zi, support in enumerate(stab_supports):
+            for di in support:
+                c.append("CX", [di, 10 + zi])
+                c.append("DEPOLARIZE2", [di, 10 + zi], p)
+
+        # cudaq-qec measures ancz first, then ancx.
+        c.append("M", [10, 11, 12, 7, 8, 9])
+        c.append("R", [7, 8, 9, 10, 11, 12])
+
+        # Z-DEM keeps ancz detectors only; later rounds compare to prior ancz.
+        if r == 0:
+            for zi in range(3):
+                c.append("DETECTOR", [stim_mod.target_rec(-6 + zi)])
+        else:
+            for zi in range(3):
+                c.append("DETECTOR", [
+                    stim_mod.target_rec(-6 + zi),
+                    stim_mod.target_rec(-12 + zi),
+                ])
+
+    c.append("M", [0, 1, 2, 3, 4, 5, 6])
+    # Z_L = Z_4 Z_5 Z_6.
+    c.append("OBSERVABLE_INCLUDE",
+             [stim_mod.target_rec(i) for i in (-3, -2, -1)], 0)
+    return c
+
+
+def _independent_merge(p, q):
+    # P(A xor B) for independent events.
+    return p + q - 2.0 * p * q
+
+
+def _stim_dem_to_multiset(stim_dem):
+    # Match cudaq-qec canonicalize_for_rounds by dropping no-syndrome errors.
+    bucket = {}
+    for inst in stim_dem.flattened():
+        if inst.type != "error":
+            continue
+        prob = inst.args_copy()[0]
+        det_set = []
+        obs_set = []
+        for tgt in inst.targets_copy():
+            if tgt.is_relative_detector_id():
+                det_set.append(tgt.val)
+            elif tgt.is_logical_observable_id():
+                obs_set.append(tgt.val)
+        if not det_set:
+            continue
+        key = (tuple(sorted(det_set)), tuple(sorted(obs_set)))
+        if key in bucket:
+            bucket[key] = _independent_merge(bucket[key], prob)
+        else:
+            bucket[key] = prob
+    return bucket
+
+
+def _cudaq_dem_to_multiset(dem):
+    # Same keying as _stim_dem_to_multiset: (detectors, observables) -> prob.
+    H = np.asarray(dem.detector_error_matrix)
+    O = np.asarray(dem.observables_flips_matrix)
+    rates = np.asarray(dem.error_rates)
+    bucket = {}
+    for c in range(H.shape[1]):
+        key = (tuple(np.flatnonzero(H[:, c]).tolist()),
+               tuple(np.flatnonzero(O[:, c]).tolist()))
+        prob = float(rates[c])
+        if key in bucket:
+            bucket[key] = _independent_merge(bucket[key], prob)
+        else:
+            bucket[key] = prob
+    return bucket
+
+
+def _stim_dem_to_arrays(stim_dem):
+    # Split decomposed Stim errors into graphlike H/O columns for PyMatching.
+    n_dets = stim_dem.num_detectors
+    n_obs = stim_dem.num_observables
+    h_cols, o_cols, rates = [], [], []
+    for inst in stim_dem.flattened():
+        if inst.type != "error":
+            continue
+        prob = inst.args_copy()[0]
+        components = [[]]
+        for tgt in inst.targets_copy():
+            if tgt.is_separator():
+                components.append([])
+            else:
+                components[-1].append(tgt)
+        for comp in components:
+            h_col = np.zeros(n_dets, dtype=np.uint8)
+            o_col = np.zeros(n_obs, dtype=np.uint8)
+            for tgt in comp:
+                if tgt.is_relative_detector_id():
+                    h_col[tgt.val] = 1
+                elif tgt.is_logical_observable_id():
+                    o_col[tgt.val] = 1
+            # Matching cannot use a purely-logical component with no syndrome.
+            if h_col.sum() == 0:
+                continue
+            h_cols.append(h_col)
+            o_cols.append(o_col)
+            rates.append(prob)
+    H = np.stack(h_cols, axis=1) if h_cols else np.zeros((n_dets, 0),
+                                                         dtype=np.uint8)
+    O = np.stack(o_cols, axis=1) if o_cols else np.zeros((n_obs, 0),
+                                                         dtype=np.uint8)
+    return H, O, np.asarray(rates, dtype=np.float64)
+
+
 # Use the fixture to set the target
 @pytest.fixture(scope="module", autouse=True)
 def set_target():
@@ -87,48 +225,52 @@ def test_dem_from_memory_circuit_requires_noise_model(dem_fn):
         dem_fn(code, statePrep, 1, None)
 
 
-def test_dem_from_memory_circuit():
+def test_z_dem_from_memory_circuit_against_stim_oracle():
+    """Compare cudaq-qec's Steane Z-DEM against Stim's independent DEM."""
+    stim_mod = pytest.importorskip(
+        "stim",
+        reason="stim not installed; skipping Stim oracle cross-check for Steane Z-DEM"
+    )
+
     code = qec.get_code('steane')
     p = 0.01
+    n_rounds = 2
     noise = cudaq.NoiseModel()
     noise.add_all_qubit_channel("x", cudaq.Depolarization2(p), 1)
-    statePrep = qec.operation.prep0
-    nRounds = 2
-
-    dem = qec.z_dem_from_memory_circuit(code, statePrep, nRounds, noise)
-    expected_detector_error_matrix = """
-1111...1..............
-.1.111..111...........
-..11.11..1.1111.......
-.......111.1.1.1111...
-..........1.11..1.111.
-..............1..11.11
-"""
-    # Uncomment the following line to get a string representation of the DEM
-    # that you can compare to expected_detector_error_matrix.
-    # print(mat_to_str(dem.detector_error_matrix), end='')
-    assert '\n' + mat_to_str(
-        dem.detector_error_matrix) == expected_detector_error_matrix
-
-    # Uncomment the following line to get a string representation of the error
-    # rates that you can compare to expected_error_rates.
-    # print(np.round(dem.error_rates, 4))
-
-    # The following error rates were captured from the above print statement and
-    # are considered "truth" data now.
-    expected_error_rates = [
-        0.0183, 0.0235, 0.0158, 0.0209, 0.0310, 0.0235, 0.0183, 0.0106, 0.0053,
-        0.0053, 0.0106, 0.0053, 0.0053, 0.0053, 0.0106, 0.0235, 0.0158, 0.0158,
-        0.0183, 0.0335, 0.0209, 0.0434
-    ]
-    assert np.allclose(dem.error_rates, expected_error_rates, atol=1e-4)
-
-    expected_observables_flips_matrix = '1....11.....1......111\n'
-    # Uncomment the following line to get a string representation of the
-    # observables flips matrix that you can compare to
-    # expected_observables_flips_matrix.
-    assert mat_to_str(
-        dem.observables_flips_matrix) == expected_observables_flips_matrix
+    cudaq_dem = qec.z_dem_from_memory_circuit(code, qec.operation.prep0,
+                                              n_rounds, noise)
+
+    stim_circuit = _build_steane_z_memory_stim_circuit(stim_mod, p, n_rounds)
+    stim_dem = stim_circuit.detector_error_model(decompose_errors=False)
+
+    # Basic shape must agree before comparing signatures.
+    assert cudaq_dem.detector_error_matrix.shape[0] == stim_dem.num_detectors, (
+        f"num_detectors mismatch: cudaq="
+        f"{cudaq_dem.detector_error_matrix.shape[0]}, "
+        f"stim={stim_dem.num_detectors}")
+    assert cudaq_dem.observables_flips_matrix.shape[
+        0] == stim_dem.num_observables, (
+            f"num_observables mismatch: cudaq="
+            f"{cudaq_dem.observables_flips_matrix.shape[0]}, "
+            f"stim={stim_dem.num_observables}")
+
+    cudaq_terms = _cudaq_dem_to_multiset(cudaq_dem)
+    stim_terms = _stim_dem_to_multiset(stim_dem)
+
+    # Catch regression of same-syndrome/different-observable merging.
+    cudaq_keys = set(cudaq_terms)
+    stim_keys = set(stim_terms)
+    assert cudaq_keys == stim_keys, (
+        f"DEM key sets differ. cudaq-only keys: "
+        f"{sorted(cudaq_keys - stim_keys)}; "
+        f"stim-only keys: {sorted(stim_keys - cudaq_keys)}")
+
+    # The small tolerance covers different grouping of Pauli outcomes.
+    for k in cudaq_keys:
+        assert np.isclose(cudaq_terms[k], stim_terms[k], atol=1e-4,
+                          rtol=1e-3), (
+                              f"probability mismatch at {k}: "
+                              f"cudaq={cudaq_terms[k]}, stim={stim_terms[k]}")
 
 
 def test_x_dem_from_memory_circuit():
@@ -296,8 +438,7 @@ def test_decoding_from_surface_code_dem_from_memory_circuit(
 
 
 def test_pymatching_decode_to_observable_surface_code_dem():
-    """Test PyMatching with O (observables) matrix: decoder returns observable
-    flips directly.cpp)."""
+    """Test PyMatching with O matrix on cudaq-qec's generated surface-code DEM."""
     cudaq.set_random_seed(13)
     code = qec.get_code('surface_code', distance=5)
     Lz = code.get_observables_z()
@@ -320,16 +461,19 @@ def test_pymatching_decode_to_observable_surface_code_dem():
 
     dem = qec.z_dem_from_memory_circuit(code, statePrep, nRounds, noise)
 
+    # Correct canonicalization preserves parallel matching edges; merge them
+    # inside PyMatching instead of dropping DEM columns upstream.
     decoder = qec.get_decoder(
         'pymatching',
         dem.detector_error_matrix,
         O=dem.observables_flips_matrix,
         error_rate_vec=np.array(dem.error_rates),
+        merge_strategy='independent',
     )
 
     dr = decoder.decode_batch(syndromes)
-    # With decode_to_observables=True, each row is observable flips
-    # (length num_observables), not error predictions.
+    # With O provided, PyMatching returns observable-flip predictions directly,
+    # one row per shot.
     obs_per_shot = np.asarray(dr.result, dtype=np.float64)
     data_predictions = np.round(obs_per_shot).astype(np.uint8).T
 
@@ -338,6 +482,65 @@ def test_pymatching_decode_to_observable_surface_code_dem():
     assert nLogicalErrorsWithDecoding < nLogicalErrorsWithoutDecoding
 
 
+def test_pymatching_decodes_stim_surface_code_dem():
+    """Decode a Stim surface-code DEM through cudaq-qec's PyMatching plugin."""
+    stim_mod = pytest.importorskip(
+        "stim",
+        reason="stim not installed; skipping Stim-based PyMatching decode test"
+    )
+
+    distance = 5
+    n_rounds = 5
+    p = 0.003
+    n_shots = 2000
+
+    circuit = stim_mod.Circuit.generated(
+        "surface_code:rotated_memory_z",
+        distance=distance,
+        rounds=n_rounds,
+        after_clifford_depolarization=p,
+        after_reset_flip_probability=p,
+        before_measure_flip_probability=p,
+        before_round_data_depolarization=p,
+    )
+    # Matching plugin expects graphlike columns (1 or 2 detectors).
+    stim_dem = circuit.detector_error_model(decompose_errors=True)
+
+    H, O, rates = _stim_dem_to_arrays(stim_dem)
+
+    # Sample syndromes and true observable flips from the same Stim circuit.
+    sampler = circuit.compile_detector_sampler(seed=13)
+    syndromes_bool, obs_bool = sampler.sample(shots=n_shots,
+                                              separate_observables=True)
+    syndromes = syndromes_bool.astype(np.uint8)
+    logical_measurements = obs_bool.astype(np.uint8).flatten()
+
+    # Surface-code DEMs have parallel edges; let PyMatching merge them.
+    try:
+        decoder = qec.get_decoder(
+            'pymatching',
+            H,
+            O=O,
+            error_rate_vec=rates,
+            merge_strategy='independent',
+        )
+    except Exception as e:
+        pytest.skip(f'pymatching decoder unavailable in this build: {e}')
+
+    dr = decoder.decode_batch(syndromes)
+    # With O provided, the decoder returns predicted observable flips.
+    obs_per_shot = np.asarray(dr.result, dtype=np.float64)
+    data_predictions = np.round(obs_per_shot).astype(np.uint8).flatten()
+
+    n_errors_without_decoding = int(np.sum(logical_measurements))
+    n_errors_with_decoding = int(
+        np.sum(data_predictions ^ logical_measurements))
+    assert n_errors_with_decoding < n_errors_without_decoding, (
+        f"PyMatching did not reduce logical errors: "
+        f"with_decoding={n_errors_with_decoding}, "
+        f"without_decoding={n_errors_without_decoding}")
+
+
 def test_pcm_extend_to_n_rounds():
     # This test independently compares the functionality of dem_from_memory_circuit
     # (of two different numbers of rounds) to pcm_extend_to_n_rounds.

From b4d4a725bd24d59c8f36f11e72d73f4c8ec46cf7 Mon Sep 17 00:00:00 2001
From: Kaiqi Yan <kaiqiy@nvidia.com>
Date: Thu, 28 May 2026 06:30:22 +0000
Subject: [PATCH 3/6] sort and check O-signatures

Signed-off-by: Kaiqi Yan <kaiqiy@nvidia.com>
---
 libs/qec/lib/detector_error_model.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libs/qec/lib/detector_error_model.cpp b/libs/qec/lib/detector_error_model.cpp
index b90d1cad..56333565 100644
--- a/libs/qec/lib/detector_error_model.cpp
+++ b/libs/qec/lib/detector_error_model.cpp
@@ -9,7 +9,6 @@
 #include "cudaq/qec/detector_error_model.h"
 #include "cudaq/qec/pcm_utils.h"
 #include "cudaq/runtime/logger/logger.h"
-#include <algorithm>
 
 namespace cudaq::qec {
 

From 0f7d62806e3d9551ed0f39f090e523a53bdce599 Mon Sep 17 00:00:00 2001
From: Kaiqi Yan <kaiqiy@nvidia.com>
Date: Thu, 28 May 2026 06:41:01 +0000
Subject: [PATCH 4/6] update and add tests

Signed-off-by: Kaiqi Yan <kaiqiy@nvidia.com>
---
 libs/qec/python/tests/test_dem.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/libs/qec/python/tests/test_dem.py b/libs/qec/python/tests/test_dem.py
index 09c10838..ef7faf1f 100644
--- a/libs/qec/python/tests/test_dem.py
+++ b/libs/qec/python/tests/test_dem.py
@@ -189,10 +189,10 @@ def _stim_dem_to_arrays(stim_dem):
             h_cols.append(h_col)
             o_cols.append(o_col)
             rates.append(prob)
-    H = np.stack(h_cols, axis=1) if h_cols else np.zeros((n_dets, 0),
-                                                         dtype=np.uint8)
-    O = np.stack(o_cols, axis=1) if o_cols else np.zeros((n_obs, 0),
-                                                         dtype=np.uint8)
+    H = np.stack(h_cols, axis=1) if h_cols else np.zeros(
+        (n_dets, 0), dtype=np.uint8)
+    O = np.stack(o_cols, axis=1) if o_cols else np.zeros(
+        (n_obs, 0), dtype=np.uint8)
     return H, O, np.asarray(rates, dtype=np.float64)
 
 
@@ -229,8 +229,8 @@ def test_z_dem_from_memory_circuit_against_stim_oracle():
     """Compare cudaq-qec's Steane Z-DEM against Stim's independent DEM."""
     stim_mod = pytest.importorskip(
         "stim",
-        reason="stim not installed; skipping Stim oracle cross-check for Steane Z-DEM"
-    )
+        reason=
+        "stim not installed; skipping Stim oracle cross-check for Steane Z-DEM")
 
     code = qec.get_code('steane')
     p = 0.01
@@ -267,10 +267,10 @@ def test_z_dem_from_memory_circuit_against_stim_oracle():
 
     # The small tolerance covers different grouping of Pauli outcomes.
     for k in cudaq_keys:
-        assert np.isclose(cudaq_terms[k], stim_terms[k], atol=1e-4,
-                          rtol=1e-3), (
-                              f"probability mismatch at {k}: "
-                              f"cudaq={cudaq_terms[k]}, stim={stim_terms[k]}")
+        assert np.isclose(
+            cudaq_terms[k], stim_terms[k], atol=1e-4,
+            rtol=1e-3), (f"probability mismatch at {k}: "
+                         f"cudaq={cudaq_terms[k]}, stim={stim_terms[k]}")
 
 
 def test_x_dem_from_memory_circuit():
@@ -486,8 +486,7 @@ def test_pymatching_decodes_stim_surface_code_dem():
     """Decode a Stim surface-code DEM through cudaq-qec's PyMatching plugin."""
     stim_mod = pytest.importorskip(
         "stim",
-        reason="stim not installed; skipping Stim-based PyMatching decode test"
-    )
+        reason="stim not installed; skipping Stim-based PyMatching decode test")
 
     distance = 5
     n_rounds = 5
@@ -533,8 +532,8 @@ def test_pymatching_decodes_stim_surface_code_dem():
     data_predictions = np.round(obs_per_shot).astype(np.uint8).flatten()
 
     n_errors_without_decoding = int(np.sum(logical_measurements))
-    n_errors_with_decoding = int(
-        np.sum(data_predictions ^ logical_measurements))
+    n_errors_with_decoding = int(np.sum(data_predictions ^
+                                        logical_measurements))
     assert n_errors_with_decoding < n_errors_without_decoding, (
         f"PyMatching did not reduce logical errors: "
         f"with_decoding={n_errors_with_decoding}, "

From 7bb56a8ecc834444c0f964c75996f6089f90c924 Mon Sep 17 00:00:00 2001
From: Kaiqi Yan <kaiqiy@nvidia.com>
Date: Fri, 29 May 2026 04:32:17 +0000
Subject: [PATCH 5/6] throw a warning when same-H diff-O

Signed-off-by: Kaiqi Yan <kaiqiy@nvidia.com>
---
 libs/qec/lib/detector_error_model.cpp | 25 +---------------------
 libs/qec/python/tests/test_dem.py     | 30 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/libs/qec/lib/detector_error_model.cpp b/libs/qec/lib/detector_error_model.cpp
index 56333565..9dd69e76 100644
--- a/libs/qec/lib/detector_error_model.cpp
+++ b/libs/qec/lib/detector_error_model.cpp
@@ -76,29 +76,6 @@ void detector_error_model::canonicalize_for_rounds(
     return true;
   };
 
-  auto observables_less = [&](std::uint32_t lhs, std::uint32_t rhs) {
-    for (std::size_t r = 0; r < num_obs; r++) {
-      auto lhs_obs = this->observables_flips_matrix.at({r, lhs});
-      auto rhs_obs = this->observables_flips_matrix.at({r, rhs});
-      if (lhs_obs != rhs_obs)
-        return lhs_obs < rhs_obs;
-    }
-    return lhs < rhs;
-  };
-
-  // Keep the PCM/topological ordering from get_sorted_pcm_column_indices, but
-  // sort equal-syndrome groups by observable signature so merge-compatible DEM
-  // columns are adjacent.
-  for (auto group_begin = column_order.begin();
-       group_begin != column_order.end();) {
-    auto group_end = group_begin + 1;
-    while (group_end != column_order.end() &&
-           row_indices[*group_end] == row_indices[*group_begin])
-      ++group_end;
-    std::sort(group_begin, group_end, observables_less);
-    group_begin = group_end;
-  }
-
   std::vector<std::uint32_t> final_column_order;
   // March through the columns in topological order, and combine the probability
   // weight vectors if the columns have the same row indices.
@@ -163,7 +140,7 @@ void detector_error_model::canonicalize_for_rounds(
         // Either the syndrome differs, or the same syndrome has a different
         // observable flip. In both cases this is a distinct error mechanism.
         if (prev_row_indices == curr_row_indices) {
-          cudaq::info(
+          cudaq::warn(
               "detector_error_model::canonicalize_for_rounds: identical "
               "syndromes exist in detector_error_matrix but have different "
               "observables in observables_flips_matrix; keeping column {} as a "
diff --git a/libs/qec/python/tests/test_dem.py b/libs/qec/python/tests/test_dem.py
index ef7faf1f..0d328f81 100644
--- a/libs/qec/python/tests/test_dem.py
+++ b/libs/qec/python/tests/test_dem.py
@@ -570,5 +570,35 @@ def test_pcm_extend_to_n_rounds():
     assert np.allclose(H15_new_error_rates, dem15.error_rates, atol=1e-6)
 
 
+def test_canonicalize_keeps_duplicate_syndromes_with_different_observables():
+    # Same syndrome but different observable flips are distinct mechanisms.
+    dem = qec.DetectorErrorModel()
+    dem.detector_error_matrix = np.array([[1, 1]], dtype=np.uint8)
+    dem.observables_flips_matrix = np.array([[0, 1]], dtype=np.uint8)
+    dem.error_rates = [0.1, 0.2]
+
+    dem.canonicalize_for_rounds(1)
+
+    assert dem.detector_error_matrix.shape[1] == 2
+    assert dem.observables_flips_matrix.shape[1] == 2
+    assert np.array_equal(dem.observables_flips_matrix,
+                          np.array([[0, 1]], dtype=np.uint8))
+
+
+def test_canonicalize_merges_duplicate_syndromes_with_same_observables():
+    # Same syndrome and same observable flip can be represented by one column.
+    dem = qec.DetectorErrorModel()
+    dem.detector_error_matrix = np.array([[1, 1]], dtype=np.uint8)
+    dem.observables_flips_matrix = np.array([[1, 1]], dtype=np.uint8)
+    dem.error_rates = [0.1, 0.2]
+
+    dem.canonicalize_for_rounds(1)
+
+    assert dem.detector_error_matrix.shape[1] == 1
+    assert dem.observables_flips_matrix.shape[1] == 1
+    assert np.array_equal(dem.observables_flips_matrix,
+                          np.array([[1]], dtype=np.uint8))
+
+
 if __name__ == "__main__":
     pytest.main()

From c522b940792f57bb18c83dae09bb0f713b83d525 Mon Sep 17 00:00:00 2001
From: Kaiqi Yan <kaiqiy@nvidia.com>
Date: Tue, 2 Jun 2026 11:22:20 +0000
Subject: [PATCH 6/6] update sliding window tests to cope with same-H diff-O

Signed-off-by: Kaiqi Yan <kaiqiy@nvidia.com>
---
 libs/qec/lib/detector_error_model.cpp        | 30 ++++++--
 libs/qec/python/tests/test_sliding_window.py | 77 ++++++++++++++++++--
 2 files changed, 93 insertions(+), 14 deletions(-)

diff --git a/libs/qec/lib/detector_error_model.cpp b/libs/qec/lib/detector_error_model.cpp
index 9dd69e76..db5d21fc 100644
--- a/libs/qec/lib/detector_error_model.cpp
+++ b/libs/qec/lib/detector_error_model.cpp
@@ -96,6 +96,12 @@ void detector_error_model::canonicalize_for_rounds(
         "or the detector_error_matrix  was computed incorrectly.");
   }
 
+  // Cap the number of "same syndrome, different observable" warnings emitted
+  // per invocation. Short-distance codes can have many such mechanisms, and
+  // logging every one of them would spam the console.
+  constexpr std::size_t max_same_syndrome_diff_obs_warnings = 10;
+  std::size_t num_same_syndrome_diff_obs = 0;
+
   for (std::size_t c = 0; c < num_cols; c++) {
     auto column_index = column_order[c];
     auto &curr_row_indices = row_indices[column_index];
@@ -140,12 +146,14 @@ void detector_error_model::canonicalize_for_rounds(
         // Either the syndrome differs, or the same syndrome has a different
         // observable flip. In both cases this is a distinct error mechanism.
         if (prev_row_indices == curr_row_indices) {
-          cudaq::warn(
-              "detector_error_model::canonicalize_for_rounds: identical "
-              "syndromes exist in detector_error_matrix but have different "
-              "observables in observables_flips_matrix; keeping column {} as a "
-              "distinct error mechanism (previous column {})",
-              column_index, previous_column);
+          if (num_same_syndrome_diff_obs < max_same_syndrome_diff_obs_warnings)
+            cudaq::warn(
+                "detector_error_model::canonicalize_for_rounds: identical "
+                "syndromes exist in detector_error_matrix but have different "
+                "observables in observables_flips_matrix; keeping column {} as "
+                "a distinct error mechanism (previous column {})",
+                column_index, previous_column);
+          num_same_syndrome_diff_obs++;
         }
         new_row_indices.push_back(curr_row_indices);
         new_weights.push_back(error_rates[column_index]);
@@ -156,6 +164,16 @@ void detector_error_model::canonicalize_for_rounds(
     }
   }
 
+  // Emit a single summary if we suppressed any per-column warnings above.
+  if (num_same_syndrome_diff_obs > max_same_syndrome_diff_obs_warnings)
+    cudaq::warn(
+        "detector_error_model::canonicalize_for_rounds: found {} columns with "
+        "identical syndromes but different observables; suppressed {} "
+        "additional warnings (only the first {} were shown).",
+        num_same_syndrome_diff_obs,
+        num_same_syndrome_diff_obs - max_same_syndrome_diff_obs_warnings,
+        max_same_syndrome_diff_obs_warnings);
+
   std::swap(this->error_rates, new_weights);
   if (has_error_ids)
     std::swap(*this->error_ids, new_error_ids);
diff --git a/libs/qec/python/tests/test_sliding_window.py b/libs/qec/python/tests/test_sliding_window.py
index ca00f24b..153a65e8 100644
--- a/libs/qec/python/tests/test_sliding_window.py
+++ b/libs/qec/python/tests/test_sliding_window.py
@@ -46,10 +46,20 @@ def test_sliding_window_1(decoder_name, batched, num_rounds, num_windows):
         col = np.random.randint(0, dem.detector_error_matrix.shape[1])
         syndromes[shot, :] = dem.detector_error_matrix[:, col].T
 
-    # First compare the results of the full decoder to the sliding window
-    # decoder using an inner decoder of the full window size. The results should
-    # be the same.
-    full_decoder = qec.get_decoder(decoder_name, dem.detector_error_matrix)
+    # Compare the full decoder against the sliding window decoder (configured as
+    # a single full-size window). For column-space decoders (single_error_lut)
+    # the corrections must be identical; for matching decoders (pymatching),
+    # same-H/different-O degeneracy makes the exact error column non-unique, so
+    # we instead require each correction to reproduce the input syndrome.
+    if decoder_name == "pymatching":
+        # canonicalize_for_rounds keeps same-syndrome/different-observable
+        # columns distinct; these are parallel edges in the matching graph that
+        # PyMatching's default 'disallow' strategy rejects, so merge them.
+        full_decoder = qec.get_decoder(decoder_name,
+                                       dem.detector_error_matrix,
+                                       merge_strategy="independent")
+    else:
+        full_decoder = qec.get_decoder(decoder_name, dem.detector_error_matrix)
     num_syndromes_per_round = dem.detector_error_matrix.shape[0] // num_rounds
 
     sw_as_full_decoder = qec.get_decoder(
@@ -67,18 +77,69 @@ def test_sliding_window_1(decoder_name, batched, num_rounds, num_windows):
             'merge_strategy': 'smallest_weight'
         })
 
+    # H maps an error (column space) to the syndrome it produces (mod 2).
+    H = np.asarray(dem.detector_error_matrix, dtype=np.int64)
+    # pymatching is a graph/matching decoder: for degenerate same-H/different-O
+    # groups it cannot return a unique column, so validate by syndrome
+    # reproduction rather than exact column equality.
+    check_syndrome_only = decoder_name == "pymatching"
+
     if batched:
         full_results = full_decoder.decode_batch(syndromes)
         sw_results = sw_as_full_decoder.decode_batch(syndromes)
-        num_mismatches = np.count_nonzero(
-            np.any(full_results.result != sw_results.result, axis=1))
-        assert num_mismatches == 0
+        if check_syndrome_only:
+            full_e = np.asarray(full_results.result, dtype=np.int64)
+            sw_e = np.asarray(sw_results.result, dtype=np.int64)
+            target = syndromes.astype(np.int64)
+            # ASSERT: every correction (full and windowed) explains the observed
+            # syndrome, i.e. (H @ e) % 2 == syndrome for all shots.
+            assert np.array_equal((full_e @ H.T) % 2, target)
+            assert np.array_equal((sw_e @ H.T) % 2, target)
+        else:
+            # ASSERT: column-space decoders produce identical corrections.
+            num_mismatches = np.count_nonzero(
+                np.any(full_results.result != sw_results.result, axis=1))
+            assert num_mismatches == 0
 
     else:
         num_mismatches = 0
         for syndrome in syndromes:
             r1 = full_decoder.decode(syndrome)
             r2 = sw_as_full_decoder.decode(syndrome)
-            if not np.array_equal(r1.result, r2.result):
+            if check_syndrome_only:
+                # A correction is valid if it reproduces the observed syndrome
+                # via H (mod 2); count a mismatch if either decoder fails to.
+                target = np.asarray(syndrome, dtype=np.int64)
+                e1 = np.asarray(r1.result, dtype=np.int64)
+                e2 = np.asarray(r2.result, dtype=np.int64)
+                if not (np.array_equal((H @ e1) % 2, target) and np.array_equal(
+                    (H @ e2) % 2, target)):
+                    num_mismatches += 1
+            elif not np.array_equal(r1.result, r2.result):
                 num_mismatches += 1
         assert num_mismatches == 0
+
+
+def test_pymatching_parallel_edges_use_observable_faults():
+    # Same detector syndrome with different observable flips represents
+    # distinct logical fault mechanisms. H-only PyMatching cannot distinguish
+    # them, so the observable-aware path must preserve O and merge parallel
+    # graph edges inside PyMatching rather than dropping DEM columns.
+    H = np.array([[1, 1]], dtype=np.uint8)
+    O = np.array([[1, 0], [0, 1]], dtype=np.uint8)
+    error_rates = np.array([0.1, 0.2], dtype=np.float64)
+
+    with pytest.raises(ValueError, match="Parallel edges not permitted"):
+        qec.get_decoder("pymatching", H)
+
+    decoder = qec.get_decoder("pymatching",
+                              H,
+                              O=O,
+                              error_rate_vec=error_rates,
+                              merge_strategy="independent")
+    result = decoder.decode_batch(np.array([[1]], dtype=np.uint8))
+
+    assert isinstance(result, qec.BatchDecoderResult)
+    assert result.result.shape[0] == 1
+    assert result.result.shape[1] > 0
+    assert result.converged.tolist() == [True]