NVIDIA · bmhowe23 · Jun 12, 2026 · Jun 12, 2026
diff --git a/libs/qec/include/cudaq/qec/detector_error_model.h b/libs/qec/include/cudaq/qec/detector_error_model.h
@@ -65,8 +65,21 @@ struct detector_error_model {
 
   /// Put the detector_error_matrix into canonical form, where the rows and
   /// columns are ordered in a way that is amenable to the round-based decoding
-  /// process.
-  void canonicalize_for_rounds(uint32_t num_syndromes_per_round);
+  /// process. Columns sharing the same detector AND observable signature are
+  /// merged, with their rates composed so the resulting model matches the
+  /// input model. By default, zero-syndrome columns that still flip an
+  /// observable (undetectable logical errors) are retained so the model's
+  /// observable-flip probability is preserved. Set @p
+  /// remove_zero_syndrome_errors to true to drop all columns with no detector
+  /// signature, which is appropriate when the canonicalized DEM is consumed
+  /// only for round-based decoding (where such columns carry no syndrome).
+  ///
+  /// @note Canonicalization does not preserve cross-column exclusivity
+  /// structure. Each output column is given a fresh unique error id and is
+  /// treated as independent of every other column; any `error_ids` correlation
+  /// present in the input model is discarded.
+  void canonicalize_for_rounds(uint32_t num_syndromes_per_round,
+                               bool remove_zero_syndrome_errors = false);
 };
 
 /// Parse Stim DEM text into detector/observable flip matrices and error rates.

diff --git a/libs/qec/lib/decoders/plugins/pymatching/pymatching.cpp b/libs/qec/lib/decoders/plugins/pymatching/pymatching.cpp
@@ -28,7 +28,16 @@ class pymatching : public decoder {
 
   // Input parameters
   std::vector<double> error_rate_vec;
-  pm::MERGE_STRATEGY merge_strategy_enum = pm::MERGE_STRATEGY::DISALLOW;
+  // Default to INDEPENDENT, matching upstream PyMatching's
+  // Matching.from_detector_error_model. A faithful DEM can legitimately contain
+  // multiple mechanisms that map to the same matching edge (e.g. two errors
+  // with the same single-detector syndrome but different observable flips); a
+  // matching graph cannot represent parallel edges, so they must be combined.
+  // INDEPENDENT combines their probabilities as independent error sources
+  // (keeping the first edge's observables), which is the standard matching
+  // approximation. Use merge_strategy="disallow" to instead reject parallel
+  // edges.
+  pm::MERGE_STRATEGY merge_strategy_enum = pm::MERGE_STRATEGY::INDEPENDENT;
 
   // Map of edge pairs to column indices. This does not seem particularly
   // efficient.

diff --git a/libs/qec/lib/detector_error_model.cpp b/libs/qec/lib/detector_error_model.cpp
@@ -13,6 +13,7 @@
 #include "stim.h"
 
 #include <exception>
+#include <map>
 #include <stdexcept>
 #include <string>
 #include <utility>
@@ -124,19 +125,13 @@ std::size_t detector_error_model::num_observables() const {
 }
 
 void detector_error_model::canonicalize_for_rounds(
-    uint32_t num_syndromes_per_round) {
+    uint32_t num_syndromes_per_round, bool remove_zero_syndrome_errors) {
   auto row_indices = dense_to_sparse(detector_error_matrix);
   auto column_order =
       get_sorted_pcm_column_indices(row_indices, num_syndromes_per_round);
-  std::vector<std::uint32_t> final_column_order;
-  // March through the columns in topological order, and combine the probability
-  // weight vectors if the columns have the same row indices.
-  std::vector<std::vector<std::uint32_t>> new_row_indices;
-  std::vector<double> new_weights;
-  std::vector<std::size_t> new_error_ids;
   const std::size_t num_obs = this->num_observables();
   const auto num_cols = column_order.size();
-  bool has_error_ids =
+  const bool has_error_ids =
       error_ids.has_value() && error_ids->size() == error_rates.size();
 
   if (row_indices.size() > error_rates.size()) {
@@ -149,72 +144,116 @@ void detector_error_model::canonicalize_for_rounds(
         "or the detector_error_matrix  was computed incorrectly.");
   }
 
+  // March through the columns in topological order and merge columns that share
+  // the SAME full signature: identical detector rows AND identical observable
+  // rows. Columns that differ in either are distinct error mechanisms and are
+  // kept separate (merging on detectors alone would relabel observable-flip
+  // probability mass). The merge key is therefore (detector rows, observable
+  // rows); because the sort above only orders by detector rows, columns with
+  // the same detectors but different observables can be interleaved, so we
+  // group by key explicitly rather than relying on adjacency.
+  using signature_t =
+      std::pair<std::vector<std::uint32_t>, std::vector<std::uint32_t>>;
+  std::map<signature_t, std::size_t> sig_to_out;
+  std::vector<std::uint32_t> final_column_order;
+  // For each retained output column, accumulate probability mass grouped by the
+  // exclusive-set it belongs to. Within one exclusive set (same error id) the
+  // alternatives are mutually exclusive, so their rates add. Across exclusive
+  // sets the mechanisms are independent, so they are combined with the XOR rule
+  // P(A xor B) = P(A) + P(B) - 2 P(A) P(B). When error ids are absent every
+  // column is treated as its own independent mechanism (keyed by its original
+  // column index), reproducing the all-XOR behavior.
+  std::vector<std::map<std::size_t, double>> out_exclusive;
+
+  // Track the first observable signature seen for each detector signature so we
+  // can flag columns that share a syndrome but flip a different observable.
+  // These are kept as distinct mechanisms (above), but they are worth
+  // surfacing: they often indicate an ambiguous/degenerate decoding situation.
+  // Cap the per-invocation warnings since short-distance codes can have many
+  // such mechanisms, and emit a single summary for the remainder.
+  constexpr std::size_t max_same_syndrome_diff_obs_warnings = 10;
+  std::size_t num_same_syndrome_diff_obs = 0;
+  std::map<std::vector<std::uint32_t>,
+           std::pair<std::vector<std::uint32_t>, std::uint32_t>>
+      first_obs_for_detector;
+
   for (std::size_t c = 0; c < num_cols; c++) {
-    auto column_index = column_order[c];
-    auto &curr_row_indices = row_indices[column_index];
-    // If the column has no non-zero elements, or a weight of 0, then we skip
-    // it.
-    if (curr_row_indices.size() == 0 || error_rates[column_index] == 0)
+    const auto column_index = column_order[c];
+    const auto &curr_row_indices = row_indices[column_index];
+    const double rate = error_rates[column_index];
+
+    // Build the observable-flip signature for this column.
+    std::vector<std::uint32_t> obs_indices;
+    for (std::size_t r = 0; r < num_obs; r++)
+      if (this->observables_flips_matrix.at({r, column_index}))
+        obs_indices.push_back(static_cast<std::uint32_t>(r));
+
+    // Skip columns that carry no information: zero probability, or no detector
+    // signature AND no observable flip. A column with no detectors but a
+    // nonzero observable flip is a genuine (undetectable) logical error and is
+    // retained by default so the model's observable-flip mass is preserved.
+    // Such a column has no syndrome for a round-based decoder to act on, so
+    // callers that only consume the detector matrix for decoding can drop all
+    // zero-syndrome columns via remove_zero_syndrome_errors.
+    const bool zero_syndrome = curr_row_indices.empty();
+    if (rate == 0.0 || (zero_syndrome && obs_indices.empty()) ||
+        (remove_zero_syndrome_errors && zero_syndrome))
       continue;
-    if (new_row_indices.empty()) {
-      new_row_indices.push_back(curr_row_indices);
-      new_weights.push_back(error_rates[column_index]);
+
+    signature_t sig{curr_row_indices, obs_indices};
+    auto [it, inserted] = sig_to_out.try_emplace(sig, out_exclusive.size());
+    if (inserted) {
+      out_exclusive.emplace_back();
       final_column_order.push_back(column_index);
-      if (has_error_ids)
-        new_error_ids.push_back(error_ids->at(column_index));
-    } else {
-      auto &prev_row_indices = new_row_indices.back();
-      if (prev_row_indices == curr_row_indices) {
-        // The current column has the same row indices as the previous column
-        // (i.e. has the same syndrome signature) so we update the error_rates
-        // and do NOT add the duplicate column.
-        auto prev_weight = new_weights.back();
-        auto prev_error_id = has_error_ids
-                                 ? new_error_ids.back()
-                                 : std::numeric_limits<std::size_t>::max();
-        auto curr_weight = error_rates[column_index];
-        bool same_error_id =
-            has_error_ids && prev_error_id == error_ids->at(column_index);
-        double scale_factor = same_error_id ? 0.0 : 1.0;
-        // The new weight is the probability that exactly ONE of the two errors
-        // occurs. This is given by the formula: P(A xor B) = P(A) + P(B) - 2 *
-        // P(A and B). If the errors originate from the same error mechanism,
-        // then P(A and B) = 0.
-        auto new_weight = prev_weight + curr_weight -
-                          scale_factor * 2.0 * prev_weight * curr_weight;
-        new_weights.back() = new_weight;
-        // Arbitrarily choose to keep the smaller error ID.
-        if (has_error_ids)
-          new_error_ids.back() =
-              std::min(prev_error_id, error_ids->at(column_index));
-        // Verify that the observables are the same for the duplicate column.
-        auto previous_column = column_order[c - 1];
-        bool match = true;
-        for (std::size_t r = 0; r < num_obs; r++) {
-          if (this->observables_flips_matrix.at({r, previous_column}) !=
-              this->observables_flips_matrix.at({r, column_index})) {
-            match = false;
-            break;
-          }
-        }
-        if (!match) {
-          cudaq::info(
+
+      // A new full signature. If this detector syndrome was already seen with a
+      // different observable signature, this is a "same syndrome, different
+      // observable" mechanism; flag it (capped).
+      auto [dit, first_seen] = first_obs_for_detector.try_emplace(
+          curr_row_indices, obs_indices, column_index);
+      if (!first_seen && dit->second.first != obs_indices) {
+        if (num_same_syndrome_diff_obs < max_same_syndrome_diff_obs_warnings)
+          cudaq::warn(
               "detector_error_model::canonicalize_for_rounds: identical "
               "syndromes exist in detector_error_matrix but have different "
-              "observables in observables_flips_matrix (columns {} and {})",
-              previous_column, column_index);
-        }
-      } else {
-        // The current column has different row indices than the previous
-        // column. So we add the current column to the new PCM, and update the
-        // error_rates.
-        new_row_indices.push_back(curr_row_indices);
-        new_weights.push_back(error_rates[column_index]);
-        final_column_order.push_back(column_index);
-        if (has_error_ids)
-          new_error_ids.push_back(error_ids->at(column_index));
+              "observables in observables_flips_matrix; keeping column {} as a "
+              "distinct error mechanism (previous column {})",
+              column_index, dit->second.second);
+        num_same_syndrome_diff_obs++;
       }
     }
+    const std::size_t exclusive_key =
+        has_error_ids ? error_ids->at(column_index) : column_index;
+    out_exclusive[it->second][exclusive_key] += rate;
+  }
+
+  // Emit a single summary if we suppressed any per-column warnings above.
+  if (num_same_syndrome_diff_obs > max_same_syndrome_diff_obs_warnings)
+    cudaq::warn(
+        "detector_error_model::canonicalize_for_rounds: found {} columns with "
+        "identical syndromes but different observables; suppressed {} "
+        "additional warnings (only the first {} were shown).",
+        num_same_syndrome_diff_obs,
+        num_same_syndrome_diff_obs - max_same_syndrome_diff_obs_warnings,
+        max_same_syndrome_diff_obs_warnings);
+
+  std::vector<double> new_weights;
+  std::vector<std::size_t> new_error_ids;
+  new_weights.reserve(out_exclusive.size());
+  for (std::size_t i = 0; i < out_exclusive.size(); i++) {
+    double weight = 0.0;
+    for (const auto &[id, p] : out_exclusive[i])
+      weight = weight + p - 2.0 * weight * p;
+    new_weights.push_back(weight);
+    // Assign each output column a fresh unique id. Canonicalization does not
+    // preserve any cross-column exclusivity structure: if a source mechanism's
+    // mutually-exclusive outcomes land in different signature columns, that
+    // relationship is no longer recoverable from the ids, so we do not pretend
+    // it is. Unique ids simply mark every canonicalized column as an
+    // independent mechanism, which is the relation the merged rates were
+    // composed under.
+    if (has_error_ids)
+      new_error_ids.push_back(i);
   }
 
   std::swap(this->error_rates, new_weights);

diff --git a/libs/qec/lib/experiments.cpp b/libs/qec/lib/experiments.cpp
@@ -458,7 +458,10 @@ cudaq::qec::detector_error_model dem_from_memory_circuit(
   // dem.detector_error_matrix.dump_bits();
   // printf("dem.observables_flips_matrix Before canonicalization:\n");
   // dem.observables_flips_matrix.dump_bits();
-  dem.canonicalize_for_rounds(numReturnSynPerRound);
+  // This DEM is returned for round-based decoding, so drop zero-syndrome
+  // columns (no detector signature) that a decoder cannot act on.
+  dem.canonicalize_for_rounds(numReturnSynPerRound,
+                              /*remove_zero_syndrome_errors=*/true);
   // printf("dem.detector_error_matrix After canonicalization:\n");
   // dem.detector_error_matrix.dump_bits();
   // printf("dem.observables_flips_matrix After canonicalization:\n");

diff --git a/libs/qec/python/bindings/py_decoder.cpp b/libs/qec/python/bindings/py_decoder.cpp
@@ -762,9 +762,24 @@ void bindDecoder(nb::module_ &mod) {
       .def("canonicalize_for_rounds",
            &detector_error_model::canonicalize_for_rounds,
            R"pbdoc(
-            Canonicalize the detector error model for a given number of rounds
+            Canonicalize the detector error model for a given number of rounds.
+
+            Columns sharing the same detector and observable signature are
+            merged, with rates composed to match the input model. By default,
+            zero-syndrome columns that still flip an observable (undetectable
+            logical errors) are retained so the model's observable-flip
+            probability is preserved. Set ``remove_zero_syndrome_errors=True``
+            to drop all columns with no detector signature, which is
+            appropriate when the canonicalized DEM is consumed only for
+            round-based decoding.
+
+            Canonicalization does not preserve cross-column exclusivity
+            structure: each output column is given a fresh unique error id and
+            treated as independent of every other column, so any ``error_ids``
+            correlation in the input model is discarded.
           )pbdoc",
-           nb::arg("num_syndromes_per_round"));
+           nb::arg("num_syndromes_per_round"),
+           nb::arg("remove_zero_syndrome_errors") = false);
 
   qecmod.def(
       "dem_from_stim_text", &dem_from_stim_text,

diff --git a/libs/qec/python/tests/test_dem.py b/libs/qec/python/tests/test_dem.py
@@ -96,13 +96,16 @@ def test_dem_from_memory_circuit():
     nRounds = 2
 
     dem = qec.z_dem_from_memory_circuit(code, statePrep, nRounds, noise)
+    # Mechanisms that share a detector syndrome but flip a different observable
+    # are kept as distinct columns (they are not merged), so the same detector
+    # column pattern can appear more than once with a distinct observable row.
     expected_detector_error_matrix = """
-1111...1..............
-.1.111..111...........
-..11.11..1.1111.......
-.......111.1.1.1111...
-..........1.11..1.111.
-..............1..11.11
+11111.....1.................
+..1.1111...111..............
+...11..111..1.1111..........
+..........111.1.1.1111......
+.............1.11..1.11111..
+.................1..11..1111
 """
     # Uncomment the following line to get a string representation of the DEM
     # that you can compare to expected_detector_error_matrix.
@@ -117,13 +120,14 @@ def test_dem_from_memory_circuit():
     # The following error rates were captured from the above print statement and
     # are considered "truth" data now.
     expected_error_rates = [
-        0.0183, 0.0235, 0.0158, 0.0209, 0.0310, 0.0235, 0.0183, 0.0106, 0.0053,
-        0.0053, 0.0106, 0.0053, 0.0053, 0.0053, 0.0106, 0.0235, 0.0158, 0.0158,
-        0.0183, 0.0335, 0.0209, 0.0434
+        0.0053, 0.0132, 0.0235, 0.0158, 0.0210, 0.0235, 0.0080, 0.0235, 0.0132,
+        0.0053, 0.0106, 0.0053, 0.0053, 0.0106, 0.0053, 0.0053, 0.0053, 0.0106,
+        0.0235, 0.0158, 0.0158, 0.0184, 0.0080, 0.0260, 0.0158, 0.0053, 0.0184,
+        0.0261
     ]
     assert np.allclose(dem.error_rates, expected_error_rates, atol=1e-4)
 
-    expected_observables_flips_matrix = '1....11.....1......111\n'
+    expected_observables_flips_matrix = '1.....111......1......1.1.1.\n'
     # Uncomment the following line to get a string representation of the
     # observables flips matrix that you can compare to
     # expected_observables_flips_matrix.

diff --git a/libs/qec/python/tests/test_sliding_window.py b/libs/qec/python/tests/test_sliding_window.py
@@ -48,8 +48,13 @@ def test_sliding_window_1(decoder_name, batched, num_rounds, num_windows):
 
     # First compare the results of the full decoder to the sliding window
     # decoder using an inner decoder of the full window size. The results should
-    # be the same.
-    full_decoder = qec.get_decoder(decoder_name, dem.detector_error_matrix)
+    # be the same. The full decoder must use the same merge_strategy as the
+    # sliding window's inner decoder (below): a faithful DEM can contain
+    # parallel matching edges, and the two decoders only agree if they combine
+    # those edges identically.
+    full_decoder = qec.get_decoder(decoder_name,
+                                   dem.detector_error_matrix,
+                                   merge_strategy='smallest_weight')
     num_syndromes_per_round = dem.detector_error_matrix.shape[0] // num_rounds
 
     sw_as_full_decoder = qec.get_decoder(

diff --git a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp
@@ -596,7 +596,8 @@ void demo_circuit_host(const cudaq::qec::code &code, int distance,
     obs_matrix.dump_bits();
     dem.observables_flips_matrix = obs_matrix.dot(msm_obs) % 2;
     printf("numSyndromesPerRound: %ld\n", numSyndromesPerRound);
-    dem.canonicalize_for_rounds(numSyndromesPerRound);
+    dem.canonicalize_for_rounds(numSyndromesPerRound,
+                                /*remove_zero_syndrome_errors=*/true);
 
     printf("dem.detector_error_matrix:\n");
     dem.detector_error_matrix.dump_bits();

diff --git a/libs/qec/unittests/realtime/app_examples/surface_code-2.cpp b/libs/qec/unittests/realtime/app_examples/surface_code-2.cpp
@@ -493,7 +493,8 @@ void demo_circuit_host(const cudaq::qec::code &code, int distance,
     obs_matrix.dump_bits();
     dem.observables_flips_matrix = obs_matrix.dot(msm_obs) % 2;
     printf("numSyndromesPerRound: %ld\n", numSyndromesPerRound);
-    dem.canonicalize_for_rounds(numSyndromesPerRound);
+    dem.canonicalize_for_rounds(numSyndromesPerRound,
+                                /*remove_zero_syndrome_errors=*/true);
 
     printf("dem.detector_error_matrix:\n");
     dem.detector_error_matrix.dump_bits();

diff --git a/libs/qec/unittests/realtime/app_examples/surface_code-3.cpp b/libs/qec/unittests/realtime/app_examples/surface_code-3.cpp
@@ -667,8 +667,10 @@ void demo_circuit_host(const cudaq::qec::code &code, int distance,
     printf("numSyndromesPerRound_x: %ld\n", numSyndromesPerRound_x);
 
     // Canonicalize both DEMs with their respective syndrome counts
-    dem_z.canonicalize_for_rounds(numSyndromesPerRound_z);
-    dem_x.canonicalize_for_rounds(numSyndromesPerRound_x);
+    dem_z.canonicalize_for_rounds(numSyndromesPerRound_z,
+                                  /*remove_zero_syndrome_errors=*/true);
+    dem_x.canonicalize_for_rounds(numSyndromesPerRound_x,
+                                  /*remove_zero_syndrome_errors=*/true);
 
     printf("dem_z.detector_error_matrix:\n");
     dem_z.detector_error_matrix.dump_bits();

diff --git a/libs/qec/unittests/realtime/app_examples/surface_code_1.py b/libs/qec/unittests/realtime/app_examples/surface_code_1.py
@@ -550,7 +550,8 @@ def demo_circuit_host(code_obj: qec.code,
         print("obs_matrix:", obs_matrix)
         dem.observables_flips_matrix = (obs_matrix @ msm_obs) % 2
         print("numSyndromesPerRound:", numSyndromesPerRound)
-        dem.canonicalize_for_rounds(numSyndromesPerRound)
+        dem.canonicalize_for_rounds(numSyndromesPerRound,
+                                    remove_zero_syndrome_errors=True)
 
         print("dem.detector_error_matrix:")
         print(dem.detector_error_matrix)