From f1e2f8986a26c24da047c33fcac867b6c9fbf49f Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe@nvidia.com>
Date: Fri, 12 Jun 2026 22:54:07 +0000
Subject: [PATCH 1/2] Fix DEM canonicalization to preserve the modeled error
 process

detector_error_model::canonicalize_for_rounds had three defects that
silently changed the model it was meant to only reorder:

1. Columns were merged on detector signature alone, so mechanisms with
   the same syndrome but different observable flips were collapsed and
   their observable-flip probability mass was relabeled.
2. Zero-syndrome columns were dropped outright, discarding undetectable
   logical errors and underreporting the logical error rate.
3. Rate composition kept min(error_id) and dispatched later merges on
   that id, making the composed rate depend on input column order.

Rework the merge into a group-by-(detector, observable) pass: only
columns with an identical full signature merge; rates compose additively
within an exclusive set (same error id) and via the XOR rule across
independent sets, which is order-independent; and each output column
gets a fresh unique id (canonicalization does not preserve cross-column
exclusivity, and the docs now say so).

Zero-syndrome observable-flipping columns are retained by default to
preserve observable-flip mass, with a new remove_zero_syndrome_errors
option to drop them when the DEM is consumed only for round-based
decoding (where a detector-less column carries no syndrome).
dem_from_memory_circuit and the realtime examples opt into removal.

Columns that share a syndrome but flip a different observable are kept
as distinct mechanisms and surfaced via a warning (capped, with a
summary) since they often indicate an ambiguous decoding situation.

Adds regression tests for observable-aware merging, zero-syndrome
keep/drop, and order-independent composition.

Co-Authored-By: Kaiqi Yan <kaiqiy@nvidia.com>
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Ben Howe <bhowe@nvidia.com>
---
 .../include/cudaq/qec/detector_error_model.h  |  17 +-
 libs/qec/lib/detector_error_model.cpp         | 173 +++++++++++-------
 libs/qec/lib/experiments.cpp                  |   5 +-
 libs/qec/python/bindings/py_decoder.cpp       |  19 +-
 .../realtime/app_examples/surface_code-1.cpp  |   3 +-
 .../realtime/app_examples/surface_code-2.cpp  |   3 +-
 .../realtime/app_examples/surface_code-3.cpp  |   6 +-
 .../realtime/app_examples/surface_code_1.py   |   3 +-
 libs/qec/unittests/test_qec.cpp               |  90 +++++++++
 9 files changed, 242 insertions(+), 77 deletions(-)

diff --git a/libs/qec/include/cudaq/qec/detector_error_model.h b/libs/qec/include/cudaq/qec/detector_error_model.h
index 6411f0f7..a8a38ccf 100644
--- a/libs/qec/include/cudaq/qec/detector_error_model.h
+++ b/libs/qec/include/cudaq/qec/detector_error_model.h
@@ -65,8 +65,21 @@ struct detector_error_model {
 
   /// Put the detector_error_matrix into canonical form, where the rows and
   /// columns are ordered in a way that is amenable to the round-based decoding
-  /// process.
-  void canonicalize_for_rounds(uint32_t num_syndromes_per_round);
+  /// process. Columns sharing the same detector AND observable signature are
+  /// merged, with their rates composed so the resulting model matches the
+  /// input model. By default, zero-syndrome columns that still flip an
+  /// observable (undetectable logical errors) are retained so the model's
+  /// observable-flip probability is preserved. Set @p
+  /// remove_zero_syndrome_errors to true to drop all columns with no detector
+  /// signature, which is appropriate when the canonicalized DEM is consumed
+  /// only for round-based decoding (where such columns carry no syndrome).
+  ///
+  /// @note Canonicalization does not preserve cross-column exclusivity
+  /// structure. Each output column is given a fresh unique error id and is
+  /// treated as independent of every other column; any `error_ids` correlation
+  /// present in the input model is discarded.
+  void canonicalize_for_rounds(uint32_t num_syndromes_per_round,
+                               bool remove_zero_syndrome_errors = false);
 };
 
 /// Parse Stim DEM text into detector/observable flip matrices and error rates.
diff --git a/libs/qec/lib/detector_error_model.cpp b/libs/qec/lib/detector_error_model.cpp
index cb9e1a1d..e891ce9b 100644
--- a/libs/qec/lib/detector_error_model.cpp
+++ b/libs/qec/lib/detector_error_model.cpp
@@ -13,6 +13,7 @@
 #include "stim.h"
 
 #include <exception>
+#include <map>
 #include <stdexcept>
 #include <string>
 #include <utility>
@@ -124,19 +125,13 @@ std::size_t detector_error_model::num_observables() const {
 }
 
 void detector_error_model::canonicalize_for_rounds(
-    uint32_t num_syndromes_per_round) {
+    uint32_t num_syndromes_per_round, bool remove_zero_syndrome_errors) {
   auto row_indices = dense_to_sparse(detector_error_matrix);
   auto column_order =
       get_sorted_pcm_column_indices(row_indices, num_syndromes_per_round);
-  std::vector<std::uint32_t> final_column_order;
-  // March through the columns in topological order, and combine the probability
-  // weight vectors if the columns have the same row indices.
-  std::vector<std::vector<std::uint32_t>> new_row_indices;
-  std::vector<double> new_weights;
-  std::vector<std::size_t> new_error_ids;
   const std::size_t num_obs = this->num_observables();
   const auto num_cols = column_order.size();
-  bool has_error_ids =
+  const bool has_error_ids =
       error_ids.has_value() && error_ids->size() == error_rates.size();
 
   if (row_indices.size() > error_rates.size()) {
@@ -149,72 +144,116 @@ void detector_error_model::canonicalize_for_rounds(
         "or the detector_error_matrix  was computed incorrectly.");
   }
 
+  // March through the columns in topological order and merge columns that share
+  // the SAME full signature: identical detector rows AND identical observable
+  // rows. Columns that differ in either are distinct error mechanisms and are
+  // kept separate (merging on detectors alone would relabel observable-flip
+  // probability mass). The merge key is therefore (detector rows, observable
+  // rows); because the sort above only orders by detector rows, columns with
+  // the same detectors but different observables can be interleaved, so we
+  // group by key explicitly rather than relying on adjacency.
+  using signature_t =
+      std::pair<std::vector<std::uint32_t>, std::vector<std::uint32_t>>;
+  std::map<signature_t, std::size_t> sig_to_out;
+  std::vector<std::uint32_t> final_column_order;
+  // For each retained output column, accumulate probability mass grouped by the
+  // exclusive-set it belongs to. Within one exclusive set (same error id) the
+  // alternatives are mutually exclusive, so their rates add. Across exclusive
+  // sets the mechanisms are independent, so they are combined with the XOR rule
+  // P(A xor B) = P(A) + P(B) - 2 P(A) P(B). When error ids are absent every
+  // column is treated as its own independent mechanism (keyed by its original
+  // column index), reproducing the all-XOR behavior.
+  std::vector<std::map<std::size_t, double>> out_exclusive;
+
+  // Track the first observable signature seen for each detector signature so we
+  // can flag columns that share a syndrome but flip a different observable.
+  // These are kept as distinct mechanisms (above), but they are worth
+  // surfacing: they often indicate an ambiguous/degenerate decoding situation.
+  // Cap the per-invocation warnings since short-distance codes can have many
+  // such mechanisms, and emit a single summary for the remainder.
+  constexpr std::size_t max_same_syndrome_diff_obs_warnings = 10;
+  std::size_t num_same_syndrome_diff_obs = 0;
+  std::map<std::vector<std::uint32_t>,
+           std::pair<std::vector<std::uint32_t>, std::uint32_t>>
+      first_obs_for_detector;
+
   for (std::size_t c = 0; c < num_cols; c++) {
-    auto column_index = column_order[c];
-    auto &curr_row_indices = row_indices[column_index];
-    // If the column has no non-zero elements, or a weight of 0, then we skip
-    // it.
-    if (curr_row_indices.size() == 0 || error_rates[column_index] == 0)
+    const auto column_index = column_order[c];
+    const auto &curr_row_indices = row_indices[column_index];
+    const double rate = error_rates[column_index];
+
+    // Build the observable-flip signature for this column.
+    std::vector<std::uint32_t> obs_indices;
+    for (std::size_t r = 0; r < num_obs; r++)
+      if (this->observables_flips_matrix.at({r, column_index}))
+        obs_indices.push_back(static_cast<std::uint32_t>(r));
+
+    // Skip columns that carry no information: zero probability, or no detector
+    // signature AND no observable flip. A column with no detectors but a
+    // nonzero observable flip is a genuine (undetectable) logical error and is
+    // retained by default so the model's observable-flip mass is preserved.
+    // Such a column has no syndrome for a round-based decoder to act on, so
+    // callers that only consume the detector matrix for decoding can drop all
+    // zero-syndrome columns via remove_zero_syndrome_errors.
+    const bool zero_syndrome = curr_row_indices.empty();
+    if (rate == 0.0 || (zero_syndrome && obs_indices.empty()) ||
+        (remove_zero_syndrome_errors && zero_syndrome))
       continue;
-    if (new_row_indices.empty()) {
-      new_row_indices.push_back(curr_row_indices);
-      new_weights.push_back(error_rates[column_index]);
+
+    signature_t sig{curr_row_indices, obs_indices};
+    auto [it, inserted] = sig_to_out.try_emplace(sig, out_exclusive.size());
+    if (inserted) {
+      out_exclusive.emplace_back();
       final_column_order.push_back(column_index);
-      if (has_error_ids)
-        new_error_ids.push_back(error_ids->at(column_index));
-    } else {
-      auto &prev_row_indices = new_row_indices.back();
-      if (prev_row_indices == curr_row_indices) {
-        // The current column has the same row indices as the previous column
-        // (i.e. has the same syndrome signature) so we update the error_rates
-        // and do NOT add the duplicate column.
-        auto prev_weight = new_weights.back();
-        auto prev_error_id = has_error_ids
-                                 ? new_error_ids.back()
-                                 : std::numeric_limits<std::size_t>::max();
-        auto curr_weight = error_rates[column_index];
-        bool same_error_id =
-            has_error_ids && prev_error_id == error_ids->at(column_index);
-        double scale_factor = same_error_id ? 0.0 : 1.0;
-        // The new weight is the probability that exactly ONE of the two errors
-        // occurs. This is given by the formula: P(A xor B) = P(A) + P(B) - 2 *
-        // P(A and B). If the errors originate from the same error mechanism,
-        // then P(A and B) = 0.
-        auto new_weight = prev_weight + curr_weight -
-                          scale_factor * 2.0 * prev_weight * curr_weight;
-        new_weights.back() = new_weight;
-        // Arbitrarily choose to keep the smaller error ID.
-        if (has_error_ids)
-          new_error_ids.back() =
-              std::min(prev_error_id, error_ids->at(column_index));
-        // Verify that the observables are the same for the duplicate column.
-        auto previous_column = column_order[c - 1];
-        bool match = true;
-        for (std::size_t r = 0; r < num_obs; r++) {
-          if (this->observables_flips_matrix.at({r, previous_column}) !=
-              this->observables_flips_matrix.at({r, column_index})) {
-            match = false;
-            break;
-          }
-        }
-        if (!match) {
-          cudaq::info(
+
+      // A new full signature. If this detector syndrome was already seen with a
+      // different observable signature, this is a "same syndrome, different
+      // observable" mechanism; flag it (capped).
+      auto [dit, first_seen] = first_obs_for_detector.try_emplace(
+          curr_row_indices, obs_indices, column_index);
+      if (!first_seen && dit->second.first != obs_indices) {
+        if (num_same_syndrome_diff_obs < max_same_syndrome_diff_obs_warnings)
+          cudaq::warn(
               "detector_error_model::canonicalize_for_rounds: identical "
               "syndromes exist in detector_error_matrix but have different "
-              "observables in observables_flips_matrix (columns {} and {})",
-              previous_column, column_index);
-        }
-      } else {
-        // The current column has different row indices than the previous
-        // column. So we add the current column to the new PCM, and update the
-        // error_rates.
-        new_row_indices.push_back(curr_row_indices);
-        new_weights.push_back(error_rates[column_index]);
-        final_column_order.push_back(column_index);
-        if (has_error_ids)
-          new_error_ids.push_back(error_ids->at(column_index));
+              "observables in observables_flips_matrix; keeping column {} as a "
+              "distinct error mechanism (previous column {})",
+              column_index, dit->second.second);
+        num_same_syndrome_diff_obs++;
       }
     }
+    const std::size_t exclusive_key =
+        has_error_ids ? error_ids->at(column_index) : column_index;
+    out_exclusive[it->second][exclusive_key] += rate;
+  }
+
+  // Emit a single summary if we suppressed any per-column warnings above.
+  if (num_same_syndrome_diff_obs > max_same_syndrome_diff_obs_warnings)
+    cudaq::warn(
+        "detector_error_model::canonicalize_for_rounds: found {} columns with "
+        "identical syndromes but different observables; suppressed {} "
+        "additional warnings (only the first {} were shown).",
+        num_same_syndrome_diff_obs,
+        num_same_syndrome_diff_obs - max_same_syndrome_diff_obs_warnings,
+        max_same_syndrome_diff_obs_warnings);
+
+  std::vector<double> new_weights;
+  std::vector<std::size_t> new_error_ids;
+  new_weights.reserve(out_exclusive.size());
+  for (std::size_t i = 0; i < out_exclusive.size(); i++) {
+    double weight = 0.0;
+    for (const auto &[id, p] : out_exclusive[i])
+      weight = weight + p - 2.0 * weight * p;
+    new_weights.push_back(weight);
+    // Assign each output column a fresh unique id. Canonicalization does not
+    // preserve any cross-column exclusivity structure: if a source mechanism's
+    // mutually-exclusive outcomes land in different signature columns, that
+    // relationship is no longer recoverable from the ids, so we do not pretend
+    // it is. Unique ids simply mark every canonicalized column as an
+    // independent mechanism, which is the relation the merged rates were
+    // composed under.
+    if (has_error_ids)
+      new_error_ids.push_back(i);
   }
 
   std::swap(this->error_rates, new_weights);
diff --git a/libs/qec/lib/experiments.cpp b/libs/qec/lib/experiments.cpp
index 6c899dde..1ba11ebe 100644
--- a/libs/qec/lib/experiments.cpp
+++ b/libs/qec/lib/experiments.cpp
@@ -458,7 +458,10 @@ cudaq::qec::detector_error_model dem_from_memory_circuit(
   // dem.detector_error_matrix.dump_bits();
   // printf("dem.observables_flips_matrix Before canonicalization:\n");
   // dem.observables_flips_matrix.dump_bits();
-  dem.canonicalize_for_rounds(numReturnSynPerRound);
+  // This DEM is returned for round-based decoding, so drop zero-syndrome
+  // columns (no detector signature) that a decoder cannot act on.
+  dem.canonicalize_for_rounds(numReturnSynPerRound,
+                              /*remove_zero_syndrome_errors=*/true);
   // printf("dem.detector_error_matrix After canonicalization:\n");
   // dem.detector_error_matrix.dump_bits();
   // printf("dem.observables_flips_matrix After canonicalization:\n");
diff --git a/libs/qec/python/bindings/py_decoder.cpp b/libs/qec/python/bindings/py_decoder.cpp
index 57078a28..bdc0f1fe 100644
--- a/libs/qec/python/bindings/py_decoder.cpp
+++ b/libs/qec/python/bindings/py_decoder.cpp
@@ -762,9 +762,24 @@ void bindDecoder(nb::module_ &mod) {
       .def("canonicalize_for_rounds",
            &detector_error_model::canonicalize_for_rounds,
            R"pbdoc(
-            Canonicalize the detector error model for a given number of rounds
+            Canonicalize the detector error model for a given number of rounds.
+
+            Columns sharing the same detector and observable signature are
+            merged, with rates composed to match the input model. By default,
+            zero-syndrome columns that still flip an observable (undetectable
+            logical errors) are retained so the model's observable-flip
+            probability is preserved. Set ``remove_zero_syndrome_errors=True``
+            to drop all columns with no detector signature, which is
+            appropriate when the canonicalized DEM is consumed only for
+            round-based decoding.
+
+            Canonicalization does not preserve cross-column exclusivity
+            structure: each output column is given a fresh unique error id and
+            treated as independent of every other column, so any ``error_ids``
+            correlation in the input model is discarded.
           )pbdoc",
-           nb::arg("num_syndromes_per_round"));
+           nb::arg("num_syndromes_per_round"),
+           nb::arg("remove_zero_syndrome_errors") = false);
 
   qecmod.def(
       "dem_from_stim_text", &dem_from_stim_text,
diff --git a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp
index a01856e8..63fb154a 100644
--- a/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp
+++ b/libs/qec/unittests/realtime/app_examples/surface_code-1.cpp
@@ -596,7 +596,8 @@ void demo_circuit_host(const cudaq::qec::code &code, int distance,
     obs_matrix.dump_bits();
     dem.observables_flips_matrix = obs_matrix.dot(msm_obs) % 2;
     printf("numSyndromesPerRound: %ld\n", numSyndromesPerRound);
-    dem.canonicalize_for_rounds(numSyndromesPerRound);
+    dem.canonicalize_for_rounds(numSyndromesPerRound,
+                                /*remove_zero_syndrome_errors=*/true);
 
     printf("dem.detector_error_matrix:\n");
     dem.detector_error_matrix.dump_bits();
diff --git a/libs/qec/unittests/realtime/app_examples/surface_code-2.cpp b/libs/qec/unittests/realtime/app_examples/surface_code-2.cpp
index e5d1ea07..b61e174f 100644
--- a/libs/qec/unittests/realtime/app_examples/surface_code-2.cpp
+++ b/libs/qec/unittests/realtime/app_examples/surface_code-2.cpp
@@ -493,7 +493,8 @@ void demo_circuit_host(const cudaq::qec::code &code, int distance,
     obs_matrix.dump_bits();
     dem.observables_flips_matrix = obs_matrix.dot(msm_obs) % 2;
     printf("numSyndromesPerRound: %ld\n", numSyndromesPerRound);
-    dem.canonicalize_for_rounds(numSyndromesPerRound);
+    dem.canonicalize_for_rounds(numSyndromesPerRound,
+                                /*remove_zero_syndrome_errors=*/true);
 
     printf("dem.detector_error_matrix:\n");
     dem.detector_error_matrix.dump_bits();
diff --git a/libs/qec/unittests/realtime/app_examples/surface_code-3.cpp b/libs/qec/unittests/realtime/app_examples/surface_code-3.cpp
index bfba8195..655049e7 100644
--- a/libs/qec/unittests/realtime/app_examples/surface_code-3.cpp
+++ b/libs/qec/unittests/realtime/app_examples/surface_code-3.cpp
@@ -667,8 +667,10 @@ void demo_circuit_host(const cudaq::qec::code &code, int distance,
     printf("numSyndromesPerRound_x: %ld\n", numSyndromesPerRound_x);
 
     // Canonicalize both DEMs with their respective syndrome counts
-    dem_z.canonicalize_for_rounds(numSyndromesPerRound_z);
-    dem_x.canonicalize_for_rounds(numSyndromesPerRound_x);
+    dem_z.canonicalize_for_rounds(numSyndromesPerRound_z,
+                                  /*remove_zero_syndrome_errors=*/true);
+    dem_x.canonicalize_for_rounds(numSyndromesPerRound_x,
+                                  /*remove_zero_syndrome_errors=*/true);
 
     printf("dem_z.detector_error_matrix:\n");
     dem_z.detector_error_matrix.dump_bits();
diff --git a/libs/qec/unittests/realtime/app_examples/surface_code_1.py b/libs/qec/unittests/realtime/app_examples/surface_code_1.py
index 6119122c..401699f5 100644
--- a/libs/qec/unittests/realtime/app_examples/surface_code_1.py
+++ b/libs/qec/unittests/realtime/app_examples/surface_code_1.py
@@ -550,7 +550,8 @@ def demo_circuit_host(code_obj: qec.code,
         print("obs_matrix:", obs_matrix)
         dem.observables_flips_matrix = (obs_matrix @ msm_obs) % 2
         print("numSyndromesPerRound:", numSyndromesPerRound)
-        dem.canonicalize_for_rounds(numSyndromesPerRound)
+        dem.canonicalize_for_rounds(numSyndromesPerRound,
+                                    remove_zero_syndrome_errors=True)
 
         print("dem.detector_error_matrix:")
         print(dem.detector_error_matrix)
diff --git a/libs/qec/unittests/test_qec.cpp b/libs/qec/unittests/test_qec.cpp
index df1a5bf0..f9f0bd6e 100644
--- a/libs/qec/unittests/test_qec.cpp
+++ b/libs/qec/unittests/test_qec.cpp
@@ -1753,6 +1753,96 @@ TEST(DetectorErrorModelTest, CanonicalizeWithMismatchedErrorIds) {
   EXPECT_LT(dem.num_error_mechanisms(), 3); // Should have merged some columns
 }
 
+// Sum the probability mass of columns that flip observable 0.
+static double
+observable0_flip_mass(const cudaq::qec::detector_error_model &dem) {
+  double mass = 0.0;
+  for (std::size_t c = 0; c < dem.num_error_mechanisms(); c++)
+    if (dem.observables_flips_matrix.at({0, c}))
+      mass += dem.error_rates[c];
+  return mass;
+}
+
+// Columns with the same detector signature but DIFFERENT observable flips must
+// not be merged, otherwise observable-flip probability mass is relabeled.
+TEST(DetectorErrorModelTest, CanonicalizeObservableAwareMerge) {
+  cudaq::qec::detector_error_model dem;
+  // Two columns, identical single-detector syndrome.
+  dem.detector_error_matrix = cudaqx::tensor<uint8_t>({1, 2});
+  dem.detector_error_matrix.at({0, 0}) = 1;
+  dem.detector_error_matrix.at({0, 1}) = 1;
+  // Column 0 flips the observable, column 1 does not.
+  dem.observables_flips_matrix = cudaqx::tensor<uint8_t>({1, 2});
+  dem.observables_flips_matrix.at({0, 0}) = 1;
+  dem.observables_flips_matrix.at({0, 1}) = 0;
+  dem.error_rates = {0.2, 0.3};
+
+  dem.canonicalize_for_rounds(1);
+
+  // The two columns differ in observable, so they remain distinct.
+  EXPECT_EQ(dem.num_error_mechanisms(), 2);
+  // The observable-flip mass is preserved at 0.2 (neither merged to 0.38
+  // nor dropped to 0.0).
+  EXPECT_NEAR(observable0_flip_mass(dem), 0.2, 1e-12);
+}
+
+// A zero-syndrome column that flips an observable is an undetectable logical
+// error. It is kept by default and dropped only when
+// remove_zero_syndrome_errors is requested.
+TEST(DetectorErrorModelTest, CanonicalizeZeroSyndromeObservable) {
+  auto make = []() {
+    cudaq::qec::detector_error_model dem;
+    dem.detector_error_matrix = cudaqx::tensor<uint8_t>({1, 2});
+    dem.detector_error_matrix.at({0, 0}) = 1; // normal column
+    dem.detector_error_matrix.at({0, 1}) = 0; // no detector signature
+    dem.observables_flips_matrix = cudaqx::tensor<uint8_t>({1, 2});
+    dem.observables_flips_matrix.at({0, 0}) = 0;
+    dem.observables_flips_matrix.at({0, 1}) = 1; // flips the observable
+    dem.error_rates = {0.1, 0.01};
+    return dem;
+  };
+
+  // Default: the zero-syndrome observable-flipping column is retained.
+  auto dem_keep = make();
+  dem_keep.canonicalize_for_rounds(1);
+  EXPECT_EQ(dem_keep.num_error_mechanisms(), 2);
+  EXPECT_NEAR(observable0_flip_mass(dem_keep), 0.01, 1e-12);
+
+  // With removal requested: it is dropped (useless for round-based decoding).
+  auto dem_drop = make();
+  dem_drop.canonicalize_for_rounds(1, /*remove_zero_syndrome_errors=*/true);
+  EXPECT_EQ(dem_drop.num_error_mechanisms(), 1);
+  EXPECT_NEAR(observable0_flip_mass(dem_drop), 0.0, 1e-12);
+}
+
+// Rate composition must be independent of input column order. Two exclusive
+// alternatives of one mechanism (same id) plus an independent mechanism
+// (different id), all on the same syndrome, must compose to
+// P = (0.1 + 0.1)(1 - 0.2) + (1 - 0.1 - 0.1)(0.2) = 0.32 regardless of order.
+TEST(DetectorErrorModelTest, CanonicalizeOrderIndependentComposition) {
+  auto run = [](const std::vector<double> &rates,
+                const std::vector<std::size_t> &ids) {
+    cudaq::qec::detector_error_model dem;
+    const std::size_t n = rates.size();
+    dem.detector_error_matrix = cudaqx::tensor<uint8_t>({1, n});
+    for (std::size_t c = 0; c < n; c++)
+      dem.detector_error_matrix.at({0, c}) =
+          1; // shared single-detector syndrome
+    dem.observables_flips_matrix = cudaqx::tensor<uint8_t>({1, n});
+    dem.error_rates = rates;
+    dem.error_ids = ids;
+    dem.canonicalize_for_rounds(1);
+    EXPECT_EQ(dem.num_error_mechanisms(), 1u);
+    return dem.error_rates[0];
+  };
+
+  // A(0.1,id0), B(0.2,id1), C(0.1,id0) in two different column orders.
+  double abc = run({0.1, 0.2, 0.1}, {0, 1, 0});
+  double acb = run({0.1, 0.1, 0.2}, {0, 0, 1});
+  EXPECT_NEAR(abc, 0.32, 1e-12);
+  EXPECT_NEAR(acb, 0.32, 1e-12);
+}
+
 TEST(PluginLoaderTester, checkCleanupPluginsEdgeCases) {
   // Test edge cases for cleanup_plugins function to cover the else branch
   // The plugin loader is loaded in the constructor of load_decoder_plugins()

From 11a0b5e4e3cf6e7d7ad7a36ec55dc2b166c4919d Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe@nvidia.com>
Date: Fri, 12 Jun 2026 23:36:13 +0000
Subject: [PATCH 2/2] Handle parallel matching edges from faithful DEMs

Now that canonicalization keeps mechanisms which share a detector
syndrome but flip a different observable as distinct columns, a DEM can
contain multiple columns that map to the same matching edge. A matching
graph cannot represent parallel edges, so the pymatching decoder threw
under its DISALLOW default.

Default the pymatching plugin's merge_strategy to INDEPENDENT, matching
upstream PyMatching's Matching.from_detector_error_model: parallel edges
are combined as independent error sources (keeping the first edge's
observables), the standard matching approximation. Callers can still pass
merge_strategy="disallow" to reject parallel edges.

Update the tests affected by the now-faithful DEM:
- test_sliding_window: build the full decoder with the same merge_strategy
  as the sliding window's inner decoder, so the equivalence check compares
  like with like (it previously only agreed because no parallel edges
  existed).
- test_dem_from_memory_circuit: regenerate the Steane DEM truth data; the
  matrix grows from 22 to 28 columns as observable-distinct mechanisms are
  no longer merged (the 6 added columns match the 6 same-syndrome,
  different-observable warnings).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Ben Howe <bhowe@nvidia.com>
---
 .../plugins/pymatching/pymatching.cpp         | 11 ++++++++-
 libs/qec/python/tests/test_dem.py             | 24 +++++++++++--------
 libs/qec/python/tests/test_sliding_window.py  |  9 +++++--
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/libs/qec/lib/decoders/plugins/pymatching/pymatching.cpp b/libs/qec/lib/decoders/plugins/pymatching/pymatching.cpp
index 7d0421d7..7ff8105f 100644
--- a/libs/qec/lib/decoders/plugins/pymatching/pymatching.cpp
+++ b/libs/qec/lib/decoders/plugins/pymatching/pymatching.cpp
@@ -28,7 +28,16 @@ class pymatching : public decoder {
 
   // Input parameters
   std::vector<double> error_rate_vec;
-  pm::MERGE_STRATEGY merge_strategy_enum = pm::MERGE_STRATEGY::DISALLOW;
+  // Default to INDEPENDENT, matching upstream PyMatching's
+  // Matching.from_detector_error_model. A faithful DEM can legitimately contain
+  // multiple mechanisms that map to the same matching edge (e.g. two errors
+  // with the same single-detector syndrome but different observable flips); a
+  // matching graph cannot represent parallel edges, so they must be combined.
+  // INDEPENDENT combines their probabilities as independent error sources
+  // (keeping the first edge's observables), which is the standard matching
+  // approximation. Use merge_strategy="disallow" to instead reject parallel
+  // edges.
+  pm::MERGE_STRATEGY merge_strategy_enum = pm::MERGE_STRATEGY::INDEPENDENT;
 
   // Map of edge pairs to column indices. This does not seem particularly
   // efficient.
diff --git a/libs/qec/python/tests/test_dem.py b/libs/qec/python/tests/test_dem.py
index 2cfffb06..8696e573 100644
--- a/libs/qec/python/tests/test_dem.py
+++ b/libs/qec/python/tests/test_dem.py
@@ -96,13 +96,16 @@ def test_dem_from_memory_circuit():
     nRounds = 2
 
     dem = qec.z_dem_from_memory_circuit(code, statePrep, nRounds, noise)
+    # Mechanisms that share a detector syndrome but flip a different observable
+    # are kept as distinct columns (they are not merged), so the same detector
+    # column pattern can appear more than once with a distinct observable row.
     expected_detector_error_matrix = """
-1111...1..............
-.1.111..111...........
-..11.11..1.1111.......
-.......111.1.1.1111...
-..........1.11..1.111.
-..............1..11.11
+11111.....1.................
+..1.1111...111..............
+...11..111..1.1111..........
+..........111.1.1.1111......
+.............1.11..1.11111..
+.................1..11..1111
 """
     # Uncomment the following line to get a string representation of the DEM
     # that you can compare to expected_detector_error_matrix.
@@ -117,13 +120,14 @@ def test_dem_from_memory_circuit():
     # The following error rates were captured from the above print statement and
     # are considered "truth" data now.
     expected_error_rates = [
-        0.0183, 0.0235, 0.0158, 0.0209, 0.0310, 0.0235, 0.0183, 0.0106, 0.0053,
-        0.0053, 0.0106, 0.0053, 0.0053, 0.0053, 0.0106, 0.0235, 0.0158, 0.0158,
-        0.0183, 0.0335, 0.0209, 0.0434
+        0.0053, 0.0132, 0.0235, 0.0158, 0.0210, 0.0235, 0.0080, 0.0235, 0.0132,
+        0.0053, 0.0106, 0.0053, 0.0053, 0.0106, 0.0053, 0.0053, 0.0053, 0.0106,
+        0.0235, 0.0158, 0.0158, 0.0184, 0.0080, 0.0260, 0.0158, 0.0053, 0.0184,
+        0.0261
     ]
     assert np.allclose(dem.error_rates, expected_error_rates, atol=1e-4)
 
-    expected_observables_flips_matrix = '1....11.....1......111\n'
+    expected_observables_flips_matrix = '1.....111......1......1.1.1.\n'
     # Uncomment the following line to get a string representation of the
     # observables flips matrix that you can compare to
     # expected_observables_flips_matrix.
diff --git a/libs/qec/python/tests/test_sliding_window.py b/libs/qec/python/tests/test_sliding_window.py
index ca00f24b..996402f2 100644
--- a/libs/qec/python/tests/test_sliding_window.py
+++ b/libs/qec/python/tests/test_sliding_window.py
@@ -48,8 +48,13 @@ def test_sliding_window_1(decoder_name, batched, num_rounds, num_windows):
 
     # First compare the results of the full decoder to the sliding window
     # decoder using an inner decoder of the full window size. The results should
-    # be the same.
-    full_decoder = qec.get_decoder(decoder_name, dem.detector_error_matrix)
+    # be the same. The full decoder must use the same merge_strategy as the
+    # sliding window's inner decoder (below): a faithful DEM can contain
+    # parallel matching edges, and the two decoders only agree if they combine
+    # those edges identically.
+    full_decoder = qec.get_decoder(decoder_name,
+                                   dem.detector_error_matrix,
+                                   merge_strategy='smallest_weight')
     num_syndromes_per_round = dem.detector_error_matrix.shape[0] // num_rounds
 
     sw_as_full_decoder = qec.get_decoder(