From 62a630c2e144d0ae0526ef492e0ec79757f704f3 Mon Sep 17 00:00:00 2001 From: Arkadiy Kukarkin Date: Thu, 9 Apr 2026 20:42:38 +0200 Subject: [PATCH] fix non-deterministic piece CID for inline DAG regeneration UnmarshalToBlocks iterated data.Reals and data.Additional via 'for c, d := range', and Go map iteration order is randomized per-instance. The same serialized directory blob therefore yielded a different block sequence on each call, producing a different CAR byte layout and a different piece CID even though the root CID was stable. Sort the CIDs lexicographically before iterating so the CAR layout is content-derived and reproducible. Add a regression test that asserts the same marshaled blob unmarshals to the same block order across repeated calls. Reported by users seeing baga6ea4...drift across consecutive 'dag generate' runs against the same inline preparation. --- pack/daggen/directory.go | 26 +++++++++++++++++---- pack/daggen/directory_test.go | 43 +++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/pack/daggen/directory.go b/pack/daggen/directory.go index 4b9c09899..744bd91a0 100644 --- a/pack/daggen/directory.go +++ b/pack/daggen/directory.go @@ -3,6 +3,7 @@ package daggen import ( "bytes" "context" + "sort" "github.com/cockroachdb/errors" "github.com/data-preservation-programs/singularity/model" @@ -356,18 +357,35 @@ func UnmarshalToBlocks(in []byte) ([]blocks.Block, error) { return nil, errors.WithStack(err) } + // Iterate Reals and Additional in CID-sorted order so the resulting + // CAR layout is deterministic across runs. Go map iteration is + // randomized; without a sort, the same DAG produces a different piece + // CID on every regeneration even though the root CID is stable. blks := make([]blocks.Block, 0, len(data.Reals)+len(data.Additional)) - for c, d := range data.Reals { - blk, _ := blocks.NewBlockWithCid(d, c) + for _, c := range sortedCids(data.Reals) { + blk, _ := blocks.NewBlockWithCid(data.Reals[c], c) blks = append(blks, blk) } - for c, d := range data.Additional { - blk, _ := blocks.NewBlockWithCid(d, c) + for _, c := range sortedCids(data.Additional) { + blk, _ := blocks.NewBlockWithCid(data.Additional[c], c) blks = append(blks, blk) } return blks, nil } +// sortedCids returns the keys of a cid→bytes map in lexicographic order +// (by CID bytes). Used to make CAR layouts deterministic. +func sortedCids(m map[cid.Cid][]byte) []cid.Cid { + keys := make([]cid.Cid, 0, len(m)) + for c := range m { + keys = append(keys, c) + } + sort.Slice(keys, func(i, j int) bool { + return bytes.Compare(keys[i].Bytes(), keys[j].Bytes()) < 0 + }) + return keys +} + // UnmarshalBinary deserializes binary data into the current DirectoryData object. // This method: // 1. Creates a new blockstore and DAG service. diff --git a/pack/daggen/directory_test.go b/pack/daggen/directory_test.go index 4613a13c0..a968da358 100644 --- a/pack/daggen/directory_test.go +++ b/pack/daggen/directory_test.go @@ -216,3 +216,46 @@ func TestResolveDirectoryTree(t *testing.T) { require.Equal(t, "name", node.Links()[0].Name) require.Equal(t, "test", node.Links()[1].Name) } + +// TestUnmarshalToBlocksDeterministic verifies that UnmarshalToBlocks returns +// blocks in a stable, content-derived order across calls. Without this, Go's +// randomized map iteration leaks into the CAR layout, producing a different +// piece CID on every regeneration even when the underlying DAG is identical. +func TestUnmarshalToBlocksDeterministic(t *testing.T) { + ctx := context.Background() + + // Build a directoryData with many real blocks. We populate Additional + // directly via AddBlocks because AddFile creates dummy nodes that get + // filtered out by UnmarshalToBlocks. + dirData := NewDirectoryData() + const blockCount = 50 + for i := 0; i < blockCount; i++ { + c := cid.NewCidV1(cid.Raw, util.Hash([]byte(strconv.Itoa(i)))) + dirData.additional[c] = []byte("block-data-" + strconv.Itoa(i)) + } + + marshaled, err := dirData.MarshalBinary(ctx) + require.NoError(t, err) + + // Unmarshal multiple times and compare the block CID sequences. Each + // call must produce the same ordering or the resulting CAR file will + // be byte-different and the piece CID will drift. + first, err := UnmarshalToBlocks(marshaled) + require.NoError(t, err) + require.GreaterOrEqual(t, len(first), blockCount, "expected at least %d blocks", blockCount) + + firstCids := make([]string, len(first)) + for i, blk := range first { + firstCids[i] = blk.Cid().String() + } + + for run := 0; run < 20; run++ { + blks, err := UnmarshalToBlocks(marshaled) + require.NoError(t, err) + require.Len(t, blks, len(first)) + for i, blk := range blks { + require.Equal(t, firstCids[i], blk.Cid().String(), + "block order must be deterministic across UnmarshalToBlocks calls (run %d, position %d)", run, i) + } + } +}