From 396168b2badf9100b2424d7aecac9c353c5d4dd0 Mon Sep 17 00:00:00 2001 From: Simon Cropp Date: Tue, 30 Jun 2026 20:41:54 +1000 Subject: [PATCH 1/4] Normalize ZIP host-OS byte for cross-OS determinism ZipArchive stamps the build OS into each central-directory record (version-made-by high byte 0 on Windows, 3 on Unix), so output bytes differed by OS. ZipPlatformNormalizer rewrites that byte to 0 and clears external attributes after building into a buffer; Convert/ConvertAsync now buffer-then-normalize-then-copy. --- claude.md | 1 + .../DeterministicPackage_Convert.cs | 30 +++- .../ZipPlatformNormalizer.cs | 98 +++++++++++++ src/Tests/ZipPlatformNormalizerTests.cs | 129 ++++++++++++++++++ 4 files changed, 252 insertions(+), 6 deletions(-) create mode 100644 src/DeterministicIoPackaging/ZipPlatformNormalizer.cs create mode 100644 src/Tests/ZipPlatformNormalizerTests.cs diff --git a/claude.md b/claude.md index 7a82ded..922806c 100644 --- a/claude.md +++ b/claude.md @@ -70,6 +70,7 @@ The content patcher receives the relationship patcher via constructor injection. - ZIP entries use Deflate compression via `ZipArchive`. Binary output may differ between net48 and net10.0+ due to Deflate implementation differences, but XML content is identical - Entries are sorted by `FullName` using `StringComparer.Ordinal` - Binary snapshot tests use `UniqueForRuntime` to allow framework-specific verified files +- `ZipPlatformNormalizer` makes output **OS-independent**: `ZipArchive` stamps the host OS into each central-directory record (the "version made by" high byte is 0 on Windows, 3 on Unix; Unix can also leak file-mode bits into the external-attributes field). After the archive is built into a buffer, the normalizer rewrites the host byte to 0 (MS-DOS/FAT) and clears external attributes on every record, so identical bytes are produced on Windows/macOS/Linux. The only remaining cross-environment difference is the Deflate stream (cross-runtime, not cross-OS). Because `Convert`/`ConvertAsync` now build into a `MemoryStream` before this pass, the target stream need not be seekable. - `PngNormalizer` writes raw zlib stored blocks (CMF+FLG + DEFLATE stored blocks + Adler-32) instead of using `ZLibStream`, which produces different output on net48 vs net10.0 Example patcher structure: diff --git a/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs b/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs index 46d89f3..9299f4b 100644 --- a/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs +++ b/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs @@ -22,22 +22,40 @@ public static void Convert(Stream source, Stream target) { var patchers = CreatePatchers(); using var sourceArchive = ReadArchive(source); - using var targetArchive = CreateArchive(target); - foreach (var sourceEntry in sourceArchive.OrderedEntries()) + + // Build into a buffer first so the central directory can be normalized + // (see ZipPlatformNormalizer) before the bytes reach the caller's stream. + using var buffer = new MemoryStream(); + using (var targetArchive = CreateArchive(buffer)) { - DuplicateEntry(sourceEntry, targetArchive, patchers); + foreach (var sourceEntry in sourceArchive.OrderedEntries()) + { + DuplicateEntry(sourceEntry, targetArchive, patchers); + } } + + ZipPlatformNormalizer.Normalize(buffer); + buffer.Position = 0; + buffer.CopyTo(target); } public static async Task ConvertAsync(Stream source, Stream target, Cancel token = default) { var patchers = CreatePatchers(); using var sourceArchive = ReadArchive(source); - using var targetArchive = CreateArchive(target); - foreach (var sourceEntry in sourceArchive.OrderedEntries()) + + using var buffer = new MemoryStream(); + using (var targetArchive = CreateArchive(buffer)) { - await DuplicateEntryAsync(sourceEntry, targetArchive, patchers, token); + foreach (var sourceEntry in sourceArchive.OrderedEntries()) + { + await DuplicateEntryAsync(sourceEntry, targetArchive, patchers, token); + } } + + ZipPlatformNormalizer.Normalize(buffer); + buffer.Position = 0; + await buffer.CopyToAsync(target, token); } // ZIP local file header signature ("PK\x03\x04"). diff --git a/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs b/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs new file mode 100644 index 0000000..4567a3c --- /dev/null +++ b/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs @@ -0,0 +1,98 @@ +using System.Buffers.Binary; + +namespace DeterministicIoPackaging; + +// ZipArchive stamps the host operating system into every central-directory +// record: the high byte of the "version made by" field is 0 on Windows and 3 +// on Unix (per the .ZIP spec § 4.4.2), and Unix builds can additionally leak +// file-mode bits into the external-file-attributes field. Neither affects the +// archive's content, but both make the produced bytes depend on the OS that +// ran the conversion, defeating cross-platform determinism. This pass rewrites +// those fields to fixed values so the output is identical on every OS. +static class ZipPlatformNormalizer +{ + // Central-directory file header signature "PK\x01\x02". + static readonly byte[] centralDirectoryHeader = [0x50, 0x4B, 0x01, 0x02]; + + // End-of-central-directory record signature "PK\x05\x06". + static readonly byte[] endOfCentralDirectory = [0x50, 0x4B, 0x05, 0x06]; + + // Fixed size of a central-directory file header before the variable-length + // file name, extra field and comment. + const int centralHeaderSize = 46; + + // Minimum size of the end-of-central-directory record (no archive comment). + const int eocdSize = 22; + + public static void Normalize(MemoryStream archive) + { + var buffer = archive.GetBuffer(); + var length = (int) archive.Length; + + if (!TryFindEndOfCentralDirectory(buffer, length, out var eocd)) + { + return; + } + + var count = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(eocd + 10)); + var offset = (int) BinaryPrimitives.ReadUInt32LittleEndian(buffer.AsSpan(eocd + 16)); + + for (var record = 0; record < count; record++) + { + if (offset + centralHeaderSize > length || + !StartsWith(buffer, offset, centralDirectoryHeader)) + { + // Not the structure we expect (e.g. a ZIP64 archive). Leave it + // untouched rather than risk corrupting the output. + return; + } + + // "version made by" high byte (host OS): force to 0 (MS-DOS / FAT). + buffer[offset + 5] = 0; + // External file attributes (4 bytes): clear any Unix mode bits. + buffer.AsSpan(offset + 38, 4).Clear(); + + var nameLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 28)); + var extraLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 30)); + var commentLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 32)); + offset += centralHeaderSize + nameLength + extraLength + commentLength; + } + } + + // Scans backwards for the EOCD signature. ZipArchive writes no archive + // comment, so it is normally the final 22 bytes, but scanning keeps this + // robust to any trailing bytes. + static bool TryFindEndOfCentralDirectory(byte[] buffer, int length, out int position) + { + for (var i = length - eocdSize; i >= 0; i--) + { + if (StartsWith(buffer, i, endOfCentralDirectory)) + { + position = i; + return true; + } + } + + position = -1; + return false; + } + + static bool StartsWith(byte[] buffer, int offset, byte[] signature) + { + if (offset < 0 || + offset + signature.Length > buffer.Length) + { + return false; + } + + for (var i = 0; i < signature.Length; i++) + { + if (buffer[offset + i] != signature[i]) + { + return false; + } + } + + return true; + } +} diff --git a/src/Tests/ZipPlatformNormalizerTests.cs b/src/Tests/ZipPlatformNormalizerTests.cs new file mode 100644 index 0000000..6d5cfaf --- /dev/null +++ b/src/Tests/ZipPlatformNormalizerTests.cs @@ -0,0 +1,129 @@ +[TestFixture] +public class ZipPlatformNormalizerTests +{ + // Simulates an archive produced on Unix (host byte 3, Unix mode bits in the + // external attributes) and asserts the normalizer rewrites both to the + // Windows/FAT-neutral values. This would fail before the normalizer existed. + [Test] + public void RewritesUnixHostByteAndExternalAttributes() + { + var archive = BuildArchive(); + var buffer = archive.GetBuffer(); + var length = (int) archive.Length; + + foreach (var record in CentralDirectoryRecords(buffer, length)) + { + // host OS = Unix + buffer[record + 5] = 3; + // external attributes carrying Unix mode 0100644 in the high word + BinaryPrimitives.WriteUInt32LittleEndian(buffer.AsSpan(record + 38), 0x81A4_0000); + } + + ZipPlatformNormalizer.Normalize(archive); + + AssertNormalized(archive); + } + + // The end-to-end guarantee: whatever OS runs the conversion, the central + // directory comes out OS-independent. + [Test] + public void ConvertProducesOsIndependentCentralDirectory() + { + using var result = DeterministicPackage.Convert(BuildArchive()); + + AssertNormalized(result); + } + + // The low byte of "version made by" encodes the spec version (a function of + // the features used, not the OS) and must be left alone. + [Test] + public void PreservesSpecVersionLowByte() + { + var archive = BuildArchive(); + var buffer = archive.GetBuffer(); + var length = (int) archive.Length; + + var before = CentralDirectoryRecords(buffer, length) + .Select(_ => buffer[_ + 4]) + .ToList(); + + ZipPlatformNormalizer.Normalize(archive); + + var after = CentralDirectoryRecords(buffer, length) + .Select(_ => buffer[_ + 4]) + .ToList(); + + Assert.That(after, Is.EqualTo(before)); + } + + static void AssertNormalized(MemoryStream archive) + { + var buffer = archive.GetBuffer(); + var length = (int) archive.Length; + var records = CentralDirectoryRecords(buffer, length); + + Assert.That(records, Is.Not.Empty); + foreach (var record in records) + { + Assert.Multiple(() => + { + Assert.That(buffer[record + 5], Is.EqualTo(0), + "host-OS byte must be normalized to 0"); + Assert.That(BinaryPrimitives.ReadUInt32LittleEndian(buffer.AsSpan(record + 38)), Is.EqualTo(0u), + "external file attributes must be cleared"); + }); + } + } + + static MemoryStream BuildArchive() + { + var stream = new MemoryStream(); + using (var archive = new Archive(stream, ZipArchiveMode.Create, leaveOpen: true)) + { + foreach (var name in (string[]) ["alpha.txt", "beta.txt", "nested/gamma.txt"]) + { + var entry = archive.CreateEntry(name, CompressionLevel.Optimal); + using var entryStream = entry.Open(); + using var writer = new StreamWriter(entryStream, Encoding.UTF8); + writer.Write("payload"); + } + } + + stream.Position = 0; + return stream; + } + + // Walks the central directory via the EOCD record, returning the byte offset + // of each central-directory file header. + static List CentralDirectoryRecords(byte[] buffer, int length) + { + var eocd = -1; + for (var i = length - 22; i >= 0; i--) + { + if (buffer[i] == 0x50 && + buffer[i + 1] == 0x4B && + buffer[i + 2] == 0x05 && + buffer[i + 3] == 0x06) + { + eocd = i; + break; + } + } + + Assert.That(eocd, Is.GreaterThanOrEqualTo(0), "EOCD record not found"); + + var count = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(eocd + 10)); + var offset = (int) BinaryPrimitives.ReadUInt32LittleEndian(buffer.AsSpan(eocd + 16)); + var records = new List(); + for (var i = 0; i < count; i++) + { + records.Add(offset); + var nameLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 28)); + var extraLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 30)); + var commentLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 32)); + offset += 46 + nameLength + extraLength + commentLength; + } + + return records; + } +} From 9664e566f759dc10b7da6799da3dd3eaa4259b16 Mon Sep 17 00:00:00 2001 From: Simon Cropp Date: Tue, 30 Jun 2026 21:05:29 +1000 Subject: [PATCH 2/4] . --- claude.md | 2 +- .../DeterministicPackage_Convert.cs | 62 +++++++++++++------ .../ZipPlatformNormalizer.cs | 9 ++- src/Tests/ZipPlatformNormalizerTests.cs | 45 +++++++++++--- 4 files changed, 88 insertions(+), 30 deletions(-) diff --git a/claude.md b/claude.md index 922806c..dda34e9 100644 --- a/claude.md +++ b/claude.md @@ -70,7 +70,7 @@ The content patcher receives the relationship patcher via constructor injection. - ZIP entries use Deflate compression via `ZipArchive`. Binary output may differ between net48 and net10.0+ due to Deflate implementation differences, but XML content is identical - Entries are sorted by `FullName` using `StringComparer.Ordinal` - Binary snapshot tests use `UniqueForRuntime` to allow framework-specific verified files -- `ZipPlatformNormalizer` makes output **OS-independent**: `ZipArchive` stamps the host OS into each central-directory record (the "version made by" high byte is 0 on Windows, 3 on Unix; Unix can also leak file-mode bits into the external-attributes field). After the archive is built into a buffer, the normalizer rewrites the host byte to 0 (MS-DOS/FAT) and clears external attributes on every record, so identical bytes are produced on Windows/macOS/Linux. The only remaining cross-environment difference is the Deflate stream (cross-runtime, not cross-OS). Because `Convert`/`ConvertAsync` now build into a `MemoryStream` before this pass, the target stream need not be seekable. +- `ZipPlatformNormalizer` makes output **OS-independent**: `ZipArchive` stamps the host OS into each central-directory record (the "version made by" high byte is 0 on Windows, 3 on Unix; Unix can also leak file-mode bits into the external-attributes field). The normalizer rewrites the host byte to 0 (MS-DOS/FAT) and clears external attributes on every record, so identical bytes are produced on Windows/macOS/Linux. The only remaining cross-environment difference is the Deflate stream (cross-runtime, not cross-OS). It patches the central directory **in place** via `MemoryStream.GetBuffer()` — no re-zip. When the `Convert`/`ConvertAsync` target is a `MemoryStream` (the common case, incl. the single-arg overloads Verify uses) the archive is written straight into it and patched with no extra copy; only a non-seekable target (e.g. the entry stream in nested-zip recursion) falls back to buffer-then-copy. - `PngNormalizer` writes raw zlib stored blocks (CMF+FLG + DEFLATE stored blocks + Adler-32) instead of using `ZLibStream`, which produces different output on net48 vs net10.0 Example patcher structure: diff --git a/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs b/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs index 9299f4b..0ef0152 100644 --- a/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs +++ b/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs @@ -20,20 +20,22 @@ public static async Task ConvertAsync(Stream source) public static void Convert(Stream source, Stream target) { - var patchers = CreatePatchers(); - using var sourceArchive = ReadArchive(source); - - // Build into a buffer first so the central directory can be normalized - // (see ZipPlatformNormalizer) before the bytes reach the caller's stream. - using var buffer = new MemoryStream(); - using (var targetArchive = CreateArchive(buffer)) + // The central directory can only be normalized (see ZipPlatformNormalizer) + // after the whole archive is written, and that needs random access to the + // bytes. A MemoryStream whose buffer is reachable gives us that in place; + // for anything else (a non-seekable entry stream from nested-zip recursion, + // or a MemoryStream wrapping a caller-owned array) build into a buffer and + // copy the normalized result out. + if (target is MemoryStream memoryTarget && + memoryTarget.TryGetBuffer(out _)) { - foreach (var sourceEntry in sourceArchive.OrderedEntries()) - { - DuplicateEntry(sourceEntry, targetArchive, patchers); - } + WriteArchive(source, memoryTarget); + ZipPlatformNormalizer.Normalize(memoryTarget); + return; } + using var buffer = new MemoryStream(); + WriteArchive(source, buffer); ZipPlatformNormalizer.Normalize(buffer); buffer.Position = 0; buffer.CopyTo(target); @@ -41,23 +43,43 @@ public static void Convert(Stream source, Stream target) public static async Task ConvertAsync(Stream source, Stream target, Cancel token = default) { - var patchers = CreatePatchers(); - using var sourceArchive = ReadArchive(source); - - using var buffer = new MemoryStream(); - using (var targetArchive = CreateArchive(buffer)) + if (target is MemoryStream memoryTarget && + memoryTarget.TryGetBuffer(out _)) { - foreach (var sourceEntry in sourceArchive.OrderedEntries()) - { - await DuplicateEntryAsync(sourceEntry, targetArchive, patchers, token); - } + await WriteArchiveAsync(source, memoryTarget, token); + ZipPlatformNormalizer.Normalize(memoryTarget); + return; } + using var buffer = new MemoryStream(); + await WriteArchiveAsync(source, buffer, token); ZipPlatformNormalizer.Normalize(buffer); buffer.Position = 0; await buffer.CopyToAsync(target, token); } + static void WriteArchive(Stream source, Stream target) + { + var patchers = CreatePatchers(); + using var sourceArchive = ReadArchive(source); + using var targetArchive = CreateArchive(target); + foreach (var sourceEntry in sourceArchive.OrderedEntries()) + { + DuplicateEntry(sourceEntry, targetArchive, patchers); + } + } + + static async Task WriteArchiveAsync(Stream source, Stream target, Cancel token) + { + var patchers = CreatePatchers(); + using var sourceArchive = ReadArchive(source); + using var targetArchive = CreateArchive(target); + foreach (var sourceEntry in sourceArchive.OrderedEntries()) + { + await DuplicateEntryAsync(sourceEntry, targetArchive, patchers, token); + } + } + // ZIP local file header signature ("PK\x03\x04"). // Used to detect nested ZIP packages (e.g. xlsx/docx/pptx embedded inside // word/embeddings/, ppt/embeddings/, xl/embeddings/) so they can be diff --git a/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs b/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs index 4567a3c..afded03 100644 --- a/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs +++ b/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs @@ -26,7 +26,14 @@ static class ZipPlatformNormalizer public static void Normalize(MemoryStream archive) { - var buffer = archive.GetBuffer(); + if (!archive.TryGetBuffer(out var segment)) + { + // Buffer is not reachable (caller-owned array); nothing we can patch + // in place. Convert routes these through the buffered path instead. + return; + } + + var buffer = segment.Array!; var length = (int) archive.Length; if (!TryFindEndOfCentralDirectory(buffer, length, out var eocd)) diff --git a/src/Tests/ZipPlatformNormalizerTests.cs b/src/Tests/ZipPlatformNormalizerTests.cs index 6d5cfaf..d864138 100644 --- a/src/Tests/ZipPlatformNormalizerTests.cs +++ b/src/Tests/ZipPlatformNormalizerTests.cs @@ -21,7 +21,7 @@ public void RewritesUnixHostByteAndExternalAttributes() ZipPlatformNormalizer.Normalize(archive); - AssertNormalized(archive); + AssertNormalized(archive.ToArray()); } // The end-to-end guarantee: whatever OS runs the conversion, the central @@ -31,7 +31,22 @@ public void ConvertProducesOsIndependentCentralDirectory() { using var result = DeterministicPackage.Convert(BuildArchive()); - AssertNormalized(result); + AssertNormalized(result.ToArray()); + } + + // A non-seekable / non-MemoryStream target can't be patched in place, so + // Convert must route it through the buffered path and still emit normalized + // bytes. This is the same path nested-zip recursion uses. + [Test] + public void NonMemoryStreamTargetIsNormalizedViaFallback() + { + var inner = new MemoryStream(); + using (var target = new WriteOnlyStream(inner)) + { + DeterministicPackage.Convert(BuildArchive(), target); + } + + AssertNormalized(inner.ToArray()); } // The low byte of "version made by" encodes the spec version (a function of @@ -56,20 +71,18 @@ public void PreservesSpecVersionLowByte() Assert.That(after, Is.EqualTo(before)); } - static void AssertNormalized(MemoryStream archive) + static void AssertNormalized(byte[] archive) { - var buffer = archive.GetBuffer(); - var length = (int) archive.Length; - var records = CentralDirectoryRecords(buffer, length); + var records = CentralDirectoryRecords(archive, archive.Length); Assert.That(records, Is.Not.Empty); foreach (var record in records) { Assert.Multiple(() => { - Assert.That(buffer[record + 5], Is.EqualTo(0), + Assert.That(archive[record + 5], Is.EqualTo(0), "host-OS byte must be normalized to 0"); - Assert.That(BinaryPrimitives.ReadUInt32LittleEndian(buffer.AsSpan(record + 38)), Is.EqualTo(0u), + Assert.That(BinaryPrimitives.ReadUInt32LittleEndian(archive.AsSpan(record + 38)), Is.EqualTo(0u), "external file attributes must be cleared"); }); } @@ -126,4 +139,20 @@ static List CentralDirectoryRecords(byte[] buffer, int length) return records; } + + // Write-only, non-seekable stream that forwards to an inner stream, forcing + // Convert down its buffered fallback path (target is not a MemoryStream). + class WriteOnlyStream(Stream inner) : Stream + { + public override bool CanWrite => true; + public override bool CanRead => false; + public override bool CanSeek => false; + public override void Write(byte[] buffer, int offset, int count) => inner.Write(buffer, offset, count); + public override void Flush() => inner.Flush(); + public override long Length => throw new NotSupportedException(); + public override long Position { get => throw new NotSupportedException(); set => throw new NotSupportedException(); } + public override long Seek(long offset, SeekOrigin origin) => throw new NotSupportedException(); + public override void SetLength(long value) => throw new NotSupportedException(); + public override int Read(byte[] buffer, int offset, int count) => throw new NotSupportedException(); + } } From 64c7f7908e08f0fcb6f0ec3d797a67fc41620f6f Mon Sep 17 00:00:00 2001 From: Simon Cropp Date: Tue, 30 Jun 2026 21:27:58 +1000 Subject: [PATCH 3/4] . --- claude.md | 3 +- readme.md | 12 +-- src/Benchmarks/ConvertEndToEndBench.cs | 5 +- .../DeterministicPackage_Convert.cs | 95 ++++++------------- .../ZipPlatformNormalizer.cs | 9 +- src/Tests/Tests.cs | 11 +-- src/Tests/ZipPlatformNormalizerTests.cs | 31 ------ 7 files changed, 45 insertions(+), 121 deletions(-) diff --git a/claude.md b/claude.md index dda34e9..9c134d5 100644 --- a/claude.md +++ b/claude.md @@ -70,7 +70,8 @@ The content patcher receives the relationship patcher via constructor injection. - ZIP entries use Deflate compression via `ZipArchive`. Binary output may differ between net48 and net10.0+ due to Deflate implementation differences, but XML content is identical - Entries are sorted by `FullName` using `StringComparer.Ordinal` - Binary snapshot tests use `UniqueForRuntime` to allow framework-specific verified files -- `ZipPlatformNormalizer` makes output **OS-independent**: `ZipArchive` stamps the host OS into each central-directory record (the "version made by" high byte is 0 on Windows, 3 on Unix; Unix can also leak file-mode bits into the external-attributes field). The normalizer rewrites the host byte to 0 (MS-DOS/FAT) and clears external attributes on every record, so identical bytes are produced on Windows/macOS/Linux. The only remaining cross-environment difference is the Deflate stream (cross-runtime, not cross-OS). It patches the central directory **in place** via `MemoryStream.GetBuffer()` — no re-zip. When the `Convert`/`ConvertAsync` target is a `MemoryStream` (the common case, incl. the single-arg overloads Verify uses) the archive is written straight into it and patched with no extra copy; only a non-seekable target (e.g. the entry stream in nested-zip recursion) falls back to buffer-then-copy. +- `Convert(Stream source)` / `ConvertAsync(Stream source, Cancel)` are the only entry points and always return a fresh `MemoryStream`. Normalization is not a streaming operation (entries are reordered, every part is rewritten, the central directory is patched afterward, and `ZipArchive` read-mode needs a seekable source), so there is deliberately no `Convert(source, target)` overload — the result is always fully materialized in a buffer. Nested-zip recursion calls `Convert` then `CopyTo`s into the outer entry stream. +- `ZipPlatformNormalizer` makes output **OS-independent**: `ZipArchive` stamps the host OS into each central-directory record (the "version made by" high byte is 0 on Windows, 3 on Unix; Unix can also leak file-mode bits into the external-attributes field). The normalizer rewrites the host byte to 0 (MS-DOS/FAT) and clears external attributes on every record, so identical bytes are produced on Windows/macOS/Linux. The only remaining cross-environment difference is the Deflate stream (cross-runtime, not cross-OS). It patches the central directory **in place** via `MemoryStream.GetBuffer()` — no re-zip, no extra copy. - `PngNormalizer` writes raw zlib stored blocks (CMF+FLG + DEFLATE stored blocks + Adler-32) instead of using `ZLibStream`, which produces different output on net48 vs net10.0 Example patcher structure: diff --git a/readme.md b/readme.md index bebfe70..b566be8 100644 --- a/readme.md +++ b/readme.md @@ -58,13 +58,13 @@ See [Verify Naming docs](https://github.com/VerifyTests/Verify/blob/main/docs/na ### Convert - - + + ```cs using var sourceStream = File.OpenRead(packagePath); -await DeterministicPackage.ConvertAsync(sourceStream, targetStream); +var target = DeterministicPackage.Convert(sourceStream); ``` -snippet source | anchor +snippet source | anchor @@ -74,9 +74,9 @@ await DeterministicPackage.ConvertAsync(sourceStream, targetStream); ```cs using var sourceStream = File.OpenRead(packagePath); -await DeterministicPackage.ConvertAsync(sourceStream, targetStream); +var target = await DeterministicPackage.ConvertAsync(sourceStream); ``` -snippet source | anchor +snippet source | anchor diff --git a/src/Benchmarks/ConvertEndToEndBench.cs b/src/Benchmarks/ConvertEndToEndBench.cs index ee6ef3a..8762b59 100644 --- a/src/Benchmarks/ConvertEndToEndBench.cs +++ b/src/Benchmarks/ConvertEndToEndBench.cs @@ -11,10 +11,9 @@ public void Setup() => sourceBytes = SampleXml.BuildDocxZip(Paragraphs, drawings: 200, hyperlinks: 200); [Benchmark] - public void Convert() + public MemoryStream Convert() { using var source = new MemoryStream(sourceBytes, writable: false); - using var target = new MemoryStream(); - DeterministicPackage.Convert(source, target); + return DeterministicPackage.Convert(source); } } diff --git a/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs b/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs index 0ef0152..6174364 100644 --- a/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs +++ b/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs @@ -2,82 +2,45 @@ public static partial class DeterministicPackage { + // Normalizing a package is not a streaming operation: entries are reordered, + // every part is rewritten, and the central directory is patched after the fact + // (see ZipPlatformNormalizer) — all of which need the whole archive in a + // seekable buffer. The result is therefore always a fresh MemoryStream, built + // and patched in place with no extra copy. public static MemoryStream Convert(Stream source) { + var patchers = CreatePatchers(); var target = new MemoryStream(); - Convert(source, target); - target.Position = 0; - return target; - } - - public static async Task ConvertAsync(Stream source) - { - var target = new MemoryStream(); - await ConvertAsync(source, target); - target.Position = 0; - return target; - } - - public static void Convert(Stream source, Stream target) - { - // The central directory can only be normalized (see ZipPlatformNormalizer) - // after the whole archive is written, and that needs random access to the - // bytes. A MemoryStream whose buffer is reachable gives us that in place; - // for anything else (a non-seekable entry stream from nested-zip recursion, - // or a MemoryStream wrapping a caller-owned array) build into a buffer and - // copy the normalized result out. - if (target is MemoryStream memoryTarget && - memoryTarget.TryGetBuffer(out _)) - { - WriteArchive(source, memoryTarget); - ZipPlatformNormalizer.Normalize(memoryTarget); - return; - } - - using var buffer = new MemoryStream(); - WriteArchive(source, buffer); - ZipPlatformNormalizer.Normalize(buffer); - buffer.Position = 0; - buffer.CopyTo(target); - } - - public static async Task ConvertAsync(Stream source, Stream target, Cancel token = default) - { - if (target is MemoryStream memoryTarget && - memoryTarget.TryGetBuffer(out _)) + using (var sourceArchive = ReadArchive(source)) + using (var targetArchive = CreateArchive(target)) { - await WriteArchiveAsync(source, memoryTarget, token); - ZipPlatformNormalizer.Normalize(memoryTarget); - return; + foreach (var sourceEntry in sourceArchive.OrderedEntries()) + { + DuplicateEntry(sourceEntry, targetArchive, patchers); + } } - using var buffer = new MemoryStream(); - await WriteArchiveAsync(source, buffer, token); - ZipPlatformNormalizer.Normalize(buffer); - buffer.Position = 0; - await buffer.CopyToAsync(target, token); + ZipPlatformNormalizer.Normalize(target); + target.Position = 0; + return target; } - static void WriteArchive(Stream source, Stream target) + public static async Task ConvertAsync(Stream source, Cancel token = default) { var patchers = CreatePatchers(); - using var sourceArchive = ReadArchive(source); - using var targetArchive = CreateArchive(target); - foreach (var sourceEntry in sourceArchive.OrderedEntries()) + var target = new MemoryStream(); + using (var sourceArchive = ReadArchive(source)) + using (var targetArchive = CreateArchive(target)) { - DuplicateEntry(sourceEntry, targetArchive, patchers); + foreach (var sourceEntry in sourceArchive.OrderedEntries()) + { + await DuplicateEntryAsync(sourceEntry, targetArchive, patchers, token); + } } - } - static async Task WriteArchiveAsync(Stream source, Stream target, Cancel token) - { - var patchers = CreatePatchers(); - using var sourceArchive = ReadArchive(source); - using var targetArchive = CreateArchive(target); - foreach (var sourceEntry in sourceArchive.OrderedEntries()) - { - await DuplicateEntryAsync(sourceEntry, targetArchive, patchers, token); - } + ZipPlatformNormalizer.Normalize(target); + target.Position = 0; + return target; } // ZIP local file header signature ("PK\x03\x04"). @@ -124,7 +87,8 @@ static void CopyOrRecurseZip(Stream source, Stream target) buffer.Write(head, 0, read); source.CopyTo(buffer); buffer.Position = 0; - Convert(buffer, target); + using var normalized = Convert(buffer); + normalized.CopyTo(target); return; } @@ -147,7 +111,8 @@ static async Task CopyOrRecurseZipAsync(Stream source, Stream target, Cancel can await buffer.WriteAsync(head, 0, read, cancel); await source.CopyToAsync(buffer, cancel); buffer.Position = 0; - await ConvertAsync(buffer, target, cancel); + using var normalized = await ConvertAsync(buffer, cancel); + await normalized.CopyToAsync(target, cancel); return; } diff --git a/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs b/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs index afded03..4567a3c 100644 --- a/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs +++ b/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs @@ -26,14 +26,7 @@ static class ZipPlatformNormalizer public static void Normalize(MemoryStream archive) { - if (!archive.TryGetBuffer(out var segment)) - { - // Buffer is not reachable (caller-owned array); nothing we can patch - // in place. Convert routes these through the buffered path instead. - return; - } - - var buffer = segment.Array!; + var buffer = archive.GetBuffer(); var length = (int) archive.Length; if (!TryFindEndOfCentralDirectory(buffer, length, out var eocd)) diff --git a/src/Tests/Tests.cs b/src/Tests/Tests.cs index 576724f..0978122 100644 --- a/src/Tests/Tests.cs +++ b/src/Tests/Tests.cs @@ -265,31 +265,28 @@ static MemoryStream Convert(Extension extension) static MemoryStream Convert(string packagePath) { - var targetStream = new MemoryStream(); - #region Convert using var sourceStream = File.OpenRead(packagePath); - DeterministicPackage.Convert(sourceStream, targetStream); + var target = DeterministicPackage.Convert(sourceStream); #endregion - return targetStream; + return target; } static async Task ConvertAsync(Extension extension) { var packagePath = Path.Combine(directory, $"sample.{extension}"); - var targetStream = new MemoryStream(); #region ConvertAsync using var sourceStream = File.OpenRead(packagePath); - await DeterministicPackage.ConvertAsync(sourceStream, targetStream); + var target = await DeterministicPackage.ConvertAsync(sourceStream); #endregion - return targetStream; + return target; } } diff --git a/src/Tests/ZipPlatformNormalizerTests.cs b/src/Tests/ZipPlatformNormalizerTests.cs index d864138..cdf6334 100644 --- a/src/Tests/ZipPlatformNormalizerTests.cs +++ b/src/Tests/ZipPlatformNormalizerTests.cs @@ -34,21 +34,6 @@ public void ConvertProducesOsIndependentCentralDirectory() AssertNormalized(result.ToArray()); } - // A non-seekable / non-MemoryStream target can't be patched in place, so - // Convert must route it through the buffered path and still emit normalized - // bytes. This is the same path nested-zip recursion uses. - [Test] - public void NonMemoryStreamTargetIsNormalizedViaFallback() - { - var inner = new MemoryStream(); - using (var target = new WriteOnlyStream(inner)) - { - DeterministicPackage.Convert(BuildArchive(), target); - } - - AssertNormalized(inner.ToArray()); - } - // The low byte of "version made by" encodes the spec version (a function of // the features used, not the OS) and must be left alone. [Test] @@ -139,20 +124,4 @@ static List CentralDirectoryRecords(byte[] buffer, int length) return records; } - - // Write-only, non-seekable stream that forwards to an inner stream, forcing - // Convert down its buffered fallback path (target is not a MemoryStream). - class WriteOnlyStream(Stream inner) : Stream - { - public override bool CanWrite => true; - public override bool CanRead => false; - public override bool CanSeek => false; - public override void Write(byte[] buffer, int offset, int count) => inner.Write(buffer, offset, count); - public override void Flush() => inner.Flush(); - public override long Length => throw new NotSupportedException(); - public override long Position { get => throw new NotSupportedException(); set => throw new NotSupportedException(); } - public override long Seek(long offset, SeekOrigin origin) => throw new NotSupportedException(); - public override void SetLength(long value) => throw new NotSupportedException(); - public override int Read(byte[] buffer, int offset, int count) => throw new NotSupportedException(); - } } From a1301a12f2e64cf631a31b1968180f29ad48b4fb Mon Sep 17 00:00:00 2001 From: Simon Cropp Date: Tue, 30 Jun 2026 21:35:40 +1000 Subject: [PATCH 4/4] . --- src/DeterministicIoPackaging/ZipPlatformNormalizer.cs | 2 -- src/Directory.Build.props | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs b/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs index 4567a3c..636c08c 100644 --- a/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs +++ b/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs @@ -1,7 +1,5 @@ using System.Buffers.Binary; -namespace DeterministicIoPackaging; - // ZipArchive stamps the host operating system into every central-directory // record: the high byte of the "version made by" field is 0 on Windows and 3 // on Unix (per the .ZIP spec § 4.4.2), and Unix builds can additionally leak diff --git a/src/Directory.Build.props b/src/Directory.Build.props index 4c613dd..399cd3e 100644 --- a/src/Directory.Build.props +++ b/src/Directory.Build.props @@ -2,7 +2,7 @@ CS1591;CS0649;CA1416;NU1608;NU1109;NU1510 - 0.27.0 + 0.28.0 preview 1.0.0 Modify System.IO.Packaging (https://learn.microsoft.com/en-us/dotnet/api/system.io.packaging) files to ensure they are deterministic. Helpful for testing, build reproducibility, security verification, and ensuring package integrity across different build environments.