diff --git a/claude.md b/claude.md
index 7a82ded..9c134d5 100644
--- a/claude.md
+++ b/claude.md
@@ -70,6 +70,8 @@ The content patcher receives the relationship patcher via constructor injection.
- ZIP entries use Deflate compression via `ZipArchive`. Binary output may differ between net48 and net10.0+ due to Deflate implementation differences, but XML content is identical
- Entries are sorted by `FullName` using `StringComparer.Ordinal`
- Binary snapshot tests use `UniqueForRuntime` to allow framework-specific verified files
+- `Convert(Stream source)` / `ConvertAsync(Stream source, Cancel)` are the only entry points and always return a fresh `MemoryStream`. Normalization is not a streaming operation (entries are reordered, every part is rewritten, the central directory is patched afterward, and `ZipArchive` read-mode needs a seekable source), so there is deliberately no `Convert(source, target)` overload โ the result is always fully materialized in a buffer. Nested-zip recursion calls `Convert` then `CopyTo`s into the outer entry stream.
+- `ZipPlatformNormalizer` makes output **OS-independent**: `ZipArchive` stamps the host OS into each central-directory record (the "version made by" high byte is 0 on Windows, 3 on Unix; Unix can also leak file-mode bits into the external-attributes field). The normalizer rewrites the host byte to 0 (MS-DOS/FAT) and clears external attributes on every record, so identical bytes are produced on Windows/macOS/Linux. The only remaining cross-environment difference is the Deflate stream (cross-runtime, not cross-OS). It patches the central directory **in place** via `MemoryStream.GetBuffer()` โ no re-zip, no extra copy.
- `PngNormalizer` writes raw zlib stored blocks (CMF+FLG + DEFLATE stored blocks + Adler-32) instead of using `ZLibStream`, which produces different output on net48 vs net10.0
Example patcher structure:
diff --git a/readme.md b/readme.md
index bebfe70..b566be8 100644
--- a/readme.md
+++ b/readme.md
@@ -58,13 +58,13 @@ See [Verify Naming docs](https://github.com/VerifyTests/Verify/blob/main/docs/na
### Convert
-
-
+
+
```cs
using var sourceStream = File.OpenRead(packagePath);
-await DeterministicPackage.ConvertAsync(sourceStream, targetStream);
+var target = DeterministicPackage.Convert(sourceStream);
```
-snippet source | anchor
+snippet source | anchor
@@ -74,9 +74,9 @@ await DeterministicPackage.ConvertAsync(sourceStream, targetStream);
```cs
using var sourceStream = File.OpenRead(packagePath);
-await DeterministicPackage.ConvertAsync(sourceStream, targetStream);
+var target = await DeterministicPackage.ConvertAsync(sourceStream);
```
-snippet source | anchor
+snippet source | anchor
diff --git a/src/Benchmarks/ConvertEndToEndBench.cs b/src/Benchmarks/ConvertEndToEndBench.cs
index ee6ef3a..8762b59 100644
--- a/src/Benchmarks/ConvertEndToEndBench.cs
+++ b/src/Benchmarks/ConvertEndToEndBench.cs
@@ -11,10 +11,9 @@ public void Setup() =>
sourceBytes = SampleXml.BuildDocxZip(Paragraphs, drawings: 200, hyperlinks: 200);
[Benchmark]
- public void Convert()
+ public MemoryStream Convert()
{
using var source = new MemoryStream(sourceBytes, writable: false);
- using var target = new MemoryStream();
- DeterministicPackage.Convert(source, target);
+ return DeterministicPackage.Convert(source);
}
}
diff --git a/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs b/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs
index 46d89f3..6174364 100644
--- a/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs
+++ b/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs
@@ -2,42 +2,45 @@
public static partial class DeterministicPackage
{
+ // Normalizing a package is not a streaming operation: entries are reordered,
+ // every part is rewritten, and the central directory is patched after the fact
+ // (see ZipPlatformNormalizer) โ all of which need the whole archive in a
+ // seekable buffer. The result is therefore always a fresh MemoryStream, built
+ // and patched in place with no extra copy.
public static MemoryStream Convert(Stream source)
{
+ var patchers = CreatePatchers();
var target = new MemoryStream();
- Convert(source, target);
- target.Position = 0;
- return target;
- }
+ using (var sourceArchive = ReadArchive(source))
+ using (var targetArchive = CreateArchive(target))
+ {
+ foreach (var sourceEntry in sourceArchive.OrderedEntries())
+ {
+ DuplicateEntry(sourceEntry, targetArchive, patchers);
+ }
+ }
- public static async Task ConvertAsync(Stream source)
- {
- var target = new MemoryStream();
- await ConvertAsync(source, target);
+ ZipPlatformNormalizer.Normalize(target);
target.Position = 0;
return target;
}
- public static void Convert(Stream source, Stream target)
+ public static async Task ConvertAsync(Stream source, Cancel token = default)
{
var patchers = CreatePatchers();
- using var sourceArchive = ReadArchive(source);
- using var targetArchive = CreateArchive(target);
- foreach (var sourceEntry in sourceArchive.OrderedEntries())
+ var target = new MemoryStream();
+ using (var sourceArchive = ReadArchive(source))
+ using (var targetArchive = CreateArchive(target))
{
- DuplicateEntry(sourceEntry, targetArchive, patchers);
+ foreach (var sourceEntry in sourceArchive.OrderedEntries())
+ {
+ await DuplicateEntryAsync(sourceEntry, targetArchive, patchers, token);
+ }
}
- }
- public static async Task ConvertAsync(Stream source, Stream target, Cancel token = default)
- {
- var patchers = CreatePatchers();
- using var sourceArchive = ReadArchive(source);
- using var targetArchive = CreateArchive(target);
- foreach (var sourceEntry in sourceArchive.OrderedEntries())
- {
- await DuplicateEntryAsync(sourceEntry, targetArchive, patchers, token);
- }
+ ZipPlatformNormalizer.Normalize(target);
+ target.Position = 0;
+ return target;
}
// ZIP local file header signature ("PK\x03\x04").
@@ -84,7 +87,8 @@ static void CopyOrRecurseZip(Stream source, Stream target)
buffer.Write(head, 0, read);
source.CopyTo(buffer);
buffer.Position = 0;
- Convert(buffer, target);
+ using var normalized = Convert(buffer);
+ normalized.CopyTo(target);
return;
}
@@ -107,7 +111,8 @@ static async Task CopyOrRecurseZipAsync(Stream source, Stream target, Cancel can
await buffer.WriteAsync(head, 0, read, cancel);
await source.CopyToAsync(buffer, cancel);
buffer.Position = 0;
- await ConvertAsync(buffer, target, cancel);
+ using var normalized = await ConvertAsync(buffer, cancel);
+ await normalized.CopyToAsync(target, cancel);
return;
}
diff --git a/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs b/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs
new file mode 100644
index 0000000..636c08c
--- /dev/null
+++ b/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs
@@ -0,0 +1,96 @@
+using System.Buffers.Binary;
+
+// ZipArchive stamps the host operating system into every central-directory
+// record: the high byte of the "version made by" field is 0 on Windows and 3
+// on Unix (per the .ZIP spec ยง 4.4.2), and Unix builds can additionally leak
+// file-mode bits into the external-file-attributes field. Neither affects the
+// archive's content, but both make the produced bytes depend on the OS that
+// ran the conversion, defeating cross-platform determinism. This pass rewrites
+// those fields to fixed values so the output is identical on every OS.
+static class ZipPlatformNormalizer
+{
+ // Central-directory file header signature "PK\x01\x02".
+ static readonly byte[] centralDirectoryHeader = [0x50, 0x4B, 0x01, 0x02];
+
+ // End-of-central-directory record signature "PK\x05\x06".
+ static readonly byte[] endOfCentralDirectory = [0x50, 0x4B, 0x05, 0x06];
+
+ // Fixed size of a central-directory file header before the variable-length
+ // file name, extra field and comment.
+ const int centralHeaderSize = 46;
+
+ // Minimum size of the end-of-central-directory record (no archive comment).
+ const int eocdSize = 22;
+
+ public static void Normalize(MemoryStream archive)
+ {
+ var buffer = archive.GetBuffer();
+ var length = (int) archive.Length;
+
+ if (!TryFindEndOfCentralDirectory(buffer, length, out var eocd))
+ {
+ return;
+ }
+
+ var count = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(eocd + 10));
+ var offset = (int) BinaryPrimitives.ReadUInt32LittleEndian(buffer.AsSpan(eocd + 16));
+
+ for (var record = 0; record < count; record++)
+ {
+ if (offset + centralHeaderSize > length ||
+ !StartsWith(buffer, offset, centralDirectoryHeader))
+ {
+ // Not the structure we expect (e.g. a ZIP64 archive). Leave it
+ // untouched rather than risk corrupting the output.
+ return;
+ }
+
+ // "version made by" high byte (host OS): force to 0 (MS-DOS / FAT).
+ buffer[offset + 5] = 0;
+ // External file attributes (4 bytes): clear any Unix mode bits.
+ buffer.AsSpan(offset + 38, 4).Clear();
+
+ var nameLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 28));
+ var extraLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 30));
+ var commentLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 32));
+ offset += centralHeaderSize + nameLength + extraLength + commentLength;
+ }
+ }
+
+ // Scans backwards for the EOCD signature. ZipArchive writes no archive
+ // comment, so it is normally the final 22 bytes, but scanning keeps this
+ // robust to any trailing bytes.
+ static bool TryFindEndOfCentralDirectory(byte[] buffer, int length, out int position)
+ {
+ for (var i = length - eocdSize; i >= 0; i--)
+ {
+ if (StartsWith(buffer, i, endOfCentralDirectory))
+ {
+ position = i;
+ return true;
+ }
+ }
+
+ position = -1;
+ return false;
+ }
+
+ static bool StartsWith(byte[] buffer, int offset, byte[] signature)
+ {
+ if (offset < 0 ||
+ offset + signature.Length > buffer.Length)
+ {
+ return false;
+ }
+
+ for (var i = 0; i < signature.Length; i++)
+ {
+ if (buffer[offset + i] != signature[i])
+ {
+ return false;
+ }
+ }
+
+ return true;
+ }
+}
diff --git a/src/Directory.Build.props b/src/Directory.Build.props
index 4c613dd..399cd3e 100644
--- a/src/Directory.Build.props
+++ b/src/Directory.Build.props
@@ -2,7 +2,7 @@
CS1591;CS0649;CA1416;NU1608;NU1109;NU1510
- 0.27.0
+ 0.28.0
preview
1.0.0
Modify System.IO.Packaging (https://learn.microsoft.com/en-us/dotnet/api/system.io.packaging) files to ensure they are deterministic. Helpful for testing, build reproducibility, security verification, and ensuring package integrity across different build environments.
diff --git a/src/Tests/Tests.cs b/src/Tests/Tests.cs
index 576724f..0978122 100644
--- a/src/Tests/Tests.cs
+++ b/src/Tests/Tests.cs
@@ -265,31 +265,28 @@ static MemoryStream Convert(Extension extension)
static MemoryStream Convert(string packagePath)
{
- var targetStream = new MemoryStream();
-
#region Convert
using var sourceStream = File.OpenRead(packagePath);
- DeterministicPackage.Convert(sourceStream, targetStream);
+ var target = DeterministicPackage.Convert(sourceStream);
#endregion
- return targetStream;
+ return target;
}
static async Task ConvertAsync(Extension extension)
{
var packagePath = Path.Combine(directory, $"sample.{extension}");
- var targetStream = new MemoryStream();
#region ConvertAsync
using var sourceStream = File.OpenRead(packagePath);
- await DeterministicPackage.ConvertAsync(sourceStream, targetStream);
+ var target = await DeterministicPackage.ConvertAsync(sourceStream);
#endregion
- return targetStream;
+ return target;
}
}
diff --git a/src/Tests/ZipPlatformNormalizerTests.cs b/src/Tests/ZipPlatformNormalizerTests.cs
new file mode 100644
index 0000000..cdf6334
--- /dev/null
+++ b/src/Tests/ZipPlatformNormalizerTests.cs
@@ -0,0 +1,127 @@
+[TestFixture]
+public class ZipPlatformNormalizerTests
+{
+ // Simulates an archive produced on Unix (host byte 3, Unix mode bits in the
+ // external attributes) and asserts the normalizer rewrites both to the
+ // Windows/FAT-neutral values. This would fail before the normalizer existed.
+ [Test]
+ public void RewritesUnixHostByteAndExternalAttributes()
+ {
+ var archive = BuildArchive();
+ var buffer = archive.GetBuffer();
+ var length = (int) archive.Length;
+
+ foreach (var record in CentralDirectoryRecords(buffer, length))
+ {
+ // host OS = Unix
+ buffer[record + 5] = 3;
+ // external attributes carrying Unix mode 0100644 in the high word
+ BinaryPrimitives.WriteUInt32LittleEndian(buffer.AsSpan(record + 38), 0x81A4_0000);
+ }
+
+ ZipPlatformNormalizer.Normalize(archive);
+
+ AssertNormalized(archive.ToArray());
+ }
+
+ // The end-to-end guarantee: whatever OS runs the conversion, the central
+ // directory comes out OS-independent.
+ [Test]
+ public void ConvertProducesOsIndependentCentralDirectory()
+ {
+ using var result = DeterministicPackage.Convert(BuildArchive());
+
+ AssertNormalized(result.ToArray());
+ }
+
+ // The low byte of "version made by" encodes the spec version (a function of
+ // the features used, not the OS) and must be left alone.
+ [Test]
+ public void PreservesSpecVersionLowByte()
+ {
+ var archive = BuildArchive();
+ var buffer = archive.GetBuffer();
+ var length = (int) archive.Length;
+
+ var before = CentralDirectoryRecords(buffer, length)
+ .Select(_ => buffer[_ + 4])
+ .ToList();
+
+ ZipPlatformNormalizer.Normalize(archive);
+
+ var after = CentralDirectoryRecords(buffer, length)
+ .Select(_ => buffer[_ + 4])
+ .ToList();
+
+ Assert.That(after, Is.EqualTo(before));
+ }
+
+ static void AssertNormalized(byte[] archive)
+ {
+ var records = CentralDirectoryRecords(archive, archive.Length);
+
+ Assert.That(records, Is.Not.Empty);
+ foreach (var record in records)
+ {
+ Assert.Multiple(() =>
+ {
+ Assert.That(archive[record + 5], Is.EqualTo(0),
+ "host-OS byte must be normalized to 0");
+ Assert.That(BinaryPrimitives.ReadUInt32LittleEndian(archive.AsSpan(record + 38)), Is.EqualTo(0u),
+ "external file attributes must be cleared");
+ });
+ }
+ }
+
+ static MemoryStream BuildArchive()
+ {
+ var stream = new MemoryStream();
+ using (var archive = new Archive(stream, ZipArchiveMode.Create, leaveOpen: true))
+ {
+ foreach (var name in (string[]) ["alpha.txt", "beta.txt", "nested/gamma.txt"])
+ {
+ var entry = archive.CreateEntry(name, CompressionLevel.Optimal);
+ using var entryStream = entry.Open();
+ using var writer = new StreamWriter(entryStream, Encoding.UTF8);
+ writer.Write("payload");
+ }
+ }
+
+ stream.Position = 0;
+ return stream;
+ }
+
+ // Walks the central directory via the EOCD record, returning the byte offset
+ // of each central-directory file header.
+ static List CentralDirectoryRecords(byte[] buffer, int length)
+ {
+ var eocd = -1;
+ for (var i = length - 22; i >= 0; i--)
+ {
+ if (buffer[i] == 0x50 &&
+ buffer[i + 1] == 0x4B &&
+ buffer[i + 2] == 0x05 &&
+ buffer[i + 3] == 0x06)
+ {
+ eocd = i;
+ break;
+ }
+ }
+
+ Assert.That(eocd, Is.GreaterThanOrEqualTo(0), "EOCD record not found");
+
+ var count = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(eocd + 10));
+ var offset = (int) BinaryPrimitives.ReadUInt32LittleEndian(buffer.AsSpan(eocd + 16));
+ var records = new List();
+ for (var i = 0; i < count; i++)
+ {
+ records.Add(offset);
+ var nameLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 28));
+ var extraLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 30));
+ var commentLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 32));
+ offset += 46 + nameLength + extraLength + commentLength;
+ }
+
+ return records;
+ }
+}