diff --git a/claude.md b/claude.md index 7a82ded..9c134d5 100644 --- a/claude.md +++ b/claude.md @@ -70,6 +70,8 @@ The content patcher receives the relationship patcher via constructor injection. - ZIP entries use Deflate compression via `ZipArchive`. Binary output may differ between net48 and net10.0+ due to Deflate implementation differences, but XML content is identical - Entries are sorted by `FullName` using `StringComparer.Ordinal` - Binary snapshot tests use `UniqueForRuntime` to allow framework-specific verified files +- `Convert(Stream source)` / `ConvertAsync(Stream source, Cancel)` are the only entry points and always return a fresh `MemoryStream`. Normalization is not a streaming operation (entries are reordered, every part is rewritten, the central directory is patched afterward, and `ZipArchive` read-mode needs a seekable source), so there is deliberately no `Convert(source, target)` overload โ€” the result is always fully materialized in a buffer. Nested-zip recursion calls `Convert` then `CopyTo`s into the outer entry stream. +- `ZipPlatformNormalizer` makes output **OS-independent**: `ZipArchive` stamps the host OS into each central-directory record (the "version made by" high byte is 0 on Windows, 3 on Unix; Unix can also leak file-mode bits into the external-attributes field). The normalizer rewrites the host byte to 0 (MS-DOS/FAT) and clears external attributes on every record, so identical bytes are produced on Windows/macOS/Linux. The only remaining cross-environment difference is the Deflate stream (cross-runtime, not cross-OS). It patches the central directory **in place** via `MemoryStream.GetBuffer()` โ€” no re-zip, no extra copy. - `PngNormalizer` writes raw zlib stored blocks (CMF+FLG + DEFLATE stored blocks + Adler-32) instead of using `ZLibStream`, which produces different output on net48 vs net10.0 Example patcher structure: diff --git a/readme.md b/readme.md index bebfe70..b566be8 100644 --- a/readme.md +++ b/readme.md @@ -58,13 +58,13 @@ See [Verify Naming docs](https://github.com/VerifyTests/Verify/blob/main/docs/na ### Convert - - + + ```cs using var sourceStream = File.OpenRead(packagePath); -await DeterministicPackage.ConvertAsync(sourceStream, targetStream); +var target = DeterministicPackage.Convert(sourceStream); ``` -snippet source | anchor +snippet source | anchor @@ -74,9 +74,9 @@ await DeterministicPackage.ConvertAsync(sourceStream, targetStream); ```cs using var sourceStream = File.OpenRead(packagePath); -await DeterministicPackage.ConvertAsync(sourceStream, targetStream); +var target = await DeterministicPackage.ConvertAsync(sourceStream); ``` -snippet source | anchor +snippet source | anchor diff --git a/src/Benchmarks/ConvertEndToEndBench.cs b/src/Benchmarks/ConvertEndToEndBench.cs index ee6ef3a..8762b59 100644 --- a/src/Benchmarks/ConvertEndToEndBench.cs +++ b/src/Benchmarks/ConvertEndToEndBench.cs @@ -11,10 +11,9 @@ public void Setup() => sourceBytes = SampleXml.BuildDocxZip(Paragraphs, drawings: 200, hyperlinks: 200); [Benchmark] - public void Convert() + public MemoryStream Convert() { using var source = new MemoryStream(sourceBytes, writable: false); - using var target = new MemoryStream(); - DeterministicPackage.Convert(source, target); + return DeterministicPackage.Convert(source); } } diff --git a/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs b/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs index 46d89f3..6174364 100644 --- a/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs +++ b/src/DeterministicIoPackaging/DeterministicPackage_Convert.cs @@ -2,42 +2,45 @@ public static partial class DeterministicPackage { + // Normalizing a package is not a streaming operation: entries are reordered, + // every part is rewritten, and the central directory is patched after the fact + // (see ZipPlatformNormalizer) โ€” all of which need the whole archive in a + // seekable buffer. The result is therefore always a fresh MemoryStream, built + // and patched in place with no extra copy. public static MemoryStream Convert(Stream source) { + var patchers = CreatePatchers(); var target = new MemoryStream(); - Convert(source, target); - target.Position = 0; - return target; - } + using (var sourceArchive = ReadArchive(source)) + using (var targetArchive = CreateArchive(target)) + { + foreach (var sourceEntry in sourceArchive.OrderedEntries()) + { + DuplicateEntry(sourceEntry, targetArchive, patchers); + } + } - public static async Task ConvertAsync(Stream source) - { - var target = new MemoryStream(); - await ConvertAsync(source, target); + ZipPlatformNormalizer.Normalize(target); target.Position = 0; return target; } - public static void Convert(Stream source, Stream target) + public static async Task ConvertAsync(Stream source, Cancel token = default) { var patchers = CreatePatchers(); - using var sourceArchive = ReadArchive(source); - using var targetArchive = CreateArchive(target); - foreach (var sourceEntry in sourceArchive.OrderedEntries()) + var target = new MemoryStream(); + using (var sourceArchive = ReadArchive(source)) + using (var targetArchive = CreateArchive(target)) { - DuplicateEntry(sourceEntry, targetArchive, patchers); + foreach (var sourceEntry in sourceArchive.OrderedEntries()) + { + await DuplicateEntryAsync(sourceEntry, targetArchive, patchers, token); + } } - } - public static async Task ConvertAsync(Stream source, Stream target, Cancel token = default) - { - var patchers = CreatePatchers(); - using var sourceArchive = ReadArchive(source); - using var targetArchive = CreateArchive(target); - foreach (var sourceEntry in sourceArchive.OrderedEntries()) - { - await DuplicateEntryAsync(sourceEntry, targetArchive, patchers, token); - } + ZipPlatformNormalizer.Normalize(target); + target.Position = 0; + return target; } // ZIP local file header signature ("PK\x03\x04"). @@ -84,7 +87,8 @@ static void CopyOrRecurseZip(Stream source, Stream target) buffer.Write(head, 0, read); source.CopyTo(buffer); buffer.Position = 0; - Convert(buffer, target); + using var normalized = Convert(buffer); + normalized.CopyTo(target); return; } @@ -107,7 +111,8 @@ static async Task CopyOrRecurseZipAsync(Stream source, Stream target, Cancel can await buffer.WriteAsync(head, 0, read, cancel); await source.CopyToAsync(buffer, cancel); buffer.Position = 0; - await ConvertAsync(buffer, target, cancel); + using var normalized = await ConvertAsync(buffer, cancel); + await normalized.CopyToAsync(target, cancel); return; } diff --git a/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs b/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs new file mode 100644 index 0000000..636c08c --- /dev/null +++ b/src/DeterministicIoPackaging/ZipPlatformNormalizer.cs @@ -0,0 +1,96 @@ +using System.Buffers.Binary; + +// ZipArchive stamps the host operating system into every central-directory +// record: the high byte of the "version made by" field is 0 on Windows and 3 +// on Unix (per the .ZIP spec ยง 4.4.2), and Unix builds can additionally leak +// file-mode bits into the external-file-attributes field. Neither affects the +// archive's content, but both make the produced bytes depend on the OS that +// ran the conversion, defeating cross-platform determinism. This pass rewrites +// those fields to fixed values so the output is identical on every OS. +static class ZipPlatformNormalizer +{ + // Central-directory file header signature "PK\x01\x02". + static readonly byte[] centralDirectoryHeader = [0x50, 0x4B, 0x01, 0x02]; + + // End-of-central-directory record signature "PK\x05\x06". + static readonly byte[] endOfCentralDirectory = [0x50, 0x4B, 0x05, 0x06]; + + // Fixed size of a central-directory file header before the variable-length + // file name, extra field and comment. + const int centralHeaderSize = 46; + + // Minimum size of the end-of-central-directory record (no archive comment). + const int eocdSize = 22; + + public static void Normalize(MemoryStream archive) + { + var buffer = archive.GetBuffer(); + var length = (int) archive.Length; + + if (!TryFindEndOfCentralDirectory(buffer, length, out var eocd)) + { + return; + } + + var count = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(eocd + 10)); + var offset = (int) BinaryPrimitives.ReadUInt32LittleEndian(buffer.AsSpan(eocd + 16)); + + for (var record = 0; record < count; record++) + { + if (offset + centralHeaderSize > length || + !StartsWith(buffer, offset, centralDirectoryHeader)) + { + // Not the structure we expect (e.g. a ZIP64 archive). Leave it + // untouched rather than risk corrupting the output. + return; + } + + // "version made by" high byte (host OS): force to 0 (MS-DOS / FAT). + buffer[offset + 5] = 0; + // External file attributes (4 bytes): clear any Unix mode bits. + buffer.AsSpan(offset + 38, 4).Clear(); + + var nameLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 28)); + var extraLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 30)); + var commentLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 32)); + offset += centralHeaderSize + nameLength + extraLength + commentLength; + } + } + + // Scans backwards for the EOCD signature. ZipArchive writes no archive + // comment, so it is normally the final 22 bytes, but scanning keeps this + // robust to any trailing bytes. + static bool TryFindEndOfCentralDirectory(byte[] buffer, int length, out int position) + { + for (var i = length - eocdSize; i >= 0; i--) + { + if (StartsWith(buffer, i, endOfCentralDirectory)) + { + position = i; + return true; + } + } + + position = -1; + return false; + } + + static bool StartsWith(byte[] buffer, int offset, byte[] signature) + { + if (offset < 0 || + offset + signature.Length > buffer.Length) + { + return false; + } + + for (var i = 0; i < signature.Length; i++) + { + if (buffer[offset + i] != signature[i]) + { + return false; + } + } + + return true; + } +} diff --git a/src/Directory.Build.props b/src/Directory.Build.props index 4c613dd..399cd3e 100644 --- a/src/Directory.Build.props +++ b/src/Directory.Build.props @@ -2,7 +2,7 @@ CS1591;CS0649;CA1416;NU1608;NU1109;NU1510 - 0.27.0 + 0.28.0 preview 1.0.0 Modify System.IO.Packaging (https://learn.microsoft.com/en-us/dotnet/api/system.io.packaging) files to ensure they are deterministic. Helpful for testing, build reproducibility, security verification, and ensuring package integrity across different build environments. diff --git a/src/Tests/Tests.cs b/src/Tests/Tests.cs index 576724f..0978122 100644 --- a/src/Tests/Tests.cs +++ b/src/Tests/Tests.cs @@ -265,31 +265,28 @@ static MemoryStream Convert(Extension extension) static MemoryStream Convert(string packagePath) { - var targetStream = new MemoryStream(); - #region Convert using var sourceStream = File.OpenRead(packagePath); - DeterministicPackage.Convert(sourceStream, targetStream); + var target = DeterministicPackage.Convert(sourceStream); #endregion - return targetStream; + return target; } static async Task ConvertAsync(Extension extension) { var packagePath = Path.Combine(directory, $"sample.{extension}"); - var targetStream = new MemoryStream(); #region ConvertAsync using var sourceStream = File.OpenRead(packagePath); - await DeterministicPackage.ConvertAsync(sourceStream, targetStream); + var target = await DeterministicPackage.ConvertAsync(sourceStream); #endregion - return targetStream; + return target; } } diff --git a/src/Tests/ZipPlatformNormalizerTests.cs b/src/Tests/ZipPlatformNormalizerTests.cs new file mode 100644 index 0000000..cdf6334 --- /dev/null +++ b/src/Tests/ZipPlatformNormalizerTests.cs @@ -0,0 +1,127 @@ +[TestFixture] +public class ZipPlatformNormalizerTests +{ + // Simulates an archive produced on Unix (host byte 3, Unix mode bits in the + // external attributes) and asserts the normalizer rewrites both to the + // Windows/FAT-neutral values. This would fail before the normalizer existed. + [Test] + public void RewritesUnixHostByteAndExternalAttributes() + { + var archive = BuildArchive(); + var buffer = archive.GetBuffer(); + var length = (int) archive.Length; + + foreach (var record in CentralDirectoryRecords(buffer, length)) + { + // host OS = Unix + buffer[record + 5] = 3; + // external attributes carrying Unix mode 0100644 in the high word + BinaryPrimitives.WriteUInt32LittleEndian(buffer.AsSpan(record + 38), 0x81A4_0000); + } + + ZipPlatformNormalizer.Normalize(archive); + + AssertNormalized(archive.ToArray()); + } + + // The end-to-end guarantee: whatever OS runs the conversion, the central + // directory comes out OS-independent. + [Test] + public void ConvertProducesOsIndependentCentralDirectory() + { + using var result = DeterministicPackage.Convert(BuildArchive()); + + AssertNormalized(result.ToArray()); + } + + // The low byte of "version made by" encodes the spec version (a function of + // the features used, not the OS) and must be left alone. + [Test] + public void PreservesSpecVersionLowByte() + { + var archive = BuildArchive(); + var buffer = archive.GetBuffer(); + var length = (int) archive.Length; + + var before = CentralDirectoryRecords(buffer, length) + .Select(_ => buffer[_ + 4]) + .ToList(); + + ZipPlatformNormalizer.Normalize(archive); + + var after = CentralDirectoryRecords(buffer, length) + .Select(_ => buffer[_ + 4]) + .ToList(); + + Assert.That(after, Is.EqualTo(before)); + } + + static void AssertNormalized(byte[] archive) + { + var records = CentralDirectoryRecords(archive, archive.Length); + + Assert.That(records, Is.Not.Empty); + foreach (var record in records) + { + Assert.Multiple(() => + { + Assert.That(archive[record + 5], Is.EqualTo(0), + "host-OS byte must be normalized to 0"); + Assert.That(BinaryPrimitives.ReadUInt32LittleEndian(archive.AsSpan(record + 38)), Is.EqualTo(0u), + "external file attributes must be cleared"); + }); + } + } + + static MemoryStream BuildArchive() + { + var stream = new MemoryStream(); + using (var archive = new Archive(stream, ZipArchiveMode.Create, leaveOpen: true)) + { + foreach (var name in (string[]) ["alpha.txt", "beta.txt", "nested/gamma.txt"]) + { + var entry = archive.CreateEntry(name, CompressionLevel.Optimal); + using var entryStream = entry.Open(); + using var writer = new StreamWriter(entryStream, Encoding.UTF8); + writer.Write("payload"); + } + } + + stream.Position = 0; + return stream; + } + + // Walks the central directory via the EOCD record, returning the byte offset + // of each central-directory file header. + static List CentralDirectoryRecords(byte[] buffer, int length) + { + var eocd = -1; + for (var i = length - 22; i >= 0; i--) + { + if (buffer[i] == 0x50 && + buffer[i + 1] == 0x4B && + buffer[i + 2] == 0x05 && + buffer[i + 3] == 0x06) + { + eocd = i; + break; + } + } + + Assert.That(eocd, Is.GreaterThanOrEqualTo(0), "EOCD record not found"); + + var count = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(eocd + 10)); + var offset = (int) BinaryPrimitives.ReadUInt32LittleEndian(buffer.AsSpan(eocd + 16)); + var records = new List(); + for (var i = 0; i < count; i++) + { + records.Add(offset); + var nameLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 28)); + var extraLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 30)); + var commentLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 32)); + offset += 46 + nameLength + extraLength + commentLength; + } + + return records; + } +}