Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions claude.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ The content patcher receives the relationship patcher via constructor injection.
- ZIP entries use Deflate compression via `ZipArchive`. Binary output may differ between net48 and net10.0+ due to Deflate implementation differences, but XML content is identical
- Entries are sorted by `FullName` using `StringComparer.Ordinal`
- Binary snapshot tests use `UniqueForRuntime` to allow framework-specific verified files
- `Convert(Stream source)` / `ConvertAsync(Stream source, Cancel)` are the only entry points and always return a fresh `MemoryStream`. Normalization is not a streaming operation (entries are reordered, every part is rewritten, the central directory is patched afterward, and `ZipArchive` read-mode needs a seekable source), so there is deliberately no `Convert(source, target)` overload — the result is always fully materialized in a buffer. Nested-zip recursion calls `Convert` then `CopyTo`s into the outer entry stream.
- `ZipPlatformNormalizer` makes output **OS-independent**: `ZipArchive` stamps the host OS into each central-directory record (the "version made by" high byte is 0 on Windows, 3 on Unix; Unix can also leak file-mode bits into the external-attributes field). The normalizer rewrites the host byte to 0 (MS-DOS/FAT) and clears external attributes on every record, so identical bytes are produced on Windows/macOS/Linux. The only remaining cross-environment difference is the Deflate stream (cross-runtime, not cross-OS). It patches the central directory **in place** via `MemoryStream.GetBuffer()` — no re-zip, no extra copy.
- `PngNormalizer` writes raw zlib stored blocks (CMF+FLG + DEFLATE stored blocks + Adler-32) instead of using `ZLibStream`, which produces different output on net48 vs net10.0

Example patcher structure:
Expand Down
12 changes: 6 additions & 6 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,13 @@ See [Verify Naming docs](https://github.com/VerifyTests/Verify/blob/main/docs/na

### Convert

<!-- snippet: ConvertAsync -->
<a id='snippet-ConvertAsync'></a>
<!-- snippet: Convert -->
<a id='snippet-Convert'></a>
```cs
using var sourceStream = File.OpenRead(packagePath);
await DeterministicPackage.ConvertAsync(sourceStream, targetStream);
var target = DeterministicPackage.Convert(sourceStream);
```
<sup><a href='/src/Tests/Tests.cs#L286-L291' title='Snippet source file'>snippet source</a> | <a href='#snippet-ConvertAsync' title='Start of snippet'>anchor</a></sup>
<sup><a href='/src/Tests/Tests.cs#L268-L273' title='Snippet source file'>snippet source</a> | <a href='#snippet-Convert' title='Start of snippet'>anchor</a></sup>
<!-- endSnippet -->


Expand All @@ -74,9 +74,9 @@ await DeterministicPackage.ConvertAsync(sourceStream, targetStream);
<a id='snippet-ConvertAsync'></a>
```cs
using var sourceStream = File.OpenRead(packagePath);
await DeterministicPackage.ConvertAsync(sourceStream, targetStream);
var target = await DeterministicPackage.ConvertAsync(sourceStream);
```
<sup><a href='/src/Tests/Tests.cs#L286-L291' title='Snippet source file'>snippet source</a> | <a href='#snippet-ConvertAsync' title='Start of snippet'>anchor</a></sup>
<sup><a href='/src/Tests/Tests.cs#L283-L288' title='Snippet source file'>snippet source</a> | <a href='#snippet-ConvertAsync' title='Start of snippet'>anchor</a></sup>
<!-- endSnippet -->


Expand Down
5 changes: 2 additions & 3 deletions src/Benchmarks/ConvertEndToEndBench.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@ public void Setup() =>
sourceBytes = SampleXml.BuildDocxZip(Paragraphs, drawings: 200, hyperlinks: 200);

[Benchmark]
public void Convert()
public MemoryStream Convert()
{
using var source = new MemoryStream(sourceBytes, writable: false);
using var target = new MemoryStream();
DeterministicPackage.Convert(source, target);
return DeterministicPackage.Convert(source);
}
}
55 changes: 30 additions & 25 deletions src/DeterministicIoPackaging/DeterministicPackage_Convert.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,45 @@

public static partial class DeterministicPackage
{
// Normalizing a package is not a streaming operation: entries are reordered,
// every part is rewritten, and the central directory is patched after the fact
// (see ZipPlatformNormalizer) — all of which need the whole archive in a
// seekable buffer. The result is therefore always a fresh MemoryStream, built
// and patched in place with no extra copy.
public static MemoryStream Convert(Stream source)
{
var patchers = CreatePatchers();
var target = new MemoryStream();
Convert(source, target);
target.Position = 0;
return target;
}
using (var sourceArchive = ReadArchive(source))
using (var targetArchive = CreateArchive(target))
{
foreach (var sourceEntry in sourceArchive.OrderedEntries())
{
DuplicateEntry(sourceEntry, targetArchive, patchers);
}
}

public static async Task<MemoryStream> ConvertAsync(Stream source)
{
var target = new MemoryStream();
await ConvertAsync(source, target);
ZipPlatformNormalizer.Normalize(target);
target.Position = 0;
return target;
}

public static void Convert(Stream source, Stream target)
public static async Task<MemoryStream> ConvertAsync(Stream source, Cancel token = default)
{
var patchers = CreatePatchers();
using var sourceArchive = ReadArchive(source);
using var targetArchive = CreateArchive(target);
foreach (var sourceEntry in sourceArchive.OrderedEntries())
var target = new MemoryStream();
using (var sourceArchive = ReadArchive(source))
using (var targetArchive = CreateArchive(target))
{
DuplicateEntry(sourceEntry, targetArchive, patchers);
foreach (var sourceEntry in sourceArchive.OrderedEntries())
{
await DuplicateEntryAsync(sourceEntry, targetArchive, patchers, token);
}
}
}

public static async Task ConvertAsync(Stream source, Stream target, Cancel token = default)
{
var patchers = CreatePatchers();
using var sourceArchive = ReadArchive(source);
using var targetArchive = CreateArchive(target);
foreach (var sourceEntry in sourceArchive.OrderedEntries())
{
await DuplicateEntryAsync(sourceEntry, targetArchive, patchers, token);
}
ZipPlatformNormalizer.Normalize(target);
target.Position = 0;
return target;
}

// ZIP local file header signature ("PK\x03\x04").
Expand Down Expand Up @@ -84,7 +87,8 @@ static void CopyOrRecurseZip(Stream source, Stream target)
buffer.Write(head, 0, read);
source.CopyTo(buffer);
buffer.Position = 0;
Convert(buffer, target);
using var normalized = Convert(buffer);
normalized.CopyTo(target);
return;
}

Expand All @@ -107,7 +111,8 @@ static async Task CopyOrRecurseZipAsync(Stream source, Stream target, Cancel can
await buffer.WriteAsync(head, 0, read, cancel);
await source.CopyToAsync(buffer, cancel);
buffer.Position = 0;
await ConvertAsync(buffer, target, cancel);
using var normalized = await ConvertAsync(buffer, cancel);
await normalized.CopyToAsync(target, cancel);
return;
}

Expand Down
96 changes: 96 additions & 0 deletions src/DeterministicIoPackaging/ZipPlatformNormalizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
using System.Buffers.Binary;

// ZipArchive stamps the host operating system into every central-directory
// record: the high byte of the "version made by" field is 0 on Windows and 3
// on Unix (per the .ZIP spec § 4.4.2), and Unix builds can additionally leak
// file-mode bits into the external-file-attributes field. Neither affects the
// archive's content, but both make the produced bytes depend on the OS that
// ran the conversion, defeating cross-platform determinism. This pass rewrites
// those fields to fixed values so the output is identical on every OS.
static class ZipPlatformNormalizer
{
// Central-directory file header signature "PK\x01\x02".
static readonly byte[] centralDirectoryHeader = [0x50, 0x4B, 0x01, 0x02];

// End-of-central-directory record signature "PK\x05\x06".
static readonly byte[] endOfCentralDirectory = [0x50, 0x4B, 0x05, 0x06];

// Fixed size of a central-directory file header before the variable-length
// file name, extra field and comment.
const int centralHeaderSize = 46;

// Minimum size of the end-of-central-directory record (no archive comment).
const int eocdSize = 22;

public static void Normalize(MemoryStream archive)
{
var buffer = archive.GetBuffer();
var length = (int) archive.Length;

if (!TryFindEndOfCentralDirectory(buffer, length, out var eocd))
{
return;
}

var count = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(eocd + 10));
var offset = (int) BinaryPrimitives.ReadUInt32LittleEndian(buffer.AsSpan(eocd + 16));

for (var record = 0; record < count; record++)
{
if (offset + centralHeaderSize > length ||
!StartsWith(buffer, offset, centralDirectoryHeader))
{
// Not the structure we expect (e.g. a ZIP64 archive). Leave it
// untouched rather than risk corrupting the output.
return;
}

// "version made by" high byte (host OS): force to 0 (MS-DOS / FAT).
buffer[offset + 5] = 0;
// External file attributes (4 bytes): clear any Unix mode bits.
buffer.AsSpan(offset + 38, 4).Clear();

var nameLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 28));
var extraLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 30));
var commentLength = BinaryPrimitives.ReadUInt16LittleEndian(buffer.AsSpan(offset + 32));
offset += centralHeaderSize + nameLength + extraLength + commentLength;
}
}

// Scans backwards for the EOCD signature. ZipArchive writes no archive
// comment, so it is normally the final 22 bytes, but scanning keeps this
// robust to any trailing bytes.
static bool TryFindEndOfCentralDirectory(byte[] buffer, int length, out int position)
{
for (var i = length - eocdSize; i >= 0; i--)
{
if (StartsWith(buffer, i, endOfCentralDirectory))
{
position = i;
return true;
}
}

position = -1;
return false;
}

static bool StartsWith(byte[] buffer, int offset, byte[] signature)
{
if (offset < 0 ||
offset + signature.Length > buffer.Length)
{
return false;
}

for (var i = 0; i < signature.Length; i++)
{
if (buffer[offset + i] != signature[i])
{
return false;
}
}

return true;
}
}
2 changes: 1 addition & 1 deletion src/Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<Project>
<PropertyGroup>
<NoWarn>CS1591;CS0649;CA1416;NU1608;NU1109;NU1510</NoWarn>
<Version>0.27.0</Version>
<Version>0.28.0</Version>
<LangVersion>preview</LangVersion>
<AssemblyVersion>1.0.0</AssemblyVersion>
<Description>Modify System.IO.Packaging (https://learn.microsoft.com/en-us/dotnet/api/system.io.packaging) files to ensure they are deterministic. Helpful for testing, build reproducibility, security verification, and ensuring package integrity across different build environments.</Description>
Expand Down
11 changes: 4 additions & 7 deletions src/Tests/Tests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -265,31 +265,28 @@ static MemoryStream Convert(Extension extension)

static MemoryStream Convert(string packagePath)
{
var targetStream = new MemoryStream();

#region Convert

using var sourceStream = File.OpenRead(packagePath);
DeterministicPackage.Convert(sourceStream, targetStream);
var target = DeterministicPackage.Convert(sourceStream);

#endregion

return targetStream;
return target;
}


static async Task<MemoryStream> ConvertAsync(Extension extension)
{
var packagePath = Path.Combine(directory, $"sample.{extension}");
var targetStream = new MemoryStream();

#region ConvertAsync

using var sourceStream = File.OpenRead(packagePath);
await DeterministicPackage.ConvertAsync(sourceStream, targetStream);
var target = await DeterministicPackage.ConvertAsync(sourceStream);

#endregion

return targetStream;
return target;
}
}
Loading
Loading