Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions src/Benchmarks/PngNormalizerBench.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[MemoryDiagnoser]
public class PngNormalizerBench
{
byte[] png = null!;

[Params(64 * 1024, 1024 * 1024)]
public int ImageBytes { get; set; }

[GlobalSetup]
public void Setup() =>
png = SampleXml.BuildPng(ImageBytes);

[Benchmark]
public void Normalize()
{
using var source = new MemoryStream(png, writable: false);
using var target = new MemoryStream();
PngNormalizer.Normalize(source, target);
}
}
48 changes: 48 additions & 0 deletions src/Benchmarks/SampleXml.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using System.Buffers.Binary;

static class SampleXml
{
static XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
Expand Down Expand Up @@ -193,4 +195,50 @@ static void WriteEntry(Archive zip, string name, XDocument xml)
using var writer = new StreamWriter(stream, new UTF8Encoding(false));
xml.Save(writer, SaveOptions.DisableFormatting);
}

// Builds a minimal PNG (signature + IHDR + IDAT + IEND) whose IDAT carries a
// zlib-compressed payload of `rawBytes` bytes — that payload is what
// PngNormalizer decompresses and rewrites as stored zlib blocks. Chunk CRCs
// are left zero: the normalizer reads chunk lengths and re-derives the IDAT
// CRC itself, so the input value is irrelevant for benchmarking it.
public static byte[] BuildPng(int rawBytes)
{
var raw = new byte[rawBytes];
for (var i = 0; i < raw.Length; i++)
{
raw[i] = (byte) (i * 31 + 7);
}

byte[] idat;
using (var compressed = new MemoryStream())
{
using (var zlib = new ZLibStream(compressed, CompressionLevel.Optimal, leaveOpen: true))
{
zlib.Write(raw, 0, raw.Length);
}

idat = compressed.ToArray();
}

using var png = new MemoryStream();
ReadOnlySpan<byte> signature = [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
png.Write(signature);
WriteChunk(png, "IHDR"u8, new byte[13]);
WriteChunk(png, "IDAT"u8, idat);
WriteChunk(png, "IEND"u8, []);
return png.ToArray();
}

static void WriteChunk(Stream png, ReadOnlySpan<byte> type, ReadOnlySpan<byte> data)
{
Span<byte> header = stackalloc byte[4];
BinaryPrimitives.WriteInt32BigEndian(header, data.Length);
png.Write(header);
png.Write(type);
png.Write(data);

// CRC placeholder (PngNormalizer does not validate input chunk CRCs).
header.Clear();
png.Write(header);
}
}
4 changes: 2 additions & 2 deletions src/DeterministicIoPackaging/DeterministicPackage.cs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ static void DuplicateEntry(Entry sourceEntry, Archive targetArchive, PatcherSet
return;
}

CopyOrRecurseZip(sourceStream, targetStream);
CopyOrRecurseZip(sourceStream, targetStream, sourceEntry.Length);
}

static async Task DuplicateEntryAsync(Entry sourceEntry, Archive targetArchive, PatcherSet currentPatchers, Cancel cancel)
Expand Down Expand Up @@ -115,7 +115,7 @@ static async Task DuplicateEntryAsync(Entry sourceEntry, Archive targetArchive,
return;
}

await CopyOrRecurseZipAsync(sourceStream, targetStream, cancel);
await CopyOrRecurseZipAsync(sourceStream, targetStream, sourceEntry.Length, cancel);
}

static bool IsSpreadsheetXml(Entry entry) =>
Expand Down
25 changes: 21 additions & 4 deletions src/DeterministicIoPackaging/DeterministicPackage_Convert.cs
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,17 @@ static bool LooksLikeZip(byte[] head, int length)
// packages flow through with whatever non-deterministic deflate/timestamps
// their producer emitted, defeating the deterministic guarantee for the
// outer package.
static void CopyOrRecurseZip(Stream source, Stream target)
static void CopyOrRecurseZip(Stream source, Stream target, long sourceLength)
{
var head = new byte[4];
var read = ReadUpTo(source, head, 4);

if (LooksLikeZip(head, read))
{
using var buffer = new MemoryStream();
// The whole entry (head + remainder) is buffered for the recursive
// Convert. Its uncompressed size is known from the central directory,
// so presize the buffer to avoid MemoryStream's grow-and-copy churn.
using var buffer = new MemoryStream(InitialCapacity(sourceLength));
buffer.Write(head, 0, read);
source.CopyTo(buffer);
buffer.Position = 0;
Expand All @@ -100,14 +103,16 @@ static void CopyOrRecurseZip(Stream source, Stream target)
source.CopyTo(target);
}

static async Task CopyOrRecurseZipAsync(Stream source, Stream target, Cancel cancel)
static async Task CopyOrRecurseZipAsync(Stream source, Stream target, long sourceLength, Cancel cancel)
{
var head = new byte[4];
var read = await ReadUpToAsync(source, head, 4, cancel);

if (LooksLikeZip(head, read))
{
using var buffer = new MemoryStream();
// See CopyOrRecurseZip: presize the recursion buffer to the entry's
// known uncompressed size to avoid grow-and-copy reallocations.
using var buffer = new MemoryStream(InitialCapacity(sourceLength));
await buffer.WriteAsync(head, 0, read, cancel);
await source.CopyToAsync(buffer, cancel);
buffer.Position = 0;
Expand All @@ -124,6 +129,18 @@ static async Task CopyOrRecurseZipAsync(Stream source, Stream target, Cancel can
await source.CopyToAsync(target, cancel);
}

// Clamp a ZipArchiveEntry.Length to a valid MemoryStream initial capacity.
// 0 (the parameterless-constructor default) for unknown/oversized lengths.
static int InitialCapacity(long sourceLength)
{
if (sourceLength is > 0 and <= int.MaxValue)
{
return (int) sourceLength;
}

return 0;
}

static int ReadUpTo(Stream source, byte[] buffer, int count) =>
source.ReadAtLeast(buffer.AsSpan(0, count), count, throwOnEndOfStream: false);

Expand Down
26 changes: 7 additions & 19 deletions src/DeterministicIoPackaging/Patching/DocumentPatcher.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,15 @@ public void PatchXml(XDocument xml, string entryName)
{
var root = xml.Root!;

WordRevisionMarkers.Strip(xml);

// Collect id attributes in one Descendants() walk rather than two.
// Preserves the original ordering: all wp:docPr ids first, then
// pic:cNvPr ids, numbering continuing from where the docPrs left off.
// Storing the XAttribute directly avoids a redundant Attribute("id")
// lookup during renumbering.
// Strip revision markers and collect the drawing-id attributes in a
// single tree traversal. word/document.xml is the largest content part,
// so fusing these two passes avoids walking it twice. Ordering is
// preserved: all wp:docPr ids first, then pic:cNvPr ids, numbering
// continuing from where the docPrs left off. Storing the XAttribute
// directly avoids a redundant Attribute("id") lookup during renumbering.
var docPrIds = new List<XAttribute>();
var picIds = new List<XAttribute>();
foreach (var element in root.Descendants())
{
var name = element.Name;
if (name == wpDocPr)
{
docPrIds.Add(element.Attribute("id")!);
}
else if (name == picCNvPr)
{
picIds.Add(element.Attribute("id")!);
}
}
WordRevisionMarkers.StripAndCollectDrawingIds(root, wpDocPr, picCNvPr, docPrIds, picIds);

var index = 1;
foreach (var attr in docPrIds)
Expand Down
64 changes: 51 additions & 13 deletions src/DeterministicIoPackaging/Patching/WordRevisionMarkers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,24 +33,62 @@ public static void Strip(XDocument xml)
return;
}

// Walk the attribute linked-list manually rather than via LINQ +
// ToList(). Capturing NextAttribute before Remove() lets us mutate
// safely without per-element allocations — common case is zero
// matching attributes, so this method used to allocate a Where
// iterator and a List<XAttribute> for every node in the document.
foreach (var element in root.DescendantsAndSelf())
{
var attr = element.FirstAttribute;
while (attr != null)
StripAttributes(element);
}
}

// Strips revision markers and, in the same traversal, collects the
// wp:docPr / pic:cNvPr id attributes DocumentPatcher renumbers.
// word/document.xml is the largest part in a .docx, so folding the strip
// and the id collection into a single Descendants() walk avoids traversing
// the whole tree twice. The collection order is identical to a standalone
// root.Descendants() pass: all docPr ids then all pic ids in document order.
public static void StripAndCollectDrawingIds(
XElement root,
XName docPrName,
XName cNvPrName,
List<XAttribute> docPrIds,
List<XAttribute> picIds)
{
// Process the root's own attributes first, mirroring Strip's use of
// DescendantsAndSelf. The root of word/document.xml is w:document —
// never a drawing element — so it needs stripping, not id collection.
StripAttributes(root);

foreach (var element in root.Descendants())
{
StripAttributes(element);

var name = element.Name;
if (name == docPrName)
{
docPrIds.Add(element.Attribute("id")!);
}
else if (name == cNvPrName)
{
var next = attr.NextAttribute;
if (attributesToRemove.Contains(attr.Name))
{
attr.Remove();
}
picIds.Add(element.Attribute("id")!);
}
}
}

attr = next;
// Walk the attribute linked-list manually rather than via LINQ + ToList().
// Capturing NextAttribute before Remove() lets us mutate safely without
// per-element allocations — the common case is zero matching attributes, so
// a Where iterator + List<XAttribute> per node would be pure waste.
static void StripAttributes(XElement element)
{
var attr = element.FirstAttribute;
while (attr != null)
{
var next = attr.NextAttribute;
if (attributesToRemove.Contains(attr.Name))
{
attr.Remove();
}

attr = next;
}
}
}
109 changes: 64 additions & 45 deletions src/DeterministicIoPackaging/PngNormalizer.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
using System.Buffers.Binary;

namespace DeterministicIoPackaging;

static class PngNormalizer
{
static readonly byte[] pngSignature = [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
Expand Down Expand Up @@ -98,56 +96,77 @@ static void WriteNormalizedIdat(Stream target, byte[] zlibBytes, int zlibLength)
var raw = decompressedStream.GetBuffer();
var rawLength = (int) decompressedStream.Length;

// Write raw zlib stored format to avoid framework DEFLATE differences.
// Format: CMF(0x78) FLG(0x01) + DEFLATE stored blocks + Adler-32
using var compressOutput = new MemoryStream();
// CMF: deflate, 32K window
compressOutput.WriteByte(0x78);
// FLG: no dict, check bits make CMF*256+FLG divisible by 31
compressOutput.WriteByte(0x01);
// Emit the IDAT chunk straight to target instead of staging the whole
// normalized zlib stream in a second MemoryStream. The chunk length is
// computed analytically and a single Crc32 accumulates the same bytes as
// they are written, so this avoids buffering a second full copy of the
// image and two extra passes over it (GetBuffer for the write + the CRC).
//
// Raw zlib stored format (avoids framework DEFLATE differences):
// CMF(0x78) FLG(0x01) + DEFLATE stored blocks + Adler-32.
// Each stored block is a 5-byte header (BFINAL/BTYPE + LEN + NLEN) plus
// its payload; a zero-length image still emits one empty final block.
var blockCount = rawLength == 0 ? 1 : (rawLength + 65534) / 65535;
var idatLength = 2 + blockCount * 5 + rawLength + 4;

// Write DEFLATE stored blocks (max 65535 bytes each)
var offset = 0;
while (offset < rawLength)
{
var blockSize = Math.Min(rawLength - offset, 65535);
var isFinal = offset + blockSize >= rawLength;
compressOutput.WriteByte(isFinal ? (byte) 1 : (byte) 0); // BFINAL + BTYPE=00
compressOutput.WriteByte((byte) (blockSize & 0xFF));
compressOutput.WriteByte((byte) (blockSize >> 8));
compressOutput.WriteByte((byte) (~blockSize & 0xFF));
compressOutput.WriteByte((byte) ((~blockSize >> 8) & 0xFF));
compressOutput.Write(raw, offset, blockSize);
offset += blockSize;
}
var crc = new Crc32();

// Chunk length (big-endian) + "IDAT". The CRC covers the type + data, not the length.
Span<byte> lengthAndType = stackalloc byte[8];
BinaryPrimitives.WriteInt32BigEndian(lengthAndType, idatLength);
idatType.AsSpan().CopyTo(lengthAndType.Slice(4));
target.Write(lengthAndType);
crc.Append(lengthAndType.Slice(4, 4));

// Reusable scratch for the 2-byte zlib header and the 5-byte block headers.
Span<byte> block = stackalloc byte[5];
// CMF: deflate, 32K window. FLG: no dict, CMF*256+FLG divisible by 31.
block[0] = 0x78;
block[1] = 0x01;
target.Write(block.Slice(0, 2));
crc.Append(block.Slice(0, 2));

if (rawLength == 0)
{
// Empty data: single final stored block with length 0
compressOutput.WriteByte(1);
compressOutput.Write([0, 0, 0xFF, 0xFF], 0, 4);
// Empty data: single final stored block with length 0.
block[0] = 1;
block[1] = 0;
block[2] = 0;
block[3] = 0xFF;
block[4] = 0xFF;
target.Write(block);
crc.Append(block);
}
else
{
// DEFLATE stored blocks (max 65535 bytes each).
var offset = 0;
while (offset < rawLength)
{
var blockSize = Math.Min(rawLength - offset, 65535);
var isFinal = offset + blockSize >= rawLength;
block[0] = isFinal ? (byte) 1 : (byte) 0; // BFINAL + BTYPE=00
block[1] = (byte) (blockSize & 0xFF);
block[2] = (byte) (blockSize >> 8);
block[3] = (byte) (~blockSize & 0xFF);
block[4] = (byte) ((~blockSize >> 8) & 0xFF);
target.Write(block);
crc.Append(block);

target.Write(raw, offset, blockSize);
crc.Append(raw.AsSpan(offset, blockSize));
offset += blockSize;
}
}

// Adler-32 checksum
var adler = Adler32(raw, rawLength);
compressOutput.WriteByte((byte) (adler >> 24));
compressOutput.WriteByte((byte) (adler >> 16));
compressOutput.WriteByte((byte) (adler >> 8));
compressOutput.WriteByte((byte) adler);

var length = (int) compressOutput.Length;
var header = new byte[4];
BinaryPrimitives.WriteInt32BigEndian(header, length);
target.Write(header);
target.Write(idatType);
target.Write(compressOutput.GetBuffer(), 0, length);
// Adler-32 checksum, then the chunk's CRC-32 (over type + data).
Span<byte> trailer = stackalloc byte[4];
BinaryPrimitives.WriteUInt32BigEndian(trailer, Adler32(raw, rawLength));
target.Write(trailer);
crc.Append(trailer);

var crc = new Crc32();
crc.Append(idatType);
crc.Append(compressOutput.GetBuffer().AsSpan(0, length));
var crcBytes = new byte[4];
BinaryPrimitives.WriteUInt32BigEndian(crcBytes, crc.GetCurrentHashAsUInt32());
target.Write(crcBytes);
BinaryPrimitives.WriteUInt32BigEndian(trailer, crc.GetCurrentHashAsUInt32());
target.Write(trailer);
}

// Adler-32 checksum as defined in RFC 1950.
Expand Down
Loading