From 176d07b9bd00bc83d310f0034452a1448414c55f Mon Sep 17 00:00:00 2001 From: 0WD0 Date: Fri, 22 May 2026 14:53:21 +0800 Subject: [PATCH] fix(gix-pack)!: accept non-canonical pack entry size headers C Git accepts overlong pack entry size encodings, and real servers can send them when reusing existing pack data. Accept these headers while recording the actual header length consumed from the pack. Keeping the actual header length avoids recomputing a canonical length from the decoded size, which would break pack offset reconstruction and ofs-delta base offset calculations for non-canonical entries. BREAKING CHANGE: gix_pack::data::Entry now has a public encoded_header_size field. --- gix-pack/src/data/entry/decode.rs | 73 +++++++++++++++--------------- gix-pack/src/data/entry/mod.rs | 9 ++-- gix-pack/src/data/mod.rs | 10 ++++ gix-pack/tests/pack/data/fuzzed.rs | 42 +++++++++++++++++ 4 files changed, 95 insertions(+), 39 deletions(-) diff --git a/gix-pack/src/data/entry/decode.rs b/gix-pack/src/data/entry/decode.rs index b78ab3b3b7f..728eebdb0df 100644 --- a/gix-pack/src/data/entry/decode.rs +++ b/gix-pack/src/data/entry/decode.rs @@ -58,6 +58,7 @@ impl data::Entry { header: object, decompressed_size: size, data_offset: pack_offset + consumed as u64, + encoded_header_size: consumed.try_into().expect("pack entry headers fit into u16"), }) } @@ -96,6 +97,7 @@ impl data::Entry { header: object, decompressed_size: size, data_offset: pack_offset + consumed as u64, + encoded_header_size: consumed.try_into().expect("pack entry headers fit into u16"), }) } } @@ -121,12 +123,6 @@ fn streaming_parse_header_info(read: &mut dyn io::Read) -> Result<(u8, u64, usiz .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "pack entry header overflowed"))?; shift += 7; } - if i != encoded_pack_entry_header_size(size) { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "pack entry header uses a non-canonical size encoding", - )); - } Ok((type_id, size, i)) } @@ -149,11 +145,6 @@ fn parse_header_info(data: &[u8]) -> Result<(u8, u64, usize), Error> { size = size.checked_add(component).ok_or(Error::Overflow)?; shift += 7; } - if i != encoded_pack_entry_header_size(size) { - return Err(Error::Corrupt { - message: "pack entry header uses a non-canonical size encoding", - }); - } Ok((type_id, size, i)) } @@ -178,35 +169,45 @@ fn parse_leb64(data: &[u8]) -> Result<(u64, usize), Error> { Ok((value, i)) } -/// Return the canonical byte length of a pack-entry size header for `size`. -/// -/// We use this to reject overlong size encodings during parsing. -/// That matters for our delta resolution implementation, which later reconstructs an entry's -/// pack offset from `data_offset - header_size()`. If we accepted non-canonical encodings here, -/// `header_size()` would compute the canonical length while `data_offset` would reflect the -/// actually consumed bytes, breaking that invariant and allowing malformed delta entries to point -/// back to themselves or otherwise walk the wrong base objects. -fn encoded_pack_entry_header_size(mut size: u64) -> usize { - let mut bytes = 1; - size >>= 4; - while size != 0 { - bytes += 1; - size >>= 7; - } - bytes -} - #[cfg(test)] mod tests { use super::*; #[test] - fn rejects_non_canonical_pack_entry_header_encoding() { - assert!(matches!( - data::Entry::from_bytes(&[0xed, 0x00], 0, gix_hash::Kind::Sha1.len_in_bytes()), - Err(Error::Corrupt { - message: "pack entry header uses a non-canonical size encoding" - }) - )); + fn accepts_non_canonical_pack_entry_header_encoding() { + let pack_offset = 42; + let entry = data::Entry::from_bytes(&[0xb3, 0x00], pack_offset, gix_hash::Kind::Sha1.len_in_bytes()) + .expect("non-canonical size encodings are accepted by git"); + + assert_eq!(entry.header, data::entry::Header::Blob); + assert_eq!(entry.decompressed_size, 3); + assert_eq!(entry.header_size(), 2); + assert_eq!(entry.pack_offset(), pack_offset); + assert_eq!(entry.data_offset, pack_offset + 2); + } + + #[test] + fn non_canonical_pack_entry_header_keeps_ofs_delta_base_offsets_correct() { + let pack_offset = 100; + let base_distance = 5; + let entry = data::Entry::from_bytes( + &[0xe4, 0x00, base_distance], + pack_offset, + gix_hash::Kind::Sha1.len_in_bytes(), + ) + .expect("non-canonical ofs-delta size encodings are accepted by git"); + + assert_eq!( + entry.header, + data::entry::Header::OfsDelta { + base_distance: base_distance.into() + } + ); + assert_eq!(entry.header_size(), 3); + assert_eq!(entry.pack_offset(), pack_offset); + assert_eq!( + entry.checked_base_pack_offset(base_distance.into()), + Some(pack_offset - u64::from(base_distance)) + ); } } diff --git a/gix-pack/src/data/entry/mod.rs b/gix-pack/src/data/entry/mod.rs index 651cc06195b..88cd81d5d4a 100644 --- a/gix-pack/src/data/entry/mod.rs +++ b/gix-pack/src/data/entry/mod.rs @@ -33,8 +33,7 @@ impl Entry { /// Compute the pack offset to the base entry of the object represented by this entry, or /// return `None` if the distance would underflow or is invalid. pub fn checked_base_pack_offset(&self, distance: u64) -> Option { - let pack_offset = self.data_offset - self.header_size() as u64; - Header::verified_base_pack_offset(pack_offset, distance) + Header::verified_base_pack_offset(self.pack_offset(), distance) } /// Compute the pack offset to the base entry of the object represented by this entry. @@ -52,7 +51,11 @@ impl Entry { } /// The amount of bytes used to describe this entry in the pack. The header starts at [`Self::pack_offset()`] pub fn header_size(&self) -> usize { - self.header.size(self.decompressed_size) + if self.encoded_header_size == 0 { + self.header.size(self.decompressed_size) + } else { + self.encoded_header_size.into() + } } } diff --git a/gix-pack/src/data/mod.rs b/gix-pack/src/data/mod.rs index ca51c5fe474..a2f7ad9b422 100644 --- a/gix-pack/src/data/mod.rs +++ b/gix-pack/src/data/mod.rs @@ -20,6 +20,16 @@ pub struct Entry { pub decompressed_size: u64, /// absolute offset to compressed object data in the pack, just behind the entry's header pub data_offset: Offset, + /// The amount of bytes used to encode the entry header in the pack. + /// + /// Git accepts non-canonical size encodings in pack entry headers, so this records the actual + /// byte length instead of recomputing the canonical length from the decoded size. + /// + /// A value of `0` means the actual length is unknown, causing [`Entry::header_size()`] to + /// recompute the canonical length from the decoded size. This is useful for deserializing data + /// produced by older versions. + #[cfg_attr(feature = "serde", serde(default))] + pub encoded_header_size: u16, } mod file; diff --git a/gix-pack/tests/pack/data/fuzzed.rs b/gix-pack/tests/pack/data/fuzzed.rs index bb6bd3bd9e9..499ffe064cc 100644 --- a/gix-pack/tests/pack/data/fuzzed.rs +++ b/gix-pack/tests/pack/data/fuzzed.rs @@ -55,6 +55,48 @@ fn oversized_pack_entry_header_is_reported_without_panicking() { ); } +#[test] +fn non_canonical_pack_entry_header_is_accepted() { + fn deflate(bytes: &[u8]) -> Vec { + let mut out = gix_features::zlib::stream::deflate::Write::new(Vec::new()); + out.write_all(bytes).expect("writing to deflater succeeds"); + out.flush().expect("flushing deflater succeeds"); + out.into_inner() + } + + let mut bytes = data::header::encode(data::Version::V2, 1).to_vec(); + bytes.extend_from_slice(&[0xb3, 0x00]); + bytes.extend_from_slice(&deflate(b"20\n")); + bytes.extend_from_slice(&[0; 20]); + + let file = data::File::from_data( + bytes.as_slice(), + PathBuf::from("non-canonical-header.pack"), + gix_hash::Kind::Sha1, + ) + .expect("pack header is syntactically valid"); + let entry = file + .entry(12) + .expect("git-compatible non-canonical entry header is accepted"); + assert_eq!(entry.header_size(), 2, "the actual header size is retained"); + assert_eq!(entry.pack_offset(), 12, "the entry start remains recoverable"); + + let mut out = Vec::new(); + let outcome = file + .decode_entry( + entry, + &mut out, + &mut Default::default(), + &|_, _| None, + &mut gix_pack::cache::Never, + ) + .expect("non-canonical entry decodes like a canonical git object"); + + assert_eq!(out, b"20\n"); + assert_eq!(outcome.kind, gix_object::Kind::Blob); + assert_eq!(outcome.object_size, 3); +} + /// Reproducer for the large-allocation fuzz case: attacker-controlled object sizes must not cause /// `decode_entry()` to attempt multi-gigabyte allocations. #[test]