diff --git a/gix-pack/src/data/entry/decode.rs b/gix-pack/src/data/entry/decode.rs index b78ab3b3b7f..728eebdb0df 100644 --- a/gix-pack/src/data/entry/decode.rs +++ b/gix-pack/src/data/entry/decode.rs @@ -58,6 +58,7 @@ impl data::Entry { header: object, decompressed_size: size, data_offset: pack_offset + consumed as u64, + encoded_header_size: consumed.try_into().expect("pack entry headers fit into u16"), }) } @@ -96,6 +97,7 @@ impl data::Entry { header: object, decompressed_size: size, data_offset: pack_offset + consumed as u64, + encoded_header_size: consumed.try_into().expect("pack entry headers fit into u16"), }) } } @@ -121,12 +123,6 @@ fn streaming_parse_header_info(read: &mut dyn io::Read) -> Result<(u8, u64, usiz .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "pack entry header overflowed"))?; shift += 7; } - if i != encoded_pack_entry_header_size(size) { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "pack entry header uses a non-canonical size encoding", - )); - } Ok((type_id, size, i)) } @@ -149,11 +145,6 @@ fn parse_header_info(data: &[u8]) -> Result<(u8, u64, usize), Error> { size = size.checked_add(component).ok_or(Error::Overflow)?; shift += 7; } - if i != encoded_pack_entry_header_size(size) { - return Err(Error::Corrupt { - message: "pack entry header uses a non-canonical size encoding", - }); - } Ok((type_id, size, i)) } @@ -178,35 +169,45 @@ fn parse_leb64(data: &[u8]) -> Result<(u64, usize), Error> { Ok((value, i)) } -/// Return the canonical byte length of a pack-entry size header for `size`. -/// -/// We use this to reject overlong size encodings during parsing. -/// That matters for our delta resolution implementation, which later reconstructs an entry's -/// pack offset from `data_offset - header_size()`. If we accepted non-canonical encodings here, -/// `header_size()` would compute the canonical length while `data_offset` would reflect the -/// actually consumed bytes, breaking that invariant and allowing malformed delta entries to point -/// back to themselves or otherwise walk the wrong base objects. -fn encoded_pack_entry_header_size(mut size: u64) -> usize { - let mut bytes = 1; - size >>= 4; - while size != 0 { - bytes += 1; - size >>= 7; - } - bytes -} - #[cfg(test)] mod tests { use super::*; #[test] - fn rejects_non_canonical_pack_entry_header_encoding() { - assert!(matches!( - data::Entry::from_bytes(&[0xed, 0x00], 0, gix_hash::Kind::Sha1.len_in_bytes()), - Err(Error::Corrupt { - message: "pack entry header uses a non-canonical size encoding" - }) - )); + fn accepts_non_canonical_pack_entry_header_encoding() { + let pack_offset = 42; + let entry = data::Entry::from_bytes(&[0xb3, 0x00], pack_offset, gix_hash::Kind::Sha1.len_in_bytes()) + .expect("non-canonical size encodings are accepted by git"); + + assert_eq!(entry.header, data::entry::Header::Blob); + assert_eq!(entry.decompressed_size, 3); + assert_eq!(entry.header_size(), 2); + assert_eq!(entry.pack_offset(), pack_offset); + assert_eq!(entry.data_offset, pack_offset + 2); + } + + #[test] + fn non_canonical_pack_entry_header_keeps_ofs_delta_base_offsets_correct() { + let pack_offset = 100; + let base_distance = 5; + let entry = data::Entry::from_bytes( + &[0xe4, 0x00, base_distance], + pack_offset, + gix_hash::Kind::Sha1.len_in_bytes(), + ) + .expect("non-canonical ofs-delta size encodings are accepted by git"); + + assert_eq!( + entry.header, + data::entry::Header::OfsDelta { + base_distance: base_distance.into() + } + ); + assert_eq!(entry.header_size(), 3); + assert_eq!(entry.pack_offset(), pack_offset); + assert_eq!( + entry.checked_base_pack_offset(base_distance.into()), + Some(pack_offset - u64::from(base_distance)) + ); } } diff --git a/gix-pack/src/data/entry/mod.rs b/gix-pack/src/data/entry/mod.rs index 651cc06195b..88cd81d5d4a 100644 --- a/gix-pack/src/data/entry/mod.rs +++ b/gix-pack/src/data/entry/mod.rs @@ -33,8 +33,7 @@ impl Entry { /// Compute the pack offset to the base entry of the object represented by this entry, or /// return `None` if the distance would underflow or is invalid. pub fn checked_base_pack_offset(&self, distance: u64) -> Option { - let pack_offset = self.data_offset - self.header_size() as u64; - Header::verified_base_pack_offset(pack_offset, distance) + Header::verified_base_pack_offset(self.pack_offset(), distance) } /// Compute the pack offset to the base entry of the object represented by this entry. @@ -52,7 +51,11 @@ impl Entry { } /// The amount of bytes used to describe this entry in the pack. The header starts at [`Self::pack_offset()`] pub fn header_size(&self) -> usize { - self.header.size(self.decompressed_size) + if self.encoded_header_size == 0 { + self.header.size(self.decompressed_size) + } else { + self.encoded_header_size.into() + } } } diff --git a/gix-pack/src/data/mod.rs b/gix-pack/src/data/mod.rs index ca51c5fe474..a2f7ad9b422 100644 --- a/gix-pack/src/data/mod.rs +++ b/gix-pack/src/data/mod.rs @@ -20,6 +20,16 @@ pub struct Entry { pub decompressed_size: u64, /// absolute offset to compressed object data in the pack, just behind the entry's header pub data_offset: Offset, + /// The amount of bytes used to encode the entry header in the pack. + /// + /// Git accepts non-canonical size encodings in pack entry headers, so this records the actual + /// byte length instead of recomputing the canonical length from the decoded size. + /// + /// A value of `0` means the actual length is unknown, causing [`Entry::header_size()`] to + /// recompute the canonical length from the decoded size. This is useful for deserializing data + /// produced by older versions. + #[cfg_attr(feature = "serde", serde(default))] + pub encoded_header_size: u16, } mod file; diff --git a/gix-pack/tests/pack/data/fuzzed.rs b/gix-pack/tests/pack/data/fuzzed.rs index bb6bd3bd9e9..499ffe064cc 100644 --- a/gix-pack/tests/pack/data/fuzzed.rs +++ b/gix-pack/tests/pack/data/fuzzed.rs @@ -55,6 +55,48 @@ fn oversized_pack_entry_header_is_reported_without_panicking() { ); } +#[test] +fn non_canonical_pack_entry_header_is_accepted() { + fn deflate(bytes: &[u8]) -> Vec { + let mut out = gix_features::zlib::stream::deflate::Write::new(Vec::new()); + out.write_all(bytes).expect("writing to deflater succeeds"); + out.flush().expect("flushing deflater succeeds"); + out.into_inner() + } + + let mut bytes = data::header::encode(data::Version::V2, 1).to_vec(); + bytes.extend_from_slice(&[0xb3, 0x00]); + bytes.extend_from_slice(&deflate(b"20\n")); + bytes.extend_from_slice(&[0; 20]); + + let file = data::File::from_data( + bytes.as_slice(), + PathBuf::from("non-canonical-header.pack"), + gix_hash::Kind::Sha1, + ) + .expect("pack header is syntactically valid"); + let entry = file + .entry(12) + .expect("git-compatible non-canonical entry header is accepted"); + assert_eq!(entry.header_size(), 2, "the actual header size is retained"); + assert_eq!(entry.pack_offset(), 12, "the entry start remains recoverable"); + + let mut out = Vec::new(); + let outcome = file + .decode_entry( + entry, + &mut out, + &mut Default::default(), + &|_, _| None, + &mut gix_pack::cache::Never, + ) + .expect("non-canonical entry decodes like a canonical git object"); + + assert_eq!(out, b"20\n"); + assert_eq!(outcome.kind, gix_object::Kind::Blob); + assert_eq!(outcome.object_size, 3); +} + /// Reproducer for the large-allocation fuzz case: attacker-controlled object sizes must not cause /// `decode_entry()` to attempt multi-gigabyte allocations. #[test]