From 5225ec858dd0416670cfb2512fe8f045abe34e85 Mon Sep 17 00:00:00 2001 From: David Hotham Date: Sat, 4 Apr 2026 14:19:19 +0100 Subject: [PATCH] Return install record mismatches from install() install() now returns a list of (source_record, installed_record) tuples for files whose hash or size differs from the wheel's RECORD. Callers that don't care can ignore the return value (previously None). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/installer/_core.py | 50 ++++++- src/installer/destinations.py | 2 + tests/test_core.py | 274 +++++++++++++++++++++++++++++----- 3 files changed, 290 insertions(+), 36 deletions(-) diff --git a/src/installer/_core.py b/src/installer/_core.py index e23f06e..a71eec9 100644 --- a/src/installer/_core.py +++ b/src/installer/_core.py @@ -15,6 +15,10 @@ __all__ = ["install"] +# Hash algorithms considered secure enough for RECORD verification (PEP 427). +_COMPLIANT_HASH_ALGORITHMS = frozenset({"sha256", "sha384", "sha512"}) + + def _process_WHEEL_file(source: WheelSource) -> Scheme: # noqa: N802 """Process the WHEEL file, from ``source``. @@ -67,7 +71,7 @@ def install( source: WheelSource, destination: WheelDestination, additional_metadata: dict[str, bytes], -) -> None: +) -> list[tuple[RecordEntry, RecordEntry]]: """Install wheel described by ``source`` into ``destination``. :param source: wheel to install. @@ -75,12 +79,22 @@ def install( :param additional_metadata: additional metadata files to generate, usually generated by the caller. + :returns: A list of record mismatches between the wheel's ``RECORD`` + and the actually installed files. Each entry is a + ``(source_record, installed_record)`` pair. An empty list means + all files matched. Files in the ``scripts`` scheme are excluded + because their content may be modified by shebang rewriting. """ root_scheme = _process_WHEEL_file(source) # RECORD handling record_file_path = posixpath.join(source.dist_info_dir, "RECORD") written_records = [] + mismatches: list[tuple[RecordEntry, RecordEntry]] = [] + + # Remember the destination's default hash algorithm so we can use it as a + # fallback and restore it when we are done. + _orig_hash_algorithm = destination.hash_algorithm # Write the entry_points based scripts. if "entry_points.txt" in source.dist_info_filenames: @@ -121,12 +135,42 @@ def install( source=source, root_scheme=root_scheme, ) + # Prefer the source's hash algorithm, if compliant. This enables mismatch + # detection even when the wheel uses an algorithm other than the destination's + # default. + if ( + source_record.hash_ is not None + and source_record.hash_.name in _COMPLIANT_HASH_ALGORITHMS + ): + destination.hash_algorithm = source_record.hash_.name + else: + destination.hash_algorithm = _orig_hash_algorithm + record = destination.write_file( scheme=scheme, path=destination_path, stream=stream, is_executable=is_executable, ) + + # Compare the installed record against the wheel's RECORD. + # Scripts-scheme files are excluded because shebang rewriting + # may legitimately change content. + if scheme != "scripts": + hash_mismatch = ( + source_record.hash_ is not None + and record.hash_ is not None + and source_record.hash_.name == record.hash_.name + and source_record.hash_.value != record.hash_.value + ) + size_mismatch = ( + source_record.size is not None + and record.size is not None + and record.size != source_record.size + ) + if hash_mismatch or size_mismatch: + mismatches.append((source_record, record)) + written_records.append((scheme, record)) # Write all the installation-specific metadata @@ -148,3 +192,7 @@ def install( record_file_path=record_file_path, records=written_records, ) + + destination.hash_algorithm = _orig_hash_algorithm + + return mismatches diff --git a/src/installer/destinations.py b/src/installer/destinations.py index 71a8f9f..608ef57 100644 --- a/src/installer/destinations.py +++ b/src/installer/destinations.py @@ -34,6 +34,8 @@ class WheelDestination: (re)writing. """ + hash_algorithm: str = "sha256" + def write_script( self, name: str, module: str, attr: str, section: "ScriptSection" ) -> RecordEntry: diff --git a/tests/test_core.py b/tests/test_core.py index 121b9d3..0b9b1d2 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,3 +1,4 @@ +import base64 import hashlib import textwrap from io import BytesIO @@ -7,7 +8,7 @@ from installer import install from installer.exceptions import InvalidWheelSource -from installer.records import RecordEntry +from installer.records import Hash, RecordEntry from installer.sources import WheelSource @@ -15,7 +16,14 @@ # Helpers # -------------------------------------------------------------------------------------- def hash_and_size(data): - return hashlib.sha256(data).hexdigest(), len(data) + digest = hashlib.sha256(data).digest() + hash_ = base64.urlsafe_b64encode(digest).decode("ascii").rstrip("=") + return hash_, len(data) + + +def _re(path): + """Build expected RecordEntry matching on path only (hash/size lenient).""" + return RecordEntry(path, Hash("sha256", mock.ANY), mock.ANY) @pytest.fixture @@ -25,13 +33,16 @@ def mock_destination(): # A hacky approach to making sure we got the right objects going in. def custom_write_file(scheme, path, stream, is_executable): assert isinstance(stream, BytesIO) - return (path, scheme, 0) + data = stream.read() + hash_value, size = hash_and_size(data) + return RecordEntry(path, Hash("sha256", hash_value), size) def custom_write_script(name, module, attr, section): return (name, module, attr, section) retval.write_file.side_effect = custom_write_file retval.write_script.side_effect = custom_write_script + retval.hash_algorithm = "sha256" return retval @@ -214,21 +225,21 @@ def main(): records=[ ("scripts", ("fancy", "fancy", "main", "console")), ("scripts", ("fancy-gui", "fancy", "main", "gui")), - ("purelib", ("fancy/__init__.py", "purelib", 0)), - ("purelib", ("fancy/__main__.py", "purelib", 0)), - ("purelib", ("fancy-1.0.0.dist-info/METADATA", "purelib", 0)), - ("purelib", ("fancy-1.0.0.dist-info/WHEEL", "purelib", 0)), + ("purelib", _re("fancy/__init__.py")), + ("purelib", _re("fancy/__main__.py")), + ("purelib", _re("fancy-1.0.0.dist-info/METADATA")), + ("purelib", _re("fancy-1.0.0.dist-info/WHEEL")), ( "purelib", - ("fancy-1.0.0.dist-info/entry_points.txt", "purelib", 0), + _re("fancy-1.0.0.dist-info/entry_points.txt"), ), ( "purelib", - ("fancy-1.0.0.dist-info/top_level.txt", "purelib", 0), + _re("fancy-1.0.0.dist-info/top_level.txt"), ), ( "purelib", - ("fancy-1.0.0.dist-info/fun_file.txt", "purelib", 0), + _re("fancy-1.0.0.dist-info/fun_file.txt"), ), ( "purelib", @@ -330,17 +341,17 @@ def main(): scheme="purelib", record_file_path="fancy-1.0.0.dist-info/RECORD", records=[ - ("purelib", ("fancy/__init__.py", "purelib", 0)), - ("purelib", ("fancy/__main__.py", "purelib", 0)), - ("purelib", ("fancy-1.0.0.dist-info/METADATA", "purelib", 0)), - ("purelib", ("fancy-1.0.0.dist-info/WHEEL", "purelib", 0)), + ("purelib", _re("fancy/__init__.py")), + ("purelib", _re("fancy/__main__.py")), + ("purelib", _re("fancy-1.0.0.dist-info/METADATA")), + ("purelib", _re("fancy-1.0.0.dist-info/WHEEL")), ( "purelib", - ("fancy-1.0.0.dist-info/top_level.txt", "purelib", 0), + _re("fancy-1.0.0.dist-info/top_level.txt"), ), ( "purelib", - ("fancy-1.0.0.dist-info/fun_file.txt", "purelib", 0), + _re("fancy-1.0.0.dist-info/fun_file.txt"), ), ( "purelib", @@ -469,21 +480,21 @@ def main(): records=[ ("scripts", ("fancy", "fancy", "main", "console")), ("scripts", ("fancy-gui", "fancy", "main", "gui")), - ("platlib", ("fancy/__init__.py", "platlib", 0)), - ("platlib", ("fancy/__main__.py", "platlib", 0)), - ("platlib", ("fancy-1.0.0.dist-info/METADATA", "platlib", 0)), - ("platlib", ("fancy-1.0.0.dist-info/WHEEL", "platlib", 0)), + ("platlib", _re("fancy/__init__.py")), + ("platlib", _re("fancy/__main__.py")), + ("platlib", _re("fancy-1.0.0.dist-info/METADATA")), + ("platlib", _re("fancy-1.0.0.dist-info/WHEEL")), ( "platlib", - ("fancy-1.0.0.dist-info/entry_points.txt", "platlib", 0), + _re("fancy-1.0.0.dist-info/entry_points.txt"), ), ( "platlib", - ("fancy-1.0.0.dist-info/top_level.txt", "platlib", 0), + _re("fancy-1.0.0.dist-info/top_level.txt"), ), ( "platlib", - ("fancy-1.0.0.dist-info/fun_file.txt", "platlib", 0), + _re("fancy-1.0.0.dist-info/fun_file.txt"), ), ( "platlib", @@ -756,21 +767,21 @@ def test_handles_data_properly(self, mock_destination): records=[ ("scripts", ("fancy", "fancy", "main", "console")), ("scripts", ("fancy-gui", "fancy", "main", "gui")), - ("data", ("fancy/data.py", "data", 0)), - ("headers", ("fancy/headers.py", "headers", 0)), - ("platlib", ("fancy/platlib.py", "platlib", 0)), - ("purelib", ("fancy/purelib.py", "purelib", 0)), - ("scripts", ("fancy/scripts.py", "scripts", 0)), - ("purelib", ("fancy/__init__.py", "purelib", 0)), - ("purelib", ("fancy-1.0.0.dist-info/METADATA", "purelib", 0)), - ("purelib", ("fancy-1.0.0.dist-info/WHEEL", "purelib", 0)), + ("data", _re("fancy/data.py")), + ("headers", _re("fancy/headers.py")), + ("platlib", _re("fancy/platlib.py")), + ("purelib", _re("fancy/purelib.py")), + ("scripts", _re("fancy/scripts.py")), + ("purelib", _re("fancy/__init__.py")), + ("purelib", _re("fancy-1.0.0.dist-info/METADATA")), + ("purelib", _re("fancy-1.0.0.dist-info/WHEEL")), ( "purelib", - ("fancy-1.0.0.dist-info/entry_points.txt", "purelib", 0), + _re("fancy-1.0.0.dist-info/entry_points.txt"), ), ( "purelib", - ("fancy-1.0.0.dist-info/top_level.txt", "purelib", 0), + _re("fancy-1.0.0.dist-info/top_level.txt"), ), ( "purelib", @@ -872,9 +883,12 @@ def test_ensure_non_executable_for_additional_metadata(self, mock_destination): """, }, ) - all_contents = list(source.get_contents()) + all_contents = [ + (record, stream.read(), is_exec) + for record, stream, is_exec in source.get_contents() + ] source.get_contents = lambda: ( - (*contents, True) for (*contents, _) in all_contents + (record, BytesIO(data), True) for record, data, _ in all_contents ) install( source=source, @@ -987,3 +1001,193 @@ def test_skips_pycache_and_warns(self, mock_destination): assert sub_good_path in record_paths assert top_pycache_path not in record_paths assert sub_pycache_path not in record_paths + + def test_returns_empty_mismatches_when_hashes_match(self, mock_destination): + """When the wheel's RECORD matches installed content, return [].""" + source = FakeWheelSource( + distribution="fancy", + version="1.0.0", + regular_files={ + "fancy/__init__.py": b"""\ + def main(): + print("I'm a fancy package") + """, + }, + dist_info_files={ + "top_level.txt": b"""\ + fancy + """, + "WHEEL": b"""\ + Wheel-Version: 1.0 + Generator: magic (1.0.0) + Root-Is-Purelib: true + Tag: py3-none-any + """, + "METADATA": b"""\ + Metadata-Version: 2.1 + Name: fancy + Version: 1.0.0 + """, + }, + ) + + mismatches = install( + source=source, + destination=mock_destination, + additional_metadata={}, + ) + + assert mismatches == [] + + def test_returns_mismatches_for_corrupted_content(self, mock_destination): + """When installed content differs from the wheel's RECORD, report it.""" + source = FakeWheelSource( + distribution="fancy", + version="1.0.0", + regular_files={ + "fancy/__init__.py": b"""\ + def main(): + print("I'm a fancy package") + """, + }, + dist_info_files={ + "top_level.txt": b"""\ + fancy + """, + "WHEEL": b"""\ + Wheel-Version: 1.0 + Generator: magic (1.0.0) + Root-Is-Purelib: true + Tag: py3-none-any + """, + "METADATA": b"""\ + Metadata-Version: 2.1 + Name: fancy + Version: 1.0.0 + """, + }, + ) + + # Corrupt get_contents: inject different data than what RECORD expects. + original_get_contents = source.get_contents + + def corrupted_get_contents(): + for record, stream, is_exec in original_get_contents(): + if record[0] == "fancy/__init__.py": + yield record, BytesIO(b"CORRUPTED"), is_exec + else: + yield record, stream, is_exec + + source.get_contents = corrupted_get_contents + + mismatches = install( + source=source, + destination=mock_destination, + additional_metadata={}, + ) + + assert len(mismatches) == 1 + source_rec, written_rec = mismatches[0] + assert source_rec.path == "fancy/__init__.py" + assert source_rec.hash_ != written_rec.hash_ + + def test_no_mismatch_when_source_has_no_hash(self, mock_destination): + """Files without a hash in the wheel's RECORD cannot be compared.""" + source = FakeWheelSource( + distribution="fancy", + version="1.0.0", + regular_files={ + "fancy/__init__.py": b"""\ + def main(): + print("I'm a fancy package") + """, + }, + dist_info_files={ + "top_level.txt": b"""\ + fancy + """, + "WHEEL": b"""\ + Wheel-Version: 1.0 + Generator: magic (1.0.0) + Root-Is-Purelib: true + Tag: py3-none-any + """, + "METADATA": b"""\ + Metadata-Version: 2.1 + Name: fancy + Version: 1.0.0 + """, + }, + ) + + # Strip hash info from all records. + original_get_contents = source.get_contents + + def no_hash_get_contents(): + for record, stream, is_exec in original_get_contents(): + yield (record[0], "", ""), stream, is_exec + + source.get_contents = no_hash_get_contents + + mismatches = install( + source=source, + destination=mock_destination, + additional_metadata={}, + ) + + assert mismatches == [] + + def test_skips_mismatch_check_for_non_compliant_algorithm(self, mock_destination): + """A non-compliant hash algorithm (e.g. md5) is ignored for comparison.""" + source = FakeWheelSource( + distribution="fancy", + version="1.0.0", + regular_files={ + "fancy/__init__.py": b"""\ + def main(): + print("I'm a fancy package") + """, + }, + dist_info_files={ + "top_level.txt": b"""\ + fancy + """, + "WHEEL": b"""\ + Wheel-Version: 1.0 + Generator: magic (1.0.0) + Root-Is-Purelib: true + Tag: py3-none-any + """, + "METADATA": b"""\ + Metadata-Version: 2.1 + Name: fancy + Version: 1.0.0 + """, + }, + ) + + # Replace the sha256 record with a *wrong* md5 hash. + original_get_contents = source.get_contents + + def wrong_md5_get_contents(): + for record, stream, is_exec in original_get_contents(): + if record[0] == "fancy/__init__.py": + data = stream.read() + yield ( + (record[0], "md5=AAAAAAAAAAAAAAAAAAAAAA", str(len(data))), + BytesIO(data), + is_exec, + ) + else: + yield record, stream, is_exec + + source.get_contents = wrong_md5_get_contents + + mismatches = install( + source=source, + destination=mock_destination, + additional_metadata={}, + ) + + # md5 is non-compliant, so no comparison is performed. + assert mismatches == []