From 20c34a4867c2756e3da4aeceea3559f2f45125b2 Mon Sep 17 00:00:00 2001 From: Haricharanpanjwani Date: Sat, 18 Apr 2026 15:06:36 -0700 Subject: [PATCH] ci: improve artifact verification flow --- scripts/apache_release.py | 58 +- scripts/verify_apache_artifacts.py | 1137 ++++++++++++++++++------- tests/test_verify_apache_artifacts.py | 254 ++++++ 3 files changed, 1112 insertions(+), 337 deletions(-) create mode 100644 tests/test_verify_apache_artifacts.py diff --git a/scripts/apache_release.py b/scripts/apache_release.py index fe6dfb86c..27b463820 100644 --- a/scripts/apache_release.py +++ b/scripts/apache_release.py @@ -38,6 +38,7 @@ import shutil import subprocess import sys +import tempfile from typing import NoReturn, Optional # --- Configuration --- @@ -422,6 +423,9 @@ def _build_sdist_from_git(version: str, output_dir: str = "dist") -> str: env = os.environ.copy() env["FLIT_USE_VCS"] = "0" + source_epoch = _source_date_epoch(version, output_dir) + if source_epoch is not None: + env["SOURCE_DATE_EPOCH"] = str(source_epoch) _run_command( ["flit", "build", "--format", "sdist"], description="Running flit build --format sdist...", @@ -451,6 +455,14 @@ def _build_sdist_from_git(version: str, output_dir: str = "dist") -> str: return apache_sdist +def _source_date_epoch(version: str, output_dir: str = "dist") -> Optional[int]: + """Use the source archive timestamp when available so local rebuilds are comparable.""" + source_archive = os.path.join(output_dir, f"apache-burr-{version}-incubating-src.tar.gz") + if os.path.exists(source_archive): + return int(os.path.getmtime(source_archive)) + return None + + # ============================================================================ # Step 3: Build Wheel (SIMPLIFIED!) # ============================================================================ @@ -503,14 +515,15 @@ def _build_ui_artifacts() -> None: _fail(f"UI build directory is empty: {ui_build_dir}") -def _prepare_wheel_contents() -> tuple[bool, bool, Optional[str]]: - """Handle burr/examples symlink: replace with real files for wheel.""" +def _prepare_wheel_contents() -> tuple[bool, bool, Optional[str], list[tuple[str, str]]]: + """Prepare wheel contents and temporarily remove files excluded from the sdist.""" burr_examples_dir = "burr/examples" source_examples_dir = "examples" + removed_files: list[tuple[str, str]] = [] if not os.path.exists(source_examples_dir): print(f" ⚠️ {source_examples_dir} not found") - return (False, False, None) + return (False, False, None, removed_files) # Check if burr/examples is a symlink (should be in dev repo) was_symlink = False @@ -547,11 +560,25 @@ def _prepare_wheel_contents() -> tuple[bool, bool, Optional[str]]: shutil.copytree(src_path, dest_path, dirs_exist_ok=True) print(f" ✓ Copied {example_dir}") - return (True, was_symlink, symlink_target) - - -def _cleanup_wheel_contents(was_symlink: bool, symlink_target: Optional[str]) -> None: - """Restore burr/examples symlink after wheel build.""" + # Keep wheel contents aligned with the sdist so rebuild verification compares like-for-like artifacts. + excluded_wheel_files = [ + "burr/tracking/server/s3/deployment/terraform/.gitignore", + ] + for path in excluded_wheel_files: + if os.path.exists(path): + backup_dir = tempfile.mkdtemp(prefix="apache-release-wheel-") + backup_path = os.path.join(backup_dir, os.path.basename(path)) + os.replace(path, backup_path) + removed_files.append((path, backup_path)) + print(f" ✓ Temporarily excluded {path}") + + return (True, was_symlink, symlink_target, removed_files) + + +def _cleanup_wheel_contents( + was_symlink: bool, symlink_target: Optional[str], removed_files: list[tuple[str, str]] +) -> None: + """Restore temporary wheel-build changes after the wheel build finishes.""" burr_examples_dir = "burr/examples" if os.path.exists(burr_examples_dir): @@ -562,6 +589,14 @@ def _cleanup_wheel_contents(was_symlink: bool, symlink_target: Optional[str]) -> os.symlink(symlink_target, burr_examples_dir) print(" ✓ Symlink restored") + for original_path, backup_path in removed_files: + if os.path.exists(backup_path): + os.replace(backup_path, original_path) + print(f" Restored {original_path}") + backup_dir = os.path.dirname(backup_path) + if os.path.isdir(backup_dir): + shutil.rmtree(backup_dir) + def _build_wheel_from_current_dir(version: str, output_dir: str = "dist") -> str: """Build wheel from current directory (matches what voters do). @@ -575,13 +610,16 @@ def _build_wheel_from_current_dir(version: str, output_dir: str = "dist") -> str _build_ui_artifacts() _print_step(2, 3, "Preparing wheel contents") - copied, was_symlink, symlink_target = _prepare_wheel_contents() + copied, was_symlink, symlink_target, removed_files = _prepare_wheel_contents() _print_step(3, 3, "Building wheel with flit") try: env = os.environ.copy() env["FLIT_USE_VCS"] = "0" + source_epoch = _source_date_epoch(version, output_dir) + if source_epoch is not None: + env["SOURCE_DATE_EPOCH"] = str(source_epoch) _run_command( ["flit", "build", "--format", "wheel"], @@ -606,7 +644,7 @@ def _build_wheel_from_current_dir(version: str, output_dir: str = "dist") -> str finally: # Always restore symlinks if copied: - _cleanup_wheel_contents(was_symlink, symlink_target) + _cleanup_wheel_contents(was_symlink, symlink_target, removed_files) def _verify_wheel(wheel_path: str) -> bool: diff --git a/scripts/verify_apache_artifacts.py b/scripts/verify_apache_artifacts.py index 54350deb5..eb32b0238 100755 --- a/scripts/verify_apache_artifacts.py +++ b/scripts/verify_apache_artifacts.py @@ -20,65 +20,216 @@ Apache Artifacts Verification Script Comprehensive verification tool for Apache release artifacts. -Checks signatures, checksums, licenses, and archive integrity. +Checks signatures, checksums, archive integrity, license metadata, +reproducible rebuilds, and Apache RAT results. Usage: - # List contents of an artifact - python scripts/verify_apache_artifacts.py list-contents dist/apache-burr-0.41.0-incubating-src.tar.gz - - # Verify signatures and checksums + python scripts/verify_apache_artifacts.py --help + Show the available verification commands and flags. python scripts/verify_apache_artifacts.py signatures - - # Verify licenses with Apache RAT - python scripts/verify_apache_artifacts.py licenses --rat-jar path/to/apache-rat.jar - - # Verify everything - python scripts/verify_apache_artifacts.py all --rat-jar path/to/apache-rat.jar - - # Specify custom artifacts directory - python scripts/verify_apache_artifacts.py signatures --artifacts-dir /path/to/dist + Verify detached GPG signatures, SHA512 checksums, and basic archive readability. + python scripts/verify_apache_artifacts.py artifacts + Verify required LICENSE/NOTICE/DISCLAIMER files in release artifacts. + python scripts/verify_apache_artifacts.py licenses --rat-jar /path/to/apache-rat.jar + Run Apache RAT and validate license-report results for extracted tarball contents. + python scripts/verify_apache_artifacts.py reproducible + Rebuild from the release source artifact and compare rebuilt outputs to release artifacts. + python scripts/verify_apache_artifacts.py all --rat-jar /path/to/apache-rat.jar --vote-email + Run the full verification flow and optionally render a vote email draft from the results. """ +from __future__ import annotations + import argparse import glob import hashlib +import importlib.util import os +import re import shutil import subprocess import sys import tarfile import tempfile +import textwrap import xml.etree.ElementTree as ET import zipfile +from dataclasses import dataclass, field +from pathlib import Path, PurePosixPath # Configuration PROJECT_SHORT_NAME = "burr" +REQUIRED_TEXT_FILES = ("LICENSE", "NOTICE", "DISCLAIMER") +WHEEL_LICENSE_FILES = ("LICENSE-wheel",) +WHEEL_REQUIRED_TEXT_FILES = ("NOTICE", "DISCLAIMER") + WHEEL_LICENSE_FILES +PASS = "PASS" +FAIL = "FAIL" +SKIP = "SKIP" + + +@dataclass +class CheckResult: + name: str + status: str + details: str = "" + + +@dataclass +class VerificationSummary: + results: list[CheckResult] = field(default_factory=list) + + def record(self, name: str, status: str, details: str = "") -> None: + self.results.append(CheckResult(name=name, status=status, details=details)) + + def pass_(self, name: str, details: str = "") -> None: + self.record(name, PASS, details) + + def fail(self, name: str, details: str = "") -> None: + self.record(name, FAIL, details) + + def skip(self, name: str, details: str = "") -> None: + self.record(name, SKIP, details) + + @property + def ok(self) -> bool: + return all(result.status != FAIL for result in self.results) + + def render(self) -> str: + lines = ["Results:"] + if not self.results: + lines.append(" (no checks executed)") + return "\n".join(lines) + + width = max(len(result.name) for result in self.results) + for result in self.results: + symbol = {"PASS": "✅", "FAIL": "❌", "SKIP": "⊘"}[result.status] + line = f" {result.name:<{width}} {symbol} {result.status}" + if result.details: + line += f" {result.details}" + lines.append(line) + return "\n".join(lines) def _fail(message: str) -> None: - """Print error message and exit.""" print(f"\n❌ {message}") sys.exit(1) def _print_section(title: str) -> None: - """Print formatted section header.""" print("\n" + "=" * 80) print(f" {title}") print("=" * 80 + "\n") -# ============================================================================ -# Signature and Checksum Verification -# ============================================================================ +def _sha512_for_file(path: str) -> str: + sha512_hash = hashlib.sha512() + with open(path, "rb") as handle: + while chunk := handle.read(65536): + sha512_hash.update(chunk) + return sha512_hash.hexdigest() + + +def _read_expected_text_files() -> dict[str, bytes]: + project_root = Path(__file__).resolve().parent.parent + expected: dict[str, bytes] = {} + for filename in (*REQUIRED_TEXT_FILES, *WHEEL_LICENSE_FILES): + expected[filename] = (project_root / filename).read_bytes() + return expected + + +def _artifact_files(artifacts_dir: str) -> list[str]: + all_files = [ + name + for name in os.listdir(artifacts_dir) + if os.path.isfile(os.path.join(artifacts_dir, name)) + ] + return sorted( + name + for name in all_files + if not name.endswith((".asc", ".sha512")) + and not name.startswith("rat-report-") + ) + + +def _top_level_prefix(paths: list[str]) -> str | None: + parts = [PurePosixPath(path).parts for path in paths if path] + if not parts: + return None + prefix = parts[0][0] + if all(item and item[0] == prefix for item in parts): + return prefix + return None + + +def _normalize_archive_member_names(names: list[str]) -> dict[str, str]: + prefix = _top_level_prefix(names) + normalized = {} + for name in names: + pure_name = PurePosixPath(name) + relative_parts = pure_name.parts[1:] if prefix and pure_name.parts[:1] == (prefix,) else pure_name.parts + normalized_name = str(PurePosixPath(*relative_parts)) if relative_parts else "" + normalized[normalized_name] = name + return normalized + + +def _tar_file_bytes(artifact_path: str) -> dict[str, bytes]: + with tarfile.open(artifact_path, "r:gz") as tar: + file_members = [member for member in tar.getmembers() if member.isfile()] + mapping = _normalize_archive_member_names([member.name for member in file_members]) + contents: dict[str, bytes] = {} + for member in file_members: + normalized_name = next( + normalized + for normalized, original in mapping.items() + if original == member.name + ) + extracted = tar.extractfile(member) + if extracted is None: + continue + contents[normalized_name] = extracted.read() + return contents + + +def _wheel_file_bytes(artifact_path: str) -> dict[str, bytes]: + with zipfile.ZipFile(artifact_path, "r") as wheel: + return {name: wheel.read(name) for name in wheel.namelist() if not name.endswith("/")} + +def _find_files_by_basename(file_bytes: dict[str, bytes], basename: str) -> list[str]: + matches = [] + for path in file_bytes: + if PurePosixPath(path).name == basename: + matches.append(path) + return sorted(matches) -def _verify_artifact_signature(artifact_path: str, signature_path: str) -> bool: - """Verify GPG signature of artifact.""" + +def _verify_artifact_exists(artifact_path: str, summary: VerificationSummary, min_size: int = 1000) -> bool: + name = f"Artifact exists: {os.path.basename(artifact_path)}" + if not os.path.exists(artifact_path): + print(f" ✗ Artifact not found: {os.path.basename(artifact_path)}") + summary.fail(name, "missing file") + return False + + file_size = os.path.getsize(artifact_path) + if file_size < min_size: + print( + f" ✗ Artifact is suspiciously small ({file_size} bytes): {os.path.basename(artifact_path)}" + ) + summary.fail(name, f"size {file_size} bytes") + return False + + print(f" ✓ Artifact exists: {os.path.basename(artifact_path)} ({file_size:,} bytes)") + summary.pass_(name, f"{file_size:,} bytes") + return True + + +def _verify_artifact_signature(artifact_path: str, signature_path: str, summary: VerificationSummary) -> bool: + check_name = f"GPG signature: {os.path.basename(artifact_path)}" print(f" Verifying GPG signature: {os.path.basename(signature_path)}") if not os.path.exists(signature_path): print(" ✗ Signature file not found") + summary.fail(check_name, "missing .asc") return False try: @@ -87,131 +238,200 @@ def _verify_artifact_signature(artifact_path: str, signature_path: str) -> bool: capture_output=True, check=False, ) - if result.returncode == 0: - print(" ✓ GPG signature is valid") - return True - else: - print(" ✗ GPG signature verification failed") - if result.stderr: - print(f" Error: {result.stderr.decode()}") - return False - except subprocess.CalledProcessError: - print(" ✗ Error running GPG") + except OSError as exc: + print(f" ✗ Error running GPG: {exc}") + summary.fail(check_name, f"gpg unavailable: {exc}") return False + if result.returncode == 0: + print(" ✓ GPG signature is valid") + summary.pass_(check_name) + return True + + print(" ✗ GPG signature verification failed") + if result.stderr: + print(f" Error: {result.stderr.decode()}") + summary.fail(check_name, result.stderr.decode().strip() or "verification failed") + return False + -def _verify_artifact_checksum(artifact_path: str, checksum_path: str) -> bool: - """Verify SHA512 checksum of artifact.""" +def _verify_artifact_checksum( + artifact_path: str, + checksum_path: str, + summary: VerificationSummary, +) -> bool: + check_name = f"SHA512 checksum: {os.path.basename(artifact_path)}" print(f" Verifying SHA512 checksum: {os.path.basename(checksum_path)}") if not os.path.exists(checksum_path): print(" ✗ Checksum file not found") + summary.fail(check_name, "missing .sha512") return False - # Read expected checksum - with open(checksum_path, "r", encoding="utf-8") as f: - expected_checksum = f.read().strip().split()[0] - - # Calculate actual checksum - sha512_hash = hashlib.sha512() - with open(artifact_path, "rb") as f: - while chunk := f.read(65536): - sha512_hash.update(chunk) - - actual_checksum = sha512_hash.hexdigest() + with open(checksum_path, "r", encoding="utf-8") as handle: + expected_checksum = handle.read().strip().split()[0] + actual_checksum = _sha512_for_file(artifact_path) if actual_checksum == expected_checksum: print(" ✓ SHA512 checksum is valid") + summary.pass_(check_name) return True - else: - print(" ✗ SHA512 checksum mismatch!") - print(f" Expected: {expected_checksum}") - print(f" Actual: {actual_checksum}") - return False + + print(" ✗ SHA512 checksum mismatch!") + print(f" Expected: {expected_checksum}") + print(f" Actual: {actual_checksum}") + summary.fail(check_name, "checksum mismatch") + return False -def _verify_tar_gz_readable(artifact_path: str) -> bool: - """Verify tar.gz archive can be read and contains files.""" +def _verify_tar_gz_readable(artifact_path: str, summary: VerificationSummary) -> bool: + check_name = f"Readable tar.gz: {os.path.basename(artifact_path)}" print(f" Checking archive readability: {os.path.basename(artifact_path)}") try: with tarfile.open(artifact_path, "r:gz") as tar: members = tar.getmembers() - - if len(members) == 0: + if not members: print(" ✗ Archive is empty (no files)") + summary.fail(check_name, "archive is empty") return False - print(f" ✓ Archive is readable and contains {len(members)} files") + summary.pass_(check_name, f"{len(members)} members") return True - except tarfile.TarError as e: - print(f" ✗ Archive is corrupted or unreadable: {e}") + except tarfile.TarError as exc: + print(f" ✗ Archive is corrupted or unreadable: {exc}") + summary.fail(check_name, str(exc)) return False - except Exception as e: - print(f" ✗ Error reading archive: {e}") + except Exception as exc: # pragma: no cover - defensive + print(f" ✗ Error reading archive: {exc}") + summary.fail(check_name, str(exc)) return False -def _verify_wheel_readable(wheel_path: str) -> bool: - """Verify wheel can be read and contains expected structure.""" +def _verify_wheel_readable(wheel_path: str, summary: VerificationSummary) -> bool: + check_name = f"Readable wheel: {os.path.basename(wheel_path)}" print(f" Checking wheel readability: {os.path.basename(wheel_path)}") try: - with zipfile.ZipFile(wheel_path, "r") as whl: - file_list = whl.namelist() - - if len(file_list) == 0: + with zipfile.ZipFile(wheel_path, "r") as wheel: + file_list = wheel.namelist() + if not file_list: print(" ✗ Wheel is empty (no files)") + summary.fail(check_name, "wheel is empty") return False - # Check for metadata - metadata_files = [f for f in file_list if "METADATA" in f or "WHEEL" in f] + metadata_files = [name for name in file_list if "METADATA" in name or "WHEEL" in name] if not metadata_files: print(" ✗ Wheel missing required metadata files") + summary.fail(check_name, "missing METADATA/WHEEL") return False print(f" ✓ Wheel is readable and contains {len(file_list)} files") + summary.pass_(check_name, f"{len(file_list)} members") return True - except zipfile.BadZipFile: + except zipfile.BadZipFile as exc: print(" ✗ Wheel is corrupted or not a valid ZIP file") + summary.fail(check_name, str(exc)) return False - except Exception as e: - print(f" ✗ Error reading wheel: {e}") + except Exception as exc: # pragma: no cover - defensive + print(f" ✗ Error reading wheel: {exc}") + summary.fail(check_name, str(exc)) return False -def _verify_artifact_exists(artifact_path: str, min_size: int = 1000) -> bool: - """Verify artifact exists and has reasonable size.""" - if not os.path.exists(artifact_path): - print(f" ✗ Artifact not found: {os.path.basename(artifact_path)}") - return False +def _verify_required_text_files( + artifact_name: str, + file_bytes: dict[str, bytes], + required_files: tuple[str, ...], + expected_files: dict[str, bytes], + summary: VerificationSummary, +) -> bool: + all_valid = True + for required_name in required_files: + check_name = f"{artifact_name} contains {required_name}" + matches = _find_files_by_basename(file_bytes, required_name) + if not matches: + print(f" ✗ Missing {required_name}") + summary.fail(check_name, "missing") + all_valid = False + continue - file_size = os.path.getsize(artifact_path) - if file_size < min_size: - print( - f" ✗ Artifact is suspiciously small ({file_size} bytes): {os.path.basename(artifact_path)}" - ) + matched_path = matches[0] + if file_bytes[matched_path] != expected_files[required_name]: + print(f" ✗ {required_name} content mismatch ({matched_path})") + summary.fail(check_name, f"content mismatch at {matched_path}") + all_valid = False + continue + + print(f" ✓ {required_name} present and matches repository copy ({matched_path})") + summary.pass_(check_name, matched_path) + return all_valid + + +def verify_artifact_contents(artifacts_dir: str, summary: VerificationSummary | None = None) -> bool: + _print_section("Verifying Artifact Metadata Files") + + if summary is None: + summary = VerificationSummary() + + if not os.path.exists(artifacts_dir): + _fail(f"Artifacts directory not found: {artifacts_dir}") + + artifacts = _artifact_files(artifacts_dir) + if not artifacts: + print(f"⚠️ No artifacts found in {artifacts_dir}") + summary.fail("Artifact metadata verification", "no artifacts found") return False - print(f" ✓ Artifact exists: {os.path.basename(artifact_path)} ({file_size:,} bytes)") - return True + expected_files = _read_expected_text_files() + all_valid = True + for artifact_name in artifacts: + artifact_path = os.path.join(artifacts_dir, artifact_name) + print(f"Inspecting metadata files: {artifact_name}") + print("-" * 80) + + if artifact_name.endswith(".tar.gz"): + file_bytes = _tar_file_bytes(artifact_path) + if not _verify_required_text_files( + artifact_name, + file_bytes, + REQUIRED_TEXT_FILES, + expected_files, + summary, + ): + all_valid = False + elif artifact_name.endswith(".whl"): + file_bytes = _wheel_file_bytes(artifact_path) + if not _verify_required_text_files( + artifact_name, + file_bytes, + WHEEL_REQUIRED_TEXT_FILES, + expected_files, + summary, + ): + all_valid = False + else: + print(f" ⚠️ Skipping unsupported artifact type: {artifact_name}") + summary.skip(f"Artifact metadata: {artifact_name}", "unsupported type") + print() + + return all_valid -def verify_signatures(artifacts_dir: str) -> bool: - """Verify all signatures and checksums in artifacts directory.""" + +def verify_signatures(artifacts_dir: str, summary: VerificationSummary | None = None) -> bool: _print_section("Verifying Signatures and Checksums") + if summary is None: + summary = VerificationSummary() + if not os.path.exists(artifacts_dir): _fail(f"Artifacts directory not found: {artifacts_dir}") - # Find all artifacts (exclude .asc and .sha512 files) - all_files = [ - f for f in os.listdir(artifacts_dir) if os.path.isfile(os.path.join(artifacts_dir, f)) - ] - artifacts = [f for f in all_files if not f.endswith((".asc", ".sha512"))] - + artifacts = _artifact_files(artifacts_dir) if not artifacts: print(f"⚠️ No artifacts found in {artifacts_dir}") + summary.fail("Signature verification", "no artifacts found") return False print(f"Found {len(artifacts)} artifact(s) to verify:\n") @@ -219,31 +439,24 @@ def verify_signatures(artifacts_dir: str) -> bool: all_valid = True for artifact_name in artifacts: artifact_path = os.path.join(artifacts_dir, artifact_name) - print(f"Verifying: {artifact_name}") print("-" * 80) - # Check existence and size - if not _verify_artifact_exists(artifact_path): + if not _verify_artifact_exists(artifact_path, summary): all_valid = False continue - # Verify signature - signature_path = f"{artifact_path}.asc" - if not _verify_artifact_signature(artifact_path, signature_path): + if not _verify_artifact_signature(artifact_path, f"{artifact_path}.asc", summary): all_valid = False - # Verify checksum - checksum_path = f"{artifact_path}.sha512" - if not _verify_artifact_checksum(artifact_path, checksum_path): + if not _verify_artifact_checksum(artifact_path, f"{artifact_path}.sha512", summary): all_valid = False - # Verify archive/wheel structure if artifact_name.endswith(".tar.gz"): - if not _verify_tar_gz_readable(artifact_path): + if not _verify_tar_gz_readable(artifact_path, summary): all_valid = False elif artifact_name.endswith(".whl"): - if not _verify_wheel_readable(artifact_path): + if not _verify_wheel_readable(artifact_path, summary): all_valid = False print() @@ -251,29 +464,89 @@ def verify_signatures(artifacts_dir: str) -> bool: return all_valid -# ============================================================================ -# License Verification (Apache RAT) -# ============================================================================ +def _safe_extract_tar(tar_handle: tarfile.TarFile, extract_dir: str) -> None: + try: + tar_handle.extractall(extract_dir, filter="data") + except TypeError: + tar_handle.extractall(extract_dir) + + +def _build_rat_command( + rat_jar_path: str, + extract_dir: str, + rat_excludes: str | None, + output_style: str | None = None, +) -> list[str]: + command = ["java", "-jar", rat_jar_path] + if output_style: + command.extend(["--output-style", output_style]) + if rat_excludes: + command.extend(["--input-exclude-file", rat_excludes]) + command.extend(["--", extract_dir]) + return command + + +def _rat_scan_target(extract_dir: str) -> tuple[str, str]: + extracted_root = Path(extract_dir) + entries = [entry for entry in extracted_root.iterdir()] + if len(entries) == 1 and entries[0].is_dir(): + return str(extracted_root), entries[0].name + return str(extracted_root), "." + + +def _load_rat_xml_root(rat_report_xml: str) -> ET.Element: + raw_xml = Path(rat_report_xml).read_text(encoding="utf-8") + start_tag = " tag") + + xml_content = raw_xml[xml_start : xml_end + len(end_tag)].strip() + if not xml_content: + raise ValueError("RAT XML report is empty") + + return ET.fromstring(xml_content) + + +def _rat_license_state(resource: ET.Element) -> tuple[str, str]: + approval = resource.find("license-approval") + family = resource.find("license-family") + if approval is not None or family is not None: + license_approval = approval.get("name", "true") if approval is not None else "true" + license_family = family.get("name", "") if family is not None else "" + return license_approval, license_family + + license_elem = resource.find("license") + if license_elem is not None: + license_approval = license_elem.get("approval", "true") + license_family = license_elem.get("family", "") or license_elem.get("name", "") + return license_approval, license_family + + return "true", "" def _check_licenses_with_rat( artifact_path: str, rat_jar_path: str, report_name: str, + summary: VerificationSummary, report_only: bool = False, ) -> bool: - """Run Apache RAT license checker on artifact.""" + check_name = f"Apache RAT: {os.path.basename(artifact_path)}" print(f"\nRunning Apache RAT on: {os.path.basename(artifact_path)}") print("-" * 80) - # Create reports directory report_dir = "dist" os.makedirs(report_dir, exist_ok=True) rat_report_xml = os.path.join(report_dir, f"rat-report-{report_name}.xml") rat_report_txt = os.path.join(report_dir, f"rat-report-{report_name}.txt") - # Extract archive to temp directory with tempfile.TemporaryDirectory() as temp_dir: extract_dir = os.path.join(temp_dir, "extracted") os.makedirs(extract_dir) @@ -281,100 +554,74 @@ def _check_licenses_with_rat( print(" Extracting archive...") try: with tarfile.open(artifact_path, "r:gz") as tar: - # Use data filter for Python 3.12+ to avoid deprecation warning - tar.extractall(extract_dir, filter="data") + _safe_extract_tar(tar, extract_dir) print(" ✓ Extracted to temp directory") - except Exception as e: - print(f" ✗ Error extracting archive: {e}") + except Exception as exc: + print(f" ✗ Error extracting archive: {exc}") + summary.fail(check_name, f"extract failed: {exc}") return False - # Locate .rat-excludes file rat_excludes = ".rat-excludes" if not os.path.exists(rat_excludes): print(f" ⚠️ Warning: {rat_excludes} not found, running without excludes") rat_excludes = None + else: + rat_excludes = os.path.abspath(rat_excludes) + + rat_cwd, rat_target = _rat_scan_target(extract_dir) - # Run RAT with XML output print(" Running Apache RAT (XML format for parsing)...") - rat_cmd_xml = [ - "java", - "-jar", + rat_cmd_xml = _build_rat_command( rat_jar_path, - "-x", # XML output - "-d", - extract_dir, - ] - if rat_excludes: - rat_cmd_xml.extend(["-E", rat_excludes]) + rat_target, + rat_excludes, + output_style="xml", + ) try: with open(rat_report_xml, "w", encoding="utf-8") as report_file: result = subprocess.run( rat_cmd_xml, + cwd=rat_cwd, stdout=report_file, stderr=subprocess.PIPE, text=True, check=False, ) - if result.returncode != 0: print(f" ⚠️ RAT exited with code {result.returncode}") - print(f" ✓ RAT XML report: {rat_report_xml}") - except Exception as e: - print(f" ✗ Error running RAT (XML): {e}") + except Exception as exc: + print(f" ✗ Error running RAT (XML): {exc}") + summary.fail(check_name, f"RAT execution failed: {exc}") return False - # Run RAT with plain text output print(" Running Apache RAT (text format for review)...") - rat_cmd_txt = [ - "java", - "-jar", - rat_jar_path, - "-d", - extract_dir, - ] - if rat_excludes: - rat_cmd_txt.extend(["-E", rat_excludes]) + rat_cmd_txt = _build_rat_command(rat_jar_path, rat_target, rat_excludes) try: with open(rat_report_txt, "w", encoding="utf-8") as report_file: subprocess.run( rat_cmd_txt, + cwd=rat_cwd, stdout=report_file, stderr=subprocess.PIPE, text=True, check=False, ) print(f" ✓ RAT text report: {rat_report_txt}") - except Exception as e: - print(f" ⚠️ Warning: Could not generate text report: {e}") + except Exception as exc: + print(f" ⚠️ Warning: Could not generate text report: {exc}") - # Parse XML report print(" Parsing RAT report...") try: - tree = ET.parse(rat_report_xml) - root = tree.getroot() - - # Find license issues + root = _load_rat_xml_root(rat_report_xml) unapproved_licenses = [] unknown_licenses = [] for resource in root.findall(".//resource"): name = resource.get("name", "unknown") - - # Get license approval and family from child elements - license_approval_elem = resource.find("license-approval") - license_family_elem = resource.find("license-family") - - license_approval = ( - license_approval_elem.get("name", "true") - if license_approval_elem is not None - else "true" - ) - license_family = ( - license_family_elem.get("name", "") if license_family_elem is not None else "" - ) + license_approval, license_family = _rat_license_state(resource) if license_approval == "false" or license_family == "Unknown license": if license_family == "Unknown license" or not license_family: @@ -382,59 +629,54 @@ def _check_licenses_with_rat( else: unapproved_licenses.append(name) - # Report findings - total_files = len(root.findall(".//resource")) issues_count = len(unapproved_licenses) + len(unknown_licenses) - + total_files = len(root.findall(".//resource")) print(f" ✓ Scanned {total_files} files") print(f" ✓ Found {issues_count} files with license issues") - if issues_count > 0: - print("\n ⚠️ License Issues Found:") - - if unknown_licenses: - print(f"\n Unknown/Missing Licenses ({len(unknown_licenses)} files):") - for file in unknown_licenses[:10]: - print(f" - {file}") - if len(unknown_licenses) > 10: - print(f" ... and {len(unknown_licenses) - 10} more") - - if unapproved_licenses: - print(f"\n Unapproved Licenses ({len(unapproved_licenses)} files):") - for file in unapproved_licenses[:10]: - print(f" - {file}") - if len(unapproved_licenses) > 10: - print(f" ... and {len(unapproved_licenses) - 10} more") - - print("\n 📄 Reports saved:") - print(f" - {rat_report_xml} (structured)") - print(f" - {rat_report_txt} (human-readable)") - - if report_only: - print("\n ℹ️ Report-only mode: continuing despite license issues") - return True - else: - print("\n ❌ License check failed!") - return False - else: + if issues_count == 0: print(" ✅ All files have approved licenses") - print("\n 📄 Reports saved:") - print(f" - {rat_report_xml} (structured)") - print(f" - {rat_report_txt} (human-readable)") + summary.pass_(check_name, "no RAT issues") + return True + + if unknown_licenses: + print(f"\n Unknown/Missing Licenses ({len(unknown_licenses)} files):") + for file_name in unknown_licenses[:10]: + print(f" - {file_name}") + if unapproved_licenses: + print(f"\n Unapproved Licenses ({len(unapproved_licenses)} files):") + for file_name in unapproved_licenses[:10]: + print(f" - {file_name}") + + if report_only: + print("\n ℹ️ Report-only mode: continuing despite license issues") + summary.pass_(check_name, f"report-only with {issues_count} issue(s)") return True - except Exception as e: - print(f" ✗ Error parsing RAT report: {e}") + print("\n ❌ License check failed!") + summary.fail(check_name, f"{issues_count} RAT issue(s)") + return False + except Exception as exc: + print(f" ✗ Error parsing RAT report: {exc}") if report_only: print(" ℹ️ Report-only mode: continuing despite parse error") + summary.pass_(check_name, "report-only despite parse error") return True + summary.fail(check_name, f"report parse failed: {exc}") return False -def verify_licenses(artifacts_dir: str, rat_jar_path: str, report_only: bool = False) -> bool: - """Verify licenses in all tar.gz artifacts using Apache RAT.""" +def verify_licenses( + artifacts_dir: str, + rat_jar_path: str, + summary: VerificationSummary | None = None, + report_only: bool = False, +) -> bool: _print_section("Verifying Licenses with Apache RAT") + if summary is None: + summary = VerificationSummary() + if not os.path.exists(artifacts_dir): _fail(f"Artifacts directory not found: {artifacts_dir}") @@ -443,18 +685,13 @@ def verify_licenses(artifacts_dir: str, rat_jar_path: str, report_only: bool = F f"Apache RAT JAR not found: {rat_jar_path}\nDownload from: https://creadur.apache.org/rat/download_rat.cgi" ) - # Check for java if shutil.which("java") is None: _fail("Java not found. Required for Apache RAT.") - # Find all tar.gz artifacts (not wheels) - all_files = [ - f for f in os.listdir(artifacts_dir) if os.path.isfile(os.path.join(artifacts_dir, f)) - ] - tar_artifacts = [f for f in all_files if f.endswith(".tar.gz")] - + tar_artifacts = [name for name in _artifact_files(artifacts_dir) if name.endswith(".tar.gz")] if not tar_artifacts: print(f"⚠️ No tar.gz artifacts found in {artifacts_dir}") + summary.fail("Apache RAT", "no tar.gz artifacts found") return False print(f"Found {len(tar_artifacts)} tar.gz artifact(s) to check:\n") @@ -462,40 +699,282 @@ def verify_licenses(artifacts_dir: str, rat_jar_path: str, report_only: bool = F all_valid = True for artifact_name in tar_artifacts: artifact_path = os.path.join(artifacts_dir, artifact_name) - - # Generate report name from artifact name report_name = artifact_name.replace(".tar.gz", "").replace(".", "-") + if not _check_licenses_with_rat( + artifact_path, + rat_jar_path, + report_name, + summary, + report_only, + ): + all_valid = False + + return all_valid + + +def _release_artifact_map(artifacts_dir: str) -> dict[str, list[str]]: + artifacts = _artifact_files(artifacts_dir) + return { + "source": [name for name in artifacts if name.endswith("-src.tar.gz")], + "sdist": [name for name in artifacts if name.endswith("-sdist.tar.gz")], + "wheel": [name for name in artifacts if name.endswith(".whl")], + } + + +def _extract_project_root(source_artifact: str, destination: str) -> Path: + with tarfile.open(source_artifact, "r:gz") as tar: + _safe_extract_tar(tar, destination) + + entries = [entry for entry in Path(destination).iterdir()] + if len(entries) == 1 and entries[0].is_dir(): + return entries[0] + return Path(destination) + + +def _load_apache_release_module(project_root: Path): + module_path = project_root / "scripts" / "apache_release.py" + spec = importlib.util.spec_from_file_location("apache_release_for_verify", module_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"Unable to load release helper from {module_path}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def _build_reproducible_wheel(project_root: Path, version: str, output_dir: str, source_epoch: int) -> tuple[bool, str]: + command = [ + sys.executable, + "-c", + ( + "import importlib.util, pathlib, sys; " + "project_root = pathlib.Path(sys.argv[1]); " + "version = sys.argv[2]; " + "output_dir = sys.argv[3]; " + "module_path = project_root / 'scripts' / 'apache_release.py'; " + "spec = importlib.util.spec_from_file_location('apache_release_for_verify', module_path); " + "module = importlib.util.module_from_spec(spec); " + "spec.loader.exec_module(module); " + "module._build_wheel_from_current_dir(version, output_dir)" + ), + str(project_root), + version, + output_dir, + ] + env = os.environ.copy() + env["SOURCE_DATE_EPOCH"] = str(source_epoch) + env["PATH"] = f"{Path(sys.executable).parent}{os.pathsep}{env.get('PATH', '')}" + result = subprocess.run( + command, + cwd=project_root, + capture_output=True, + text=True, + check=False, + env=env, + ) + output = "\n".join(item for item in [result.stdout, result.stderr] if item) + return result.returncode == 0, output + + +def _build_reproducible_artifacts(source_artifact: str, output_dir: str) -> tuple[bool, str]: + with tempfile.TemporaryDirectory() as temp_dir: + project_root = _extract_project_root(source_artifact, temp_dir) + source_epoch = int(os.path.getmtime(source_artifact)) + version_match = re.search(r"(\d+\.\d+\.\d+)", os.path.basename(source_artifact)) + if version_match is None: + return False, f"unable to determine version from {os.path.basename(source_artifact)}" + version = version_match.group(1) + + dist_dir = os.path.join(project_root, "dist") + if os.path.exists(dist_dir): + shutil.rmtree(dist_dir) + + env = os.environ.copy() + env["FLIT_USE_VCS"] = "0" + env["SOURCE_DATE_EPOCH"] = str(source_epoch) + env["PATH"] = f"{Path(sys.executable).parent}{os.pathsep}{env.get('PATH', '')}" + sdist_result = subprocess.run( + ["flit", "build", "--format", "sdist"], + cwd=project_root, + capture_output=True, + text=True, + check=False, + env=env, + ) + if sdist_result.returncode != 0: + return False, sdist_result.stderr or sdist_result.stdout + + wheel_ok, wheel_output = _build_reproducible_wheel(project_root, version, output_dir, source_epoch) + if not wheel_ok: + return False, wheel_output + + os.makedirs(output_dir, exist_ok=True) + for built_artifact in Path(dist_dir).glob("*"): + shutil.copy2(built_artifact, Path(output_dir) / built_artifact.name) + + combined_output = "\n".join( + item for item in [sdist_result.stdout, sdist_result.stderr, wheel_output] if item + ) + return True, combined_output + + +def _compare_rebuilt_artifacts( + artifacts_dir: str, + rebuilt_dir: str, + release_artifacts: dict[str, list[str]], + summary: VerificationSummary, +) -> bool: + all_valid = True - if not _check_licenses_with_rat(artifact_path, rat_jar_path, report_name, report_only): + if release_artifacts["sdist"]: + release_sdist = os.path.join(artifacts_dir, release_artifacts["sdist"][0]) + rebuilt_sdists = sorted(glob.glob(os.path.join(rebuilt_dir, "*.tar.gz"))) + if not rebuilt_sdists: + summary.fail("Rebuilt sdist checksum", "no rebuilt sdist produced") all_valid = False + else: + rebuilt_sdist = rebuilt_sdists[0] + if _sha512_for_file(release_sdist) == _sha512_for_file(rebuilt_sdist): + summary.pass_("Rebuilt sdist checksum", os.path.basename(rebuilt_sdist)) + else: + summary.fail("Rebuilt sdist checksum", "rebuilt sdist differs from release") + all_valid = False + else: + summary.skip("Rebuilt sdist checksum", "no release sdist found") + + if release_artifacts["wheel"]: + release_wheels = [os.path.join(artifacts_dir, name) for name in release_artifacts["wheel"]] + rebuilt_wheels = sorted(glob.glob(os.path.join(rebuilt_dir, "*.whl"))) + if len(rebuilt_wheels) != len(release_wheels): + summary.fail( + "Rebuilt wheel checksum", + f"expected {len(release_wheels)} wheel(s), found {len(rebuilt_wheels)} rebuilt wheel(s)", + ) + return False + + for release_wheel in release_wheels: + release_name = os.path.basename(release_wheel) + matching_wheels = [ + rebuilt for rebuilt in rebuilt_wheels if os.path.basename(rebuilt) == release_name + ] + if not matching_wheels: + summary.fail(f"Rebuilt wheel checksum: {release_name}", "matching rebuilt wheel not found") + all_valid = False + continue + rebuilt_wheel = matching_wheels[0] + if _sha512_for_file(release_wheel) == _sha512_for_file(rebuilt_wheel): + summary.pass_(f"Rebuilt wheel checksum: {release_name}") + else: + summary.fail(f"Rebuilt wheel checksum: {release_name}", "rebuilt wheel differs from release") + all_valid = False + else: + summary.skip("Rebuilt wheel checksum", "no release wheel found") return all_valid -# ============================================================================ -# List Contents -# ============================================================================ +def verify_reproducible_build( + artifacts_dir: str, + summary: VerificationSummary | None = None, +) -> bool: + _print_section("Verifying Reproducible Build") + + if summary is None: + summary = VerificationSummary() + + if not os.path.exists(artifacts_dir): + _fail(f"Artifacts directory not found: {artifacts_dir}") + + release_artifacts = _release_artifact_map(artifacts_dir) + source_candidates = release_artifacts["source"] or release_artifacts["sdist"] + if not source_candidates: + summary.fail("Rebuild source artifact", "no source or sdist tarball found") + return False + + if shutil.which("flit") is None: + summary.fail("Reproducible rebuild", "flit is required to rebuild release artifacts") + return False + + source_artifact = os.path.join(artifacts_dir, source_candidates[0]) + summary.pass_("Rebuild source artifact", os.path.basename(source_artifact)) + + with tempfile.TemporaryDirectory() as rebuilt_dir: + print(f"Rebuilding from: {os.path.basename(source_artifact)}") + ok, output = _build_reproducible_artifacts(source_artifact, rebuilt_dir) + if not ok: + summary.fail("Reproducible rebuild", output.strip() or "build failed") + return False + + summary.pass_("Reproducible rebuild", "build completed") + return _compare_rebuilt_artifacts(artifacts_dir, rebuilt_dir, release_artifacts, summary) + + +def _extract_version_from_artifacts(artifacts_dir: str) -> str: + version_pattern = re.compile(r"(\d+\.\d+\.\d+)") + for artifact_name in _artifact_files(artifacts_dir): + match = version_pattern.search(artifact_name) + if match: + return match.group(1) + return "UNKNOWN_VERSION" + + +def render_vote_email(artifacts_dir: str, summary: VerificationSummary) -> str: + version = _extract_version_from_artifacts(artifacts_dir) + pass_count = sum(result.status == PASS for result in summary.results) + fail_count = sum(result.status == FAIL for result in summary.results) + skip_count = sum(result.status == SKIP for result in summary.results) + vote = "+1" if summary.ok else "-1" + + result_lines = [] + for result in summary.results: + result_lines.append(f"- [{result.status}] {result.name}" + (f": {result.details}" if result.details else "")) + + return textwrap.dedent( + f"""\ + Subject: [{vote}] Release Apache Burr (incubating) {version} + + I verified the Apache Burr (incubating) {version} release artifacts. + + Verification summary: + {os.linesep.join(result_lines)} + + Totals: + - PASS: {pass_count} + - FAIL: {fail_count} + - SKIP: {skip_count} + + Vote: + {vote} approve the release based on the checks above. + """ + ).strip() + + +def _maybe_output_vote_email(args: argparse.Namespace, summary: VerificationSummary) -> None: + if not getattr(args, "vote_email", False): + return + + email_text = render_vote_email(args.artifacts_dir, summary) + _print_section("Vote Email Draft") + print(email_text) + + output_path = getattr(args, "vote_email_output", None) + if output_path: + Path(output_path).write_text(email_text + "\n", encoding="utf-8") + print(f"\nSaved vote email draft to: {output_path}") def _list_tar_gz_contents(artifact_path: str) -> None: - """List contents of a tar.gz archive.""" print(f"\nContents of: {os.path.basename(artifact_path)}") print("=" * 80) try: with tarfile.open(artifact_path, "r:gz") as tar: members = tar.getmembers() - print(f"Total files: {len(members)}\n") - # Group by type - files = [m for m in members if m.isfile()] - dirs = [m for m in members if m.isdir()] - symlinks = [m for m in members if m.issym() or m.islnk()] - + files = [member for member in members if member.isfile()] + dirs = [member for member in members if member.isdir()] + symlinks = [member for member in members if member.issym() or member.islnk()] print(f"Files: {len(files)}, Directories: {len(dirs)}, Symlinks: {len(symlinks)}\n") - - # Show all files print("Files:\n") for member in members: @@ -507,46 +986,38 @@ def _list_tar_gz_contents(artifact_path: str) -> None: print(f"{prefix}{member.name} -> {member.linkname}") continue print(f"{prefix}{member.name:<70} {size}") - - except Exception as e: - print(f"Error reading archive: {e}") + except Exception as exc: + print(f"Error reading archive: {exc}") def _list_wheel_contents(wheel_path: str) -> None: - """List contents of a wheel file.""" print(f"\nContents of: {os.path.basename(wheel_path)}") print("=" * 80) try: - with zipfile.ZipFile(wheel_path, "r") as whl: - file_list = whl.namelist() - + with zipfile.ZipFile(wheel_path, "r") as wheel: + file_list = wheel.namelist() print(f"Total files: {len(file_list)}\n") - # Group by directory - top_level_dirs = {} - for file in file_list: - top_dir = file.split("/")[0] + top_level_dirs: dict[str, int] = {} + for file_name in file_list: + top_dir = file_name.split("/")[0] top_level_dirs[top_dir] = top_level_dirs.get(top_dir, 0) + 1 print("Top-level structure:") for dir_name, count in sorted(top_level_dirs.items()): print(f" {dir_name:<50} ({count} files)") - # Show all files print("\nFiles:\n") - for filename in sorted(file_list): - info = whl.getinfo(filename) + info = wheel.getinfo(filename) size = f"{info.file_size:>12,}" if not filename.endswith("/") else " " print(f" {filename:<70} {size}") - - except Exception as e: - print(f"Error reading wheel: {e}") + except Exception as exc: + print(f"Error reading wheel: {exc}") def list_contents(artifact_path: str) -> None: - """List contents of a specific artifact.""" _print_section("Listing Artifact Contents") if not os.path.exists(artifact_path): @@ -560,66 +1031,83 @@ def list_contents(artifact_path: str) -> None: _fail(f"Unsupported file type: {artifact_path}\nSupported: .tar.gz, .whl") -# ============================================================================ -# Command Handlers -# ============================================================================ +def cmd_signatures(args: argparse.Namespace) -> bool: + summary = VerificationSummary() + verify_signatures(args.artifacts_dir, summary) + _print_section("Verification Summary") + print(summary.render()) + return summary.ok -def cmd_signatures(args) -> bool: - """Verify signatures and checksums.""" - return verify_signatures(args.artifacts_dir) +def cmd_artifacts(args: argparse.Namespace) -> bool: + summary = VerificationSummary() + verify_artifact_contents(args.artifacts_dir, summary) + _print_section("Verification Summary") + print(summary.render()) + return summary.ok -def cmd_licenses(args) -> bool: - """Verify licenses with Apache RAT.""" +def cmd_licenses(args: argparse.Namespace) -> bool: if not args.rat_jar: _fail("--rat-jar is required for license verification") - return verify_licenses(args.artifacts_dir, args.rat_jar, args.report_only) + summary = VerificationSummary() + verify_licenses(args.artifacts_dir, args.rat_jar, summary, args.report_only) + _print_section("Verification Summary") + print(summary.render()) + return summary.ok -def cmd_all(args) -> bool: - """Verify everything: signatures, checksums, and licenses.""" +def cmd_reproducible(args: argparse.Namespace) -> bool: + summary = VerificationSummary() + verify_reproducible_build(args.artifacts_dir, summary) + _print_section("Verification Summary") + print(summary.render()) + _maybe_output_vote_email(args, summary) + return summary.ok + + +def cmd_all(args: argparse.Namespace) -> bool: _print_section("Complete Apache Artifacts Verification") + summary = VerificationSummary() + + print("\n[1/4] Verifying signatures and checksums...") + verify_signatures(args.artifacts_dir, summary) - # Step 1: Verify signatures - print("\n[1/2] Verifying signatures and checksums...") - signatures_ok = verify_signatures(args.artifacts_dir) + print("\n[2/4] Verifying required metadata files...") + verify_artifact_contents(args.artifacts_dir, summary) + + print("\n[3/4] Verifying reproducible rebuild...") + verify_reproducible_build(args.artifacts_dir, summary) - # Step 2: Verify licenses if args.rat_jar: - print("\n[2/2] Verifying licenses with Apache RAT...") - licenses_ok = verify_licenses(args.artifacts_dir, args.rat_jar, args.report_only) + print("\n[4/4] Verifying licenses with Apache RAT...") + verify_licenses(args.artifacts_dir, args.rat_jar, summary, args.report_only) else: - print("\n[2/2] Skipping license verification (no --rat-jar provided)") - licenses_ok = True + summary.skip("Apache RAT", "no --rat-jar provided") - # Summary _print_section("Verification Summary") - - print("Results:") - print(f" Signatures & Checksums: {'✅ PASS' if signatures_ok else '❌ FAIL'}") - print( - f" License Compliance: {'✅ PASS' if licenses_ok else '❌ FAIL' if args.rat_jar else '⊘ SKIPPED'}" - ) - - return signatures_ok and licenses_ok + print(summary.render()) + _maybe_output_vote_email(args, summary) + return summary.ok -def cmd_list_contents(args) -> None: - """List contents of a specific artifact.""" +def cmd_list_contents(args: argparse.Namespace) -> None: list_contents(args.artifact) -def cmd_twine_check(args) -> bool: - """Verify wheel metadata with twine.""" +def cmd_twine_check(args: argparse.Namespace) -> bool: _print_section("Verifying Wheel Metadata with Twine") + summary = VerificationSummary() wheel_pattern = f"{args.artifacts_dir}/apache_burr-*.whl" wheel_files = glob.glob(wheel_pattern) if not wheel_files: print(f"❌ No wheel found matching: {wheel_pattern}") + summary.fail("Twine metadata check", "no wheel found") + _print_section("Verification Summary") + print(summary.render()) return False for wheel_path in wheel_files: @@ -632,105 +1120,100 @@ def cmd_twine_check(args) -> bool: text=True, ) print(f" ✓ {os.path.basename(wheel_path)} metadata is valid") - except subprocess.CalledProcessError as e: - print(f" ✗ Twine check failed: {e.stderr}") - return False + summary.pass_(f"Twine metadata: {os.path.basename(wheel_path)}") + except subprocess.CalledProcessError as exc: + print(f" ✗ Twine check failed: {exc.stderr}") + summary.fail(f"Twine metadata: {os.path.basename(wheel_path)}", exc.stderr.strip()) - print("\n✅ All wheels passed twine validation") - return True + _print_section("Verification Summary") + print(summary.render()) + return summary.ok -# ============================================================================ -# CLI Entry Point -# ============================================================================ +def _add_vote_email_args(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + "--vote-email", + action="store_true", + help="Render a vote email draft using the collected verification results", + ) + parser.add_argument( + "--vote-email-output", + help="Optional path to write the generated vote email draft", + ) -def main(): - """Main entry point.""" +def main() -> None: parser = argparse.ArgumentParser( description="Apache Artifacts Verification Tool", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - # List contents of a specific artifact python scripts/verify_apache_artifacts.py list-contents dist/apache-burr-0.41.0-incubating-src.tar.gz - python scripts/verify_apache_artifacts.py list-contents dist/apache_burr-0.41.0-py3-none-any.whl - - # Verify signatures and checksums only python scripts/verify_apache_artifacts.py signatures - - # Verify licenses with Apache RAT + python scripts/verify_apache_artifacts.py artifacts + python scripts/verify_apache_artifacts.py reproducible python scripts/verify_apache_artifacts.py licenses --rat-jar /path/to/apache-rat.jar - - # Verify everything - python scripts/verify_apache_artifacts.py all --rat-jar /path/to/apache-rat.jar - - # Report-only mode (don't fail on license issues) - python scripts/verify_apache_artifacts.py licenses --rat-jar /path/to/apache-rat.jar --report-only - - # Custom artifacts directory - python scripts/verify_apache_artifacts.py all --artifacts-dir /path/to/artifacts --rat-jar /path/to/rat.jar + python scripts/verify_apache_artifacts.py all --rat-jar /path/to/apache-rat.jar --vote-email """, ) subparsers = parser.add_subparsers(dest="command", required=True) - # list-contents subcommand - list_parser = subparsers.add_parser( - "list-contents", help="List contents of a specific artifact" - ) + list_parser = subparsers.add_parser("list-contents", help="List contents of a specific artifact") list_parser.add_argument("artifact", help="Path to artifact file (.tar.gz or .whl)") - # signatures subcommand - sig_parser = subparsers.add_parser( - "signatures", help="Verify GPG signatures and SHA512 checksums" + sig_parser = subparsers.add_parser("signatures", help="Verify GPG signatures and SHA512 checksums") + sig_parser.add_argument("--artifacts-dir", default="dist", help="Directory containing artifacts (default: dist)") + + artifacts_parser = subparsers.add_parser( + "artifacts", + help="Verify required LICENSE/NOTICE/DISCLAIMER metadata in artifacts", ) - sig_parser.add_argument( + artifacts_parser.add_argument( "--artifacts-dir", default="dist", help="Directory containing artifacts (default: dist)" ) - # licenses subcommand lic_parser = subparsers.add_parser("licenses", help="Verify licenses with Apache RAT") - lic_parser.add_argument( - "--artifacts-dir", default="dist", help="Directory containing artifacts (default: dist)" - ) + lic_parser.add_argument("--artifacts-dir", default="dist", help="Directory containing artifacts (default: dist)") lic_parser.add_argument("--rat-jar", required=True, help="Path to Apache RAT JAR file") - lic_parser.add_argument( - "--report-only", action="store_true", help="Generate report but don't fail on issues" - ) + lic_parser.add_argument("--report-only", action="store_true", help="Generate report but don't fail on issues") - # all subcommand - all_parser = subparsers.add_parser("all", help="Verify everything (signatures + licenses)") - all_parser.add_argument( - "--artifacts-dir", default="dist", help="Directory containing artifacts (default: dist)" + reproducible_parser = subparsers.add_parser( + "reproducible", + help="Rebuild from release source and compare rebuilt artifacts against release artifacts", ) - all_parser.add_argument( - "--rat-jar", help="Path to Apache RAT JAR file (optional for signatures-only)" + reproducible_parser.add_argument( + "--artifacts-dir", default="dist", help="Directory containing artifacts (default: dist)" ) - all_parser.add_argument( - "--report-only", - action="store_true", - help="Generate report but don't fail on license issues", + _add_vote_email_args(reproducible_parser) + + all_parser = subparsers.add_parser( + "all", + help="Verify signatures, metadata files, reproducibility, and optionally Apache RAT results", ) + all_parser.add_argument("--artifacts-dir", default="dist", help="Directory containing artifacts (default: dist)") + all_parser.add_argument("--rat-jar", help="Path to Apache RAT JAR file (optional)") + all_parser.add_argument("--report-only", action="store_true", help="Generate report but don't fail on RAT issues") + _add_vote_email_args(all_parser) - # twine-check subcommand twine_parser = subparsers.add_parser("twine-check", help="Verify wheel metadata with twine") - twine_parser.add_argument( - "--artifacts-dir", default="dist", help="Directory containing artifacts (default: dist)" - ) + twine_parser.add_argument("--artifacts-dir", default="dist", help="Directory containing artifacts (default: dist)") args = parser.parse_args() - # Dispatch to command handler success = False try: if args.command == "list-contents": cmd_list_contents(args) sys.exit(0) - elif args.command == "signatures": + if args.command == "signatures": success = cmd_signatures(args) + elif args.command == "artifacts": + success = cmd_artifacts(args) elif args.command == "licenses": success = cmd_licenses(args) + elif args.command == "reproducible": + success = cmd_reproducible(args) elif args.command == "all": success = cmd_all(args) elif args.command == "twine-check": @@ -740,8 +1223,8 @@ def main(): except KeyboardInterrupt: print("\n\n⚠️ Interrupted by user") sys.exit(130) - except Exception as e: - print(f"\n❌ Unexpected error: {e}") + except Exception as exc: + print(f"\n❌ Unexpected error: {exc}") import traceback traceback.print_exc() @@ -750,9 +1233,9 @@ def main(): if success: print("\n✅ Verification completed successfully!") sys.exit(0) - else: - print("\n❌ Verification failed.") - sys.exit(1) + + print("\n❌ Verification failed.") + sys.exit(1) if __name__ == "__main__": diff --git a/tests/test_verify_apache_artifacts.py b/tests/test_verify_apache_artifacts.py new file mode 100644 index 000000000..60b8a00f9 --- /dev/null +++ b/tests/test_verify_apache_artifacts.py @@ -0,0 +1,254 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import importlib.util +import sys +import tarfile +import tempfile +import zipfile +from pathlib import Path + + +def _load_verify_module(): + module_path = Path(__file__).resolve().parent.parent / "scripts" / "verify_apache_artifacts.py" + spec = importlib.util.spec_from_file_location("verify_apache_artifacts", module_path) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +verify = _load_verify_module() + + +def _reference_text(filename: str) -> bytes: + return (Path(__file__).resolve().parent.parent / filename).read_bytes() + + +def _write_tar_gz(path: Path, root: str, files: dict[str, bytes]) -> None: + with tarfile.open(path, "w:gz") as tar: + for relative_name, content in files.items(): + with tempfile.NamedTemporaryFile(delete=False, dir=path.parent) as temp_file: + temp_path = Path(temp_file.name) + temp_path.write_bytes(content) + tar.add(temp_path, arcname=f"{root}/{relative_name}") + temp_path.unlink() + + +def _write_wheel(path: Path, files: dict[str, bytes]) -> None: + with zipfile.ZipFile(path, "w") as wheel: + for name, content in files.items(): + wheel.writestr(name, content) + + +def test_verify_artifact_contents_passes_for_tarball_and_wheel(): + with tempfile.TemporaryDirectory() as temp_dir: + artifacts_dir = Path(temp_dir) / "dist" + artifacts_dir.mkdir() + + tar_path = artifacts_dir / "apache-burr-0.41.0-incubating-src.tar.gz" + wheel_path = artifacts_dir / "apache_burr-0.41.0-py3-none-any.whl" + + _write_tar_gz( + tar_path, + "apache-burr-0.41.0-incubating-src", + { + "LICENSE": _reference_text("LICENSE"), + "NOTICE": _reference_text("NOTICE"), + "DISCLAIMER": _reference_text("DISCLAIMER"), + "README.md": b"example", + }, + ) + _write_wheel( + wheel_path, + { + "apache_burr/__init__.py": b"__version__ = '0.41.0'\n", + "apache_burr-0.41.0.dist-info/METADATA": b"Metadata-Version: 2.1\n", + "apache_burr-0.41.0.dist-info/WHEEL": b"Wheel-Version: 1.0\n", + "apache_burr-0.41.0.dist-info/licenses/NOTICE": _reference_text("NOTICE"), + "apache_burr-0.41.0.dist-info/licenses/DISCLAIMER": _reference_text("DISCLAIMER"), + "apache_burr-0.41.0.dist-info/licenses/LICENSE-wheel": _reference_text("LICENSE-wheel"), + }, + ) + + summary = verify.VerificationSummary() + assert verify.verify_artifact_contents(str(artifacts_dir), summary) is True + assert summary.ok is True + + +def test_verify_artifact_contents_fails_when_wheel_license_file_is_missing(): + with tempfile.TemporaryDirectory() as temp_dir: + artifacts_dir = Path(temp_dir) / "dist" + artifacts_dir.mkdir() + + wheel_path = artifacts_dir / "apache_burr-0.41.0-py3-none-any.whl" + _write_wheel( + wheel_path, + { + "apache_burr/__init__.py": b"__version__ = '0.41.0'\n", + "apache_burr-0.41.0.dist-info/METADATA": b"Metadata-Version: 2.1\n", + "apache_burr-0.41.0.dist-info/WHEEL": b"Wheel-Version: 1.0\n", + "apache_burr-0.41.0.dist-info/licenses/NOTICE": _reference_text("NOTICE"), + "apache_burr-0.41.0.dist-info/licenses/DISCLAIMER": _reference_text("DISCLAIMER"), + }, + ) + + summary = verify.VerificationSummary() + assert verify.verify_artifact_contents(str(artifacts_dir), summary) is False + assert any( + result.name.endswith("contains LICENSE-wheel") and result.status == verify.FAIL + for result in summary.results + ) + + +def test_verify_reproducible_build_compares_rebuilt_outputs(monkeypatch): + with tempfile.TemporaryDirectory() as temp_dir: + artifacts_dir = Path(temp_dir) / "dist" + artifacts_dir.mkdir() + + source_tar = artifacts_dir / "apache-burr-0.41.0-incubating-src.tar.gz" + release_sdist = artifacts_dir / "apache-burr-0.41.0-incubating-sdist.tar.gz" + release_wheel = artifacts_dir / "apache_burr-0.41.0-py3-none-any.whl" + + _write_tar_gz(source_tar, "apache-burr-0.41.0-incubating-src", {"README.md": b"source"}) + _write_tar_gz(release_sdist, "apache_burr-0.41.0", {"README.md": b"rebuilt"}) + _write_wheel( + release_wheel, + { + "apache_burr-0.41.0.dist-info/METADATA": b"Metadata-Version: 2.1\n", + "apache_burr-0.41.0.dist-info/WHEEL": b"Wheel-Version: 1.0\n", + }, + ) + + def _fake_build(source_artifact: str, output_dir: str): + assert Path(source_artifact) == source_tar + rebuilt_sdist = Path(output_dir) / "apache_burr-0.41.0.tar.gz" + rebuilt_wheel = Path(output_dir) / release_wheel.name + rebuilt_sdist.write_bytes(release_sdist.read_bytes()) + rebuilt_wheel.write_bytes(release_wheel.read_bytes()) + return True, "ok" + + monkeypatch.setattr(verify, "_build_reproducible_artifacts", _fake_build) + + summary = verify.VerificationSummary() + assert verify.verify_reproducible_build(str(artifacts_dir), summary) is True + assert any(result.name == "Rebuilt sdist checksum" and result.status == verify.PASS for result in summary.results) + assert any( + result.name == f"Rebuilt wheel checksum: {release_wheel.name}" and result.status == verify.PASS + for result in summary.results + ) + + +def test_render_vote_email_includes_status_counts(): + with tempfile.TemporaryDirectory() as temp_dir: + artifacts_dir = Path(temp_dir) / "dist" + artifacts_dir.mkdir() + (artifacts_dir / "apache-burr-0.41.0-incubating-src.tar.gz").write_bytes(b"artifact") + + summary = verify.VerificationSummary() + summary.pass_("Signatures") + summary.fail("Apache RAT", "2 issue(s)") + summary.skip("Reproducible rebuild", "build tool unavailable") + + email = verify.render_vote_email(str(artifacts_dir), summary) + + assert "Subject: [-1] Release Apache Burr (incubating) 0.41.0" in email + assert "- PASS: 1" in email + assert "- FAIL: 1" in email + assert "- SKIP: 1" in email + assert "- [FAIL] Apache RAT: 2 issue(s)" in email + + +def test_load_rat_xml_root_skips_log_preamble(): + with tempfile.TemporaryDirectory() as temp_dir: + report_path = Path(temp_dir) / "rat.xml" + report_path.write_text( + "INFO: Apache Creadur RAT 0.18\n" + "WARN: deprecated flag\n" + '\n', + encoding="utf-8", + ) + + root = verify._load_rat_xml_root(str(report_path)) + + assert root.tag == "rat-report" + + +def test_load_rat_xml_root_ignores_trailing_summary_lines(): + with tempfile.TemporaryDirectory() as temp_dir: + report_path = Path(temp_dir) / "rat.xml" + report_path.write_text( + 'INFO: Apache Creadur RAT 0.18 (Apache Software Foundation)\n' + '\n' + ' \n' + ' \n' + " \n" + "\n" + "INFO: RAT summary:\n" + "INFO: Approved: 0\n", + encoding="utf-8", + ) + + root = verify._load_rat_xml_root(str(report_path)) + + assert root.tag == "rat-report" + + +def test_rat_license_state_supports_old_and_new_xml_shapes(): + old_resource = verify.ET.fromstring( + """ + + + + + """ + ) + new_resource = verify.ET.fromstring( + """ + + + + """ + ) + + assert verify._rat_license_state(old_resource) == ("false", "Unknown license") + assert verify._rat_license_state(new_resource) == ("false", "Unknown license") + + +def test_rat_scan_target_prefers_single_extracted_project_dir(): + with tempfile.TemporaryDirectory() as temp_dir: + extract_dir = Path(temp_dir) / "extracted" + extract_dir.mkdir() + (extract_dir / "apache-burr-0.41.0-incubating-src").mkdir() + + rat_cwd, rat_target = verify._rat_scan_target(str(extract_dir)) + + assert rat_cwd == str(extract_dir) + assert rat_target == "apache-burr-0.41.0-incubating-src" + + +def test_artifact_files_ignores_rat_reports(): + with tempfile.TemporaryDirectory() as temp_dir: + artifacts_dir = Path(temp_dir) + (artifacts_dir / "apache_burr-0.41.0-py3-none-any.whl").write_bytes(b"wheel") + (artifacts_dir / "rat-report-sample.xml").write_text("report", encoding="utf-8") + (artifacts_dir / "rat-report-sample.txt").write_text("report", encoding="utf-8") + + artifact_files = verify._artifact_files(str(artifacts_dir)) + + assert artifact_files == ["apache_burr-0.41.0-py3-none-any.whl"]