From 242470a99920f190e4b7d6266e0f316882e2a221 Mon Sep 17 00:00:00 2001 From: Gabriela de Santana Carrara Date: Mon, 13 Apr 2026 11:43:47 +0200 Subject: [PATCH 1/7] digitization(utils): Automate CERNBox file downloads * Implement fetch_boite_files to replace manual fetching * ref cern-sis/digitization#15 --- refactory/storage_connection.py | 126 ++++++++++++++++++++------------ refactory/utils.py | 72 ++++++++++++++++++ 2 files changed, 150 insertions(+), 48 deletions(-) create mode 100644 refactory/utils.py diff --git a/refactory/storage_connection.py b/refactory/storage_connection.py index bdfdad9..361a5e1 100644 --- a/refactory/storage_connection.py +++ b/refactory/storage_connection.py @@ -11,7 +11,7 @@ def list_folders(self, base_path: str) -> list[str]: pass @abstractmethod - def list_pdfs(self, folder_path: str) -> list[str]: + def list_files(self, folder_path: str, extension: str = None) -> list[str]: pass @abstractmethod @@ -23,7 +23,9 @@ def upload_file(self, local_file_path: str, remote_file_path: str) -> None: pass @abstractmethod - def list_excel(self, folder_path: str) -> list[str]: + def generate_presigned_url( + self, file_key: str, content_type: str = None, expiration: int = 31556952 + ) -> str: pass class S3Provider(StorageProvider): @@ -31,8 +33,8 @@ def __init__(self, bucket: str, endpoint_url: str = 'https://s3.cern.ch'): self.bucket = bucket self.s3 = boto3.client( "s3", - aws_access_key_id=os.environ["ACCESS_KEY"], - aws_secret_access_key=os.environ["SECRET_KEY"], + # aws_access_key_id=os.environ["ACCESS_KEY"], + # aws_secret_access_key=os.environ["SECRET_KEY"], endpoint_url=endpoint_url, ) @@ -44,12 +46,16 @@ def list_folders(self, base_path: str) -> list[str]: folders.append(prefix['Prefix']) return folders - def list_pdfs(self, folder_path: str) -> list[str]: - response = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=folder_path) - return [ - obj['Key'] for obj in response.get('Contents', []) - if obj['Key'].lower().endswith('.pdf') - ] + def list_files(self, folder_path: str, extension: str = None) -> list[str]: + paginator = self.s3.get_paginator('list_objects_v2') + files = [] + for page in paginator.paginate(Bucket=self.bucket, Prefix=folder_path): + for obj in page.get('Contents', []): + key = obj['Key'] + if not key.endswith('/'): + if extension is None or key.lower().endswith(extension.lower()): + files.append(key) + return files def download_to_temp(self, file_path: str, temp_file_path: str) -> None: self.s3.download_file(self.bucket, file_path, temp_file_path) @@ -57,70 +63,94 @@ def download_to_temp(self, file_path: str, temp_file_path: str) -> None: def upload_file(self, local_file_path: str, remote_file_path: str) -> None: self.s3.upload_file(local_file_path, self.bucket, remote_file_path) - def list_excel(self, folder_path: str) -> list[str]: - response = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=folder_path) - return [ - obj['Key'] for obj in response.get('Contents', []) - if obj['Key'].lower().endswith('.xlsx') - ] + def generate_presigned_url( + self, file_key: str, content_type: str = None, expiration: int = 31556952 + ) -> str: + params = {"Bucket": self.bucket, "Key": file_key} + if content_type: + params["ResponseContentType"] = content_type + + return self.s3.generate_presigned_url( + ClientMethod="get_object", + Params=params, + ExpiresIn=expiration, + ) + class CernboxProvider(StorageProvider): - def __init__(self, public_link_hash: str): - self.public_link_hash = public_link_hash + def __init__(self, public_link_hash: str = None): self.account = os.getenv("CERNBOX_ACCOUNT") self.password = os.getenv("CERNBOX_PASSWORD") - self.public_base_url = f"https://cernbox.cern.ch/remote.php/dav/public-files/{public_link_hash}" + self.is_public = bool(public_link_hash) + self.public_link_hash = public_link_hash - if self.account: - self.auth_base_url = f"https://api.cernbox.cern.ch/remote.php/dav/files/{self.account}" + if self.is_public: + # Public + self.base_url = f"https://cernbox.cern.ch/remote.php/dav/public-files/{public_link_hash}" + self.auth = None else: - self.auth_base_url = None + # Protected + if not self.account or not self.password: + raise ValueError( + "Error: CERN credentials required for protected shares.." + ) + self.base_url = ( + f"https://api.cernbox.cern.ch/remote.php/dav/files/{self.account}" + ) + self.auth = (self.account, self.password) def _propfind(self, path: str, depth: str = "1") -> list[str]: - url = f"{self.public_base_url}/{path}".rstrip('/') + '/' - headers = {'Depth': depth} - response = requests.request('PROPFIND', url, headers=headers) + url = f"{self.base_url}/{path}/" if path else f"{self.base_url}/" + headers = {"Depth": depth} + + response = requests.request("PROPFIND", url, headers=headers, auth=self.auth) response.raise_for_status() root = ET.fromstring(response.content) - namespaces = {'d': 'DAV:'} + namespaces = {"d": "DAV:"} paths = [] - for response_tag in root.findall('d:response', namespaces)[1:]: - href = response_tag.find('d:href', namespaces).text - relative_path = href.split(f"/public-files/{self.public_link_hash}/")[-1] - paths.append(relative_path) + for response_tag in root.findall("d:response", namespaces)[1:]: + href = response_tag.find("d:href", namespaces).text + filename = href.rstrip("/").split("/")[-1] + + paths.append(filename) + return paths def list_folders(self, base_path: str) -> list[str]: - all_items = self._propfind(base_path) - return [p for p in all_items if p.endswith('/') or "BOITE_" in p] + pass - def list_pdfs(self, folder_path: str) -> list[str]: + def list_files(self, folder_path: str, extension: str = None) -> list[str]: all_items = self._propfind(folder_path) - return [p for p in all_items if p.lower().endswith('.pdf')] + if extension: + return [p for p in all_items if p.lower().endswith(extension.lower())] + return all_items def download_to_temp(self, file_path: str, temp_file_path: str) -> None: - url = f"{self.public_base_url}/{file_path}" - response = requests.get(url, stream=True) + url = f"{self.base_url}/{file_path}" + response = requests.get(url, stream=True, auth=self.auth) response.raise_for_status() - with open(temp_file_path, 'wb') as f: + + with open(temp_file_path, "wb") as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) - def list_excel(self, folder_path: str)-> list[str]: - all_items = self._propfind(folder_path) - return [p for p in all_items if p.lower().endswith('.xlsx')] - def upload_file(self, local_file_path: str, remote_file_path: str) -> None: - if not self.account or not self.password: - raise ValueError("CERN account and password are required for uploading.") - - clean_remote_path = remote_file_path.lstrip('/') - url = f"{self.auth_base_url}/{clean_remote_path}" + if self.is_public: + raise NotImplementedError( + "Error: CERN credentials required for updates." + ) - with open(local_file_path, 'rb') as f: - response = requests.put(url, data=f, auth=(self.account, self.password)) + clean_remote_path = remote_file_path.strip("/") + url = f"{self.base_url}/{clean_remote_path}" + with open(local_file_path, "rb") as f: + response = requests.put(url, data=f, auth=self.auth) response.raise_for_status() + + def generate_presigned_url( + self, file_key: str, content_type: str = None, expiration: int = None + ) -> str: + return f"{self.base_url}/{file_key}" diff --git a/refactory/utils.py b/refactory/utils.py new file mode 100644 index 0000000..9bf46f6 --- /dev/null +++ b/refactory/utils.py @@ -0,0 +1,72 @@ +import os +import tempfile +from urllib.parse import urlparse +from storage_connection import CernboxProvider + + +def parse_cernbox_url(url: str) -> dict: + parsed = urlparse(url) + path = parsed.path + + if "/s/" in path: + hash_code = path.split("/s/")[-1].split("/")[0] + return {"public_link_hash": hash_code, "eos_path": None} + + elif "public-files" in path: + hash_code = path.split("public-files/")[-1].split("/")[0] + return {"public_link_hash": hash_code, "eos_path": None} + + elif "eos" in path: + + eos_path = "eos" + path.split("eos")[-1] + return {"public_link_hash": None, "eos_path": eos_path} + + else: + raise ValueError(f"Invalid CERNBox URL format: {url}") + + +def fetch_boite_files(url: str, output_dir: str = None) -> str: + """ + Downloads all .xlsx files from a CERNBox URL. + """ + print(f"Fetch URL: {url}") + + parsed_data = parse_cernbox_url(url) + + provider = CernboxProvider(public_link_hash=parsed_data["public_link_hash"]) + + if output_dir is None: + output_dir = tempfile.mkdtemp(prefix="boite_data_") + elif not os.path.exists(output_dir): + os.makedirs(output_dir) + print(f"Created directory: {output_dir}") + + target_path = parsed_data["eos_path"] if parsed_data["eos_path"] else "" + + try: + xlsx_files = provider.list_files(target_path, extension=".xlsx") + except Exception as e: + print(f"Failed to access CERNBox. Error: {e}") + return output_dir + + if not xlsx_files: + print("No .xlsx files found.") + return output_dir + + print(f"Found {len(xlsx_files)} files. Starting download...") + + for filename in xlsx_files: + local_path = os.path.join(output_dir, filename) + + remote_file_path = ( + f"{target_path.rstrip('/')}/{filename}" if target_path else filename + ) + + try: + print(f"Downloading: {filename}...") + provider.download_to_temp(remote_file_path, local_path) + except Exception as e: + print(f"Failed to download {filename}: {e}") + + print(f"Download complete. Files are located in: {output_dir}") + return output_dir From 7084edadae134111750ed422c7db941b90279060 Mon Sep 17 00:00:00 2001 From: Gabriela de Santana Carrara Date: Tue, 14 Apr 2026 18:34:03 +0200 Subject: [PATCH 2/7] digitization(boite_matcher): Implement Boite-to-S3 file matcher * Match Boite .xlsx records to S3 paths case-insensitively * Support flat and subfolder layouts for PDF and PDF_LATEX * Log unmatched files and output data for XML generation * ref cern-sis/digitization#24 --- refactory/__init__.py | 0 refactory/{ => check_files}/main.py | 20 +- .../{validate_pdf.py => check_files/utils.py} | 8 +- refactory/cli.py | 77 ++++++- refactory/file_import/__init__.py | 0 refactory/file_import/boite_matcher.py | 206 ++++++++++++++++++ refactory/{ => file_import}/utils.py | 9 +- 7 files changed, 295 insertions(+), 25 deletions(-) create mode 100644 refactory/__init__.py rename refactory/{ => check_files}/main.py (89%) rename refactory/{validate_pdf.py => check_files/utils.py} (86%) create mode 100644 refactory/file_import/__init__.py create mode 100644 refactory/file_import/boite_matcher.py rename refactory/{ => file_import}/utils.py (93%) diff --git a/refactory/__init__.py b/refactory/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/refactory/main.py b/refactory/check_files/main.py similarity index 89% rename from refactory/main.py rename to refactory/check_files/main.py index 665486d..0e67155 100644 --- a/refactory/main.py +++ b/refactory/check_files/main.py @@ -5,22 +5,22 @@ import json from typing import Union from storage_connection import StorageProvider, S3Provider, CernboxProvider -from validate_pdf import is_pdf_valid +from .utils import validate_pdf def run_validation_pipeline( provider: StorageProvider, base_path: str, log_file: str, - inventory_source: Union[str, list[int]], + data_source: Union[str, list[int]], upload_reports: bool = False, ): """Navigates directories, validates files, and logs files status.""" target_box_numbers = set() - if isinstance(inventory_source, str): - inventory_provider = CernboxProvider(inventory_source) - excel_files = inventory_provider.list_excel("") + if isinstance(data_source, str): + inventory_provider = CernboxProvider(data_source) + excel_files = inventory_provider.list_files("", '.xlsx') for file_path in excel_files: filename = file_path.split(".")[0] @@ -29,8 +29,8 @@ def run_validation_pipeline( if match: target_box_numbers.add(int(match.group(1))) - elif isinstance(inventory_source, list): - target_box_numbers = set(inventory_source) + elif isinstance(data_source, list): + target_box_numbers = set(data_source) print(f"Excel files: {len(target_box_numbers)} boxes to check.") @@ -57,7 +57,7 @@ def run_validation_pipeline( continue print(f"Processing target Box: {match.group(1) + (match.group(2) or '')}") - pdf_files = provider.list_pdfs(folder) + pdf_files = provider.list_files(folder, 'PDF') if not pdf_files: print(f"⚠️ EMPTY FOLDER: {folder}") @@ -69,7 +69,7 @@ def run_validation_pipeline( with tempfile.NamedTemporaryFile(delete=True) as tmp: provider.download_to_temp(pdf_path, tmp.name) - if is_pdf_valid(tmp.name): + if validate_pdf(tmp.name): valid_files.append(pdf_path) print(f" ✅ {pdf_path}") else: @@ -136,6 +136,6 @@ def run_validation_pipeline( provider=s3_provider, # cernbox_provider base_path="cern-archives/raw/PDF/", # "teste/", log_file="s3_pdf_issues.log", - inventory_source=sys.argv[1], # public_link_hash + data_source=sys.argv[1], # public_link_hash upload_reports=int(sys.argv[2]) ) diff --git a/refactory/validate_pdf.py b/refactory/check_files/utils.py similarity index 86% rename from refactory/validate_pdf.py rename to refactory/check_files/utils.py index 0b0ee18..39639cb 100644 --- a/refactory/validate_pdf.py +++ b/refactory/check_files/utils.py @@ -2,11 +2,11 @@ from pypdf import PdfReader from pypdf.errors import PdfReadError -def is_pdf_valid(file_path: str) -> bool: +def validate_pdf(file_path: str) -> bool: """Checks if a local PDF is structurally valid and readable.""" try: file_size = os.path.getsize(file_path) - if file_size < 100: + if file_size < 100: return False with open(file_path, "rb") as f: @@ -23,7 +23,7 @@ def is_pdf_valid(file_path: str) -> bool: if len(reader.pages) == 0: return False - _ = reader.pages[0] + _ = reader.pages[0] return True @@ -31,4 +31,4 @@ def is_pdf_valid(file_path: str) -> bool: raise RuntimeError(f"System error when accessing file {file_path}: {e}") from e except (PdfReadError, Exception): - return False \ No newline at end of file + return False diff --git a/refactory/cli.py b/refactory/cli.py index d5feb4b..17b47cd 100644 --- a/refactory/cli.py +++ b/refactory/cli.py @@ -1,8 +1,11 @@ import click import ast -from .main import run_validation_pipeline +from .check_files.main import run_validation_pipeline from storage_connection import S3Provider +# Import the matcher we built earlier (adjust the path if your folder structure is different) +from .file_import.boite_matcher import BoiteS3Matcher + def parse_inventory(value): """ @@ -25,6 +28,7 @@ def parse_inventory(value): pass return value + @click.group() def digitization_v2(): pass @@ -32,10 +36,10 @@ def digitization_v2(): @digitization_v2.command("validate-files-integrity") @click.option( - "-s", - "--inventory-source", + "-d", + "--data-source", required=True, - help="Target inventory. Supports a CERNBOX hash, range 1..10, or list [1,2].", + help="Boite Files. Supports a CERNBOX hash, range 1..10, or list [1,2].", ) @click.option( "-u", @@ -50,13 +54,13 @@ def digitization_v2(): show_default=True, help="S3 Bucket name.", ) -def validate_files_integrity(inventory_source, upload_reports, bucket): +def validate_files_integrity(data_source, upload_reports, bucket): """ Validates files integrity and inventory alignment. This command checks for corrupted files and missing boxes. """ - inventory_input = parse_inventory(inventory_source) + inventory_input = parse_inventory(data_source) provider = S3Provider(bucket=bucket) try: @@ -64,7 +68,7 @@ def validate_files_integrity(inventory_source, upload_reports, bucket): provider=provider, base_path="cern-archives/raw/PDF/", log_file="s3_pdf_issues.log", - inventory_source=inventory_input, + data_source=inventory_input, upload_reports=upload_reports, ) click.echo("Process finished. Check the generated logs for details.") @@ -72,5 +76,64 @@ def validate_files_integrity(inventory_source, upload_reports, bucket): click.secho(f"Error: {e}", fg="red", err=True) + +@digitization_v2.command("file-match") +@click.option( + "-d", + "--data-source", + required=True, + help="Target data source. Supports a local directory path or a CERNBOX URL.", +) +@click.option( + "-o", + "--output-path", + default="./match_results", + show_default=True, + help="Directory to save the generated JSON files (records and mismatches).", +) +@click.option( + "-f", + "--file-types", + default="PDF,PDF_LATEX", + show_default=True, + help="Comma-separated list of file types to match (e.g., 'PDF,PDF_LATEX,TIFF').", +) +@click.option( + "-b", + "--bucket", + default="digitization-dev", + show_default=True, + help="S3 Bucket name.", +) +def file_match(data_source, output_path, file_types, bucket): + """ + Matches Boite Excel records against S3 files and generates JSON payloads. + Generates a success JSON per Boite and a unified mismatch log. + """ + provider = S3Provider(bucket=bucket) + + parsed_file_types = [t.strip() for t in file_types.split(",")] + + click.echo("Starting match process...") + click.echo(f"Source: {data_source}") + click.echo(f"File types: {', '.join(parsed_file_types)}") + + try: + matcher = BoiteS3Matcher( + provider=provider, + data_source=data_source, + output_path=output_path, + file_types=parsed_file_types, + ) + + matcher.execute() + + click.secho( + f"Match completed successfully. Output saved to: {output_path}", fg="green" + ) + except Exception as e: + click.secho(f"Error during matching: {e}", fg="red", err=True) + + if __name__ == "__main__": digitization_v2() diff --git a/refactory/file_import/__init__.py b/refactory/file_import/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/refactory/file_import/boite_matcher.py b/refactory/file_import/boite_matcher.py new file mode 100644 index 0000000..aaf254b --- /dev/null +++ b/refactory/file_import/boite_matcher.py @@ -0,0 +1,206 @@ +import json +import os +import re +import pandas as pd +from pathlib import Path +from urllib.parse import urlparse + +from .utils import fetch_boite_files, transform_box_file_name +from ..storage_connection import StorageProvider + + +class BoiteS3Matcher: + """Matches Boite Excel records with S3 files and logs discrepancies.""" + + def __init__( + self, + provider: StorageProvider, + data_source: str, + output_path: str, + file_types: list[str] | None = None, + ): + """Initializes the matcher with storage, data output, data path, and target file types.""" + self.provider = provider + self.output_path = Path(output_path) + self.output_path.mkdir(parents=True, exist_ok=True) + self.file_types = file_types or ["PDF", "PDF_LATEX"] + self.data_path = self._prepare_data_path(data_source) + + def _is_url(self, value: str) -> bool: + return urlparse(value).scheme in {"http", "https"} + + def _prepare_data_path(self, data_source: str) -> Path: + """Returns the local path or delegates the download if a URL is provided.""" + if self._is_url(data_source): + return Path(fetch_boite_files(data_source)) + return Path(data_source) + + def _get_base_filename(self, filename: str) -> str: + """Strips file extensions and returns a clean, lowercase base name for exact matching.""" + lower_name = filename.lower() + if lower_name.endswith("_latex.pdf"): + return lower_name[:-10] + if lower_name.endswith(".pdf"): + return lower_name[:-4] + if lower_name.endswith((".tiff", ".tif")): + return lower_name.rsplit(".", 1)[0] + return lower_name + + def _normalize_for_comparison(self, name: str) -> str: + """Removes all non-alphanumeric characters for fuzzy matching and review suggestions.""" + return re.sub(r"[^a-z0-9]", "", name.lower()) + + def _load_s3_cache_for_boite( + self, box_file: str + ) -> tuple[dict[str, dict[str, str]], dict[str, set[str]]]: + """Pre-loads and filters S3 keys for match""" + cache: dict[str, dict[str, str]] = {} + available_keys: dict[str, set[str]] = {} + + folder_pattern = re.compile(r"(?i:BOITE)[\-_]O0(\d+)(?:[\-_]\w+)?") + match = folder_pattern.search(box_file) + + if not match: + print('No Boile file found.') + return {ft: {} for ft in self.file_types}, { + ft: set() for ft in self.file_types + } + + target_number = match.group(1) + + for filetype in self.file_types: + prefix = f"cern-archives/raw/{filetype}/BOITE_O0{target_number}" + all_raw_keys = self.provider.list_files(prefix) + + valid_keys: list[str] = [] + for key in all_raw_keys: + folder_name = key.split("/")[-2] + s3_match = folder_pattern.fullmatch(folder_name) + + if s3_match and s3_match.group(1) == target_number: + valid_keys.append(key) + + cache[filetype] = { + self._get_base_filename(k.split("/")[-1]): k for k in valid_keys + } + available_keys[filetype] = set(valid_keys) + + return cache, available_keys + + def process_boite( + self, box_file: str + ) -> tuple[list[dict], dict]: + """Processes a single Boite file in-memory and returns the mapped records alongside mismatch data.""" + print(f"📦 Processing {box_file}...") + df = pd.read_excel(self.data_path / box_file, header=None) + boite_name_s3 = transform_box_file_name(box_file) + + s3_cache, s3_available_keys = self._load_s3_cache_for_boite(box_file) + + records_data: list[dict] = [] + missing_in_s3: list[dict] = [] + used_s3_keys: dict[str, set[str]] = {ftype: set() for ftype in self.file_types} + + for _, row in df.iterrows(): + record_id, record_name = str(row[0]).strip(), str(row[1]).strip() + search_name = self._get_base_filename(record_name) + + record_data: dict = {"record_id": record_id} + missing_types: list[str] = [] + + for ftype in self.file_types: + url_key = f"{ftype.lower()}_url" + matched_key = s3_cache[ftype].get(search_name) + + if matched_key: + content_type = ( + "application/pdf" if ftype in ["PDF", "PDF_LATEX"] else None + ) + record_data[url_key] = self.provider.generate_presigned_url( + matched_key, content_type + ) + used_s3_keys[ftype].add(matched_key) + else: + record_data[url_key] = None + missing_types.append(ftype) + + if missing_types: + missing_in_s3.append( + { + "record_id": record_id, + "record_name": record_name, + "missing_types": missing_types, + } + ) + records_data.append(record_data) + + missing_in_boite = [ + {"s3_key": key, "filetype": ftype} + for ftype in self.file_types + for key in (s3_available_keys[ftype] - used_s3_keys[ftype]) + ] + + near_matches = [] + for missing_rec in missing_in_s3: + boite_norm = self._normalize_for_comparison(missing_rec["record_name"]) + + for ftype in missing_rec["missing_types"]: + unused_s3 = s3_available_keys[ftype] - used_s3_keys[ftype] + + for s3_key in unused_s3: + s3_base = self._get_base_filename(s3_key.split("/")[-1]) + s3_norm = self._normalize_for_comparison(s3_base) + + if boite_norm == s3_norm: + near_matches.append( + { + "boite_record": missing_rec["record_name"], + "suggested_s3_key": s3_key, + "filetype": ftype, + } + ) + + mismatch_data = { + "boite_file": box_file, + "s3_folder_name": boite_name_s3, + "mismatches": { + "in_boite_missing_in_s3": missing_in_s3, + "in_s3_missing_in_boite": missing_in_boite, + "potential_matches": near_matches, + }, + } + + return records_data, mismatch_data + + def _export_records(self, box_file: str, records: list) -> None: + """Saves Boite records to JSON.""" + base_name = box_file.rsplit(".", 1)[0] + with open( + self.output_path / f"{base_name}_records.json", "w", encoding="utf-8" + ) as f: + json.dump(records, f, indent=4, ensure_ascii=False) + + def _export_unified_log(self, all_mismatches: list) -> None: + """Saves consolidated mismatch log.""" + with open( + self.output_path / "all_boites_mismatches.json", "w", encoding="utf-8" + ) as f: + json.dump( + {"total": len(all_mismatches), "details": all_mismatches}, + f, + indent=4, + ensure_ascii=False, + ) + + def execute(self) -> dict[str, list[dict]]: + """Export logs in Json and return records data in memory""" + results_map, all_mismatches = {}, [] + for box_file in os.listdir(self.data_path): + if box_file.lower().endswith(".xlsx") and not box_file.startswith("~"): + records, mismatches = self.process_boite(box_file) + results_map[box_file] = records + all_mismatches.append(mismatches) + self._export_records(box_file, records) + + self._export_unified_log(all_mismatches) + return results_map diff --git a/refactory/utils.py b/refactory/file_import/utils.py similarity index 93% rename from refactory/utils.py rename to refactory/file_import/utils.py index 9bf46f6..cad996e 100644 --- a/refactory/utils.py +++ b/refactory/file_import/utils.py @@ -1,7 +1,7 @@ import os import tempfile from urllib.parse import urlparse -from storage_connection import CernboxProvider +from .storage_connection import CernboxProvider def parse_cernbox_url(url: str) -> dict: @@ -17,7 +17,6 @@ def parse_cernbox_url(url: str) -> dict: return {"public_link_hash": hash_code, "eos_path": None} elif "eos" in path: - eos_path = "eos" + path.split("eos")[-1] return {"public_link_hash": None, "eos_path": eos_path} @@ -67,6 +66,8 @@ def fetch_boite_files(url: str, output_dir: str = None) -> str: provider.download_to_temp(remote_file_path, local_path) except Exception as e: print(f"Failed to download {filename}: {e}") - - print(f"Download complete. Files are located in: {output_dir}") return output_dir + + +def transform_box_file_name(box_file): + return box_file.split(".")[0].upper().replace("-", "_") From 8bd86023b6d6374e56ad49d917b1e556d55450d7 Mon Sep 17 00:00:00 2001 From: Gabriela de Santana Carrara Date: Wed, 15 Apr 2026 09:47:30 +0200 Subject: [PATCH 3/7] update readme --- refactory/README.md | 130 +++++++++++++++++++++++++++----------------- 1 file changed, 81 insertions(+), 49 deletions(-) diff --git a/refactory/README.md b/refactory/README.md index 43aae66..9aab013 100644 --- a/refactory/README.md +++ b/refactory/README.md @@ -1,15 +1,82 @@ # refactory -This directory contains scripts and helpers for validating PDF files in an S3 bucket using an inventory of Excel files hosted on CERNBox. +This directory contains tools for validating PDF files and matching Boite Excel inventory records against S3 files. ## Structure -- `main.py` - main script that validates PDFs using the CERNBox inventory. +- `cli.py` - click CLI exposing the main workflows: + - `validate-files-integrity` + - `file-match` - `storage_connection.py` - storage provider abstraction: - `S3Provider` for S3. - `CernboxProvider` for public CERNBox access. -- `validate_pdf.py` - validates PDFs locally with `is_pdf_valid(file_path)`. -- `test_connections.py` - testing/connection experiment script. +- `check_files/main.py` - validation pipeline used by `validate-files-integrity`. +- `file_import/refactory_matcher.py` - Boite-to-S3 matcher implementation used by `file-match`. +- `file_import/boite_matcher.py` - additional matcher implementation and helpers. + +## CLI usage + +Run the refactory CLI from the repository root: + +```bash +poetry run digitization_v2 --help +``` + +The available commands are: + +- `validate-files-integrity` — validate PDF integrity and inventory alignment. +- `file-match` — match Boite Excel records against S3 files and generate JSON outputs. + +## 1. Validate files integrity + +Use this command to check the Boite inventory against the PDF validation pipeline. + +```bash +poetry run digitization_v2 validate-files-integrity \ + -d "[122,123]" \ + -u \ + -b digitization-dev +``` + +Options: + +- `-d, --data-source` — Boite inventory source. Supports a CERNBox hash, range (`1..10`), or list (`[1,2]`). +- `-u, --upload-reports` — upload validation reports back to storage. +- `-b, --bucket` — S3 bucket name (default: `digitization-dev`). + +This command runs the validation pipeline and generates logs such as `s3_pdf_issues.log`. + +## 2. Boite-to-S3 file matching + +Use this command to match Boite Excel filenames with S3 objects and write structured JSON output. + +```bash +poetry run digitization_v2 file-match \ + -d "https://cernbox.cern.ch/s/{hash}" \ + -o ./match_results \ + -f PDF,PDF_LATEX \ + -b digitization-dev +``` + +Options: + +- `-d, --data-source` — local directory or CERNBox URL containing `.xlsx` Boite files. +- `-o, --output-path` — output directory for JSON results (default: `./match_results`). +- `-f, --file-types` — comma-separated list of file types to match (default: `PDF,PDF_LATEX`). +- `-b, --bucket` — S3 bucket name (default: `digitization-dev`). + +### Matcher behavior + +The `file-match` flow: + +- downloads `.xlsx` Boite files from CERNBox if a URL is provided. +- reads each Boite file and extracts the record ID and filename columns. +- searches S3 under `raw///`. +- matches filenames case-insensitively. +- supports both flat and subfolder layouts: + - flat: `raw/PDF_LATEX/BOITE_O0125/ISR-LEP-RF-GG-ps.pdf` + - nested: `raw/PDF/BOITE_O0125/LEP-RF-SH-ps/LEP-RF-SH-ps.pdf` +- writes unified mismatch logs in JSON format for missing Boite rows and extra S3 files. ## Dependencies @@ -27,8 +94,6 @@ poetry install - `requests` - `pypdf` -> If the project is managed with Poetry, `requirements.txt` is not required. - ## AWS Authentication `S3Provider` uses `boto3`. Configure credentials using environment variables or the default AWS config files: @@ -51,55 +116,22 @@ export SECRET_KEY="YOUR_SECRET_KEY" > `S3Provider` also supports the default endpoint `https://s3.cern.ch`, configured in `storage_connection.py`. -## Usage with Poetry - -Run the refactored CLI via Poetry: - -```bash -poetry run digitization_v2 --help -``` - -The current command for PDF validation is `validade-files-integrity`. - -### Example - -```bash -poetry run digitization_v2 check-integrity -s "[122,123]" -u -``` - -Parameters: - -- `-i, --inventory-source`: Inventory source. Supports CERNBOX Hash, range (`1..10`), or list (`[1,2]`). -- `-u, --upload-reports`: Flag to upload validation reports back to the storage provider. -- `-b, --bucket`: S3 bucket name (default: `digitization-dev`). - -### Example without upload - -```bash -poetry run digitization_v2 check-integrity -s "[122,123]" -``` - -## Expected output - -The CLI generates the same validation reports as the core pipeline: +## CERNBox Authentication -- a text log file such as `s3_pdf_issues.log` -- a structured JSON report with valid, corrupted, and missing file details +`CernboxProvider` reads optional credentials from environment variables: -If `-u` is provided, the reports will be uploaded back to the configured storage provider. +- `CERNBOX_USER` +- `CERNBOX_PASSWORD` -## Additional notes - -- `CernboxProvider` reads optional credentials from environment variables: - - `CERNBOX_USER` - - `CERNBOX_PASSWORD` - -### Example environment variables for Cernbox +### Example environment variables for CERNBox ```bash export CERNBOX_USER="your_username" export CERNBOX_PASSWORD="your_password" ``` -- You may still pass `account` and `password` directly to `CernboxProvider` if preferred. -- Use `test_connections.py` to verify connections before running the main pipeline. +## Notes + +- `file_import/refactory_matcher.py` is the primary matcher used by `file-match`. +- `test_connections.py` can be used to verify storage connectivity before running either workflow. +- Use `poetry run digitization_v2 --help` to verify command names and options at runtime. From c8a788cc9739709fdd59f2b7cfec4c70147a3122 Mon Sep 17 00:00:00 2001 From: Gabriela de Santana Carrara Date: Thu, 16 Apr 2026 15:01:53 +0200 Subject: [PATCH 4/7] digitization(file-import):Update URL generator and address PR comments * Add support for optional expiration parameters in S3 URLs * Refactor code based on GitHub review feedback * ref cern-sis/digitization#24, cern-sis/digitization#15, cern-sis/digitization#25 --- refactory/check_files/main.py | 6 +- refactory/cli.py | 35 +++++++++--- refactory/file_import/boite_matcher.py | 30 ++++++---- refactory/file_import/utils.py | 2 +- refactory/storage_connection.py | 76 +++++++++++++++++++------- refactory/test_connections.py | 2 +- 6 files changed, 108 insertions(+), 43 deletions(-) diff --git a/refactory/check_files/main.py b/refactory/check_files/main.py index 0e67155..29fb331 100644 --- a/refactory/check_files/main.py +++ b/refactory/check_files/main.py @@ -4,7 +4,7 @@ import sys import json from typing import Union -from storage_connection import StorageProvider, S3Provider, CernboxProvider +from refactory.storage_connection import StorageProvider, S3Provider, CernboxProvider from .utils import validate_pdf @@ -19,8 +19,8 @@ def run_validation_pipeline( """Navigates directories, validates files, and logs files status.""" target_box_numbers = set() if isinstance(data_source, str): - inventory_provider = CernboxProvider(data_source) - excel_files = inventory_provider.list_files("", '.xlsx') + data_source_provider = CernboxProvider(data_source) + excel_files = data_source_provider.list_files("", '.xlsx') for file_path in excel_files: filename = file_path.split(".")[0] diff --git a/refactory/cli.py b/refactory/cli.py index 17b47cd..35b7a91 100644 --- a/refactory/cli.py +++ b/refactory/cli.py @@ -1,9 +1,8 @@ import click import ast from .check_files.main import run_validation_pipeline -from storage_connection import S3Provider +from refactory.storage_connection import S3Provider -# Import the matcher we built earlier (adjust the path if your folder structure is different) from .file_import.boite_matcher import BoiteS3Matcher @@ -54,7 +53,14 @@ def digitization_v2(): show_default=True, help="S3 Bucket name.", ) -def validate_files_integrity(data_source, upload_reports, bucket): +@click.option( + "-p", + "--base-path", + default="cern-archives/raw/PDF/", + show_default=True, + help="Base S3 path to validate.", +) +def validate_files_integrity(data_source, base_path, bucket, upload_reports): """ Validates files integrity and inventory alignment. This command checks for corrupted files and missing boxes. @@ -66,7 +72,7 @@ def validate_files_integrity(data_source, upload_reports, bucket): try: run_validation_pipeline( provider=provider, - base_path="cern-archives/raw/PDF/", + base_path=base_path, log_file="s3_pdf_issues.log", data_source=inventory_input, upload_reports=upload_reports, @@ -76,7 +82,6 @@ def validate_files_integrity(data_source, upload_reports, bucket): click.secho(f"Error: {e}", fg="red", err=True) - @digitization_v2.command("file-match") @click.option( "-d", @@ -84,6 +89,13 @@ def validate_files_integrity(data_source, upload_reports, bucket): required=True, help="Target data source. Supports a local directory path or a CERNBOX URL.", ) +@click.option( + "-p", + "--base-path", + default="cern-archives/raw/", + show_default=True, + help="Base S3 path to validate.", +) @click.option( "-o", "--output-path", @@ -105,12 +117,20 @@ def validate_files_integrity(data_source, upload_reports, bucket): show_default=True, help="S3 Bucket name.", ) -def file_match(data_source, output_path, file_types, bucket): + +def file_match(data_source, base_path, output_path, file_types, bucket): """ Matches Boite Excel records against S3 files and generates JSON payloads. Generates a success JSON per Boite and a unified mismatch log. """ - provider = S3Provider(bucket=bucket) + + CUSTOM_EXPIRATION = { + # Example: uncomment the line below to test it + # "PDF": 10, + "PDF_LATEX": 45 + } + + provider = S3Provider(bucket=bucket, custom_expiration=CUSTOM_EXPIRATION) parsed_file_types = [t.strip() for t in file_types.split(",")] @@ -121,6 +141,7 @@ def file_match(data_source, output_path, file_types, bucket): try: matcher = BoiteS3Matcher( provider=provider, + base_path=base_path, data_source=data_source, output_path=output_path, file_types=parsed_file_types, diff --git a/refactory/file_import/boite_matcher.py b/refactory/file_import/boite_matcher.py index aaf254b..da824db 100644 --- a/refactory/file_import/boite_matcher.py +++ b/refactory/file_import/boite_matcher.py @@ -15,12 +15,14 @@ class BoiteS3Matcher: def __init__( self, provider: StorageProvider, + base_path: str, data_source: str, output_path: str, file_types: list[str] | None = None, ): """Initializes the matcher with storage, data output, data path, and target file types.""" self.provider = provider + self.base_path = Path(base_path) self.output_path = Path(output_path) self.output_path.mkdir(parents=True, exist_ok=True) self.file_types = file_types or ["PDF", "PDF_LATEX"] @@ -40,9 +42,7 @@ def _get_base_filename(self, filename: str) -> str: lower_name = filename.lower() if lower_name.endswith("_latex.pdf"): return lower_name[:-10] - if lower_name.endswith(".pdf"): - return lower_name[:-4] - if lower_name.endswith((".tiff", ".tif")): + if lower_name.endswith((".pdf",".tiff", ".tif")): return lower_name.rsplit(".", 1)[0] return lower_name @@ -69,13 +69,16 @@ def _load_s3_cache_for_boite( target_number = match.group(1) for filetype in self.file_types: - prefix = f"cern-archives/raw/{filetype}/BOITE_O0{target_number}" + prefix = f"{self.base_path}{filetype}/BOITE_O0{target_number}" all_raw_keys = self.provider.list_files(prefix) valid_keys: list[str] = [] + for key in all_raw_keys: - folder_name = key.split("/")[-2] - s3_match = folder_pattern.fullmatch(folder_name) + if key.endswith("/"): + continue + + s3_match = folder_pattern.search(key) if s3_match and s3_match.group(1) == target_number: valid_keys.append(key) @@ -117,7 +120,7 @@ def process_boite( "application/pdf" if ftype in ["PDF", "PDF_LATEX"] else None ) record_data[url_key] = self.provider.generate_presigned_url( - matched_key, content_type + matched_key, ftype, content_type ) used_s3_keys[ftype].add(matched_key) else: @@ -148,21 +151,28 @@ def process_boite( unused_s3 = s3_available_keys[ftype] - used_s3_keys[ftype] for s3_key in unused_s3: - s3_base = self._get_base_filename(s3_key.split("/")[-1]) + parts = s3_key.split("/") + s3_base = self._get_base_filename(parts[-1]) s3_norm = self._normalize_for_comparison(s3_base) - if boite_norm == s3_norm: + folder_norm = "" + + if ftype == "PDF" and len(parts) > 1: + folder_norm = self._normalize_for_comparison(parts[-2]) + + if boite_norm == s3_norm or (ftype == "PDF" and boite_norm == folder_norm): near_matches.append( { "boite_record": missing_rec["record_name"], "suggested_s3_key": s3_key, - "filetype": ftype, + "filetype": ftype } ) mismatch_data = { "boite_file": box_file, "s3_folder_name": boite_name_s3, + "total_in_boite_missing_in_s3": len(missing_in_boite), "mismatches": { "in_boite_missing_in_s3": missing_in_s3, "in_s3_missing_in_boite": missing_in_boite, diff --git a/refactory/file_import/utils.py b/refactory/file_import/utils.py index cad996e..34fda2e 100644 --- a/refactory/file_import/utils.py +++ b/refactory/file_import/utils.py @@ -1,7 +1,7 @@ import os import tempfile from urllib.parse import urlparse -from .storage_connection import CernboxProvider +from ..storage_connection import CernboxProvider def parse_cernbox_url(url: str) -> dict: diff --git a/refactory/storage_connection.py b/refactory/storage_connection.py index 361a5e1..363305b 100644 --- a/refactory/storage_connection.py +++ b/refactory/storage_connection.py @@ -4,8 +4,10 @@ from abc import ABC, abstractmethod import os + class StorageProvider(ABC): """Base interface for storage providers.""" + @abstractmethod def list_folders(self, base_path: str) -> list[str]: pass @@ -28,31 +30,62 @@ def generate_presigned_url( ) -> str: pass + class S3Provider(StorageProvider): - def __init__(self, bucket: str, endpoint_url: str = 'https://s3.cern.ch'): + def __init__( + self, + bucket: str, + endpoint_url: str = "https://s3.cern.ch", + custom_expiration: dict = None, + ): self.bucket = bucket - self.s3 = boto3.client( - "s3", - # aws_access_key_id=os.environ["ACCESS_KEY"], - # aws_secret_access_key=os.environ["SECRET_KEY"], - endpoint_url=endpoint_url, - ) + # self.s3 = boto3.client( + # "s3", + # aws_access_key_id=os.environ["ACCESS_KEY"], + # aws_secret_access_key=os.environ["SECRET_KEY"], + # endpoint_url=endpoint_url, + # ) + if os.environ["ACCESS_KEY"] and os.environ["SECRET_KEY"]: + print("Logging into s3 using credentials provided in enviroment variables") + self.s3 = boto3.client( + "s3", + aws_access_key_id=os.environ["ACCESS_KEY"], + aws_secret_access_key=os.environ["SECRET_KEY"], + endpoint_url=endpoint_url, + ) + else: + print("Using default s3 login without credentials") + self.s3 = boto3.client( + "s3", + endpoint_url=endpoint_url, + ) + self.expiration_config = { + "PDF": 365, + "PDF_LATEX": 90, + "TIFF": 30, + "DEFAULT": 365, + } + print("custom_expiration", custom_expiration, self.expiration_config) + if custom_expiration: + self.expiration_config.update(custom_expiration) def list_folders(self, base_path: str) -> list[str]: - paginator = self.s3.get_paginator('list_objects_v2') + paginator = self.s3.get_paginator("list_objects_v2") folders = [] - for page in paginator.paginate(Bucket=self.bucket, Prefix=base_path, Delimiter='/'): - for prefix in page.get('CommonPrefixes', []): - folders.append(prefix['Prefix']) + for page in paginator.paginate( + Bucket=self.bucket, Prefix=base_path, Delimiter="/" + ): + for prefix in page.get("CommonPrefixes", []): + folders.append(prefix["Prefix"]) return folders def list_files(self, folder_path: str, extension: str = None) -> list[str]: - paginator = self.s3.get_paginator('list_objects_v2') + paginator = self.s3.get_paginator("list_objects_v2") files = [] for page in paginator.paginate(Bucket=self.bucket, Prefix=folder_path): - for obj in page.get('Contents', []): - key = obj['Key'] - if not key.endswith('/'): + for obj in page.get("Contents", []): + key = obj["Key"] + if not key.endswith("/"): if extension is None or key.lower().endswith(extension.lower()): files.append(key) return files @@ -64,8 +97,12 @@ def upload_file(self, local_file_path: str, remote_file_path: str) -> None: self.s3.upload_file(local_file_path, self.bucket, remote_file_path) def generate_presigned_url( - self, file_key: str, content_type: str = None, expiration: int = 31556952 + self, file_key: str, file_type: str, content_type: str = None ) -> str: + expiration = 86400 * ( + self.expiration_config.get(file_type, self.expiration_config["DEFAULT"]) + ) + params = {"Bucket": self.bucket, "Key": file_key} if content_type: params["ResponseContentType"] = content_type @@ -76,6 +113,7 @@ def generate_presigned_url( ExpiresIn=expiration, ) + class CernboxProvider(StorageProvider): def __init__(self, public_link_hash: str = None): self.account = os.getenv("CERNBOX_ACCOUNT") @@ -85,11 +123,9 @@ def __init__(self, public_link_hash: str = None): self.public_link_hash = public_link_hash if self.is_public: - # Public self.base_url = f"https://cernbox.cern.ch/remote.php/dav/public-files/{public_link_hash}" self.auth = None else: - # Protected if not self.account or not self.password: raise ValueError( "Error: CERN credentials required for protected shares.." @@ -139,9 +175,7 @@ def download_to_temp(self, file_path: str, temp_file_path: str) -> None: def upload_file(self, local_file_path: str, remote_file_path: str) -> None: if self.is_public: - raise NotImplementedError( - "Error: CERN credentials required for updates." - ) + raise NotImplementedError("Error: CERN credentials required for updates.") clean_remote_path = remote_file_path.strip("/") url = f"{self.base_url}/{clean_remote_path}" diff --git a/refactory/test_connections.py b/refactory/test_connections.py index 242f139..65467ce 100644 --- a/refactory/test_connections.py +++ b/refactory/test_connections.py @@ -1,6 +1,6 @@ import tempfile import os -from storage_connection import S3Provider, CernboxProvider +from .storage_connection import S3Provider, CernboxProvider def test_s3(): print("--- Testing AWS S3 connection ---") From 2263617da4b81542a3b189b405eb6a14073f2a0a Mon Sep 17 00:00:00 2001 From: Gabriela de Santana Carrara Date: Thu, 16 Apr 2026 15:08:50 +0200 Subject: [PATCH 5/7] remove typing union --- refactory/check_files/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/refactory/check_files/main.py b/refactory/check_files/main.py index 29fb331..a908b40 100644 --- a/refactory/check_files/main.py +++ b/refactory/check_files/main.py @@ -3,7 +3,6 @@ import os import sys import json -from typing import Union from refactory.storage_connection import StorageProvider, S3Provider, CernboxProvider from .utils import validate_pdf @@ -12,7 +11,7 @@ def run_validation_pipeline( provider: StorageProvider, base_path: str, log_file: str, - data_source: Union[str, list[int]], + data_source: str | list[int], upload_reports: bool = False, ): From 0ec41becb72c8c5f080bce7dc93ba76994d9cd3a Mon Sep 17 00:00:00 2001 From: Gabriela de Santana Carrara Date: Thu, 16 Apr 2026 16:17:48 +0200 Subject: [PATCH 6/7] fix prefix path and remove comments --- refactory/cli.py | 2 +- refactory/file_import/boite_matcher.py | 2 +- refactory/storage_connection.py | 9 ++------- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/refactory/cli.py b/refactory/cli.py index 35b7a91..4f62237 100644 --- a/refactory/cli.py +++ b/refactory/cli.py @@ -127,7 +127,7 @@ def file_match(data_source, base_path, output_path, file_types, bucket): CUSTOM_EXPIRATION = { # Example: uncomment the line below to test it # "PDF": 10, - "PDF_LATEX": 45 + # "PDF_LATEX": 45 } provider = S3Provider(bucket=bucket, custom_expiration=CUSTOM_EXPIRATION) diff --git a/refactory/file_import/boite_matcher.py b/refactory/file_import/boite_matcher.py index da824db..b9954d2 100644 --- a/refactory/file_import/boite_matcher.py +++ b/refactory/file_import/boite_matcher.py @@ -69,7 +69,7 @@ def _load_s3_cache_for_boite( target_number = match.group(1) for filetype in self.file_types: - prefix = f"{self.base_path}{filetype}/BOITE_O0{target_number}" + prefix = f"{self.base_path}/{filetype}/BOITE_O0{target_number}" all_raw_keys = self.provider.list_files(prefix) valid_keys: list[str] = [] diff --git a/refactory/storage_connection.py b/refactory/storage_connection.py index 363305b..ebf1c8c 100644 --- a/refactory/storage_connection.py +++ b/refactory/storage_connection.py @@ -39,12 +39,7 @@ def __init__( custom_expiration: dict = None, ): self.bucket = bucket - # self.s3 = boto3.client( - # "s3", - # aws_access_key_id=os.environ["ACCESS_KEY"], - # aws_secret_access_key=os.environ["SECRET_KEY"], - # endpoint_url=endpoint_url, - # ) + if os.environ["ACCESS_KEY"] and os.environ["SECRET_KEY"]: print("Logging into s3 using credentials provided in enviroment variables") self.s3 = boto3.client( @@ -65,7 +60,7 @@ def __init__( "TIFF": 30, "DEFAULT": 365, } - print("custom_expiration", custom_expiration, self.expiration_config) + if custom_expiration: self.expiration_config.update(custom_expiration) From 4c1a78a9d3363fa95d3b8b4d090fea9f2f473a3e Mon Sep 17 00:00:00 2001 From: Gabriela de Santana Carrara Date: Tue, 21 Apr 2026 16:44:56 +0200 Subject: [PATCH 7/7] digitization(storage_connection): Raise NotImplementedError for list_folders in CernboxProvider --- refactory/storage_connection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/refactory/storage_connection.py b/refactory/storage_connection.py index ebf1c8c..074b6f9 100644 --- a/refactory/storage_connection.py +++ b/refactory/storage_connection.py @@ -151,7 +151,7 @@ def _propfind(self, path: str, depth: str = "1") -> list[str]: return paths def list_folders(self, base_path: str) -> list[str]: - pass + raise NotImplementedError("This method is not available for this storage type.") def list_files(self, folder_path: str, extension: str = None) -> list[str]: all_items = self._propfind(folder_path)