From 242470a99920f190e4b7d6266e0f316882e2a221 Mon Sep 17 00:00:00 2001
From: Gabriela de Santana Carrara <gabriela.de.santana.carrara@cern.ch>
Date: Mon, 13 Apr 2026 11:43:47 +0200
Subject: [PATCH 1/7] digitization(utils): Automate CERNBox file downloads

* Implement fetch_boite_files to replace manual fetching
* ref cern-sis/digitization#15
---
 refactory/storage_connection.py | 126 ++++++++++++++++++++------------
 refactory/utils.py              |  72 ++++++++++++++++++
 2 files changed, 150 insertions(+), 48 deletions(-)
 create mode 100644 refactory/utils.py

diff --git a/refactory/storage_connection.py b/refactory/storage_connection.py
index bdfdad9..361a5e1 100644
--- a/refactory/storage_connection.py
+++ b/refactory/storage_connection.py
@@ -11,7 +11,7 @@ def list_folders(self, base_path: str) -> list[str]:
         pass
 
     @abstractmethod
-    def list_pdfs(self, folder_path: str) -> list[str]:
+    def list_files(self, folder_path: str, extension: str = None) -> list[str]:
         pass
 
     @abstractmethod
@@ -23,7 +23,9 @@ def upload_file(self, local_file_path: str, remote_file_path: str) -> None:
         pass
 
     @abstractmethod
-    def list_excel(self, folder_path: str) -> list[str]:
+    def generate_presigned_url(
+        self, file_key: str, content_type: str = None, expiration: int = 31556952
+    ) -> str:
         pass
 
 class S3Provider(StorageProvider):
@@ -31,8 +33,8 @@ def __init__(self, bucket: str, endpoint_url: str = 'https://s3.cern.ch'):
         self.bucket = bucket
         self.s3 = boto3.client(
             "s3",
-            aws_access_key_id=os.environ["ACCESS_KEY"],
-            aws_secret_access_key=os.environ["SECRET_KEY"],
+            # aws_access_key_id=os.environ["ACCESS_KEY"],
+            # aws_secret_access_key=os.environ["SECRET_KEY"],
             endpoint_url=endpoint_url,
         )
 
@@ -44,12 +46,16 @@ def list_folders(self, base_path: str) -> list[str]:
                 folders.append(prefix['Prefix'])
         return folders
 
-    def list_pdfs(self, folder_path: str) -> list[str]:
-        response = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=folder_path)
-        return [
-            obj['Key'] for obj in response.get('Contents', [])
-            if obj['Key'].lower().endswith('.pdf')
-        ]
+    def list_files(self, folder_path: str, extension: str = None) -> list[str]:
+        paginator = self.s3.get_paginator('list_objects_v2')
+        files = []
+        for page in paginator.paginate(Bucket=self.bucket, Prefix=folder_path):
+            for obj in page.get('Contents', []):
+                key = obj['Key']
+                if not key.endswith('/'):
+                    if extension is None or key.lower().endswith(extension.lower()):
+                        files.append(key)
+        return files
 
     def download_to_temp(self, file_path: str, temp_file_path: str) -> None:
         self.s3.download_file(self.bucket, file_path, temp_file_path)
@@ -57,70 +63,94 @@ def download_to_temp(self, file_path: str, temp_file_path: str) -> None:
     def upload_file(self, local_file_path: str, remote_file_path: str) -> None:
         self.s3.upload_file(local_file_path, self.bucket, remote_file_path)
 
-    def list_excel(self, folder_path: str) -> list[str]:
-            response = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=folder_path)
-            return [
-                obj['Key'] for obj in response.get('Contents', [])
-                if obj['Key'].lower().endswith('.xlsx')
-            ]
+    def generate_presigned_url(
+        self, file_key: str, content_type: str = None, expiration: int = 31556952
+    ) -> str:
+        params = {"Bucket": self.bucket, "Key": file_key}
+        if content_type:
+            params["ResponseContentType"] = content_type
+
+        return self.s3.generate_presigned_url(
+            ClientMethod="get_object",
+            Params=params,
+            ExpiresIn=expiration,
+        )
+
 class CernboxProvider(StorageProvider):
-    def __init__(self, public_link_hash: str):
-        self.public_link_hash = public_link_hash
+    def __init__(self, public_link_hash: str = None):
         self.account = os.getenv("CERNBOX_ACCOUNT")
         self.password = os.getenv("CERNBOX_PASSWORD")
 
-        self.public_base_url = f"https://cernbox.cern.ch/remote.php/dav/public-files/{public_link_hash}"
+        self.is_public = bool(public_link_hash)
+        self.public_link_hash = public_link_hash
 
-        if self.account:
-            self.auth_base_url = f"https://api.cernbox.cern.ch/remote.php/dav/files/{self.account}"
+        if self.is_public:
+            # Public
+            self.base_url = f"https://cernbox.cern.ch/remote.php/dav/public-files/{public_link_hash}"
+            self.auth = None
         else:
-            self.auth_base_url = None
+            # Protected
+            if not self.account or not self.password:
+                raise ValueError(
+                    "Error: CERN credentials required for protected shares.."
+                )
+            self.base_url = (
+                f"https://api.cernbox.cern.ch/remote.php/dav/files/{self.account}"
+            )
+            self.auth = (self.account, self.password)
 
     def _propfind(self, path: str, depth: str = "1") -> list[str]:
-        url = f"{self.public_base_url}/{path}".rstrip('/') + '/'
-        headers = {'Depth': depth}
 
-        response = requests.request('PROPFIND', url, headers=headers)
+        url = f"{self.base_url}/{path}/" if path else f"{self.base_url}/"
+        headers = {"Depth": depth}
+
+        response = requests.request("PROPFIND", url, headers=headers, auth=self.auth)
         response.raise_for_status()
 
         root = ET.fromstring(response.content)
-        namespaces = {'d': 'DAV:'}
+        namespaces = {"d": "DAV:"}
 
         paths = []
-        for response_tag in root.findall('d:response', namespaces)[1:]:
-            href = response_tag.find('d:href', namespaces).text
-            relative_path = href.split(f"/public-files/{self.public_link_hash}/")[-1]
-            paths.append(relative_path)
+        for response_tag in root.findall("d:response", namespaces)[1:]:
+            href = response_tag.find("d:href", namespaces).text
+            filename = href.rstrip("/").split("/")[-1]
+
+            paths.append(filename)
+
         return paths
 
     def list_folders(self, base_path: str) -> list[str]:
-        all_items = self._propfind(base_path)
-        return [p for p in all_items if p.endswith('/') or "BOITE_" in p]
+        pass
 
-    def list_pdfs(self, folder_path: str) -> list[str]:
+    def list_files(self, folder_path: str, extension: str = None) -> list[str]:
         all_items = self._propfind(folder_path)
-        return [p for p in all_items if p.lower().endswith('.pdf')]
+        if extension:
+            return [p for p in all_items if p.lower().endswith(extension.lower())]
+        return all_items
 
     def download_to_temp(self, file_path: str, temp_file_path: str) -> None:
-        url = f"{self.public_base_url}/{file_path}"
-        response = requests.get(url, stream=True)
+        url = f"{self.base_url}/{file_path}"
+        response = requests.get(url, stream=True, auth=self.auth)
         response.raise_for_status()
-        with open(temp_file_path, 'wb') as f:
+
+        with open(temp_file_path, "wb") as f:
             for chunk in response.iter_content(chunk_size=8192):
                 f.write(chunk)
 
-    def list_excel(self, folder_path: str)-> list[str]:
-        all_items = self._propfind(folder_path)
-        return [p for p in all_items if p.lower().endswith('.xlsx')]
-
     def upload_file(self, local_file_path: str, remote_file_path: str) -> None:
-        if not self.account or not self.password:
-            raise ValueError("CERN account and password are required for uploading.")
-
-        clean_remote_path = remote_file_path.lstrip('/')
-        url = f"{self.auth_base_url}/{clean_remote_path}"
+        if self.is_public:
+            raise NotImplementedError(
+                "Error: CERN credentials required for updates."
+            )
 
-        with open(local_file_path, 'rb') as f:
-            response = requests.put(url, data=f, auth=(self.account, self.password))
+        clean_remote_path = remote_file_path.strip("/")
+        url = f"{self.base_url}/{clean_remote_path}"
 
+        with open(local_file_path, "rb") as f:
+            response = requests.put(url, data=f, auth=self.auth)
         response.raise_for_status()
+
+    def generate_presigned_url(
+        self, file_key: str, content_type: str = None, expiration: int = None
+    ) -> str:
+        return f"{self.base_url}/{file_key}"
diff --git a/refactory/utils.py b/refactory/utils.py
new file mode 100644
index 0000000..9bf46f6
--- /dev/null
+++ b/refactory/utils.py
@@ -0,0 +1,72 @@
+import os
+import tempfile
+from urllib.parse import urlparse
+from storage_connection import CernboxProvider
+
+
+def parse_cernbox_url(url: str) -> dict:
+    parsed = urlparse(url)
+    path = parsed.path
+
+    if "/s/" in path:
+        hash_code = path.split("/s/")[-1].split("/")[0]
+        return {"public_link_hash": hash_code, "eos_path": None}
+
+    elif "public-files" in path:
+        hash_code = path.split("public-files/")[-1].split("/")[0]
+        return {"public_link_hash": hash_code, "eos_path": None}
+
+    elif "eos" in path:
+
+        eos_path = "eos" + path.split("eos")[-1]
+        return {"public_link_hash": None, "eos_path": eos_path}
+
+    else:
+        raise ValueError(f"Invalid CERNBox URL format: {url}")
+
+
+def fetch_boite_files(url: str, output_dir: str = None) -> str:
+    """
+    Downloads all .xlsx files from a CERNBox URL.
+    """
+    print(f"Fetch URL: {url}")
+
+    parsed_data = parse_cernbox_url(url)
+
+    provider = CernboxProvider(public_link_hash=parsed_data["public_link_hash"])
+
+    if output_dir is None:
+        output_dir = tempfile.mkdtemp(prefix="boite_data_")
+    elif not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+        print(f"Created directory: {output_dir}")
+
+    target_path = parsed_data["eos_path"] if parsed_data["eos_path"] else ""
+
+    try:
+        xlsx_files = provider.list_files(target_path, extension=".xlsx")
+    except Exception as e:
+        print(f"Failed to access CERNBox. Error: {e}")
+        return output_dir
+
+    if not xlsx_files:
+        print("No .xlsx files found.")
+        return output_dir
+
+    print(f"Found {len(xlsx_files)} files. Starting download...")
+
+    for filename in xlsx_files:
+        local_path = os.path.join(output_dir, filename)
+
+        remote_file_path = (
+            f"{target_path.rstrip('/')}/{filename}" if target_path else filename
+        )
+
+        try:
+            print(f"Downloading: {filename}...")
+            provider.download_to_temp(remote_file_path, local_path)
+        except Exception as e:
+            print(f"Failed to download {filename}: {e}")
+
+    print(f"Download complete. Files are located in: {output_dir}")
+    return output_dir

From 7084edadae134111750ed422c7db941b90279060 Mon Sep 17 00:00:00 2001
From: Gabriela de Santana Carrara <gabriela.de.santana.carrara@cern.ch>
Date: Tue, 14 Apr 2026 18:34:03 +0200
Subject: [PATCH 2/7] digitization(boite_matcher): Implement Boite-to-S3 file
 matcher

* Match Boite .xlsx records to S3 paths case-insensitively
* Support flat and subfolder layouts for PDF and PDF_LATEX
* Log unmatched files and output data for XML generation
* ref cern-sis/digitization#24
---
 refactory/__init__.py                         |   0
 refactory/{ => check_files}/main.py           |  20 +-
 .../{validate_pdf.py => check_files/utils.py} |   8 +-
 refactory/cli.py                              |  77 ++++++-
 refactory/file_import/__init__.py             |   0
 refactory/file_import/boite_matcher.py        | 206 ++++++++++++++++++
 refactory/{ => file_import}/utils.py          |   9 +-
 7 files changed, 295 insertions(+), 25 deletions(-)
 create mode 100644 refactory/__init__.py
 rename refactory/{ => check_files}/main.py (89%)
 rename refactory/{validate_pdf.py => check_files/utils.py} (86%)
 create mode 100644 refactory/file_import/__init__.py
 create mode 100644 refactory/file_import/boite_matcher.py
 rename refactory/{ => file_import}/utils.py (93%)

diff --git a/refactory/__init__.py b/refactory/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/refactory/main.py b/refactory/check_files/main.py
similarity index 89%
rename from refactory/main.py
rename to refactory/check_files/main.py
index 665486d..0e67155 100644
--- a/refactory/main.py
+++ b/refactory/check_files/main.py
@@ -5,22 +5,22 @@
 import json
 from typing import Union
 from storage_connection import StorageProvider, S3Provider, CernboxProvider
-from validate_pdf import is_pdf_valid
+from .utils import validate_pdf
 
 
 def run_validation_pipeline(
     provider: StorageProvider,
     base_path: str,
     log_file: str,
-    inventory_source: Union[str, list[int]],
+    data_source: Union[str, list[int]],
     upload_reports: bool = False,
 
 ):
     """Navigates directories, validates files, and logs files status."""
     target_box_numbers = set()
-    if isinstance(inventory_source, str):
-        inventory_provider = CernboxProvider(inventory_source)
-        excel_files = inventory_provider.list_excel("")
+    if isinstance(data_source, str):
+        inventory_provider = CernboxProvider(data_source)
+        excel_files = inventory_provider.list_files("", '.xlsx')
 
         for file_path in excel_files:
             filename = file_path.split(".")[0]
@@ -29,8 +29,8 @@ def run_validation_pipeline(
 
             if match:
                 target_box_numbers.add(int(match.group(1)))
-    elif isinstance(inventory_source, list):
-        target_box_numbers = set(inventory_source)
+    elif isinstance(data_source, list):
+        target_box_numbers = set(data_source)
 
     print(f"Excel files: {len(target_box_numbers)} boxes to check.")
 
@@ -57,7 +57,7 @@ def run_validation_pipeline(
             continue
         print(f"Processing target Box: {match.group(1) + (match.group(2) or '')}")
 
-        pdf_files = provider.list_pdfs(folder)
+        pdf_files = provider.list_files(folder, 'PDF')
 
         if not pdf_files:
             print(f"⚠️ EMPTY FOLDER: {folder}")
@@ -69,7 +69,7 @@ def run_validation_pipeline(
             with tempfile.NamedTemporaryFile(delete=True) as tmp:
                 provider.download_to_temp(pdf_path, tmp.name)
 
-                if is_pdf_valid(tmp.name):
+                if validate_pdf(tmp.name):
                     valid_files.append(pdf_path)
                     print(f"  ✅ {pdf_path}")
                 else:
@@ -136,6 +136,6 @@ def run_validation_pipeline(
         provider=s3_provider,  # cernbox_provider
         base_path="cern-archives/raw/PDF/",  # "teste/",
         log_file="s3_pdf_issues.log",
-        inventory_source=sys.argv[1],  # public_link_hash
+        data_source=sys.argv[1],  # public_link_hash
         upload_reports=int(sys.argv[2])
     )
diff --git a/refactory/validate_pdf.py b/refactory/check_files/utils.py
similarity index 86%
rename from refactory/validate_pdf.py
rename to refactory/check_files/utils.py
index 0b0ee18..39639cb 100644
--- a/refactory/validate_pdf.py
+++ b/refactory/check_files/utils.py
@@ -2,11 +2,11 @@
 from pypdf import PdfReader
 from pypdf.errors import PdfReadError
 
-def is_pdf_valid(file_path: str) -> bool:
+def validate_pdf(file_path: str) -> bool:
     """Checks if a local PDF is structurally valid and readable."""
     try:
         file_size = os.path.getsize(file_path)
-        if file_size < 100:  
+        if file_size < 100:
             return False
 
         with open(file_path, "rb") as f:
@@ -23,7 +23,7 @@ def is_pdf_valid(file_path: str) -> bool:
         if len(reader.pages) == 0:
             return False
 
-        _ = reader.pages[0] 
+        _ = reader.pages[0]
 
         return True
 
@@ -31,4 +31,4 @@ def is_pdf_valid(file_path: str) -> bool:
         raise RuntimeError(f"System error when accessing file {file_path}: {e}") from e
 
     except (PdfReadError, Exception):
-        return False
\ No newline at end of file
+        return False
diff --git a/refactory/cli.py b/refactory/cli.py
index d5feb4b..17b47cd 100644
--- a/refactory/cli.py
+++ b/refactory/cli.py
@@ -1,8 +1,11 @@
 import click
 import ast
-from .main import run_validation_pipeline
+from .check_files.main import run_validation_pipeline
 from storage_connection import S3Provider
 
+# Import the matcher we built earlier (adjust the path if your folder structure is different)
+from .file_import.boite_matcher import BoiteS3Matcher
+
 
 def parse_inventory(value):
     """
@@ -25,6 +28,7 @@ def parse_inventory(value):
             pass
     return value
 
+
 @click.group()
 def digitization_v2():
     pass
@@ -32,10 +36,10 @@ def digitization_v2():
 
 @digitization_v2.command("validate-files-integrity")
 @click.option(
-    "-s",
-    "--inventory-source",
+    "-d",
+    "--data-source",
     required=True,
-    help="Target inventory. Supports a CERNBOX hash, range 1..10, or list [1,2].",
+    help="Boite Files. Supports a CERNBOX hash, range 1..10, or list [1,2].",
 )
 @click.option(
     "-u",
@@ -50,13 +54,13 @@ def digitization_v2():
     show_default=True,
     help="S3 Bucket name.",
 )
-def validate_files_integrity(inventory_source, upload_reports, bucket):
+def validate_files_integrity(data_source, upload_reports, bucket):
     """
     Validates files integrity and inventory alignment.
     This command checks for corrupted files and missing boxes.
     """
 
-    inventory_input = parse_inventory(inventory_source)
+    inventory_input = parse_inventory(data_source)
     provider = S3Provider(bucket=bucket)
 
     try:
@@ -64,7 +68,7 @@ def validate_files_integrity(inventory_source, upload_reports, bucket):
             provider=provider,
             base_path="cern-archives/raw/PDF/",
             log_file="s3_pdf_issues.log",
-            inventory_source=inventory_input,
+            data_source=inventory_input,
             upload_reports=upload_reports,
         )
         click.echo("Process finished. Check the generated logs for details.")
@@ -72,5 +76,64 @@ def validate_files_integrity(inventory_source, upload_reports, bucket):
         click.secho(f"Error: {e}", fg="red", err=True)
 
 
+
+@digitization_v2.command("file-match")
+@click.option(
+    "-d",
+    "--data-source",
+    required=True,
+    help="Target data source. Supports a local directory path or a CERNBOX URL.",
+)
+@click.option(
+    "-o",
+    "--output-path",
+    default="./match_results",
+    show_default=True,
+    help="Directory to save the generated JSON files (records and mismatches).",
+)
+@click.option(
+    "-f",
+    "--file-types",
+    default="PDF,PDF_LATEX",
+    show_default=True,
+    help="Comma-separated list of file types to match (e.g., 'PDF,PDF_LATEX,TIFF').",
+)
+@click.option(
+    "-b",
+    "--bucket",
+    default="digitization-dev",
+    show_default=True,
+    help="S3 Bucket name.",
+)
+def file_match(data_source, output_path, file_types, bucket):
+    """
+    Matches Boite Excel records against S3 files and generates JSON payloads.
+    Generates a success JSON per Boite and a unified mismatch log.
+    """
+    provider = S3Provider(bucket=bucket)
+
+    parsed_file_types = [t.strip() for t in file_types.split(",")]
+
+    click.echo("Starting match process...")
+    click.echo(f"Source: {data_source}")
+    click.echo(f"File types: {', '.join(parsed_file_types)}")
+
+    try:
+        matcher = BoiteS3Matcher(
+            provider=provider,
+            data_source=data_source,
+            output_path=output_path,
+            file_types=parsed_file_types,
+        )
+
+        matcher.execute()
+
+        click.secho(
+            f"Match completed successfully. Output saved to: {output_path}", fg="green"
+        )
+    except Exception as e:
+        click.secho(f"Error during matching: {e}", fg="red", err=True)
+
+
 if __name__ == "__main__":
     digitization_v2()
diff --git a/refactory/file_import/__init__.py b/refactory/file_import/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/refactory/file_import/boite_matcher.py b/refactory/file_import/boite_matcher.py
new file mode 100644
index 0000000..aaf254b
--- /dev/null
+++ b/refactory/file_import/boite_matcher.py
@@ -0,0 +1,206 @@
+import json
+import os
+import re
+import pandas as pd
+from pathlib import Path
+from urllib.parse import urlparse
+
+from .utils import fetch_boite_files, transform_box_file_name
+from ..storage_connection import StorageProvider
+
+
+class BoiteS3Matcher:
+    """Matches Boite Excel records with S3 files and logs discrepancies."""
+
+    def __init__(
+        self,
+        provider: StorageProvider,
+        data_source: str,
+        output_path: str,
+        file_types: list[str] | None = None,
+    ):
+        """Initializes the matcher with storage, data output, data path, and target file types."""
+        self.provider = provider
+        self.output_path = Path(output_path)
+        self.output_path.mkdir(parents=True, exist_ok=True)
+        self.file_types = file_types or ["PDF", "PDF_LATEX"]
+        self.data_path = self._prepare_data_path(data_source)
+
+    def _is_url(self, value: str) -> bool:
+        return urlparse(value).scheme in {"http", "https"}
+
+    def _prepare_data_path(self, data_source: str) -> Path:
+        """Returns the local path or delegates the download if a URL is provided."""
+        if self._is_url(data_source):
+            return Path(fetch_boite_files(data_source))
+        return Path(data_source)
+
+    def _get_base_filename(self, filename: str) -> str:
+        """Strips file extensions and returns a clean, lowercase base name for exact matching."""
+        lower_name = filename.lower()
+        if lower_name.endswith("_latex.pdf"):
+            return lower_name[:-10]
+        if lower_name.endswith(".pdf"):
+            return lower_name[:-4]
+        if lower_name.endswith((".tiff", ".tif")):
+            return lower_name.rsplit(".", 1)[0]
+        return lower_name
+
+    def _normalize_for_comparison(self, name: str) -> str:
+        """Removes all non-alphanumeric characters for fuzzy matching and review suggestions."""
+        return re.sub(r"[^a-z0-9]", "", name.lower())
+
+    def _load_s3_cache_for_boite(
+        self, box_file: str
+    ) -> tuple[dict[str, dict[str, str]], dict[str, set[str]]]:
+        """Pre-loads and filters S3 keys for match"""
+        cache: dict[str, dict[str, str]] = {}
+        available_keys: dict[str, set[str]] = {}
+
+        folder_pattern = re.compile(r"(?i:BOITE)[\-_]O0(\d+)(?:[\-_]\w+)?")
+        match = folder_pattern.search(box_file)
+
+        if not match:
+            print('No Boile file found.')
+            return {ft: {} for ft in self.file_types}, {
+                ft: set() for ft in self.file_types
+            }
+
+        target_number = match.group(1)
+
+        for filetype in self.file_types:
+            prefix = f"cern-archives/raw/{filetype}/BOITE_O0{target_number}"
+            all_raw_keys = self.provider.list_files(prefix)
+
+            valid_keys: list[str] = []
+            for key in all_raw_keys:
+                folder_name = key.split("/")[-2]
+                s3_match = folder_pattern.fullmatch(folder_name)
+
+                if s3_match and s3_match.group(1) == target_number:
+                    valid_keys.append(key)
+
+            cache[filetype] = {
+                self._get_base_filename(k.split("/")[-1]): k for k in valid_keys
+            }
+            available_keys[filetype] = set(valid_keys)
+
+        return cache, available_keys
+
+    def process_boite(
+        self, box_file: str
+    ) -> tuple[list[dict], dict]:
+        """Processes a single Boite file in-memory and returns the mapped records alongside mismatch data."""
+        print(f"📦 Processing {box_file}...")
+        df = pd.read_excel(self.data_path / box_file, header=None)
+        boite_name_s3 = transform_box_file_name(box_file)
+
+        s3_cache, s3_available_keys = self._load_s3_cache_for_boite(box_file)
+
+        records_data: list[dict] = []
+        missing_in_s3: list[dict] = []
+        used_s3_keys: dict[str, set[str]] = {ftype: set() for ftype in self.file_types}
+
+        for _, row in df.iterrows():
+            record_id, record_name = str(row[0]).strip(), str(row[1]).strip()
+            search_name = self._get_base_filename(record_name)
+
+            record_data: dict = {"record_id": record_id}
+            missing_types: list[str] = []
+
+            for ftype in self.file_types:
+                url_key = f"{ftype.lower()}_url"
+                matched_key = s3_cache[ftype].get(search_name)
+
+                if matched_key:
+                    content_type = (
+                        "application/pdf" if ftype in ["PDF", "PDF_LATEX"] else None
+                    )
+                    record_data[url_key] = self.provider.generate_presigned_url(
+                        matched_key, content_type
+                    )
+                    used_s3_keys[ftype].add(matched_key)
+                else:
+                    record_data[url_key] = None
+                    missing_types.append(ftype)
+
+            if missing_types:
+                missing_in_s3.append(
+                    {
+                        "record_id": record_id,
+                        "record_name": record_name,
+                        "missing_types": missing_types,
+                    }
+                )
+            records_data.append(record_data)
+
+        missing_in_boite = [
+            {"s3_key": key, "filetype": ftype}
+            for ftype in self.file_types
+            for key in (s3_available_keys[ftype] - used_s3_keys[ftype])
+        ]
+
+        near_matches = []
+        for missing_rec in missing_in_s3:
+            boite_norm = self._normalize_for_comparison(missing_rec["record_name"])
+
+            for ftype in missing_rec["missing_types"]:
+                unused_s3 = s3_available_keys[ftype] - used_s3_keys[ftype]
+
+                for s3_key in unused_s3:
+                    s3_base = self._get_base_filename(s3_key.split("/")[-1])
+                    s3_norm = self._normalize_for_comparison(s3_base)
+
+                    if boite_norm == s3_norm:
+                        near_matches.append(
+                            {
+                                "boite_record": missing_rec["record_name"],
+                                "suggested_s3_key": s3_key,
+                                "filetype": ftype,
+                            }
+                        )
+
+        mismatch_data = {
+            "boite_file": box_file,
+            "s3_folder_name": boite_name_s3,
+            "mismatches": {
+                "in_boite_missing_in_s3": missing_in_s3,
+                "in_s3_missing_in_boite": missing_in_boite,
+                "potential_matches": near_matches,
+            },
+        }
+
+        return records_data, mismatch_data
+
+    def _export_records(self, box_file: str, records: list) -> None:
+        """Saves Boite records to JSON."""
+        base_name = box_file.rsplit(".", 1)[0]
+        with open(
+            self.output_path / f"{base_name}_records.json", "w", encoding="utf-8"
+        ) as f:
+            json.dump(records, f, indent=4, ensure_ascii=False)
+
+    def _export_unified_log(self, all_mismatches: list) -> None:
+        """Saves consolidated mismatch log."""
+        with open(
+            self.output_path / "all_boites_mismatches.json", "w", encoding="utf-8"
+        ) as f:
+            json.dump(
+                {"total": len(all_mismatches), "details": all_mismatches},
+                f,
+                indent=4,
+                ensure_ascii=False,
+            )
+
+    def execute(self) -> dict[str, list[dict]]:
+        """Export logs in Json and return records data in memory"""
+        results_map, all_mismatches = {}, []
+        for box_file in os.listdir(self.data_path):
+            if box_file.lower().endswith(".xlsx") and not box_file.startswith("~"):
+                records, mismatches = self.process_boite(box_file)
+                results_map[box_file] = records
+                all_mismatches.append(mismatches)
+                self._export_records(box_file, records)
+
+        self._export_unified_log(all_mismatches)
+        return results_map
diff --git a/refactory/utils.py b/refactory/file_import/utils.py
similarity index 93%
rename from refactory/utils.py
rename to refactory/file_import/utils.py
index 9bf46f6..cad996e 100644
--- a/refactory/utils.py
+++ b/refactory/file_import/utils.py
@@ -1,7 +1,7 @@
 import os
 import tempfile
 from urllib.parse import urlparse
-from storage_connection import CernboxProvider
+from .storage_connection import CernboxProvider
 
 
 def parse_cernbox_url(url: str) -> dict:
@@ -17,7 +17,6 @@ def parse_cernbox_url(url: str) -> dict:
         return {"public_link_hash": hash_code, "eos_path": None}
 
     elif "eos" in path:
-
         eos_path = "eos" + path.split("eos")[-1]
         return {"public_link_hash": None, "eos_path": eos_path}
 
@@ -67,6 +66,8 @@ def fetch_boite_files(url: str, output_dir: str = None) -> str:
             provider.download_to_temp(remote_file_path, local_path)
         except Exception as e:
             print(f"Failed to download {filename}: {e}")
-
-    print(f"Download complete. Files are located in: {output_dir}")
     return output_dir
+
+
+def transform_box_file_name(box_file):
+    return box_file.split(".")[0].upper().replace("-", "_")

From 8bd86023b6d6374e56ad49d917b1e556d55450d7 Mon Sep 17 00:00:00 2001
From: Gabriela de Santana Carrara <gabriela.de.santana.carrara@cern.ch>
Date: Wed, 15 Apr 2026 09:47:30 +0200
Subject: [PATCH 3/7] update readme

---
 refactory/README.md | 130 +++++++++++++++++++++++++++-----------------
 1 file changed, 81 insertions(+), 49 deletions(-)

diff --git a/refactory/README.md b/refactory/README.md
index 43aae66..9aab013 100644
--- a/refactory/README.md
+++ b/refactory/README.md
@@ -1,15 +1,82 @@
 # refactory
 
-This directory contains scripts and helpers for validating PDF files in an S3 bucket using an inventory of Excel files hosted on CERNBox.
+This directory contains tools for validating PDF files and matching Boite Excel inventory records against S3 files.
 
 ## Structure
 
-- `main.py` - main script that validates PDFs using the CERNBox inventory.
+- `cli.py` - click CLI exposing the main workflows:
+  - `validate-files-integrity`
+  - `file-match`
 - `storage_connection.py` - storage provider abstraction:
   - `S3Provider` for S3.
   - `CernboxProvider` for public CERNBox access.
-- `validate_pdf.py` - validates PDFs locally with `is_pdf_valid(file_path)`.
-- `test_connections.py` - testing/connection experiment script.
+- `check_files/main.py` - validation pipeline used by `validate-files-integrity`.
+- `file_import/refactory_matcher.py` - Boite-to-S3 matcher implementation used by `file-match`.
+- `file_import/boite_matcher.py` - additional matcher implementation and helpers.
+
+## CLI usage
+
+Run the refactory CLI from the repository root:
+
+```bash
+poetry run digitization_v2 --help
+```
+
+The available commands are:
+
+- `validate-files-integrity` — validate PDF integrity and inventory alignment.
+- `file-match` — match Boite Excel records against S3 files and generate JSON outputs.
+
+## 1. Validate files integrity
+
+Use this command to check the Boite inventory against the PDF validation pipeline.
+
+```bash
+poetry run digitization_v2 validate-files-integrity \
+  -d "[122,123]" \
+  -u \
+  -b digitization-dev
+```
+
+Options:
+
+- `-d, --data-source` — Boite inventory source. Supports a CERNBox hash, range (`1..10`), or list (`[1,2]`).
+- `-u, --upload-reports` — upload validation reports back to storage.
+- `-b, --bucket` — S3 bucket name (default: `digitization-dev`).
+
+This command runs the validation pipeline and generates logs such as `s3_pdf_issues.log`.
+
+## 2. Boite-to-S3 file matching
+
+Use this command to match Boite Excel filenames with S3 objects and write structured JSON output.
+
+```bash
+poetry run digitization_v2 file-match \
+  -d "https://cernbox.cern.ch/s/{hash}" \
+  -o ./match_results \
+  -f PDF,PDF_LATEX \
+  -b digitization-dev
+```
+
+Options:
+
+- `-d, --data-source` — local directory or CERNBox URL containing `.xlsx` Boite files.
+- `-o, --output-path` — output directory for JSON results (default: `./match_results`).
+- `-f, --file-types` — comma-separated list of file types to match (default: `PDF,PDF_LATEX`).
+- `-b, --bucket` — S3 bucket name (default: `digitization-dev`).
+
+### Matcher behavior
+
+The `file-match` flow:
+
+- downloads `.xlsx` Boite files from CERNBox if a URL is provided.
+- reads each Boite file and extracts the record ID and filename columns.
+- searches S3 under `raw/<TYPE>/<BOITE>/`.
+- matches filenames case-insensitively.
+- supports both flat and subfolder layouts:
+  - flat: `raw/PDF_LATEX/BOITE_O0125/ISR-LEP-RF-GG-ps.pdf`
+  - nested: `raw/PDF/BOITE_O0125/LEP-RF-SH-ps/LEP-RF-SH-ps.pdf`
+- writes unified mismatch logs in JSON format for missing Boite rows and extra S3 files.
 
 ## Dependencies
 
@@ -27,8 +94,6 @@ poetry install
 - `requests`
 - `pypdf`
 
-> If the project is managed with Poetry, `requirements.txt` is not required.
-
 ## AWS Authentication
 
 `S3Provider` uses `boto3`. Configure credentials using environment variables or the default AWS config files:
@@ -51,55 +116,22 @@ export SECRET_KEY="YOUR_SECRET_KEY"
 
 > `S3Provider` also supports the default endpoint `https://s3.cern.ch`, configured in `storage_connection.py`.
 
-## Usage with Poetry
-
-Run the refactored CLI via Poetry:
-
-```bash
-poetry run digitization_v2 --help
-```
-
-The current command for PDF validation is `validade-files-integrity`.
-
-### Example
-
-```bash
-poetry run digitization_v2 check-integrity -s "[122,123]" -u
-```
-
-Parameters:
-
-- `-i, --inventory-source`: Inventory source. Supports CERNBOX Hash, range (`1..10`), or list (`[1,2]`).
-- `-u, --upload-reports`: Flag to upload validation reports back to the storage provider.
-- `-b, --bucket`: S3 bucket name (default: `digitization-dev`).
-
-### Example without upload
-
-```bash
-poetry run digitization_v2 check-integrity -s "[122,123]"
-```
-
-## Expected output
-
-The CLI generates the same validation reports as the core pipeline:
+## CERNBox Authentication
 
-- a text log file such as `s3_pdf_issues.log`
-- a structured JSON report with valid, corrupted, and missing file details
+`CernboxProvider` reads optional credentials from environment variables:
 
-If `-u` is provided, the reports will be uploaded back to the configured storage provider.
+- `CERNBOX_USER`
+- `CERNBOX_PASSWORD`
 
-## Additional notes
-
-- `CernboxProvider` reads optional credentials from environment variables:
-  - `CERNBOX_USER`
-  - `CERNBOX_PASSWORD`
-
-### Example environment variables for Cernbox
+### Example environment variables for CERNBox
 
 ```bash
 export CERNBOX_USER="your_username"
 export CERNBOX_PASSWORD="your_password"
 ```
 
-- You may still pass `account` and `password` directly to `CernboxProvider` if preferred.
-- Use `test_connections.py` to verify connections before running the main pipeline.
+## Notes
+
+- `file_import/refactory_matcher.py` is the primary matcher used by `file-match`.
+- `test_connections.py` can be used to verify storage connectivity before running either workflow.
+- Use `poetry run digitization_v2 --help` to verify command names and options at runtime.

From c8a788cc9739709fdd59f2b7cfec4c70147a3122 Mon Sep 17 00:00:00 2001
From: Gabriela de Santana Carrara <gabriela.de.santana.carrara@cern.ch>
Date: Thu, 16 Apr 2026 15:01:53 +0200
Subject: [PATCH 4/7] digitization(file-import):Update URL generator and
 address PR comments

* Add support for optional expiration parameters in S3 URLs
* Refactor code based on GitHub review feedback
* ref cern-sis/digitization#24, cern-sis/digitization#15, cern-sis/digitization#25
---
 refactory/check_files/main.py          |  6 +-
 refactory/cli.py                       | 35 +++++++++---
 refactory/file_import/boite_matcher.py | 30 ++++++----
 refactory/file_import/utils.py         |  2 +-
 refactory/storage_connection.py        | 76 +++++++++++++++++++-------
 refactory/test_connections.py          |  2 +-
 6 files changed, 108 insertions(+), 43 deletions(-)

diff --git a/refactory/check_files/main.py b/refactory/check_files/main.py
index 0e67155..29fb331 100644
--- a/refactory/check_files/main.py
+++ b/refactory/check_files/main.py
@@ -4,7 +4,7 @@
 import sys
 import json
 from typing import Union
-from storage_connection import StorageProvider, S3Provider, CernboxProvider
+from refactory.storage_connection import StorageProvider, S3Provider, CernboxProvider
 from .utils import validate_pdf
 
 
@@ -19,8 +19,8 @@ def run_validation_pipeline(
     """Navigates directories, validates files, and logs files status."""
     target_box_numbers = set()
     if isinstance(data_source, str):
-        inventory_provider = CernboxProvider(data_source)
-        excel_files = inventory_provider.list_files("", '.xlsx')
+        data_source_provider = CernboxProvider(data_source)
+        excel_files = data_source_provider.list_files("", '.xlsx')
 
         for file_path in excel_files:
             filename = file_path.split(".")[0]
diff --git a/refactory/cli.py b/refactory/cli.py
index 17b47cd..35b7a91 100644
--- a/refactory/cli.py
+++ b/refactory/cli.py
@@ -1,9 +1,8 @@
 import click
 import ast
 from .check_files.main import run_validation_pipeline
-from storage_connection import S3Provider
+from refactory.storage_connection import S3Provider
 
-# Import the matcher we built earlier (adjust the path if your folder structure is different)
 from .file_import.boite_matcher import BoiteS3Matcher
 
 
@@ -54,7 +53,14 @@ def digitization_v2():
     show_default=True,
     help="S3 Bucket name.",
 )
-def validate_files_integrity(data_source, upload_reports, bucket):
+@click.option(
+    "-p",
+    "--base-path",
+    default="cern-archives/raw/PDF/",
+    show_default=True,
+    help="Base S3 path to validate.",
+)
+def validate_files_integrity(data_source, base_path, bucket, upload_reports):
     """
     Validates files integrity and inventory alignment.
     This command checks for corrupted files and missing boxes.
@@ -66,7 +72,7 @@ def validate_files_integrity(data_source, upload_reports, bucket):
     try:
         run_validation_pipeline(
             provider=provider,
-            base_path="cern-archives/raw/PDF/",
+            base_path=base_path,
             log_file="s3_pdf_issues.log",
             data_source=inventory_input,
             upload_reports=upload_reports,
@@ -76,7 +82,6 @@ def validate_files_integrity(data_source, upload_reports, bucket):
         click.secho(f"Error: {e}", fg="red", err=True)
 
 
-
 @digitization_v2.command("file-match")
 @click.option(
     "-d",
@@ -84,6 +89,13 @@ def validate_files_integrity(data_source, upload_reports, bucket):
     required=True,
     help="Target data source. Supports a local directory path or a CERNBOX URL.",
 )
+@click.option(
+    "-p",
+    "--base-path",
+    default="cern-archives/raw/",
+    show_default=True,
+    help="Base S3 path to validate.",
+)
 @click.option(
     "-o",
     "--output-path",
@@ -105,12 +117,20 @@ def validate_files_integrity(data_source, upload_reports, bucket):
     show_default=True,
     help="S3 Bucket name.",
 )
-def file_match(data_source, output_path, file_types, bucket):
+
+def file_match(data_source, base_path, output_path, file_types, bucket):
     """
     Matches Boite Excel records against S3 files and generates JSON payloads.
     Generates a success JSON per Boite and a unified mismatch log.
     """
-    provider = S3Provider(bucket=bucket)
+
+    CUSTOM_EXPIRATION = {
+        # Example: uncomment the line below to test it
+        # "PDF": 10,
+        "PDF_LATEX": 45
+    }
+
+    provider = S3Provider(bucket=bucket, custom_expiration=CUSTOM_EXPIRATION)
 
     parsed_file_types = [t.strip() for t in file_types.split(",")]
 
@@ -121,6 +141,7 @@ def file_match(data_source, output_path, file_types, bucket):
     try:
         matcher = BoiteS3Matcher(
             provider=provider,
+            base_path=base_path,
             data_source=data_source,
             output_path=output_path,
             file_types=parsed_file_types,
diff --git a/refactory/file_import/boite_matcher.py b/refactory/file_import/boite_matcher.py
index aaf254b..da824db 100644
--- a/refactory/file_import/boite_matcher.py
+++ b/refactory/file_import/boite_matcher.py
@@ -15,12 +15,14 @@ class BoiteS3Matcher:
     def __init__(
         self,
         provider: StorageProvider,
+        base_path: str,
         data_source: str,
         output_path: str,
         file_types: list[str] | None = None,
     ):
         """Initializes the matcher with storage, data output, data path, and target file types."""
         self.provider = provider
+        self.base_path = Path(base_path)
         self.output_path = Path(output_path)
         self.output_path.mkdir(parents=True, exist_ok=True)
         self.file_types = file_types or ["PDF", "PDF_LATEX"]
@@ -40,9 +42,7 @@ def _get_base_filename(self, filename: str) -> str:
         lower_name = filename.lower()
         if lower_name.endswith("_latex.pdf"):
             return lower_name[:-10]
-        if lower_name.endswith(".pdf"):
-            return lower_name[:-4]
-        if lower_name.endswith((".tiff", ".tif")):
+        if lower_name.endswith((".pdf",".tiff", ".tif")):
             return lower_name.rsplit(".", 1)[0]
         return lower_name
 
@@ -69,13 +69,16 @@ def _load_s3_cache_for_boite(
         target_number = match.group(1)
 
         for filetype in self.file_types:
-            prefix = f"cern-archives/raw/{filetype}/BOITE_O0{target_number}"
+            prefix = f"{self.base_path}{filetype}/BOITE_O0{target_number}"
             all_raw_keys = self.provider.list_files(prefix)
 
             valid_keys: list[str] = []
+
             for key in all_raw_keys:
-                folder_name = key.split("/")[-2]
-                s3_match = folder_pattern.fullmatch(folder_name)
+                if key.endswith("/"):
+                    continue
+
+                s3_match = folder_pattern.search(key)
 
                 if s3_match and s3_match.group(1) == target_number:
                     valid_keys.append(key)
@@ -117,7 +120,7 @@ def process_boite(
                         "application/pdf" if ftype in ["PDF", "PDF_LATEX"] else None
                     )
                     record_data[url_key] = self.provider.generate_presigned_url(
-                        matched_key, content_type
+                        matched_key, ftype, content_type
                     )
                     used_s3_keys[ftype].add(matched_key)
                 else:
@@ -148,21 +151,28 @@ def process_boite(
                 unused_s3 = s3_available_keys[ftype] - used_s3_keys[ftype]
 
                 for s3_key in unused_s3:
-                    s3_base = self._get_base_filename(s3_key.split("/")[-1])
+                    parts = s3_key.split("/")
+                    s3_base = self._get_base_filename(parts[-1])
                     s3_norm = self._normalize_for_comparison(s3_base)
 
-                    if boite_norm == s3_norm:
+                    folder_norm = ""
+
+                    if ftype == "PDF" and len(parts) > 1:
+                        folder_norm = self._normalize_for_comparison(parts[-2])
+
+                    if boite_norm == s3_norm or (ftype == "PDF" and boite_norm == folder_norm):
                         near_matches.append(
                             {
                                 "boite_record": missing_rec["record_name"],
                                 "suggested_s3_key": s3_key,
-                                "filetype": ftype,
+                                "filetype": ftype
                             }
                         )
 
         mismatch_data = {
             "boite_file": box_file,
             "s3_folder_name": boite_name_s3,
+            "total_in_boite_missing_in_s3": len(missing_in_boite),
             "mismatches": {
                 "in_boite_missing_in_s3": missing_in_s3,
                 "in_s3_missing_in_boite": missing_in_boite,
diff --git a/refactory/file_import/utils.py b/refactory/file_import/utils.py
index cad996e..34fda2e 100644
--- a/refactory/file_import/utils.py
+++ b/refactory/file_import/utils.py
@@ -1,7 +1,7 @@
 import os
 import tempfile
 from urllib.parse import urlparse
-from .storage_connection import CernboxProvider
+from ..storage_connection import CernboxProvider
 
 
 def parse_cernbox_url(url: str) -> dict:
diff --git a/refactory/storage_connection.py b/refactory/storage_connection.py
index 361a5e1..363305b 100644
--- a/refactory/storage_connection.py
+++ b/refactory/storage_connection.py
@@ -4,8 +4,10 @@
 from abc import ABC, abstractmethod
 import os
 
+
 class StorageProvider(ABC):
     """Base interface for storage providers."""
+
     @abstractmethod
     def list_folders(self, base_path: str) -> list[str]:
         pass
@@ -28,31 +30,62 @@ def generate_presigned_url(
     ) -> str:
         pass
 
+
 class S3Provider(StorageProvider):
-    def __init__(self, bucket: str, endpoint_url: str = 'https://s3.cern.ch'):
+    def __init__(
+        self,
+        bucket: str,
+        endpoint_url: str = "https://s3.cern.ch",
+        custom_expiration: dict = None,
+    ):
         self.bucket = bucket
-        self.s3 = boto3.client(
-            "s3",
-            # aws_access_key_id=os.environ["ACCESS_KEY"],
-            # aws_secret_access_key=os.environ["SECRET_KEY"],
-            endpoint_url=endpoint_url,
-        )
+        # self.s3 = boto3.client(
+        #     "s3",
+        #     aws_access_key_id=os.environ["ACCESS_KEY"],
+        #     aws_secret_access_key=os.environ["SECRET_KEY"],
+        #     endpoint_url=endpoint_url,
+        # )
+        if os.environ["ACCESS_KEY"] and os.environ["SECRET_KEY"]:
+                print("Logging into s3 using credentials provided in enviroment variables")
+                self.s3 = boto3.client(
+                    "s3",
+                    aws_access_key_id=os.environ["ACCESS_KEY"],
+                    aws_secret_access_key=os.environ["SECRET_KEY"],
+                    endpoint_url=endpoint_url,
+                )
+        else:
+                print("Using default s3 login without credentials")
+                self.s3 = boto3.client(
+                    "s3",
+                    endpoint_url=endpoint_url,
+                )
+        self.expiration_config = {
+            "PDF": 365,
+            "PDF_LATEX": 90,
+            "TIFF": 30,
+            "DEFAULT": 365,
+        }
+        print("custom_expiration", custom_expiration, self.expiration_config)
+        if custom_expiration:
+            self.expiration_config.update(custom_expiration)
 
     def list_folders(self, base_path: str) -> list[str]:
-        paginator = self.s3.get_paginator('list_objects_v2')
+        paginator = self.s3.get_paginator("list_objects_v2")
         folders = []
-        for page in paginator.paginate(Bucket=self.bucket, Prefix=base_path, Delimiter='/'):
-            for prefix in page.get('CommonPrefixes', []):
-                folders.append(prefix['Prefix'])
+        for page in paginator.paginate(
+            Bucket=self.bucket, Prefix=base_path, Delimiter="/"
+        ):
+            for prefix in page.get("CommonPrefixes", []):
+                folders.append(prefix["Prefix"])
         return folders
 
     def list_files(self, folder_path: str, extension: str = None) -> list[str]:
-        paginator = self.s3.get_paginator('list_objects_v2')
+        paginator = self.s3.get_paginator("list_objects_v2")
         files = []
         for page in paginator.paginate(Bucket=self.bucket, Prefix=folder_path):
-            for obj in page.get('Contents', []):
-                key = obj['Key']
-                if not key.endswith('/'):
+            for obj in page.get("Contents", []):
+                key = obj["Key"]
+                if not key.endswith("/"):
                     if extension is None or key.lower().endswith(extension.lower()):
                         files.append(key)
         return files
@@ -64,8 +97,12 @@ def upload_file(self, local_file_path: str, remote_file_path: str) -> None:
         self.s3.upload_file(local_file_path, self.bucket, remote_file_path)
 
     def generate_presigned_url(
-        self, file_key: str, content_type: str = None, expiration: int = 31556952
+        self, file_key: str, file_type: str, content_type: str = None
     ) -> str:
+        expiration = 86400 * (
+            self.expiration_config.get(file_type, self.expiration_config["DEFAULT"])
+        )
+
         params = {"Bucket": self.bucket, "Key": file_key}
         if content_type:
             params["ResponseContentType"] = content_type
@@ -76,6 +113,7 @@ def generate_presigned_url(
             ExpiresIn=expiration,
         )
 
+
 class CernboxProvider(StorageProvider):
     def __init__(self, public_link_hash: str = None):
         self.account = os.getenv("CERNBOX_ACCOUNT")
@@ -85,11 +123,9 @@ def __init__(self, public_link_hash: str = None):
         self.public_link_hash = public_link_hash
 
         if self.is_public:
-            # Public
             self.base_url = f"https://cernbox.cern.ch/remote.php/dav/public-files/{public_link_hash}"
             self.auth = None
         else:
-            # Protected
             if not self.account or not self.password:
                 raise ValueError(
                     "Error: CERN credentials required for protected shares.."
@@ -139,9 +175,7 @@ def download_to_temp(self, file_path: str, temp_file_path: str) -> None:
 
     def upload_file(self, local_file_path: str, remote_file_path: str) -> None:
         if self.is_public:
-            raise NotImplementedError(
-                "Error: CERN credentials required for updates."
-            )
+            raise NotImplementedError("Error: CERN credentials required for updates.")
 
         clean_remote_path = remote_file_path.strip("/")
         url = f"{self.base_url}/{clean_remote_path}"
diff --git a/refactory/test_connections.py b/refactory/test_connections.py
index 242f139..65467ce 100644
--- a/refactory/test_connections.py
+++ b/refactory/test_connections.py
@@ -1,6 +1,6 @@
 import tempfile
 import os
-from storage_connection import S3Provider, CernboxProvider
+from .storage_connection import S3Provider, CernboxProvider
 
 def test_s3():
     print("--- Testing AWS S3 connection ---")

From 2263617da4b81542a3b189b405eb6a14073f2a0a Mon Sep 17 00:00:00 2001
From: Gabriela de Santana Carrara <gabriela.de.santana.carrara@cern.ch>
Date: Thu, 16 Apr 2026 15:08:50 +0200
Subject: [PATCH 5/7] remove typing union

---
 refactory/check_files/main.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/refactory/check_files/main.py b/refactory/check_files/main.py
index 29fb331..a908b40 100644
--- a/refactory/check_files/main.py
+++ b/refactory/check_files/main.py
@@ -3,7 +3,6 @@
 import os
 import sys
 import json
-from typing import Union
 from refactory.storage_connection import StorageProvider, S3Provider, CernboxProvider
 from .utils import validate_pdf
 
@@ -12,7 +11,7 @@ def run_validation_pipeline(
     provider: StorageProvider,
     base_path: str,
     log_file: str,
-    data_source: Union[str, list[int]],
+    data_source: str | list[int],
     upload_reports: bool = False,
 
 ):

From 0ec41becb72c8c5f080bce7dc93ba76994d9cd3a Mon Sep 17 00:00:00 2001
From: Gabriela de Santana Carrara <gabriela.de.santana.carrara@cern.ch>
Date: Thu, 16 Apr 2026 16:17:48 +0200
Subject: [PATCH 6/7] fix prefix path and remove comments

---
 refactory/cli.py                       | 2 +-
 refactory/file_import/boite_matcher.py | 2 +-
 refactory/storage_connection.py        | 9 ++-------
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/refactory/cli.py b/refactory/cli.py
index 35b7a91..4f62237 100644
--- a/refactory/cli.py
+++ b/refactory/cli.py
@@ -127,7 +127,7 @@ def file_match(data_source, base_path, output_path, file_types, bucket):
     CUSTOM_EXPIRATION = {
         # Example: uncomment the line below to test it
         # "PDF": 10,
-        "PDF_LATEX": 45
+        # "PDF_LATEX": 45
     }
 
     provider = S3Provider(bucket=bucket, custom_expiration=CUSTOM_EXPIRATION)
diff --git a/refactory/file_import/boite_matcher.py b/refactory/file_import/boite_matcher.py
index da824db..b9954d2 100644
--- a/refactory/file_import/boite_matcher.py
+++ b/refactory/file_import/boite_matcher.py
@@ -69,7 +69,7 @@ def _load_s3_cache_for_boite(
         target_number = match.group(1)
 
         for filetype in self.file_types:
-            prefix = f"{self.base_path}{filetype}/BOITE_O0{target_number}"
+            prefix = f"{self.base_path}/{filetype}/BOITE_O0{target_number}"
             all_raw_keys = self.provider.list_files(prefix)
 
             valid_keys: list[str] = []
diff --git a/refactory/storage_connection.py b/refactory/storage_connection.py
index 363305b..ebf1c8c 100644
--- a/refactory/storage_connection.py
+++ b/refactory/storage_connection.py
@@ -39,12 +39,7 @@ def __init__(
         custom_expiration: dict = None,
     ):
         self.bucket = bucket
-        # self.s3 = boto3.client(
-        #     "s3",
-        #     aws_access_key_id=os.environ["ACCESS_KEY"],
-        #     aws_secret_access_key=os.environ["SECRET_KEY"],
-        #     endpoint_url=endpoint_url,
-        # )
+
         if os.environ["ACCESS_KEY"] and os.environ["SECRET_KEY"]:
                 print("Logging into s3 using credentials provided in enviroment variables")
                 self.s3 = boto3.client(
@@ -65,7 +60,7 @@ def __init__(
             "TIFF": 30,
             "DEFAULT": 365,
         }
-        print("custom_expiration", custom_expiration, self.expiration_config)
+
         if custom_expiration:
             self.expiration_config.update(custom_expiration)
 

From 4c1a78a9d3363fa95d3b8b4d090fea9f2f473a3e Mon Sep 17 00:00:00 2001
From: Gabriela de Santana Carrara <gabriela.de.santana.carrara@cern.ch>
Date: Tue, 21 Apr 2026 16:44:56 +0200
Subject: [PATCH 7/7] digitization(storage_connection): Raise
 NotImplementedError for list_folders in CernboxProvider

---
 refactory/storage_connection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/refactory/storage_connection.py b/refactory/storage_connection.py
index ebf1c8c..074b6f9 100644
--- a/refactory/storage_connection.py
+++ b/refactory/storage_connection.py
@@ -151,7 +151,7 @@ def _propfind(self, path: str, depth: str = "1") -> list[str]:
         return paths
 
     def list_folders(self, base_path: str) -> list[str]:
-        pass
+        raise NotImplementedError("This method is not available for this storage type.")
 
     def list_files(self, folder_path: str, extension: str = None) -> list[str]:
         all_items = self._propfind(folder_path)