cern-sis · PascalEgn · Apr 15, 2026 · Mar 27, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/.gitignore b/.gitignore
@@ -17,4 +17,4 @@ import_xml_files/
 # Virtual environments
 .venv
 
-
+s3_pdf_issues.json
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,15 +9,17 @@ python = "^3.12"
 pysftp = "^0.2.9"
 click = "^8.0.3"
 boto3 = "^1.38.9"
+requests = "^2.31.0"
+python-dotenv = "^1.0.0"
 pandas = "^2.2.3"
 openpyxl = "^3.1.5"
 tqdm = "^4.67.1"
-pypdf2 = "^3.0.1"
-dotenv = "^0.9.9"
+pypdf = "^6.9.2"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.scripts]
 digitization = "digitization.cli:digitization"
+digitization_v2 = "refactory.cli:digitization_v2"
diff --git a/refactory/README.md b/refactory/README.md
@@ -0,0 +1,105 @@
+# refactory
+
+This directory contains scripts and helpers for validating PDF files in an S3 bucket using an inventory of Excel files hosted on CERNBox.
+
+## Structure
+
+- `main.py` - main script that validates PDFs using the CERNBox inventory.
+- `storage_connection.py` - storage provider abstraction:
+  - `S3Provider` for S3.
+  - `CernboxProvider` for public CERNBox access.
+- `validate_pdf.py` - validates PDFs locally with `is_pdf_valid(file_path)`.
+- `test_connections.py` - testing/connection experiment script.
+
+## Dependencies
+
+This project uses Poetry to manage dependencies. The required libraries are listed in `pyproject.toml`.
+
+### Install dependencies with Poetry
+
+```bash
+poetry install
+```
+
+### Main dependencies
+
+- `boto3`
+- `requests`
+- `pypdf`
+
+> If the project is managed with Poetry, `requirements.txt` is not required.
+
+## AWS Authentication
+
+`S3Provider` uses `boto3`. Configure credentials using environment variables or the default AWS config files:
+
+- `AWS_ACCESS_KEY_ID`
+- `AWS_SECRET_ACCESS_KEY`
+
+### Example environment variables
+
+```bash
+export ACCESS_KEY="YOUR_ACCESS_KEY"
+export SECRET_KEY="YOUR_SECRET_KEY"
+```
+
+### Supported alternatives
+
+- `~/.aws/credentials`
+- `~/.aws/config`
+- IAM role attached to an instance/container
+
+> `S3Provider` also supports the default endpoint `https://s3.cern.ch`, configured in `storage_connection.py`.
+
+## Usage with Poetry
+
+Run the refactored CLI via Poetry:
+
+```bash
+poetry run digitization_v2 --help
+```
+
+The current command for PDF validation is `validade-files-integrity`.
+
+### Example
+
+```bash
+poetry run digitization_v2 check-integrity -s "[122,123]" -u
+```
+
+Parameters:
+
+- `-i, --inventory-source`: Inventory source. Supports CERNBOX Hash, range (`1..10`), or list (`[1,2]`).
+- `-u, --upload-reports`: Flag to upload validation reports back to the storage provider.
+- `-b, --bucket`: S3 bucket name (default: `digitization-dev`).
+
+### Example without upload
+
+```bash
+poetry run digitization_v2 check-integrity -s "[122,123]"
+```
+
+## Expected output
+
+The CLI generates the same validation reports as the core pipeline:
+
+- a text log file such as `s3_pdf_issues.log`
+- a structured JSON report with valid, corrupted, and missing file details
+
+If `-u` is provided, the reports will be uploaded back to the configured storage provider.
+
+## Additional notes
+
+- `CernboxProvider` reads optional credentials from environment variables:
+  - `CERNBOX_USER`
+  - `CERNBOX_PASSWORD`
+
+### Example environment variables for Cernbox
+
+```bash
+export CERNBOX_USER="your_username"
+export CERNBOX_PASSWORD="your_password"
+```
+
+- You may still pass `account` and `password` directly to `CernboxProvider` if preferred.
+- Use `test_connections.py` to verify connections before running the main pipeline.
diff --git a/refactory/cli.py b/refactory/cli.py
@@ -0,0 +1,76 @@
+import click
+import ast
+from .main import run_validation_pipeline
+from storage_connection import S3Provider
+
+
+def parse_inventory(value):
+    """
+    Parses the input to identify if it's a literal list,
+    a range of IDs (1..10), or a single string/ID.
+    """
+    if value.isdigit():
+        return [int(value)]
+    if value.startswith("[") and value.endswith("]"):
+        try:
+            return ast.literal_eval(value)
+        except (ValueError, SyntaxError):
+            raise click.BadParameter("Invalid list format. Use '[1, 2, 3]'")
+
+    if ".." in value:
+        try:
+            start, end = map(int, value.split(".."))
+            return list(range(start, end + 1))
+        except ValueError:
+            pass
+    return value
+
+@click.group()
+def digitization_v2():
+    pass
+
+
+@digitization_v2.command("validate-files-integrity")
+@click.option(
+    "-s",
+    "--inventory-source",
+    required=True,
+    help="Target inventory. Supports a CERNBOX hash, range 1..10, or list [1,2].",
+)
+@click.option(
+    "-u",
+    "--upload-reports",
+    is_flag=True,
+    help="Upload validation reports back to the storage provider.",
+)
+@click.option(
+    "-b",
+    "--bucket",
+    default="digitization-dev",
+    show_default=True,
+    help="S3 Bucket name.",
+)
+def validate_files_integrity(inventory_source, upload_reports, bucket):
+    """
+    Validates files integrity and inventory alignment.
+    This command checks for corrupted files and missing boxes.
+    """
+
+    inventory_input = parse_inventory(inventory_source)
+    provider = S3Provider(bucket=bucket)
+
+    try:
+        run_validation_pipeline(
+            provider=provider,
+            base_path="cern-archives/raw/PDF/",
+            log_file="s3_pdf_issues.log",
+            inventory_source=inventory_input,
+            upload_reports=upload_reports,
+        )
+        click.echo("Process finished. Check the generated logs for details.")
+    except Exception as e:
+        click.secho(f"Error: {e}", fg="red", err=True)
+
+
+if __name__ == "__main__":
+    digitization_v2()
diff --git a/refactory/main.py b/refactory/main.py
@@ -0,0 +1,141 @@
+import tempfile
+import re
+import os
+import sys
+import json
+from typing import Union
+from storage_connection import StorageProvider, S3Provider, CernboxProvider
+from validate_pdf import is_pdf_valid
+
+
+def run_validation_pipeline(
+    provider: StorageProvider,
+    base_path: str,
+    log_file: str,
+    inventory_source: Union[str, list[int]],
+    upload_reports: bool = False,
+
+):
+    """Navigates directories, validates files, and logs files status."""
+    target_box_numbers = set()
+    if isinstance(inventory_source, str):
+        inventory_provider = CernboxProvider(inventory_source)
+        excel_files = inventory_provider.list_excel("")
+
+        for file_path in excel_files:
+            filename = file_path.split(".")[0]
+
+            match = re.search(r"(?i:BOITE)[\-_]O0(\d+)(-\w+)?", filename)
+
+            if match:
+                target_box_numbers.add(int(match.group(1)))
+    elif isinstance(inventory_source, list):
+        target_box_numbers = set(inventory_source)
+
+    print(f"Excel files: {len(target_box_numbers)} boxes to check.")
+
+    print(f"Folders in: {base_path}")
+    folders = provider.list_folders(base_path)
+
+    if not folders:
+        print("No folders found in this path.")
+        return
+
+    found_and_valid_boxes = set()
+    corrupted_files = []
+    valid_files = []
+
+    print("Starting validation...")
+
+    for folder in folders:
+        match = re.search(r"(?i:BOITE)[\-_]O0(\d+)(-\w+)?", folder)
+        if not match:
+            continue
+
+        box_num = int(match.group(1))
+        if box_num not in target_box_numbers:
+            continue
+        print(f"Processing target Box: {match.group(1) + (match.group(2) or '')}")
+
+        pdf_files = provider.list_pdfs(folder)
+
+        if not pdf_files:
+            print(f"⚠️ EMPTY FOLDER: {folder}")
+            continue
+
+        found_and_valid_boxes.add(box_num)
+
+        for pdf_path in pdf_files:
+            with tempfile.NamedTemporaryFile(delete=True) as tmp:
+                provider.download_to_temp(pdf_path, tmp.name)
+
+                if is_pdf_valid(tmp.name):
+                    valid_files.append(pdf_path)
+                    print(f"  ✅ {pdf_path}")
+                else:
+                    print(f"  ❌ CORRUPTED: {pdf_path}")
+                    corrupted_files.append(pdf_path)
+    missing_boxes = target_box_numbers - found_and_valid_boxes
+
+    if missing_boxes:
+        print("\n Empty target boxes:")
+        for box in sorted(missing_boxes):
+            print(
+                f"  -> BOITE_O0{box}"
+            )
+
+    with open(log_file, "w", encoding="utf-8") as log:
+        log.write(
+            f"Validation report for the following boxes {target_box_numbers}\n ✅ Valid Files: {len(valid_files)}\n ❌ Corrupted Files: {len(corrupted_files)}\n"
+        )
+        for vf in valid_files:
+            log.write(f"✅ Valid PDF: {vf}\n")
+        for cf in corrupted_files:
+            log.write(f"❌ Corrupted PDF: {cf}\n")
+
+    json_report = {
+        "metadata": {"base_path": base_path, "target_boxes": list(target_box_numbers)},
+        "statistics": {
+            "valid_files_count": len(valid_files),
+            "corrupted_files_count": len(corrupted_files),
+            "missing_boxes_count": len(missing_boxes) if missing_boxes else 0,
+        },
+        "output": {
+            "valid_files": valid_files,
+            "missing_boxes": list(missing_boxes) if missing_boxes else [],
+            "corrupted_files": corrupted_files,
+        },
+    }
+
+    json_file_path = log_file.replace(".log", ".json")
+    with open(json_file_path, "w", encoding="utf-8") as jf:
+        json.dump(json_report, jf, indent=4)
+
+    print(f"\nDone! The text log of corrupted files was saved to: {log_file}")
+    print(f"The structured JSON data was saved to: {json_file_path}")
+
+    if upload_reports:
+        remote_log_path = f"{base_path.rstrip('/')}/{os.path.basename(log_file)}"
+        remote_json_path = f"{base_path.rstrip('/')}/{os.path.basename(json_file_path)}"
+
+        print(f"Uploading reports back to the cloud ({base_path})...")
+        try:
+            provider.upload_file(log_file, remote_log_path)
+            provider.upload_file(json_file_path, remote_json_path)
+            print(
+                f"✅ Upload successful! Files available at: {remote_log_path} and {remote_json_path}"
+            )
+        except Exception as e:
+            print(f"❌ Failed to upload reports: {e}")
+
+
+if __name__ == "__main__":
+    s3_provider = S3Provider(bucket="digitization-dev")
+
+    run_validation_pipeline(
+        provider=s3_provider,  # cernbox_provider
+        base_path="cern-archives/raw/PDF/",  # "teste/",
+        log_file="s3_pdf_issues.log",
+        inventory_source=sys.argv[1],  # public_link_hash
+        upload_reports=int(sys.argv[2])
+    )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,4 +17,4 @@ import_xml_files/
		# Virtual environments
		.venv


		s3_pdf_issues.json