Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ import_xml_files/
# Virtual environments
.venv


s3_pdf_issues.json
229 changes: 202 additions & 27 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,17 @@ python = "^3.12"
pysftp = "^0.2.9"
click = "^8.0.3"
boto3 = "^1.38.9"
requests = "^2.31.0"
python-dotenv = "^1.0.0"
pandas = "^2.2.3"
openpyxl = "^3.1.5"
tqdm = "^4.67.1"
pypdf2 = "^3.0.1"
dotenv = "^0.9.9"
pypdf = "^6.9.2"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[tool.poetry.scripts]
digitization = "digitization.cli:digitization"
digitization_v2 = "refactory.cli:digitization_v2"
105 changes: 105 additions & 0 deletions refactory/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# refactory

This directory contains scripts and helpers for validating PDF files in an S3 bucket using an inventory of Excel files hosted on CERNBox.

## Structure

- `main.py` - main script that validates PDFs using the CERNBox inventory.
- `storage_connection.py` - storage provider abstraction:
- `S3Provider` for S3.
- `CernboxProvider` for public CERNBox access.
- `validate_pdf.py` - validates PDFs locally with `is_pdf_valid(file_path)`.
- `test_connections.py` - testing/connection experiment script.

## Dependencies

This project uses Poetry to manage dependencies. The required libraries are listed in `pyproject.toml`.

### Install dependencies with Poetry

```bash
poetry install
```

### Main dependencies

- `boto3`
- `requests`
- `pypdf`

> If the project is managed with Poetry, `requirements.txt` is not required.

## AWS Authentication

`S3Provider` uses `boto3`. Configure credentials using environment variables or the default AWS config files:

- `AWS_ACCESS_KEY_ID`
- `AWS_SECRET_ACCESS_KEY`

### Example environment variables

```bash
export ACCESS_KEY="YOUR_ACCESS_KEY"
export SECRET_KEY="YOUR_SECRET_KEY"
```

### Supported alternatives

- `~/.aws/credentials`
- `~/.aws/config`
- IAM role attached to an instance/container

> `S3Provider` also supports the default endpoint `https://s3.cern.ch`, configured in `storage_connection.py`.

## Usage with Poetry

Run the refactored CLI via Poetry:

```bash
poetry run digitization_v2 --help
```

The current command for PDF validation is `validade-files-integrity`.

### Example

```bash
poetry run digitization_v2 check-integrity -s "[122,123]" -u
```

Parameters:

- `-i, --inventory-source`: Inventory source. Supports CERNBOX Hash, range (`1..10`), or list (`[1,2]`).
- `-u, --upload-reports`: Flag to upload validation reports back to the storage provider.
- `-b, --bucket`: S3 bucket name (default: `digitization-dev`).

### Example without upload

```bash
poetry run digitization_v2 check-integrity -s "[122,123]"
```

## Expected output

The CLI generates the same validation reports as the core pipeline:

- a text log file such as `s3_pdf_issues.log`
- a structured JSON report with valid, corrupted, and missing file details

If `-u` is provided, the reports will be uploaded back to the configured storage provider.

## Additional notes

- `CernboxProvider` reads optional credentials from environment variables:
- `CERNBOX_USER`
- `CERNBOX_PASSWORD`

### Example environment variables for Cernbox

```bash
export CERNBOX_USER="your_username"
export CERNBOX_PASSWORD="your_password"
```

- You may still pass `account` and `password` directly to `CernboxProvider` if preferred.
- Use `test_connections.py` to verify connections before running the main pipeline.
76 changes: 76 additions & 0 deletions refactory/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import click
import ast
from .main import run_validation_pipeline
from storage_connection import S3Provider


def parse_inventory(value):
"""
Parses the input to identify if it's a literal list,
a range of IDs (1..10), or a single string/ID.
"""
if value.isdigit():
return [int(value)]
if value.startswith("[") and value.endswith("]"):
try:
return ast.literal_eval(value)
except (ValueError, SyntaxError):
raise click.BadParameter("Invalid list format. Use '[1, 2, 3]'")

if ".." in value:
try:
start, end = map(int, value.split(".."))
return list(range(start, end + 1))
except ValueError:
pass
return value

@click.group()
def digitization_v2():
pass


@digitization_v2.command("validate-files-integrity")
@click.option(
"-s",
"--inventory-source",
required=True,
help="Target inventory. Supports a CERNBOX hash, range 1..10, or list [1,2].",
)
@click.option(
"-u",
"--upload-reports",
is_flag=True,
help="Upload validation reports back to the storage provider.",
)
@click.option(
"-b",
"--bucket",
default="digitization-dev",
show_default=True,
help="S3 Bucket name.",
)
def validate_files_integrity(inventory_source, upload_reports, bucket):
"""
Validates files integrity and inventory alignment.
This command checks for corrupted files and missing boxes.
"""

inventory_input = parse_inventory(inventory_source)
provider = S3Provider(bucket=bucket)

try:
run_validation_pipeline(
provider=provider,
base_path="cern-archives/raw/PDF/",
log_file="s3_pdf_issues.log",
inventory_source=inventory_input,
upload_reports=upload_reports,
)
click.echo("Process finished. Check the generated logs for details.")
except Exception as e:
click.secho(f"Error: {e}", fg="red", err=True)


if __name__ == "__main__":
digitization_v2()
141 changes: 141 additions & 0 deletions refactory/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import tempfile
import re
import os
import sys
import json
from typing import Union
from storage_connection import StorageProvider, S3Provider, CernboxProvider
from validate_pdf import is_pdf_valid


def run_validation_pipeline(
provider: StorageProvider,
base_path: str,
log_file: str,
inventory_source: Union[str, list[int]],
upload_reports: bool = False,

):
"""Navigates directories, validates files, and logs files status."""
target_box_numbers = set()
if isinstance(inventory_source, str):
inventory_provider = CernboxProvider(inventory_source)
excel_files = inventory_provider.list_excel("")

for file_path in excel_files:
filename = file_path.split(".")[0]

match = re.search(r"(?i:BOITE)[\-_]O0(\d+)(-\w+)?", filename)

if match:
target_box_numbers.add(int(match.group(1)))
elif isinstance(inventory_source, list):
target_box_numbers = set(inventory_source)

print(f"Excel files: {len(target_box_numbers)} boxes to check.")

print(f"Folders in: {base_path}")
folders = provider.list_folders(base_path)

if not folders:
print("No folders found in this path.")
return

found_and_valid_boxes = set()
corrupted_files = []
valid_files = []

print("Starting validation...")

for folder in folders:
match = re.search(r"(?i:BOITE)[\-_]O0(\d+)(-\w+)?", folder)
if not match:
continue

box_num = int(match.group(1))
if box_num not in target_box_numbers:
continue
print(f"Processing target Box: {match.group(1) + (match.group(2) or '')}")

pdf_files = provider.list_pdfs(folder)

if not pdf_files:
print(f"⚠️ EMPTY FOLDER: {folder}")
continue

found_and_valid_boxes.add(box_num)

for pdf_path in pdf_files:
with tempfile.NamedTemporaryFile(delete=True) as tmp:
provider.download_to_temp(pdf_path, tmp.name)

if is_pdf_valid(tmp.name):
valid_files.append(pdf_path)
print(f" ✅ {pdf_path}")
else:
print(f" ❌ CORRUPTED: {pdf_path}")
corrupted_files.append(pdf_path)
missing_boxes = target_box_numbers - found_and_valid_boxes

if missing_boxes:
print("\n Empty target boxes:")
for box in sorted(missing_boxes):
print(
f" -> BOITE_O0{box}"
)

with open(log_file, "w", encoding="utf-8") as log:
log.write(
f"Validation report for the following boxes {target_box_numbers}\n ✅ Valid Files: {len(valid_files)}\n ❌ Corrupted Files: {len(corrupted_files)}\n"
)
for vf in valid_files:
log.write(f"✅ Valid PDF: {vf}\n")
for cf in corrupted_files:
log.write(f"❌ Corrupted PDF: {cf}\n")

json_report = {
"metadata": {"base_path": base_path, "target_boxes": list(target_box_numbers)},
"statistics": {
"valid_files_count": len(valid_files),
"corrupted_files_count": len(corrupted_files),
"missing_boxes_count": len(missing_boxes) if missing_boxes else 0,
},
"output": {
"valid_files": valid_files,
"missing_boxes": list(missing_boxes) if missing_boxes else [],
"corrupted_files": corrupted_files,
},
}

json_file_path = log_file.replace(".log", ".json")
with open(json_file_path, "w", encoding="utf-8") as jf:
json.dump(json_report, jf, indent=4)

print(f"\nDone! The text log of corrupted files was saved to: {log_file}")
print(f"The structured JSON data was saved to: {json_file_path}")

if upload_reports:
remote_log_path = f"{base_path.rstrip('/')}/{os.path.basename(log_file)}"
remote_json_path = f"{base_path.rstrip('/')}/{os.path.basename(json_file_path)}"

print(f"Uploading reports back to the cloud ({base_path})...")
try:
provider.upload_file(log_file, remote_log_path)
provider.upload_file(json_file_path, remote_json_path)
print(
f"✅ Upload successful! Files available at: {remote_log_path} and {remote_json_path}"
)
except Exception as e:
print(f"❌ Failed to upload reports: {e}")


if __name__ == "__main__":
Comment thread
namollayo marked this conversation as resolved.
s3_provider = S3Provider(bucket="digitization-dev")

run_validation_pipeline(
Comment thread
PascalEgn marked this conversation as resolved.
provider=s3_provider, # cernbox_provider
base_path="cern-archives/raw/PDF/", # "teste/",
log_file="s3_pdf_issues.log",
inventory_source=sys.argv[1], # public_link_hash
upload_reports=int(sys.argv[2])
)
Loading
Loading