From 85a485f82188750da781e0ae5698fe6756696566 Mon Sep 17 00:00:00 2001 From: Gabriela de Santana Carrara Date: Fri, 27 Mar 2026 09:00:16 +0100 Subject: [PATCH 1/5] digitization: proposal for refactor PDF check script Ref: cern-sis/digitization#22 --- .gitignore | 2 + refactory/main.py | 121 ++++++++++++++++++++++++++++++++ refactory/storage_connection.py | 105 +++++++++++++++++++++++++++ refactory/test_connections.py | 81 +++++++++++++++++++++ refactory/validate_pdf.py | 34 +++++++++ 5 files changed, 343 insertions(+) create mode 100644 refactory/main.py create mode 100644 refactory/storage_connection.py create mode 100644 refactory/test_connections.py create mode 100644 refactory/validate_pdf.py diff --git a/.gitignore b/.gitignore index da1cc52..8d48b18 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ import_xml_files/ .venv +requirements.txt +s3_pdf_issues.json \ No newline at end of file diff --git a/refactory/main.py b/refactory/main.py new file mode 100644 index 0000000..5594b77 --- /dev/null +++ b/refactory/main.py @@ -0,0 +1,121 @@ +import tempfile +import re +import os +import sys +import json +from storage_connection import StorageProvider, S3Provider, CernboxProvider +from validate_pdf import is_pdf_valid + +def run_validation_pipeline(provider: StorageProvider, base_path: str, log_file: str, start: int = None, end: int = None, upload_reports: bool = False): + """Navigates directories, validates files, and logs corrupted files.""" + print(f"Discovering folders in: {base_path}") + + folders = provider.list_folders(base_path) + + if not folders: + print("No folders found in this path.") + return + + found_box_numbers = set() + empty_folders = [] + corrupted_files = [] + valid_files_count = 0 + + print("Starting validation...") + + for folder in folders: + if "BOITE_" not in folder: + continue + + match = re.search(r"BOITE_O0(\d+)", folder) + if match: + box_num = int(match.group(1)) + if start is not None and box_num < start: + continue + if end is not None and box_num > end: + continue + + found_box_numbers.add(box_num) + else: + continue + + print(f"\nChecking folder: {folder}") + pdf_files = provider.list_pdfs(folder) + + if not pdf_files: + print(" Empty folder or no PDFs.") + empty_folders.append(folder) + continue + + for file_path in pdf_files: + print(f" {file_path.split('/')[-1]} ... ", end="") + + with tempfile.NamedTemporaryFile(delete=True) as tmp: + provider.download_to_temp(file_path, tmp.name) + valid = is_pdf_valid(tmp.name) + + if valid: + print("✅ Valid") + valid_files_count += 1 + else: + print("❌ Corrupted") + corrupted_files.append(file_path) + + with open(log_file, "w", encoding="utf-8") as log: + for cf in corrupted_files: + log.write(f"Corrupted PDF: {cf}\n") + + missing_boxes = [] + if start is not None and end is not None: + expected_boxes = set(range(start, end + 1)) + missing_boxes = sorted(list(expected_boxes - found_box_numbers)) + + json_report = { + "metadata": { + "base_path": base_path, + "range_analyzed": {"start": start, "end": end} + }, + "statistics": { + "valid_files": valid_files_count, + "corrupted_files_count": len(corrupted_files), + "empty_folders_count": len(empty_folders), + "missing_boxes_count": len(missing_boxes) if missing_boxes else 0 + }, + "issues": { + "missing_boxes": missing_boxes if missing_boxes else [], + "empty_folders": empty_folders, + "corrupted_files": corrupted_files + } + } + + json_file_path = log_file.replace(".log", ".json") + with open(json_file_path, "w", encoding="utf-8") as jf: + json.dump(json_report, jf, indent=4) + + print(f"\nDone! The text log of corrupted files was saved to: {log_file}") + print(f"The structured JSON data was saved to: {json_file_path}") + + if upload_reports: + remote_log_path = f"{base_path.rstrip('/')}/{os.path.basename(log_file)}" + remote_json_path = f"{base_path.rstrip('/')}/{os.path.basename(json_file_path)}" + + print(f"Uploading reports back to the cloud ({base_path})...") + try: + provider.upload_file(log_file, remote_log_path) + provider.upload_file(json_file_path, remote_json_path) + print(f"✅ Upload successful! Files available at: {remote_log_path} and {remote_json_path}") + except Exception as e: + print(f"❌ Failed to upload reports: {e}") + +if __name__ == "__main__": + s3_provider = S3Provider(bucket="digitization-dev") + # cernbox_provider = CernboxProvider(public_link_hash="XjjFxUWUMpuTYCz") + + run_validation_pipeline( + provider=s3_provider, #cernbox_provider + base_path="cern-archives/raw/PDF/", #"teste/", + log_file="s3_pdf_issues.log", + start=int(sys.argv[1]), #123 + end=int(sys.argv[2]), #126 + upload_reports=sys.argv[3] # 0 | 1 + ) \ No newline at end of file diff --git a/refactory/storage_connection.py b/refactory/storage_connection.py new file mode 100644 index 0000000..7f0bc94 --- /dev/null +++ b/refactory/storage_connection.py @@ -0,0 +1,105 @@ +import boto3 +import requests +import xml.etree.ElementTree as ET +from abc import ABC, abstractmethod +import os + +class StorageProvider(ABC): + """Base interface for storage providers.""" + @abstractmethod + def list_folders(self, base_path: str) -> list[str]: + pass + + @abstractmethod + def list_pdfs(self, folder_path: str) -> list[str]: + pass + + @abstractmethod + def download_to_temp(self, file_path: str, temp_file_path: str) -> None: + pass + + @abstractmethod + def upload_file(self, local_file_path: str, remote_file_path: str) -> None: + pass + +class S3Provider(StorageProvider): + def __init__(self, bucket: str, endpoint_url: str = 'https://s3.cern.ch'): + self.bucket = bucket + self.s3 = boto3.client('s3', endpoint_url=endpoint_url) + + def list_folders(self, base_path: str) -> list[str]: + paginator = self.s3.get_paginator('list_objects_v2') + folders = [] + for page in paginator.paginate(Bucket=self.bucket, Prefix=base_path, Delimiter='/'): + for prefix in page.get('CommonPrefixes', []): + folders.append(prefix['Prefix']) + return folders + + def list_pdfs(self, folder_path: str) -> list[str]: + response = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=folder_path) + return [ + obj['Key'] for obj in response.get('Contents', []) + if obj['Key'].lower().endswith('.pdf') + ] + + def download_to_temp(self, file_path: str, temp_file_path: str) -> None: + self.s3.download_file(self.bucket, file_path, temp_file_path) + + def upload_file(self, local_file_path: str, remote_file_path: str) -> None: + self.s3.upload_file(local_file_path, self.bucket, remote_file_path) + +class CernboxProvider(StorageProvider): + def __init__(self, public_link_hash: str, account: str = None, password: str = None): + self.public_link_hash = public_link_hash + self.account = account + self.password = password + + self.public_base_url = f"https://cernbox.cern.ch/remote.php/dav/public-files/{public_link_hash}" + + if account: + self.auth_base_url = f"https://cernbox.cern.ch/remote.php/dav/files/{account}" + + def _propfind(self, path: str, depth: str = "1") -> list[str]: + url = f"{self.public_base_url}/{path}".rstrip('/') + '/' + headers = {'Depth': depth} + + response = requests.request('PROPFIND', url, headers=headers) + response.raise_for_status() + + root = ET.fromstring(response.content) + namespaces = {'d': 'DAV:'} + + paths = [] + for response_tag in root.findall('d:response', namespaces)[1:]: + href = response_tag.find('d:href', namespaces).text + relative_path = href.split(f"/public-files/{self.public_link_hash}/")[-1] + paths.append(relative_path) + return paths + + def list_folders(self, base_path: str) -> list[str]: + all_items = self._propfind(base_path) + return [p for p in all_items if p.endswith('/') or "BOITE_" in p] + + def list_pdfs(self, folder_path: str) -> list[str]: + all_items = self._propfind(folder_path) + return [p for p in all_items if p.lower().endswith('.pdf')] + + def download_to_temp(self, file_path: str, temp_file_path: str) -> None: + url = f"{self.public_base_url}/{file_path}" + response = requests.get(url, stream=True) + response.raise_for_status() + with open(temp_file_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + def upload_file(self, local_file_path: str, remote_file_path: str) -> None: + if not self.account or not self.password: + raise ValueError("CERN account and password are required for uploading.") + + clean_remote_path = remote_file_path.lstrip('/') + url = f"{self.auth_base_url}/{clean_remote_path}" + + with open(local_file_path, 'rb') as f: + response = requests.put(url, data=f, auth=(self.account, self.password)) + + response.raise_for_status() \ No newline at end of file diff --git a/refactory/test_connections.py b/refactory/test_connections.py new file mode 100644 index 0000000..56a9de4 --- /dev/null +++ b/refactory/test_connections.py @@ -0,0 +1,81 @@ +import tempfile +import os +from storage_connection import S3Provider, CernboxProvider + +def test_s3(): + print("--- Testing AWS S3 connection ---") + try: + s3 = S3Provider(bucket="digitization-dev") + base_path = "cern-archives/raw/PDF/" + + folders = s3.list_folders(base_path) + print("✅ Read: Success! Connected to S3.") + print(f"Found {len(folders)} folders in '{base_path}'.") + + print("\n--- Testing S3 Upload ---") + remote_test_file = f"{base_path}upload_test_s3.txt" + + with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as tmp: + tmp.write("Upload test file generated by test_connections.py") + tmp_path = tmp.name + + try: + s3.upload_file(tmp_path, remote_test_file) + print(f"✅ Write: Success! File sent to: {remote_test_file}") + finally: + os.remove(tmp_path) + + except Exception as e: + print(f"❌ Failed to connect/operate on S3.") + print(f"Details: {e}") + +def test_cernbox(): + print("\n--- Testing CERNBOX connection (Hybrid Mode) ---") + + # 1. Read Variables (Public) + public_hash = "QslvWRIPsBcDAOK" + read_base_path = "" # Relative path inside the public link + + # 2. Write Variables (Private/Authenticated) + cern_user = "" # CERN username + cern_password = os.environ.get("CERNBOX_PASSWORD") + write_base_path = ""#"eos/user/{u}/{user}/teste/" + + if public_hash == "PUT_YOUR_PUBLIC_HASH_HERE": + print("Warning: Configure the public_hash in the code before testing.") + return + + if not cern_password: + print("❌ The CERNBOX_PASSWORD environment variable is not set.") + print("Run in terminal: export CERNBOX_PASSWORD='your_password'") + return + + try: + # Passing all three arguments + cernbox = CernboxProvider(public_link_hash=public_hash, account=cern_user, password=cern_password) + + print("\n[Phase 1: Reading from Public Link]") + folders = cernbox.list_folders(read_base_path) + print("✅ Read: Success (Anonymous)!") + print(f"Found {len(folders)} items at the root of the link.") + + print("\n[Phase 2: Writing via Authenticated WebDAV]") + remote_test_file = f"{write_base_path}hybrid_connection_test.txt" + + with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as tmp: + tmp.write("Authenticated upload from test_connections.py") + tmp_path = tmp.name + + try: + cernbox.upload_file(tmp_path, remote_test_file) + print(f"✅ Write: Success (Authenticated)! File sent to: {remote_test_file}") + finally: + os.remove(tmp_path) + + except Exception as e: + print(f"❌ Failed to connect/operate on CERNBOX.") + print(f"Details: {e}") + +if __name__ == "__main__": + # test_s3() + test_cernbox() \ No newline at end of file diff --git a/refactory/validate_pdf.py b/refactory/validate_pdf.py new file mode 100644 index 0000000..0b0ee18 --- /dev/null +++ b/refactory/validate_pdf.py @@ -0,0 +1,34 @@ +import os +from pypdf import PdfReader +from pypdf.errors import PdfReadError + +def is_pdf_valid(file_path: str) -> bool: + """Checks if a local PDF is structurally valid and readable.""" + try: + file_size = os.path.getsize(file_path) + if file_size < 100: + return False + + with open(file_path, "rb") as f: + header = f.read(8) + f.seek(-min(1024, file_size), 2) + trailer = f.read() + + if not header.startswith(b"%PDF-"): + return False + if b"%%EOF" not in trailer: + return False + + reader = PdfReader(file_path) + if len(reader.pages) == 0: + return False + + _ = reader.pages[0] + + return True + + except OSError as e: + raise RuntimeError(f"System error when accessing file {file_path}: {e}") from e + + except (PdfReadError, Exception): + return False \ No newline at end of file From 0637b20f6230a363d90b5e8f02c4c936d5a2dec9 Mon Sep 17 00:00:00 2001 From: Gabriela de Santana Carrara Date: Tue, 31 Mar 2026 18:25:25 +0200 Subject: [PATCH 2/5] digitization: fix excel files input and other comments Ref: cern-sis/digitization#22 --- poetry.lock | 229 ++++++++++++++++++++++++++++---- pyproject.toml | 5 +- refactory/README.md | 85 ++++++++++++ refactory/main.py | 140 ++++++++++--------- refactory/storage_connection.py | 29 +++- refactory/test_connections.py | 47 ++++--- scripts/double_check_pdfs.py | 2 +- 7 files changed, 418 insertions(+), 119 deletions(-) create mode 100644 refactory/README.md diff --git a/poetry.lock b/poetry.lock index d8d0e3b..d22fc24 100644 --- a/poetry.lock +++ b/poetry.lock @@ -105,6 +105,18 @@ urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version > [package.extras] crt = ["awscrt (==0.23.8)"] +[[package]] +name = "certifi" +version = "2026.2.25" +description = "Python package for providing Mozilla's CA Bundle." +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa"}, + {file = "certifi-2026.2.25.tar.gz", hash = "sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7"}, +] + [[package]] name = "cffi" version = "1.17.1" @@ -185,6 +197,145 @@ files = [ [package.dependencies] pycparser = "*" +[[package]] +name = "charset-normalizer" +version = "3.4.6" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "charset_normalizer-3.4.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2e1d8ca8611099001949d1cdfaefc510cf0f212484fe7c565f735b68c78c3c95"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e25369dc110d58ddf29b949377a93e0716d72a24f62bad72b2b39f155949c1fd"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:259695e2ccc253feb2a016303543d691825e920917e31f894ca1a687982b1de4"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:dda86aba335c902b6149a02a55b38e96287157e609200811837678214ba2b1db"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:51fb3c322c81d20567019778cb5a4a6f2dc1c200b886bc0d636238e364848c89"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:4482481cb0572180b6fd976a4d5c72a30263e98564da68b86ec91f0fe35e8565"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:39f5068d35621da2881271e5c3205125cc456f54e9030d3f723288c873a71bf9"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8bea55c4eef25b0b19a0337dc4e3f9a15b00d569c77211fa8cde38684f234fb7"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:f0cdaecd4c953bfae0b6bb64910aaaca5a424ad9c72d85cb88417bb9814f7550"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:150b8ce8e830eb7ccb029ec9ca36022f756986aaaa7956aad6d9ec90089338c0"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:e68c14b04827dd76dcbd1aeea9e604e3e4b78322d8faf2f8132c7138efa340a8"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:3778fd7d7cd04ae8f54651f4a7a0bd6e39a0cf20f801720a4c21d80e9b7ad6b0"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:dad6e0f2e481fffdcf776d10ebee25e0ef89f16d691f1e5dee4b586375fdc64b"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-win32.whl", hash = "sha256:74a2e659c7ecbc73562e2a15e05039f1e22c75b7c7618b4b574a3ea9118d1557"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-win_amd64.whl", hash = "sha256:aa9cccf4a44b9b62d8ba8b4dd06c649ba683e4bf04eea606d2e94cfc2d6ff4d6"}, + {file = "charset_normalizer-3.4.6-cp310-cp310-win_arm64.whl", hash = "sha256:e985a16ff513596f217cee86c21371b8cd011c0f6f056d0920aa2d926c544058"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:82060f995ab5003a2d6e0f4ad29065b7672b6593c8c63559beefe5b443242c3e"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:60c74963d8350241a79cb8feea80e54d518f72c26db618862a8f53e5023deaf9"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f6e4333fb15c83f7d1482a76d45a0818897b3d33f00efd215528ff7c51b8e35d"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:bc72863f4d9aba2e8fd9085e63548a324ba706d2ea2c83b260da08a59b9482de"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9cc4fc6c196d6a8b76629a70ddfcd4635a6898756e2d9cac5565cf0654605d73"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:0c173ce3a681f309f31b87125fecec7a5d1347261ea11ebbb856fa6006b23c8c"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c907cdc8109f6c619e6254212e794d6548373cc40e1ec75e6e3823d9135d29cc"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:404a1e552cf5b675a87f0651f8b79f5f1e6fd100ee88dc612f89aa16abd4486f"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:e3c701e954abf6fc03a49f7c579cc80c2c6cc52525340ca3186c41d3f33482ef"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:7a6967aaf043bceabab5412ed6bd6bd26603dae84d5cb75bf8d9a74a4959d398"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:5feb91325bbceade6afab43eb3b508c63ee53579fe896c77137ded51c6b6958e"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:f820f24b09e3e779fe84c3c456cb4108a7aa639b0d1f02c28046e11bfcd088ed"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b35b200d6a71b9839a46b9b7fff66b6638bb52fc9658aa58796b0326595d3021"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-win32.whl", hash = "sha256:9ca4c0b502ab399ef89248a2c84c54954f77a070f28e546a85e91da627d1301e"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-win_amd64.whl", hash = "sha256:a9e68c9d88823b274cf1e72f28cb5dc89c990edf430b0bfd3e2fb0785bfeabf4"}, + {file = "charset_normalizer-3.4.6-cp311-cp311-win_arm64.whl", hash = "sha256:97d0235baafca5f2b09cf332cc275f021e694e8362c6bb9c96fc9a0eb74fc316"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2ef7fedc7a6ecbe99969cd09632516738a97eeb8bd7258bf8a0f23114c057dab"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a4ea868bc28109052790eb2b52a9ab33f3aa7adc02f96673526ff47419490e21"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:836ab36280f21fc1a03c99cd05c6b7af70d2697e374c7af0b61ed271401a72a2"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f1ce721c8a7dfec21fcbdfe04e8f68174183cf4e8188e0645e92aa23985c57ff"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e28d62a8fc7a1fa411c43bd65e346f3bce9716dc51b897fbe930c5987b402d5"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:530d548084c4a9f7a16ed4a294d459b4f229db50df689bfe92027452452943a0"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:30f445ae60aad5e1f8bdbb3108e39f6fbc09f4ea16c815c66578878325f8f15a"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ac2393c73378fea4e52aa56285a3d64be50f1a12395afef9cce47772f60334c2"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:90ca27cd8da8118b18a52d5f547859cc1f8354a00cd1e8e5120df3e30d6279e5"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8e5a94886bedca0f9b78fecd6afb6629142fd2605aa70a125d49f4edc6037ee6"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:695f5c2823691a25f17bc5d5ffe79fa90972cc34b002ac6c843bb8a1720e950d"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:231d4da14bcd9301310faf492051bee27df11f2bc7549bc0bb41fef11b82daa2"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a056d1ad2633548ca18ffa2f85c202cfb48b68615129143915b8dc72a806a923"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-win32.whl", hash = "sha256:c2274ca724536f173122f36c98ce188fd24ce3dad886ec2b7af859518ce008a4"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-win_amd64.whl", hash = "sha256:c8ae56368f8cc97c7e40a7ee18e1cedaf8e780cd8bc5ed5ac8b81f238614facb"}, + {file = "charset_normalizer-3.4.6-cp312-cp312-win_arm64.whl", hash = "sha256:899d28f422116b08be5118ef350c292b36fc15ec2daeb9ea987c89281c7bb5c4"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:11afb56037cbc4b1555a34dd69151e8e069bee82e613a73bef6e714ce733585f"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:423fb7e748a08f854a08a222b983f4df1912b1daedce51a72bd24fe8f26a1843"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d73beaac5e90173ac3deb9928a74763a6d230f494e4bfb422c217a0ad8e629bf"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d60377dce4511655582e300dc1e5a5f24ba0cb229005a1d5c8d0cb72bb758ab8"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:530e8cebeea0d76bdcf93357aa5e41336f48c3dc709ac52da2bb167c5b8271d9"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:a26611d9987b230566f24a0a125f17fe0de6a6aff9f25c9f564aaa2721a5fb88"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:34315ff4fc374b285ad7f4a0bf7dcbfe769e1b104230d40f49f700d4ab6bbd84"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5f8ddd609f9e1af8c7bd6e2aca279c931aefecd148a14402d4e368f3171769fd"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:80d0a5615143c0b3225e5e3ef22c8d5d51f3f72ce0ea6fb84c943546c7b25b6c"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:92734d4d8d187a354a556626c221cd1a892a4e0802ccb2af432a1d85ec012194"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:613f19aa6e082cf96e17e3ffd89383343d0d589abda756b7764cf78361fd41dc"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:2b1a63e8224e401cafe7739f77efd3f9e7f5f2026bda4aead8e59afab537784f"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6cceb5473417d28edd20c6c984ab6fee6c6267d38d906823ebfe20b03d607dc2"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-win32.whl", hash = "sha256:d7de2637729c67d67cf87614b566626057e95c303bc0a55ffe391f5205e7003d"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-win_amd64.whl", hash = "sha256:572d7c822caf521f0525ba1bce1a622a0b85cf47ffbdae6c9c19e3b5ac3c4389"}, + {file = "charset_normalizer-3.4.6-cp313-cp313-win_arm64.whl", hash = "sha256:a4474d924a47185a06411e0064b803c68be044be2d60e50e8bddcc2649957c1f"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:9cc6e6d9e571d2f863fa77700701dae73ed5f78881efc8b3f9a4398772ff53e8"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef5960d965e67165d75b7c7ffc60a83ec5abfc5c11b764ec13ea54fbef8b4421"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b3694e3f87f8ac7ce279d4355645b3c878d24d1424581b46282f24b92f5a4ae2"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5d11595abf8dd942a77883a39d81433739b287b6aa71620f15164f8096221b30"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7bda6eebafd42133efdca535b04ccb338ab29467b3f7bf79569883676fc628db"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:bbc8c8650c6e51041ad1be191742b8b421d05bbd3410f43fa2a00c8db87678e8"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:22c6f0c2fbc31e76c3b8a86fba1a56eda6166e238c29cdd3d14befdb4a4e4815"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7edbed096e4a4798710ed6bc75dcaa2a21b68b6c356553ac4823c3658d53743a"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:7f9019c9cb613f084481bd6a100b12e1547cf2efe362d873c2e31e4035a6fa43"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:58c948d0d086229efc484fe2f30c2d382c86720f55cd9bc33591774348ad44e0"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:419a9d91bd238052642a51938af8ac05da5b3343becde08d5cdeab9046df9ee1"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:5273b9f0b5835ff0350c0828faea623c68bfa65b792720c453e22b25cc72930f"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:0e901eb1049fdb80f5bd11ed5ea1e498ec423102f7a9b9e4645d5b8204ff2815"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-win32.whl", hash = "sha256:b4ff1d35e8c5bd078be89349b6f3a845128e685e751b6ea1169cf2160b344c4d"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-win_amd64.whl", hash = "sha256:74119174722c4349af9708993118581686f343adc1c8c9c007d59be90d077f3f"}, + {file = "charset_normalizer-3.4.6-cp314-cp314-win_arm64.whl", hash = "sha256:e5bcc1a1ae744e0bb59641171ae53743760130600da8db48cbb6e4918e186e4e"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:ad8faf8df23f0378c6d527d8b0b15ea4a2e23c89376877c598c4870d1b2c7866"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f5ea69428fa1b49573eef0cc44a1d43bebd45ad0c611eb7d7eac760c7ae771bc"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:06a7e86163334edfc5d20fe104db92fcd666e5a5df0977cb5680a506fe26cc8e"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e1f6e2f00a6b8edb562826e4632e26d063ac10307e80f7461f7de3ad8ef3f077"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:95b52c68d64c1878818687a473a10547b3292e82b6f6fe483808fb1468e2f52f"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:7504e9b7dc05f99a9bbb4525c67a2c155073b44d720470a148b34166a69c054e"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:172985e4ff804a7ad08eebec0a1640ece87ba5041d565fff23c8f99c1f389484"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4be9f4830ba8741527693848403e2c457c16e499100963ec711b1c6f2049b7c7"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:79090741d842f564b1b2827c0b82d846405b744d31e84f18d7a7b41c20e473ff"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:87725cfb1a4f1f8c2fc9890ae2f42094120f4b44db9360be5d99a4c6b0e03a9e"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:fcce033e4021347d80ed9c66dcf1e7b1546319834b74445f561d2e2221de5659"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:ca0276464d148c72defa8bb4390cce01b4a0e425f3b50d1435aa6d7a18107602"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:197c1a244a274bb016dd8b79204850144ef77fe81c5b797dc389327adb552407"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-win32.whl", hash = "sha256:2a24157fa36980478dd1770b585c0f30d19e18f4fb0c47c13aa568f871718579"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-win_amd64.whl", hash = "sha256:cd5e2801c89992ed8c0a3f0293ae83c159a60d9a5d685005383ef4caca77f2c4"}, + {file = "charset_normalizer-3.4.6-cp314-cp314t-win_arm64.whl", hash = "sha256:47955475ac79cc504ef2704b192364e51d0d473ad452caedd0002605f780101c"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:659a1e1b500fac8f2779dd9e1570464e012f43e580371470b45277a27baa7532"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f61aa92e4aad0be58eb6eb4e0c21acf32cf8065f4b2cae5665da756c4ceef982"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f50498891691e0864dc3da965f340fada0771f6142a378083dc4608f4ea513e2"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:bf625105bb9eef28a56a943fec8c8a98aeb80e7d7db99bd3c388137e6eb2d237"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2bd9d128ef93637a5d7a6af25363cf5dec3fa21cf80e68055aad627f280e8afa"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-manylinux_2_31_armv7l.whl", hash = "sha256:d08ec48f0a1c48d75d0356cea971921848fb620fdeba805b28f937e90691209f"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1ed80ff870ca6de33f4d953fda4d55654b9a2b340ff39ab32fa3adbcd718f264"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f98059e4fcd3e3e4e2d632b7cf81c2faae96c43c60b569e9c621468082f1d104"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:ab30e5e3e706e3063bc6de96b118688cb10396b70bb9864a430f67df98c61ecc"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:d5f5d1e9def3405f60e3ca8232d56f35c98fb7bf581efcc60051ebf53cb8b611"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-musllinux_1_2_riscv64.whl", hash = "sha256:461598cd852bfa5a61b09cae2b1c02e2efcd166ee5516e243d540ac24bfa68a7"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:71be7e0e01753a89cf024abf7ecb6bca2c81738ead80d43004d9b5e3f1244e64"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:df01808ee470038c3f8dc4f48620df7225c49c2d6639e38f96e6d6ac6e6f7b0e"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-win32.whl", hash = "sha256:69dd852c2f0ad631b8b60cfbe25a28c0058a894de5abb566619c205ce0550eae"}, + {file = "charset_normalizer-3.4.6-cp38-cp38-win_amd64.whl", hash = "sha256:517ad0e93394ac532745129ceabdf2696b609ec9f87863d337140317ebce1c14"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:31215157227939b4fb3d740cd23fe27be0439afef67b785a1eb78a3ae69cba9e"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecbbd45615a6885fe3240eb9db73b9e62518b611850fdf8ab08bd56de7ad2b17"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c45a03a4c69820a399f1dda9e1d8fbf3562eda46e7720458180302021b08f778"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e8aeb10fcbe92767f0fa69ad5a72deca50d0dca07fbde97848997d778a50c9fe"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:54fae94be3d75f3e573c9a1b5402dc593de19377013c9a0e4285e3d402dd3a2a"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-manylinux_2_31_armv7l.whl", hash = "sha256:2f7fdd9b6e6c529d6a2501a2d36b240109e78a8ceaef5687cfcfa2bbe671d297"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:4d1d02209e06550bdaef34af58e041ad71b88e624f5d825519da3a3308e22687"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8bc5f0687d796c05b1e28ab0d38a50e6309906ee09375dd3aff6a9c09dd6e8f4"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:ee4ec14bc1680d6b0afab9aea2ef27e26d2024f18b24a2d7155a52b60da7e833"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:d1a2ee9c1499fc8f86f4521f27a973c914b211ffa87322f4ee33bb35392da2c5"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:48696db7f18afb80a068821504296eb0787d9ce239b91ca15059d1d3eaacf13b"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:4f41da960b196ea355357285ad1316a00099f22d0929fe168343b99b254729c9"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:802168e03fba8bbc5ce0d866d589e4b1ca751d06edee69f7f3a19c5a9fe6b597"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-win32.whl", hash = "sha256:8761ac29b6c81574724322a554605608a9960769ea83d2c73e396f3df896ad54"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-win_amd64.whl", hash = "sha256:1cf0a70018692f85172348fe06d3a4b63f94ecb055e13a00c644d368eb82e5b8"}, + {file = "charset_normalizer-3.4.6-cp39-cp39-win_arm64.whl", hash = "sha256:3516bbb8d42169de9e61b8520cbeeeb716f12f4ecfe3fd30a9919aa16c806ca8"}, + {file = "charset_normalizer-3.4.6-py3-none-any.whl", hash = "sha256:947cf925bc916d90adba35a64c82aace04fa39b46b52d4630ece166655905a69"}, + {file = "charset_normalizer-3.4.6.tar.gz", hash = "sha256:1ae6b62897110aa7c79ea2f5dd38d1abca6db663687c0b1ad9aed6f6bae3d9d6"}, +] + [[package]] name = "click" version = "8.1.8" @@ -264,31 +415,32 @@ test = ["certifi", "cryptography-vectors (==43.0.3)", "pretend", "pytest (>=6.2. test-randomorder = ["pytest-randomly"] [[package]] -name = "dotenv" -version = "0.9.9" -description = "Deprecated package" +name = "et-xmlfile" +version = "2.0.0" +description = "An implementation of lxml.xmlfile for the standard library" optional = false -python-versions = "*" +python-versions = ">=3.8" groups = ["main"] files = [ - {file = "dotenv-0.9.9-py2.py3-none-any.whl", hash = "sha256:29cf74a087b31dafdb5a446b6d7e11cbce8ed2741540e2339c69fbef92c94ce9"}, + {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"}, + {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, ] -[package.dependencies] -python-dotenv = "*" - [[package]] -name = "et-xmlfile" -version = "2.0.0" -description = "An implementation of lxml.xmlfile for the standard library" +name = "idna" +version = "3.11" +description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"}, - {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, + {file = "idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea"}, + {file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"}, ] +[package.extras] +all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] + [[package]] name = "jmespath" version = "1.0.1" @@ -526,23 +678,24 @@ docs = ["sphinx (>=1.6.5)", "sphinx-rtd-theme"] tests = ["hypothesis (>=3.27.0)", "pytest (>=3.2.1,!=3.3.0)"] [[package]] -name = "pypdf2" -version = "3.0.1" +name = "pypdf" +version = "6.9.2" description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" optional = false -python-versions = ">=3.6" +python-versions = ">=3.9" groups = ["main"] files = [ - {file = "PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440"}, - {file = "pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928"}, + {file = "pypdf-6.9.2-py3-none-any.whl", hash = "sha256:662cf29bcb419a36a1365232449624ab40b7c2d0cfc28e54f42eeecd1fd7e844"}, + {file = "pypdf-6.9.2.tar.gz", hash = "sha256:7f850faf2b0d4ab936582c05da32c52214c2b089d61a316627b5bfb5b0dab46c"}, ] [package.extras] -crypto = ["PyCryptodome"] -dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "wheel"] +crypto = ["cryptography"] +cryptodome = ["PyCryptodome"] +dev = ["flit", "pip-tools", "pre-commit", "pytest-cov", "pytest-socket", "pytest-timeout", "pytest-xdist", "wheel"] docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] -full = ["Pillow", "PyCryptodome"] -image = ["Pillow"] +full = ["Pillow (>=8.0.0)", "cryptography"] +image = ["Pillow (>=8.0.0)"] [[package]] name = "pysftp" @@ -575,14 +728,14 @@ six = ">=1.5" [[package]] name = "python-dotenv" -version = "1.1.1" +version = "1.2.2" description = "Read key-value pairs from a .env file and set them as environment variables" optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" groups = ["main"] files = [ - {file = "python_dotenv-1.1.1-py3-none-any.whl", hash = "sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc"}, - {file = "python_dotenv-1.1.1.tar.gz", hash = "sha256:a8a6399716257f45be6a007360200409fce5cda2661e3dec71d23dc15f6189ab"}, + {file = "python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a"}, + {file = "python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3"}, ] [package.extras] @@ -600,6 +753,28 @@ files = [ {file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"}, ] +[[package]] +name = "requests" +version = "2.33.1" +description = "Python HTTP for Humans." +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "requests-2.33.1-py3-none-any.whl", hash = "sha256:4e6d1ef462f3626a1f0a0a9c42dd93c63bad33f9f1c1937509b8c5c8718ab56a"}, + {file = "requests-2.33.1.tar.gz", hash = "sha256:18817f8c57c6263968bc123d237e3b8b08ac046f5456bd1e307ee8f4250d3517"}, +] + +[package.dependencies] +certifi = ">=2023.5.7" +charset_normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.26,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<8)"] + [[package]] name = "s3transfer" version = "0.12.0" @@ -685,4 +860,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "afbee7892d433bfd5919d3f2c96e6dd1d7d0e9d127008ab1f6af8f80e4279812" +content-hash = "28dcb5ac54f21265b9c426938c63001062f89b60720411dd9bcde02ceef93714" diff --git a/pyproject.toml b/pyproject.toml index e05c8a3..d990287 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,11 +9,12 @@ python = "^3.12" pysftp = "^0.2.9" click = "^8.0.3" boto3 = "^1.38.9" +requests = "^2.31.0" +python-dotenv = "^1.0.0" pandas = "^2.2.3" openpyxl = "^3.1.5" tqdm = "^4.67.1" -pypdf2 = "^3.0.1" -dotenv = "^0.9.9" +pypdf = "^6.9.2" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/refactory/README.md b/refactory/README.md new file mode 100644 index 0000000..f5f3121 --- /dev/null +++ b/refactory/README.md @@ -0,0 +1,85 @@ +# refactory + +This directory contains scripts and helpers for validating PDF files in an S3 bucket using an inventory of Excel files hosted on CERNBox. + +## Structure + +- `main.py` - main script that validates PDFs using the CERNBox inventory. +- `storage_connection.py` - storage provider abstraction: + - `S3Provider` for S3. + - `CernboxProvider` for public CERNBox access. +- `validate_pdf.py` - validates PDFs locally with `is_pdf_valid(file_path)`. +- `test_connections.py` - testing/connection experiment script. + +## Dependencies + +This project uses Poetry to manage dependencies. The required libraries are listed in `pyproject.toml`. + +### Install dependencies with Poetry + +```bash +poetry install +``` + +### Main dependencies + +- `boto3` +- `requests` +- `pypdf` + +> If the project is managed with Poetry, `requirements.txt` is not required. + +## AWS Authentication + +`S3Provider` uses `boto3`. Configure credentials using environment variables or the default AWS config files: + +- `AWS_ACCESS_KEY_ID` +- `AWS_SECRET_ACCESS_KEY` + +### Example environment variables + +```bash +export ACCESS_KEY="YOUR_ACCESS_KEY" +export SECRET_KEY="YOUR_SECRET_KEY" +``` + +### Supported alternatives + +- `~/.aws/credentials` +- `~/.aws/config` +- IAM role attached to an instance/container + +> `S3Provider` also supports the default endpoint `https://s3.cern.ch`, configured in `storage_connection.py`. + +## Usage with Poetry + +Run the main script via Poetry: + +```bash +poetry run python refactory/main.py +``` + +Parameters: + +- `target_excel_hash`: public CERNBox hash containing the inventory Excel files. +- `upload_reports`: `0` to skip report upload, `1` to upload generated reports back to the S3 bucket. + +### Example + +```bash +poetry run python refactory/main.py QslvWRIPsBcDAOK 0 +``` + +## Expected output + +The script generates: + +- `s3_pdf_issues.log` - text log with valid and corrupted files. +- `s3_pdf_issues.json` - structured report with metadata, statistics, and file lists. + +If `upload_reports=1`, the reports are also uploaded back to the S3 bucket. + +## Additional notes + +- If CERNBox upload requires authentication, provide `account` and `password` to `CernboxProvider`. +- Use `test_connections.py` to verify connections before running the main pipeline. diff --git a/refactory/main.py b/refactory/main.py index 5594b77..d61f1ba 100644 --- a/refactory/main.py +++ b/refactory/main.py @@ -6,86 +6,100 @@ from storage_connection import StorageProvider, S3Provider, CernboxProvider from validate_pdf import is_pdf_valid -def run_validation_pipeline(provider: StorageProvider, base_path: str, log_file: str, start: int = None, end: int = None, upload_reports: bool = False): - """Navigates directories, validates files, and logs corrupted files.""" - print(f"Discovering folders in: {base_path}") - + +def run_validation_pipeline( + provider: StorageProvider, + base_path: str, + log_file: str, + target_excel_hash: str, + upload_reports: bool = False, +): + print(type(upload_reports),bool(upload_reports)) + """Navigates directories, validates files, and logs files status.""" + inventory_provider = CernboxProvider(target_excel_hash) + excel_files = inventory_provider.list_excel("") + + target_box_numbers = set() + for file_path in excel_files: + filename = file_path.split(".")[0] + + match = re.search(r"(?i:BOITE)[\-_]O0(\d+)(-\w+)?", filename) + + if match: + target_box_numbers.add(int(match.group(1))) + print(f"Excel files: {len(target_box_numbers)} boxes to check.") + + print(f"Folders in: {base_path}") folders = provider.list_folders(base_path) - + if not folders: print("No folders found in this path.") return - found_box_numbers = set() - empty_folders = [] + found_and_valid_boxes = set() corrupted_files = [] - valid_files_count = 0 + valid_files = [] print("Starting validation...") for folder in folders: - if "BOITE_" not in folder: + match = re.search(r"(?i:BOITE)[\-_]O0(\d+)(-\w+)?", folder) + if not match: continue - match = re.search(r"BOITE_O0(\d+)", folder) - if match: - box_num = int(match.group(1)) - if start is not None and box_num < start: - continue - if end is not None and box_num > end: - continue - - found_box_numbers.add(box_num) - else: + box_num = int(match.group(1)) + if box_num not in target_box_numbers: continue - - print(f"\nChecking folder: {folder}") + print(f"Processing target Box: {match.group(1) + (match.group(2) or '')}") + pdf_files = provider.list_pdfs(folder) if not pdf_files: - print(" Empty folder or no PDFs.") - empty_folders.append(folder) + print(f"⚠️ EMPTY FOLDER: {folder}") continue - for file_path in pdf_files: - print(f" {file_path.split('/')[-1]} ... ", end="") - + found_and_valid_boxes.add(box_num) + + for pdf_path in pdf_files: with tempfile.NamedTemporaryFile(delete=True) as tmp: - provider.download_to_temp(file_path, tmp.name) - valid = is_pdf_valid(tmp.name) - - if valid: - print("✅ Valid") - valid_files_count += 1 - else: - print("❌ Corrupted") - corrupted_files.append(file_path) + provider.download_to_temp(pdf_path, tmp.name) + + if is_pdf_valid(tmp.name): + valid_files.append(pdf_path) + print(f" ✅ {pdf_path}") + else: + print(f" ❌ CORRUPTED: {pdf_path}") + corrupted_files.append(pdf_path) + missing_boxes = target_box_numbers - found_and_valid_boxes + + if missing_boxes: + print("\n🚨 Empty target boxes:") + for box in sorted(missing_boxes): + print( + f" -> BOITE_O0{box}" + ) with open(log_file, "w", encoding="utf-8") as log: + log.write( + f"Validation report for the following boxes {target_box_numbers}\n ✅ Valid Files: {len(valid_files)}\n ❌ Corrupted Files: {len(corrupted_files)}\n" + ) + for vf in valid_files: + log.write(f"✅ Valid PDF: {vf}\n") for cf in corrupted_files: - log.write(f"Corrupted PDF: {cf}\n") - - missing_boxes = [] - if start is not None and end is not None: - expected_boxes = set(range(start, end + 1)) - missing_boxes = sorted(list(expected_boxes - found_box_numbers)) + log.write(f"❌ Corrupted PDF: {cf}\n") json_report = { - "metadata": { - "base_path": base_path, - "range_analyzed": {"start": start, "end": end} - }, + "metadata": {"base_path": base_path, "target_boxes": list(target_box_numbers)}, "statistics": { - "valid_files": valid_files_count, + "valid_files_count": len(valid_files), "corrupted_files_count": len(corrupted_files), - "empty_folders_count": len(empty_folders), - "missing_boxes_count": len(missing_boxes) if missing_boxes else 0 + "missing_boxes_count": len(missing_boxes) if missing_boxes else 0, + }, + "output": { + "valid_files": valid_files, + "missing_boxes": list(missing_boxes) if missing_boxes else [], + "corrupted_files": corrupted_files, }, - "issues": { - "missing_boxes": missing_boxes if missing_boxes else [], - "empty_folders": empty_folders, - "corrupted_files": corrupted_files - } } json_file_path = log_file.replace(".log", ".json") @@ -98,24 +112,26 @@ def run_validation_pipeline(provider: StorageProvider, base_path: str, log_file: if upload_reports: remote_log_path = f"{base_path.rstrip('/')}/{os.path.basename(log_file)}" remote_json_path = f"{base_path.rstrip('/')}/{os.path.basename(json_file_path)}" - + print(f"Uploading reports back to the cloud ({base_path})...") try: provider.upload_file(log_file, remote_log_path) provider.upload_file(json_file_path, remote_json_path) - print(f"✅ Upload successful! Files available at: {remote_log_path} and {remote_json_path}") + print( + f"✅ Upload successful! Files available at: {remote_log_path} and {remote_json_path}" + ) except Exception as e: print(f"❌ Failed to upload reports: {e}") + if __name__ == "__main__": s3_provider = S3Provider(bucket="digitization-dev") # cernbox_provider = CernboxProvider(public_link_hash="XjjFxUWUMpuTYCz") - + run_validation_pipeline( - provider=s3_provider, #cernbox_provider - base_path="cern-archives/raw/PDF/", #"teste/", + provider=s3_provider, # cernbox_provider + base_path="cern-archives/raw/PDF/", # "teste/", log_file="s3_pdf_issues.log", - start=int(sys.argv[1]), #123 - end=int(sys.argv[2]), #126 - upload_reports=sys.argv[3] # 0 | 1 - ) \ No newline at end of file + target_excel_hash=sys.argv[1], # public_link_hash + upload_reports=int(sys.argv[2]), # 0 | 1 + ) diff --git a/refactory/storage_connection.py b/refactory/storage_connection.py index 7f0bc94..6b48743 100644 --- a/refactory/storage_connection.py +++ b/refactory/storage_connection.py @@ -22,10 +22,19 @@ def download_to_temp(self, file_path: str, temp_file_path: str) -> None: def upload_file(self, local_file_path: str, remote_file_path: str) -> None: pass + @abstractmethod + def list_excel(self, folder_path: str) -> list[str]: + pass + class S3Provider(StorageProvider): def __init__(self, bucket: str, endpoint_url: str = 'https://s3.cern.ch'): self.bucket = bucket - self.s3 = boto3.client('s3', endpoint_url=endpoint_url) + self.s3 = boto3.client( + "s3", + aws_access_key_id=os.environ["ACCESS_KEY"], + aws_secret_access_key=os.environ["SECRET_KEY"], + endpoint_url=endpoint_url, + ) def list_folders(self, base_path: str) -> list[str]: paginator = self.s3.get_paginator('list_objects_v2') @@ -47,7 +56,13 @@ def download_to_temp(self, file_path: str, temp_file_path: str) -> None: def upload_file(self, local_file_path: str, remote_file_path: str) -> None: self.s3.upload_file(local_file_path, self.bucket, remote_file_path) - + + def list_excel(self, folder_path: str) -> list[str]: + response = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=folder_path) + return [ + obj['Key'] for obj in response.get('Contents', []) + if obj['Key'].lower().endswith('.xlsx') + ] class CernboxProvider(StorageProvider): def __init__(self, public_link_hash: str, account: str = None, password: str = None): self.public_link_hash = public_link_hash @@ -55,9 +70,9 @@ def __init__(self, public_link_hash: str, account: str = None, password: str = N self.password = password self.public_base_url = f"https://cernbox.cern.ch/remote.php/dav/public-files/{public_link_hash}" - + if account: - self.auth_base_url = f"https://cernbox.cern.ch/remote.php/dav/files/{account}" + self.auth_base_url = f"https://api.cernbox.cern.ch/remote.php/dav/files/{account}" def _propfind(self, path: str, depth: str = "1") -> list[str]: url = f"{self.public_base_url}/{path}".rstrip('/') + '/' @@ -92,6 +107,10 @@ def download_to_temp(self, file_path: str, temp_file_path: str) -> None: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) + def list_excel(self, folder_path: str)-> list[str]: + all_items = self._propfind(folder_path) + return [p for p in all_items if p.lower().endswith('.xlsx')] + def upload_file(self, local_file_path: str, remote_file_path: str) -> None: if not self.account or not self.password: raise ValueError("CERN account and password are required for uploading.") @@ -102,4 +121,4 @@ def upload_file(self, local_file_path: str, remote_file_path: str) -> None: with open(local_file_path, 'rb') as f: response = requests.put(url, data=f, auth=(self.account, self.password)) - response.raise_for_status() \ No newline at end of file + response.raise_for_status() diff --git a/refactory/test_connections.py b/refactory/test_connections.py index 56a9de4..ad34e99 100644 --- a/refactory/test_connections.py +++ b/refactory/test_connections.py @@ -11,22 +11,24 @@ def test_s3(): folders = s3.list_folders(base_path) print("✅ Read: Success! Connected to S3.") print(f"Found {len(folders)} folders in '{base_path}'.") + + ### Test Upload - print("\n--- Testing S3 Upload ---") - remote_test_file = f"{base_path}upload_test_s3.txt" + # print("\n--- Testing S3 Upload ---") + # remote_test_file = f"{base_path}upload_test_s3.txt" - with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as tmp: - tmp.write("Upload test file generated by test_connections.py") - tmp_path = tmp.name + # with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as tmp: + # tmp.write("Upload test file generated by test_connections.py") + # tmp_path = tmp.name - try: - s3.upload_file(tmp_path, remote_test_file) - print(f"✅ Write: Success! File sent to: {remote_test_file}") - finally: - os.remove(tmp_path) + # try: + # s3.upload_file(tmp_path, remote_test_file) + # print(f"✅ Write: Success! File sent to: {remote_test_file}") + # finally: + # os.remove(tmp_path) except Exception as e: - print(f"❌ Failed to connect/operate on S3.") + print("❌ Failed to connect/operate on S3.") print(f"Details: {e}") def test_cernbox(): @@ -37,9 +39,9 @@ def test_cernbox(): read_base_path = "" # Relative path inside the public link # 2. Write Variables (Private/Authenticated) - cern_user = "" # CERN username + cern_user = "gadesant" # CERN username cern_password = os.environ.get("CERNBOX_PASSWORD") - write_base_path = ""#"eos/user/{u}/{user}/teste/" + write_base_path = "eos/user/g/gadesant/teste/"#"eos/user/{u}/{user}/teste/" if public_hash == "PUT_YOUR_PUBLIC_HASH_HERE": print("Warning: Configure the public_hash in the code before testing.") @@ -65,17 +67,18 @@ def test_cernbox(): with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as tmp: tmp.write("Authenticated upload from test_connections.py") tmp_path = tmp.name - - try: - cernbox.upload_file(tmp_path, remote_test_file) - print(f"✅ Write: Success (Authenticated)! File sent to: {remote_test_file}") - finally: - os.remove(tmp_path) + + ### Test Upload + # try: + # cernbox.upload_file(tmp_path, remote_test_file) + # print(f"✅ Write: Success (Authenticated)! File sent to: {remote_test_file}") + # finally: + # os.remove(tmp_path) except Exception as e: - print(f"❌ Failed to connect/operate on CERNBOX.") + print("❌ Failed to connect/operate on CERNBOX.") print(f"Details: {e}") if __name__ == "__main__": - # test_s3() - test_cernbox() \ No newline at end of file + test_s3() + # test_cernbox() diff --git a/scripts/double_check_pdfs.py b/scripts/double_check_pdfs.py index f4e2625..8780bb5 100644 --- a/scripts/double_check_pdfs.py +++ b/scripts/double_check_pdfs.py @@ -1,6 +1,6 @@ import boto3 import tempfile -from PyPDF2 import PdfReader +from pypdf import PdfReader import os def get_s3_client(): From 80d7da27b3b69e0a4349f4f4d9992e01ed8251ec Mon Sep 17 00:00:00 2001 From: Gabriela de Santana Carrara Date: Tue, 31 Mar 2026 18:34:34 +0200 Subject: [PATCH 3/5] remove requirements from gitignore --- .gitignore | 4 +--- requirements.txt | 3 +++ 2 files changed, 4 insertions(+), 3 deletions(-) create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 8d48b18..b5aa617 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,4 @@ import_xml_files/ # Virtual environments .venv - -requirements.txt -s3_pdf_issues.json \ No newline at end of file +s3_pdf_issues.json diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d9af537 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +boto3 +requests +pypdf \ No newline at end of file From d92b7a062a309daa5b3241ad32b906499fc83c7f Mon Sep 17 00:00:00 2001 From: Gabriela de Santana Carrara Date: Wed, 1 Apr 2026 13:47:40 +0200 Subject: [PATCH 4/5] remove debug print --- refactory/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/refactory/main.py b/refactory/main.py index d61f1ba..9876cb3 100644 --- a/refactory/main.py +++ b/refactory/main.py @@ -14,7 +14,6 @@ def run_validation_pipeline( target_excel_hash: str, upload_reports: bool = False, ): - print(type(upload_reports),bool(upload_reports)) """Navigates directories, validates files, and logs files status.""" inventory_provider = CernboxProvider(target_excel_hash) excel_files = inventory_provider.list_excel("") From 95ecdd32b0415ec9a0bad82f1898f6eba5e24a48 Mon Sep 17 00:00:00 2001 From: Gabriela de Santana Carrara Date: Wed, 8 Apr 2026 17:11:21 +0200 Subject: [PATCH 5/5] add cli and array and range as valid inputs --- pyproject.toml | 1 + refactory/README.md | 42 +++++++++++++----- refactory/cli.py | 76 +++++++++++++++++++++++++++++++++ refactory/main.py | 31 ++++++++------ refactory/storage_connection.py | 32 +++++++------- refactory/test_connections.py | 40 ++++------------- requirements.txt | 6 +-- 7 files changed, 154 insertions(+), 74 deletions(-) create mode 100644 refactory/cli.py diff --git a/pyproject.toml b/pyproject.toml index d990287..f1ab434 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,3 +22,4 @@ build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] digitization = "digitization.cli:digitization" +digitization_v2 = "refactory.cli:digitization_v2" diff --git a/refactory/README.md b/refactory/README.md index f5f3121..43aae66 100644 --- a/refactory/README.md +++ b/refactory/README.md @@ -53,33 +53,53 @@ export SECRET_KEY="YOUR_SECRET_KEY" ## Usage with Poetry -Run the main script via Poetry: +Run the refactored CLI via Poetry: ```bash -poetry run python refactory/main.py +poetry run digitization_v2 --help +``` + +The current command for PDF validation is `validade-files-integrity`. + +### Example + +```bash +poetry run digitization_v2 check-integrity -s "[122,123]" -u ``` Parameters: -- `target_excel_hash`: public CERNBox hash containing the inventory Excel files. -- `upload_reports`: `0` to skip report upload, `1` to upload generated reports back to the S3 bucket. +- `-i, --inventory-source`: Inventory source. Supports CERNBOX Hash, range (`1..10`), or list (`[1,2]`). +- `-u, --upload-reports`: Flag to upload validation reports back to the storage provider. +- `-b, --bucket`: S3 bucket name (default: `digitization-dev`). -### Example +### Example without upload ```bash -poetry run python refactory/main.py QslvWRIPsBcDAOK 0 +poetry run digitization_v2 check-integrity -s "[122,123]" ``` ## Expected output -The script generates: +The CLI generates the same validation reports as the core pipeline: -- `s3_pdf_issues.log` - text log with valid and corrupted files. -- `s3_pdf_issues.json` - structured report with metadata, statistics, and file lists. +- a text log file such as `s3_pdf_issues.log` +- a structured JSON report with valid, corrupted, and missing file details -If `upload_reports=1`, the reports are also uploaded back to the S3 bucket. +If `-u` is provided, the reports will be uploaded back to the configured storage provider. ## Additional notes -- If CERNBox upload requires authentication, provide `account` and `password` to `CernboxProvider`. +- `CernboxProvider` reads optional credentials from environment variables: + - `CERNBOX_USER` + - `CERNBOX_PASSWORD` + +### Example environment variables for Cernbox + +```bash +export CERNBOX_USER="your_username" +export CERNBOX_PASSWORD="your_password" +``` + +- You may still pass `account` and `password` directly to `CernboxProvider` if preferred. - Use `test_connections.py` to verify connections before running the main pipeline. diff --git a/refactory/cli.py b/refactory/cli.py new file mode 100644 index 0000000..d5feb4b --- /dev/null +++ b/refactory/cli.py @@ -0,0 +1,76 @@ +import click +import ast +from .main import run_validation_pipeline +from storage_connection import S3Provider + + +def parse_inventory(value): + """ + Parses the input to identify if it's a literal list, + a range of IDs (1..10), or a single string/ID. + """ + if value.isdigit(): + return [int(value)] + if value.startswith("[") and value.endswith("]"): + try: + return ast.literal_eval(value) + except (ValueError, SyntaxError): + raise click.BadParameter("Invalid list format. Use '[1, 2, 3]'") + + if ".." in value: + try: + start, end = map(int, value.split("..")) + return list(range(start, end + 1)) + except ValueError: + pass + return value + +@click.group() +def digitization_v2(): + pass + + +@digitization_v2.command("validate-files-integrity") +@click.option( + "-s", + "--inventory-source", + required=True, + help="Target inventory. Supports a CERNBOX hash, range 1..10, or list [1,2].", +) +@click.option( + "-u", + "--upload-reports", + is_flag=True, + help="Upload validation reports back to the storage provider.", +) +@click.option( + "-b", + "--bucket", + default="digitization-dev", + show_default=True, + help="S3 Bucket name.", +) +def validate_files_integrity(inventory_source, upload_reports, bucket): + """ + Validates files integrity and inventory alignment. + This command checks for corrupted files and missing boxes. + """ + + inventory_input = parse_inventory(inventory_source) + provider = S3Provider(bucket=bucket) + + try: + run_validation_pipeline( + provider=provider, + base_path="cern-archives/raw/PDF/", + log_file="s3_pdf_issues.log", + inventory_source=inventory_input, + upload_reports=upload_reports, + ) + click.echo("Process finished. Check the generated logs for details.") + except Exception as e: + click.secho(f"Error: {e}", fg="red", err=True) + + +if __name__ == "__main__": + digitization_v2() diff --git a/refactory/main.py b/refactory/main.py index 9876cb3..665486d 100644 --- a/refactory/main.py +++ b/refactory/main.py @@ -3,6 +3,7 @@ import os import sys import json +from typing import Union from storage_connection import StorageProvider, S3Provider, CernboxProvider from validate_pdf import is_pdf_valid @@ -11,21 +12,26 @@ def run_validation_pipeline( provider: StorageProvider, base_path: str, log_file: str, - target_excel_hash: str, + inventory_source: Union[str, list[int]], upload_reports: bool = False, + ): """Navigates directories, validates files, and logs files status.""" - inventory_provider = CernboxProvider(target_excel_hash) - excel_files = inventory_provider.list_excel("") - target_box_numbers = set() - for file_path in excel_files: - filename = file_path.split(".")[0] + if isinstance(inventory_source, str): + inventory_provider = CernboxProvider(inventory_source) + excel_files = inventory_provider.list_excel("") + + for file_path in excel_files: + filename = file_path.split(".")[0] + + match = re.search(r"(?i:BOITE)[\-_]O0(\d+)(-\w+)?", filename) - match = re.search(r"(?i:BOITE)[\-_]O0(\d+)(-\w+)?", filename) + if match: + target_box_numbers.add(int(match.group(1))) + elif isinstance(inventory_source, list): + target_box_numbers = set(inventory_source) - if match: - target_box_numbers.add(int(match.group(1))) print(f"Excel files: {len(target_box_numbers)} boxes to check.") print(f"Folders in: {base_path}") @@ -72,7 +78,7 @@ def run_validation_pipeline( missing_boxes = target_box_numbers - found_and_valid_boxes if missing_boxes: - print("\n🚨 Empty target boxes:") + print("\n Empty target boxes:") for box in sorted(missing_boxes): print( f" -> BOITE_O0{box}" @@ -125,12 +131,11 @@ def run_validation_pipeline( if __name__ == "__main__": s3_provider = S3Provider(bucket="digitization-dev") - # cernbox_provider = CernboxProvider(public_link_hash="XjjFxUWUMpuTYCz") run_validation_pipeline( provider=s3_provider, # cernbox_provider base_path="cern-archives/raw/PDF/", # "teste/", log_file="s3_pdf_issues.log", - target_excel_hash=sys.argv[1], # public_link_hash - upload_reports=int(sys.argv[2]), # 0 | 1 + inventory_source=sys.argv[1], # public_link_hash + upload_reports=int(sys.argv[2]) ) diff --git a/refactory/storage_connection.py b/refactory/storage_connection.py index 6b48743..bdfdad9 100644 --- a/refactory/storage_connection.py +++ b/refactory/storage_connection.py @@ -17,7 +17,7 @@ def list_pdfs(self, folder_path: str) -> list[str]: @abstractmethod def download_to_temp(self, file_path: str, temp_file_path: str) -> None: pass - + @abstractmethod def upload_file(self, local_file_path: str, remote_file_path: str) -> None: pass @@ -56,34 +56,36 @@ def download_to_temp(self, file_path: str, temp_file_path: str) -> None: def upload_file(self, local_file_path: str, remote_file_path: str) -> None: self.s3.upload_file(local_file_path, self.bucket, remote_file_path) - + def list_excel(self, folder_path: str) -> list[str]: response = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=folder_path) return [ obj['Key'] for obj in response.get('Contents', []) if obj['Key'].lower().endswith('.xlsx') - ] + ] class CernboxProvider(StorageProvider): - def __init__(self, public_link_hash: str, account: str = None, password: str = None): + def __init__(self, public_link_hash: str): self.public_link_hash = public_link_hash - self.account = account - self.password = password - + self.account = os.getenv("CERNBOX_ACCOUNT") + self.password = os.getenv("CERNBOX_PASSWORD") + self.public_base_url = f"https://cernbox.cern.ch/remote.php/dav/public-files/{public_link_hash}" - - if account: - self.auth_base_url = f"https://api.cernbox.cern.ch/remote.php/dav/files/{account}" + + if self.account: + self.auth_base_url = f"https://api.cernbox.cern.ch/remote.php/dav/files/{self.account}" + else: + self.auth_base_url = None def _propfind(self, path: str, depth: str = "1") -> list[str]: url = f"{self.public_base_url}/{path}".rstrip('/') + '/' headers = {'Depth': depth} - + response = requests.request('PROPFIND', url, headers=headers) response.raise_for_status() root = ET.fromstring(response.content) namespaces = {'d': 'DAV:'} - + paths = [] for response_tag in root.findall('d:response', namespaces)[1:]: href = response_tag.find('d:href', namespaces).text @@ -114,11 +116,11 @@ def list_excel(self, folder_path: str)-> list[str]: def upload_file(self, local_file_path: str, remote_file_path: str) -> None: if not self.account or not self.password: raise ValueError("CERN account and password are required for uploading.") - + clean_remote_path = remote_file_path.lstrip('/') url = f"{self.auth_base_url}/{clean_remote_path}" - + with open(local_file_path, 'rb') as f: response = requests.put(url, data=f, auth=(self.account, self.password)) - + response.raise_for_status() diff --git a/refactory/test_connections.py b/refactory/test_connections.py index ad34e99..242f139 100644 --- a/refactory/test_connections.py +++ b/refactory/test_connections.py @@ -7,46 +7,31 @@ def test_s3(): try: s3 = S3Provider(bucket="digitization-dev") base_path = "cern-archives/raw/PDF/" - + folders = s3.list_folders(base_path) print("✅ Read: Success! Connected to S3.") print(f"Found {len(folders)} folders in '{base_path}'.") - ### Test Upload - - # print("\n--- Testing S3 Upload ---") - # remote_test_file = f"{base_path}upload_test_s3.txt" - - # with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as tmp: - # tmp.write("Upload test file generated by test_connections.py") - # tmp_path = tmp.name - - # try: - # s3.upload_file(tmp_path, remote_test_file) - # print(f"✅ Write: Success! File sent to: {remote_test_file}") - # finally: - # os.remove(tmp_path) - except Exception as e: print("❌ Failed to connect/operate on S3.") print(f"Details: {e}") def test_cernbox(): print("\n--- Testing CERNBOX connection (Hybrid Mode) ---") - + # 1. Read Variables (Public) public_hash = "QslvWRIPsBcDAOK" read_base_path = "" # Relative path inside the public link - + # 2. Write Variables (Private/Authenticated) cern_user = "gadesant" # CERN username cern_password = os.environ.get("CERNBOX_PASSWORD") write_base_path = "eos/user/g/gadesant/teste/"#"eos/user/{u}/{user}/teste/" - + if public_hash == "PUT_YOUR_PUBLIC_HASH_HERE": print("Warning: Configure the public_hash in the code before testing.") return - + if not cern_password: print("❌ The CERNBOX_PASSWORD environment variable is not set.") print("Run in terminal: export CERNBOX_PASSWORD='your_password'") @@ -55,26 +40,17 @@ def test_cernbox(): try: # Passing all three arguments cernbox = CernboxProvider(public_link_hash=public_hash, account=cern_user, password=cern_password) - + print("\n[Phase 1: Reading from Public Link]") folders = cernbox.list_folders(read_base_path) print("✅ Read: Success (Anonymous)!") print(f"Found {len(folders)} items at the root of the link.") - + print("\n[Phase 2: Writing via Authenticated WebDAV]") - remote_test_file = f"{write_base_path}hybrid_connection_test.txt" - + with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as tmp: tmp.write("Authenticated upload from test_connections.py") - tmp_path = tmp.name - ### Test Upload - # try: - # cernbox.upload_file(tmp_path, remote_test_file) - # print(f"✅ Write: Success (Authenticated)! File sent to: {remote_test_file}") - # finally: - # os.remove(tmp_path) - except Exception as e: print("❌ Failed to connect/operate on CERNBOX.") print(f"Details: {e}") diff --git a/requirements.txt b/requirements.txt index d9af537..8d71172 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -boto3 -requests -pypdf \ No newline at end of file +boto3>=1.38.9,<2.0.0 +requests>=2.0.0,<3.0.0 +pypdf>=3.0.0,<7.0.0