diff --git a/physionet/__init__.py b/physionet/__init__.py index e1d0d76b..1a64bec6 100644 --- a/physionet/__init__.py +++ b/physionet/__init__.py @@ -1,3 +1,9 @@ -from .api import PhysioNetClient +from physionet.api import PhysioNetClient + +try: + from importlib.metadata import version + __version__ = version("physionet") +except Exception: + __version__ = "unknown" __all__ = ["PhysioNetClient"] diff --git a/physionet/__main__.py b/physionet/__main__.py new file mode 100644 index 00000000..1e7a4f85 --- /dev/null +++ b/physionet/__main__.py @@ -0,0 +1,7 @@ +"""Allow running the CLI as a module: python -m physionet.""" + +import sys +from physionet.cli import main + +if __name__ == "__main__": + sys.exit(main()) diff --git a/physionet/cli.py b/physionet/cli.py new file mode 100644 index 00000000..575eba3c --- /dev/null +++ b/physionet/cli.py @@ -0,0 +1,142 @@ +"""Command-line interface for physionet package.""" + +import argparse +import json +import sys +from pathlib import Path + +from physionet.validate import validate_dataset, ValidationConfig + + +def main(): + """Main entry point for the CLI.""" + parser = argparse.ArgumentParser( + prog="physionet", + description="Tools for working with PhysioNet datasets", + ) + + subparsers = parser.add_subparsers(dest="command", help="Available commands") + + # Validate subcommand + validate_parser = subparsers.add_parser( + "validate", + help="Validate a dataset before submission to PhysioNet", + ) + validate_parser.add_argument( + "path", + help="Path to the dataset directory to validate", + ) + validate_parser.add_argument( + "--report", + metavar="FILE", + help="Generate detailed JSON report and save to FILE", + ) + validate_parser.add_argument( + "--checks", + metavar="CATEGORIES", + help="Comma-separated list of check categories to run (filesystem,documentation,integrity,quality,privacy)", + ) + validate_parser.add_argument( + "--level", + choices=["error", "warning", "info"], + default="info", + help="Minimum severity level to display (default: info)", + ) + validate_parser.add_argument( + "--no-sampling", + action="store_true", + help="Disable sampling for large files (scan all rows, slower but more thorough)", + ) + validate_parser.add_argument( + "--max-rows", + type=int, + metavar="N", + help="Maximum rows to scan per CSV file (default: 10000)", + ) + + args = parser.parse_args() + + if args.command == "validate": + return _handle_validate(args) + elif args.command is None: + parser.print_help() + return 0 + else: + print(f"Unknown command: {args.command}", file=sys.stderr) + return 1 + + +def _handle_validate(args): + """Handle the validate subcommand.""" + # Validate path + dataset_path = Path(args.path) + if not dataset_path.exists(): + print(f"Error: Path does not exist: {args.path}", file=sys.stderr) + return 1 + + if not dataset_path.is_dir(): + print(f"Error: Path is not a directory: {args.path}", file=sys.stderr) + return 1 + + # Configure validation + config = ValidationConfig() + + # Parse check categories if specified + if args.checks: + categories = [c.strip().lower() for c in args.checks.split(",")] + config.check_filesystem = "filesystem" in categories + config.check_documentation = "documentation" in categories + config.check_integrity = "integrity" in categories + config.check_quality = "quality" in categories + config.check_phi = "privacy" in categories + + # Configure sampling options + if args.no_sampling: + config.sample_large_files = False + if args.max_rows: + config.max_rows_to_scan = args.max_rows + + # Run validation + try: + print(f"Validating dataset: {dataset_path}") + result = validate_dataset(str(dataset_path), config, show_progress=True) + print() + + print(result.summary()) + + # Save validation report - either to specified path or default location + if args.report: + report_path = Path(args.report) + # Determine format based on file extension + if report_path.suffix.lower() == '.json': + # Save as JSON + with open(report_path, "w", encoding="utf-8") as f: + json.dump(result.to_dict(), f, indent=2) + else: + # Save as Markdown + with open(report_path, "w", encoding="utf-8") as f: + f.write(result.summary()) + else: + # Default: save as Markdown in the root of the dataset folder + report_path = dataset_path / "PHYSIONET_REPORT.md" + with open(report_path, "w", encoding="utf-8") as f: + f.write(result.summary()) + + print() + print(f"Validation report saved to: {report_path}") + + if result.status == "error": + return 1 + elif result.status == "warning" and args.level == "error": + return 0 # Warnings don't fail if level is error + return 0 + + except Exception as e: + print(f"Error during validation: {str(e)}", file=sys.stderr) + import traceback + traceback.print_exc() + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/physionet/validate/__init__.py b/physionet/validate/__init__.py new file mode 100644 index 00000000..cebe0280 --- /dev/null +++ b/physionet/validate/__init__.py @@ -0,0 +1,7 @@ +"""Dataset validation module for PhysioNet submissions.""" + +from physionet.validate.validator import validate_dataset +from physionet.validate.config import ValidationConfig +from physionet.validate.models import ValidationResult + +__all__ = ["validate_dataset", "ValidationConfig", "ValidationResult"] diff --git a/physionet/validate/checks/__init__.py b/physionet/validate/checks/__init__.py new file mode 100644 index 00000000..b8ad08ea --- /dev/null +++ b/physionet/validate/checks/__init__.py @@ -0,0 +1,15 @@ +"""Validation check modules.""" + +from physionet.validate.checks.filesystem import check_filesystem +from physionet.validate.checks.documentation import check_documentation +from physionet.validate.checks.integrity import check_integrity +from physionet.validate.checks.quality import check_quality +from physionet.validate.checks.privacy import check_privacy + +__all__ = [ + "check_filesystem", + "check_documentation", + "check_integrity", + "check_quality", + "check_privacy", +] diff --git a/physionet/validate/checks/documentation.py b/physionet/validate/checks/documentation.py new file mode 100644 index 00000000..9d6edc04 --- /dev/null +++ b/physionet/validate/checks/documentation.py @@ -0,0 +1,48 @@ +"""Documentation validation checks.""" + +from pathlib import Path + +from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity +from physionet.validate.config import ValidationConfig + + +def check_documentation(path: Path, config: ValidationConfig) -> CheckResult: + """ + Check documentation completeness. + + Validates: + - Required files exist (if any are specified in config) + + Args: + path: Path to dataset directory + config: Validation configuration + + Returns: + CheckResult with any documentation issues found + """ + result = CheckResult(category=CheckCategory.DOCUMENTATION) + + # Check for required files + for required_file in config.required_files: + file_path = path / required_file + if not file_path.exists(): + # Customize suggestion for README.md + if required_file == "README.md": + suggestion = ( + "Add README.md to your dataset. At minimum, the file should include " + "a title and a brief description of the package content." + ) + else: + suggestion = f"Add {required_file} to your dataset" + + result.issues.append( + ValidationIssue( + severity=Severity.ERROR, + category=CheckCategory.DOCUMENTATION, + file=required_file, + message=f"Required file not found: {required_file}", + suggestion=suggestion, + ) + ) + + return result diff --git a/physionet/validate/checks/filesystem.py b/physionet/validate/checks/filesystem.py new file mode 100644 index 00000000..c1f14f4e --- /dev/null +++ b/physionet/validate/checks/filesystem.py @@ -0,0 +1,194 @@ +"""File system validation checks.""" + +import os +from pathlib import Path + +from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity +from physionet.validate.config import ValidationConfig + +# Proprietary file formats and their recommended open alternatives +PROPRIETARY_FORMATS = { + '.mat': 'MATLAB format; consider .csv, .zarr, .parquet, or .npy instead', + '.sas7bdat': 'SAS format; consider .csv or .parquet instead', + '.dta': 'Stata format; consider .csv or .parquet instead', + '.sav': 'SPSS format; consider .csv or .parquet instead', + '.xlsx': 'Excel format; consider .csv instead', + '.xls': 'Excel format; consider .csv instead', + '.rds': 'R binary format; consider .csv or .parquet instead', + '.rdata': 'R binary format; consider .csv or .parquet instead', + '.ppt': 'PowerPoint format; consider .pdf instead', + '.pptx': 'PowerPoint format; consider .pdf instead', +} + + +def check_filesystem(path: Path, config: ValidationConfig) -> CheckResult: + """ + Check file system organization and structure. + + Validates: + - File naming conventions + - Presence of version control artifacts + - File sizes + - Small file count + + Args: + path: Path to dataset directory + config: Validation configuration + + Returns: + CheckResult with any filesystem issues found + """ + result = CheckResult(category=CheckCategory.FILESYSTEM) + + # Check for version control artifacts + for pattern in [".git", ".svn", ".hg", "__pycache__", ".pytest_cache"]: + found_paths = list(path.rglob(pattern)) + if found_paths: + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.FILESYSTEM, + message=f"Found version control/build artifacts: {pattern}", + suggestion=f"Remove {pattern} directories before submission", + ) + ) + + # Check for hidden and temp files + for root, dirs, files in os.walk(path): + # Filter ignored directories + dirs[:] = [d for d in dirs if not any(p in d for p in config.ignore_patterns)] + + for file in files: + file_path = Path(root) / file + + # Skip ignored files + if any(p in file for p in config.ignore_patterns): + continue + + # Check for hidden files (starting with .) + if file.startswith(".") and file not in [".gitignore", ".gitattributes"]: + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.FILESYSTEM, + file=str(file_path.relative_to(path)), + message=f"Hidden file found: {file}", + suggestion="Remove hidden files before submission", + ) + ) + + # Check for temp files + if file.endswith(("~", ".tmp", ".bak", ".swp")): + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.FILESYSTEM, + file=str(file_path.relative_to(path)), + message=f"Temporary file found: {file}", + suggestion="Remove temporary files before submission", + ) + ) + + # Check file size + try: + size = file_path.stat().st_size + if size == 0: + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.FILESYSTEM, + file=str(file_path.relative_to(path)), + message="Empty file (0 bytes)", + suggestion="Remove empty files or add content", + ) + ) + elif config.max_file_size_bytes and size > config.max_file_size_bytes: + result.issues.append( + ValidationIssue( + severity=Severity.INFO, + category=CheckCategory.FILESYSTEM, + file=str(file_path.relative_to(path)), + message=f"Large file: {_format_size(size)}", + suggestion="Consider splitting or compressing large files", + ) + ) + except (OSError, PermissionError): + pass + + # Check for excessively long filenames + # Most filesystems support 255 characters, but recommend shorter for compatibility + if len(file) > 255: + result.issues.append( + ValidationIssue( + severity=Severity.ERROR, + category=CheckCategory.FILESYSTEM, + file=str(file_path.relative_to(path)), + message=f"Filename exceeds maximum length ({len(file)} characters): {file[:50]}...", + suggestion="Shorten filename to 255 characters or less", + ) + ) + elif len(file) > 100: + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.FILESYSTEM, + file=str(file_path.relative_to(path)), + message=f"Filename is very long ({len(file)} characters): {file[:50]}...", + suggestion="Consider shortening filename for better compatibility (recommended: under 100 characters)", + ) + ) + + # Check for spaces in filename + if " " in file: + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.FILESYSTEM, + file=str(file_path.relative_to(path)), + message=f"Filename contains spaces: {file}", + suggestion="Replace spaces with underscores or hyphens", + ) + ) + + # Check for invalid/awkward characters in filename + # Include path separators, quotes, and other problematic characters + invalid_chars = set('<>:"|?*/\\\'') + found_invalid = [char for char in invalid_chars if char in file] + + if found_invalid: + char_list = ", ".join(f"'{char}'" for char in found_invalid) + result.issues.append( + ValidationIssue( + severity=Severity.ERROR, + category=CheckCategory.FILESYSTEM, + file=str(file_path.relative_to(path)), + message=f"Filename contains invalid characters ({char_list}): {file}", + suggestion="Remove special characters from filename (use only letters, numbers, underscores, hyphens, and periods)", + ) + ) + + # Check for proprietary file formats + file_ext = "." + file.split(".")[-1] if "." in file else "" + file_ext_lower = file_ext.lower() + + if file_ext_lower in PROPRIETARY_FORMATS: + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.FILESYSTEM, + file=str(file_path.relative_to(path)), + message=f"Proprietary file format detected: {file}", + suggestion=f"{PROPRIETARY_FORMATS[file_ext_lower]}", + ) + ) + + return result + + +def _format_size(size_bytes: int) -> str: + """Format byte size as human-readable string.""" + for unit in ["B", "KB", "MB", "GB", "TB"]: + if size_bytes < 1024.0: + return f"{size_bytes:.1f} {unit}" + size_bytes /= 1024.0 + return f"{size_bytes:.1f} PB" diff --git a/physionet/validate/checks/integrity.py b/physionet/validate/checks/integrity.py new file mode 100644 index 00000000..70428544 --- /dev/null +++ b/physionet/validate/checks/integrity.py @@ -0,0 +1,163 @@ +"""Data integrity validation checks.""" + +import csv +from pathlib import Path +from typing import Optional, Callable + +from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity +from physionet.validate.config import ValidationConfig + + +def check_integrity(path: Path, config: ValidationConfig, progress_callback: Optional[Callable[[str], None]] = None) -> CheckResult: + """ + Check data integrity and format validation. + + Validates: + - CSV file structure + - File format validity + - Basic structural consistency + + Args: + path: Path to dataset directory + config: Validation configuration + progress_callback: Optional callback to report progress + + Returns: + CheckResult with any integrity issues found + """ + result = CheckResult(category=CheckCategory.INTEGRITY) + + # Find and validate CSV files + csv_files = list(path.rglob("*.csv")) + for i, csv_file in enumerate(csv_files): + if progress_callback: + progress_callback(f"Checking {csv_file.name} ({i+1}/{len(csv_files)} CSV files)") + + if any(p in str(csv_file) for p in config.ignore_patterns): + continue + + _validate_csv_structure(csv_file, path, result) + + return result + + +def _validate_csv_structure(csv_file: Path, base_path: Path, result: CheckResult) -> None: + """Validate CSV file structure.""" + try: + with open(csv_file, "r", encoding="utf-8") as f: + # Try to detect dialect + sample = f.read(1024) + f.seek(0) + + try: + dialect = csv.Sniffer().sniff(sample) + except csv.Error: + # Use default dialect if detection fails + dialect = csv.excel + + reader = csv.reader(f, dialect) + + # Read header + try: + header = next(reader) + except StopIteration: + result.issues.append( + ValidationIssue( + severity=Severity.ERROR, + category=CheckCategory.INTEGRITY, + file=str(csv_file.relative_to(base_path)), + message="CSV file is empty", + ) + ) + return + + if not header: + result.issues.append( + ValidationIssue( + severity=Severity.ERROR, + category=CheckCategory.INTEGRITY, + file=str(csv_file.relative_to(base_path)), + message="CSV file has no header row", + ) + ) + return + + # Check for duplicate column names + if len(header) != len(set(header)): + duplicates = [col for col in header if header.count(col) > 1] + result.issues.append( + ValidationIssue( + severity=Severity.ERROR, + category=CheckCategory.INTEGRITY, + file=str(csv_file.relative_to(base_path)), + message=f"Duplicate column names found: {', '.join(set(duplicates))}", + ) + ) + + # Check for empty column names + if any(not col.strip() for col in header): + result.issues.append( + ValidationIssue( + severity=Severity.ERROR, + category=CheckCategory.INTEGRITY, + file=str(csv_file.relative_to(base_path)), + message="CSV contains empty column names", + ) + ) + + # Validate row consistency + expected_cols = len(header) + row_count = 0 + for line_num, row in enumerate(reader, start=2): # Start at 2 (after header) + row_count += 1 + if len(row) != expected_cols: + result.issues.append( + ValidationIssue( + severity=Severity.ERROR, + category=CheckCategory.INTEGRITY, + file=str(csv_file.relative_to(base_path)), + line=line_num, + message=f"Row has {len(row)} columns, expected {expected_cols}", + ) + ) + # Only report first few inconsistencies to avoid spam + if len([i for i in result.issues if i.file == str(csv_file.relative_to(base_path))]) >= 5: + result.issues.append( + ValidationIssue( + severity=Severity.INFO, + category=CheckCategory.INTEGRITY, + file=str(csv_file.relative_to(base_path)), + message=f"Additional row inconsistencies may exist (showing first 5)", + ) + ) + break + + if row_count == 0: + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.INTEGRITY, + file=str(csv_file.relative_to(base_path)), + message="CSV file contains only header row (no data)", + ) + ) + + except UnicodeDecodeError: + result.issues.append( + ValidationIssue( + severity=Severity.ERROR, + category=CheckCategory.INTEGRITY, + file=str(csv_file.relative_to(base_path)), + message="CSV file has encoding issues (not valid UTF-8)", + suggestion="Convert file to UTF-8 encoding", + ) + ) + except Exception as e: + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.INTEGRITY, + file=str(csv_file.relative_to(base_path)), + message=f"Could not validate CSV file: {str(e)}", + ) + ) diff --git a/physionet/validate/checks/privacy.py b/physionet/validate/checks/privacy.py new file mode 100644 index 00000000..3db7e0c8 --- /dev/null +++ b/physionet/validate/checks/privacy.py @@ -0,0 +1,312 @@ +"""Privacy and PHI validation checks.""" + +import csv +import os +import re +from pathlib import Path +from typing import Optional, Callable + +from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity +from physionet.validate.config import ValidationConfig + +# Pattern names for better error messages +PHI_PATTERN_NAMES = { + r"\b\d{3}-\d{2}-\d{4}\b": "SSN", + r"\b[\w\.-]+@[\w\.-]+\.\w+\b": "email address", + r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b": "phone number", +} + +# Sensitive configuration files that should not be included in datasets +SENSITIVE_FILES = { + # API keys and credentials + ".env": "environment variables (may contain API keys)", + ".env.local": "local environment variables", + ".env.production": "production environment variables", + "credentials.json": "credential file", + "secrets.json": "secrets file", + "config.json": "configuration file (may contain credentials)", + ".aws/credentials": "AWS credentials", + ".aws/config": "AWS configuration", + + # SSH and certificates + "id_rsa": "SSH private key", + "id_dsa": "SSH private key", + "id_ecdsa": "SSH private key", + "id_ed25519": "SSH private key", + ".pem": "private certificate/key", + ".key": "private key", + ".p12": "certificate file", + ".pfx": "certificate file", + + # Database + ".pgpass": "PostgreSQL password file", + ".my.cnf": "MySQL configuration (may contain passwords)", + + # Other sensitive files + ".netrc": "authentication credentials", + ".htpasswd": "HTTP authentication", + "docker-compose.override.yml": "Docker override (may contain secrets)", +} + + +def check_privacy(path: Path, config: ValidationConfig, progress_callback: Optional[Callable[[str], None]] = None) -> CheckResult: + """ + Check for potential privacy issues and PHI. + + Validates: + - PHI pattern detection + - Age de-identification + - Sensitive configuration files (keys, credentials) + - Date patterns + + Args: + path: Path to dataset directory + config: Validation configuration + progress_callback: Optional callback to report progress + + Returns: + CheckResult with any privacy issues found + """ + result = CheckResult(category=CheckCategory.PRIVACY) + + # Check for sensitive configuration files + if progress_callback: + progress_callback("Checking for sensitive configuration files") + _check_sensitive_files(path, config, result) + + # Compile PHI patterns with names + pattern_info = [(re.compile(pattern), PHI_PATTERN_NAMES.get(pattern, "unknown pattern")) + for pattern in config.phi_patterns] + + # Check CSV files + csv_files = list(path.rglob("*.csv")) + for i, csv_file in enumerate(csv_files): + if progress_callback: + progress_callback(f"Checking {csv_file.name} ({i+1}/{len(csv_files)} CSV files)") + + if any(p in str(csv_file) for p in config.ignore_patterns): + continue + + _check_csv_privacy(csv_file, path, config, pattern_info, result) + + # Check text files for PHI + text_files = list(path.rglob("*.txt")) + for i, text_file in enumerate(text_files): + if progress_callback: + progress_callback(f"Checking {text_file.name} ({i+1}/{len(text_files)} text files)") + + if any(p in str(text_file) for p in config.ignore_patterns): + continue + + _check_text_file_privacy(text_file, path, pattern_info, result, config) + + return result + + +def _check_sensitive_files(path: Path, config: ValidationConfig, result: CheckResult) -> None: + """Check for sensitive configuration files that shouldn't be in the dataset.""" + for root, dirs, files in os.walk(path): + # Filter out ignored directories + dirs[:] = [d for d in dirs if not any(p in d for p in config.ignore_patterns)] + + for file in files: + file_path = Path(root) / file + relative_path = str(file_path.relative_to(path)) + + # Skip ignored files + if any(p in str(file_path) for p in config.ignore_patterns): + continue + + # Check exact filename matches + if file in SENSITIVE_FILES: + result.issues.append( + ValidationIssue( + severity=Severity.ERROR, + category=CheckCategory.PRIVACY, + file=relative_path, + message=f"Sensitive file detected: {SENSITIVE_FILES[file]}", + suggestion=f"Remove '{file}' from the dataset before submission", + ) + ) + continue + + # Check file extensions for sensitive files + for sensitive_name, description in SENSITIVE_FILES.items(): + # Check if it's an extension pattern (starts with .) + if sensitive_name.startswith(".") and "." in file: + ext = "." + file.split(".")[-1] + if ext == sensitive_name: + result.issues.append( + ValidationIssue( + severity=Severity.ERROR, + category=CheckCategory.PRIVACY, + file=relative_path, + message=f"Sensitive file detected: {description}", + suggestion=f"Remove '{file}' from the dataset before submission", + ) + ) + break + + # Check for common patterns in filenames + lower_file = file.lower() + if any(keyword in lower_file for keyword in ["password", "secret", "token", "apikey", "api_key"]): + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.PRIVACY, + file=relative_path, + message=f"File name suggests sensitive content: '{file}'", + suggestion="Review file contents and remove if it contains credentials or keys", + ) + ) + + +def _check_csv_privacy( + csv_file: Path, + base_path: Path, + config: ValidationConfig, + pattern_info: list, + result: CheckResult +) -> None: + """Check a CSV file for privacy issues.""" + relative_path = str(csv_file.relative_to(base_path)) + + # Track which columns have which types of issues (to report only once per column) + # Maps column name to the pattern name that matched + phi_columns = {} # {column: pattern_name} + age_columns = set() # Columns with age violations + + try: + with open(csv_file, "r", encoding="utf-8") as f: + reader = csv.DictReader(f) + + # Determine if we should sample this file + rows_scanned = 0 + max_rows = config.max_rows_to_scan + + # Count total rows first if we're sampling + if config.sample_large_files and max_rows: + # Read all rows into list to enable sampling + all_rows = list(reader) + total_rows = len(all_rows) + + if total_rows > max_rows: + # Sample evenly distributed rows + import random + random.seed(42) # Deterministic sampling + step = total_rows / max_rows + sampled_indices = [int(i * step) for i in range(max_rows)] + rows_to_scan = [all_rows[i] for i in sampled_indices] + is_sampled = True + else: + rows_to_scan = all_rows + is_sampled = False + else: + # No sampling, but still respect max_rows limit + rows_to_scan = reader + is_sampled = False + + for line_num, row in enumerate(rows_to_scan, start=2): # Start at 2 (after header) + # Stop if we've hit the limit (when not sampling) + if max_rows and not is_sampled and rows_scanned >= max_rows: + break + rows_scanned += 1 + + for col, value in row.items(): + if not value: + continue + + value_str = str(value).strip() + + # Check for PHI patterns (only track if not already found in this column) + if col not in phi_columns: + for pattern, pattern_name in pattern_info: + if pattern.search(value_str): + phi_columns[col] = pattern_name + break + + # Check for age violations (only track if not already found in this column) + if col not in age_columns and "age" in col.lower(): + try: + age_value = float(value_str) + if age_value > config.allowed_age_max: + age_columns.add(col) + except ValueError: + pass + + # Report one issue per column type with specific pattern info + for col, pattern_name in phi_columns.items(): + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.PRIVACY, + file=relative_path, + column=col, + message=f"Potential private information detected in column '{col}' (pattern: {pattern_name})", + suggestion="Review and remove or de-identify sensitive information", + ) + ) + + for col in age_columns: + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.PRIVACY, + file=relative_path, + column=col, + message=f"Ages exceeding HIPAA limit of {config.allowed_age_max} found in column '{col}'", + suggestion=f"De-identify ages >{config.allowed_age_max} (e.g., set to {config.allowed_age_max}+)", + ) + ) + + except Exception as e: + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.PRIVACY, + file=str(csv_file.relative_to(base_path)), + message=f"Could not perform privacy checks: {str(e)}", + ) + ) + + +def _check_text_file_privacy(text_file: Path, base_path: Path, pattern_info: list, result: CheckResult, config: ValidationConfig) -> None: + """Check a text file for privacy issues.""" + relative_path = str(text_file.relative_to(base_path)) + detected_patterns = set() + + try: + with open(text_file, "r", encoding="utf-8") as f: + content = f.read() + + # Check for PHI patterns and track which ones are found + for line in content.split("\n"): + for pattern, pattern_name in pattern_info: + if pattern.search(line): + detected_patterns.add(pattern_name) + + # Report once per file with specific patterns found + if detected_patterns: + patterns_str = ", ".join(sorted(detected_patterns)) + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.PRIVACY, + file=relative_path, + message=f"Potential private information detected ({patterns_str})", + suggestion="Review and remove or de-identify sensitive information", + ) + ) + + except UnicodeDecodeError: + # Skip binary files + pass + except Exception as e: + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.PRIVACY, + file=str(text_file.relative_to(base_path)), + message=f"Could not perform privacy checks: {str(e)}", + ) + ) diff --git a/physionet/validate/checks/quality.py b/physionet/validate/checks/quality.py new file mode 100644 index 00000000..f2ec2794 --- /dev/null +++ b/physionet/validate/checks/quality.py @@ -0,0 +1,141 @@ +"""Data quality validation checks.""" + +import csv +from pathlib import Path +from typing import Optional, Callable + +from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity +from physionet.validate.config import ValidationConfig + + +def check_quality(path: Path, config: ValidationConfig, progress_callback: Optional[Callable[[str], None]] = None) -> CheckResult: + """ + Check data quality. + + Validates: + - Missing value thresholds + - Value range plausibility + - Data type consistency + + Args: + path: Path to dataset directory + config: Validation configuration + progress_callback: Optional callback to report progress + + Returns: + CheckResult with any quality issues found + """ + result = CheckResult(category=CheckCategory.QUALITY) + + # Find and validate CSV files + csv_files = list(path.rglob("*.csv")) + for i, csv_file in enumerate(csv_files): + if progress_callback: + progress_callback(f"Checking {csv_file.name} ({i+1}/{len(csv_files)} CSV files)") + + if any(p in str(csv_file) for p in config.ignore_patterns): + continue + + _check_csv_quality(csv_file, path, config, result) + + return result + + +def _check_csv_quality(csv_file: Path, base_path: Path, config: ValidationConfig, result: CheckResult) -> None: + """Check quality metrics for a CSV file.""" + try: + with open(csv_file, "r", encoding="utf-8") as f: + reader = csv.DictReader(f) + + # Track column statistics + column_stats = {col: {"total": 0, "missing": 0, "values": []} for col in reader.fieldnames or []} + + # Determine if we should sample this file + rows_scanned = 0 + max_rows = config.max_rows_to_scan + + # Sample if enabled and file is large + if config.sample_large_files and max_rows: + all_rows = list(reader) + total_rows = len(all_rows) + + if total_rows > max_rows: + # Sample evenly distributed rows + import random + random.seed(42) # Deterministic sampling + step = total_rows / max_rows + sampled_indices = [int(i * step) for i in range(max_rows)] + rows_to_scan = [all_rows[i] for i in sampled_indices] + else: + rows_to_scan = all_rows + else: + rows_to_scan = reader + + for row in rows_to_scan: + # Stop if we've hit the limit (when not sampling) + if max_rows and not config.sample_large_files and rows_scanned >= max_rows: + break + rows_scanned += 1 + + for col, value in row.items(): + column_stats[col]["total"] += 1 + + # Check for missing values + if not value or value.strip() in ("", "NA", "N/A", "NULL", "null", "None", "NaN"): + column_stats[col]["missing"] += 1 + else: + # Store value for range checking if configured + if col.lower().replace("_", " ") in [k.lower().replace("_", " ") for k in config.value_ranges]: + try: + numeric_value = float(value.strip()) + column_stats[col]["values"].append(numeric_value) + except ValueError: + pass + + # Analyze results + for col, stats in column_stats.items(): + if stats["total"] == 0: + continue + + # Check missing value threshold + missing_ratio = stats["missing"] / stats["total"] + if missing_ratio >= config.missing_value_threshold: + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.QUALITY, + file=str(csv_file.relative_to(base_path)), + column=col, + message=f"Column '{col}' is completely empty (100% missing values)", + suggestion=f"Consider removing empty column '{col}' or adding data", + ) + ) + + # Check value ranges + for range_key, (min_val, max_val) in config.value_ranges.items(): + if col.lower().replace("_", " ") == range_key.lower().replace("_", " "): + for value in stats["values"]: + if value < min_val or value > max_val: + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.QUALITY, + file=str(csv_file.relative_to(base_path)), + column=col, + value=str(value), + message=f"Value {value} in '{col}' outside expected range [{min_val}, {max_val}]", + suggestion="Verify data accuracy or adjust validation ranges", + ) + ) + # Limit warnings per column + break + + except Exception as e: + result.issues.append( + ValidationIssue( + severity=Severity.WARNING, + category=CheckCategory.QUALITY, + file=str(csv_file.relative_to(base_path)), + message=f"Could not perform quality checks: {str(e)}", + ) + ) diff --git a/physionet/validate/config.py b/physionet/validate/config.py new file mode 100644 index 00000000..f596d689 --- /dev/null +++ b/physionet/validate/config.py @@ -0,0 +1,44 @@ +"""Configuration for validation checks.""" + +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Tuple + + +@dataclass +class ValidationConfig: + """Configuration for dataset validation.""" + + # General settings + check_filesystem: bool = True + check_documentation: bool = True + check_integrity: bool = True + check_quality: bool = True + check_phi: bool = True + + # File system settings + max_file_size_bytes: Optional[int] = None # None = no limit + warn_small_files_threshold: int = 100 # Warn if more than this many small files + ignore_patterns: List[str] = field(default_factory=lambda: [ + ".git", ".gitignore", ".DS_Store", "__pycache__", "*.pyc", ".pytest_cache" + ]) + + # Documentation settings + required_files: List[str] = field(default_factory=lambda: ["README.md"]) + recommended_readme_sections: List[str] = field(default_factory=list) + + # Performance settings + max_rows_to_scan: Optional[int] = 10000 # Max rows to scan per CSV for privacy/quality checks (None = all rows) + sample_large_files: bool = True # If True, sample rows from large files instead of scanning all + + # Quality settings + missing_value_threshold: float = 1.0 # Warn if column has 100% missing values + value_ranges: Dict[str, Tuple[float, float]] = field(default_factory=dict) + # Example: {"heart_rate": (20, 300), "temperature": (32, 43)} + + # Privacy settings + allowed_age_max: int = 89 + phi_patterns: List[str] = field(default_factory=lambda: [ + r"\b\d{3}-\d{2}-\d{4}\b", # SSN pattern + r"\b[\w\.-]+@[\w\.-]+\.\w+\b", # Email pattern + r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b", # Phone pattern + ]) diff --git a/physionet/validate/models.py b/physionet/validate/models.py new file mode 100644 index 00000000..acfdfb87 --- /dev/null +++ b/physionet/validate/models.py @@ -0,0 +1,309 @@ +"""Data models for validation results.""" + +from dataclasses import dataclass, field +from typing import List, Optional, Dict, Any +from enum import Enum +from datetime import datetime + + +class Severity(Enum): + """Severity levels for validation issues.""" + ERROR = "error" + WARNING = "warning" + INFO = "info" + + +class CheckCategory(Enum): + """Categories of validation checks.""" + FILESYSTEM = "filesystem" + DOCUMENTATION = "documentation" + INTEGRITY = "integrity" + QUALITY = "quality" + PRIVACY = "privacy" + + +@dataclass +class ValidationIssue: + """Represents a single validation issue.""" + severity: Severity + category: CheckCategory + message: str + file: Optional[str] = None + line: Optional[int] = None + column: Optional[str] = None + value: Optional[str] = None + suggestion: Optional[str] = None + + def to_dict(self) -> Dict[str, Any]: + """Convert issue to dictionary format.""" + result = { + "severity": self.severity.value, + "category": self.category.value, + "message": self.message, + } + if self.file: + result["file"] = self.file + if self.line is not None: + result["line"] = self.line + if self.column: + result["column"] = self.column + if self.value: + result["value"] = self.value + if self.suggestion: + result["suggestion"] = self.suggestion + return result + + +@dataclass +class CheckResult: + """Results from a specific category of checks.""" + category: CheckCategory + issues: List[ValidationIssue] = field(default_factory=list) + + @property + def status(self) -> str: + """Get overall status for this check category.""" + if any(issue.severity == Severity.ERROR for issue in self.issues): + return "error" + elif any(issue.severity == Severity.WARNING for issue in self.issues): + return "warning" + return "pass" + + @property + def error_count(self) -> int: + """Count of errors in this category.""" + return sum(1 for issue in self.issues if issue.severity == Severity.ERROR) + + @property + def warning_count(self) -> int: + """Count of warnings in this category.""" + return sum(1 for issue in self.issues if issue.severity == Severity.WARNING) + + @property + def info_count(self) -> int: + """Count of info messages in this category.""" + return sum(1 for issue in self.issues if issue.severity == Severity.INFO) + + +@dataclass +class DatasetStats: + """Statistics about the dataset being validated.""" + total_size_bytes: int = 0 + file_count: int = 0 + directory_count: int = 0 + + +@dataclass +class ValidationResult: + """Complete validation results for a dataset.""" + dataset_path: str + timestamp: str + check_results: Dict[CheckCategory, CheckResult] = field(default_factory=dict) + dataset_stats: DatasetStats = field(default_factory=DatasetStats) + + @property + def total_errors(self) -> int: + """Total count of errors across all checks.""" + return sum(result.error_count for result in self.check_results.values()) + + @property + def total_warnings(self) -> int: + """Total count of warnings across all checks.""" + return sum(result.warning_count for result in self.check_results.values()) + + @property + def total_info(self) -> int: + """Total count of info messages across all checks.""" + return sum(result.info_count for result in self.check_results.values()) + + @property + def status(self) -> str: + """Overall validation status.""" + if self.total_errors > 0: + return "error" + elif self.total_warnings > 0: + return "warning" + return "pass" + + def summary(self) -> str: + """Generate a human-readable summary.""" + # Format timestamp as human-readable + try: + dt = datetime.fromisoformat(self.timestamp.replace('Z', '+00:00')) + formatted_timestamp = dt.strftime("%Y-%m-%d %H:%M:%S UTC") + except (ValueError, AttributeError): + formatted_timestamp = self.timestamp + + # Get package version + try: + import physionet + validator_version = physionet.__version__ + except (ImportError, AttributeError): + validator_version = "unknown" + + lines = [] + + # Section 1: Metadata + lines.extend([ + "PhysioNet Dataset Validation Report", + "=" * 50, + "", + "Metadata:", + f" Dataset: {self.dataset_path}", + f" Validator version: {validator_version}", + f" Timestamp: {formatted_timestamp}", + f" Total size: {self._format_size(self.dataset_stats.total_size_bytes)} " + f"({self.dataset_stats.file_count} files)", + "", + ]) + + # Section 2: Validation Results + lines.extend([ + "Validation Results:", + "=" * 50, + ]) + + for category, result in self.check_results.items(): + # Only show ✗ for errors, ✓ for pass or warnings-only + status_icon = "✗" if result.error_count > 0 else "✓" + issue_summary = "" + if result.error_count or result.warning_count: + parts = [] + if result.error_count: + parts.append(f"{result.error_count} error{'s' if result.error_count != 1 else ''}") + if result.warning_count: + parts.append(f"{result.warning_count} warning{'s' if result.warning_count != 1 else ''}") + issue_summary = f" ({', '.join(parts)})" + + lines.append(f"{status_icon} {category.value.replace('_', ' ').title()}{issue_summary}") + + for issue in result.issues: + icon = "✗" if issue.severity == Severity.ERROR else "⚠" + location = f" {issue.file}" + if issue.line: + location += f":{issue.line}" + lines.append(f" {icon}{location} - {issue.message}") + + lines.append("") + + # Section 3: Summary + lines.extend([ + "Summary:", + "=" * 50, + f"{self.total_errors} error{'s' if self.total_errors != 1 else ''}, " + f"{self.total_warnings} warning{'s' if self.total_warnings != 1 else ''}", + "", + ]) + + if self.status == "error": + lines.append("✗ Dataset has errors that must be fixed before submission") + elif self.status == "warning": + lines.append("⚠ Dataset has warnings that should be addressed before submission") + else: + lines.append("✓ Dataset passed validation") + + # Add recommendations section if there are issues + recommendations = self._generate_recommendations() + if recommendations: + lines.extend([ + "", + "Recommendations:", + "=" * 50, + ]) + lines.extend(recommendations) + + # Add note about including validation report in submission + lines.extend([ + "", + "Note: A validation report (PHYSIONET_REPORT.md) has been saved in your", + " dataset folder. Please include this file in your final submission.", + ]) + + return "\n".join(lines) + "\n" + + def to_dict(self) -> Dict[str, Any]: + """Convert validation result to dictionary format.""" + return { + "dataset_path": self.dataset_path, + "timestamp": self.timestamp, + "dataset_stats": { + "total_size_bytes": self.dataset_stats.total_size_bytes, + "file_count": self.dataset_stats.file_count, + "directory_count": self.dataset_stats.directory_count, + }, + "summary": { + "total_errors": self.total_errors, + "total_warnings": self.total_warnings, + "total_info": self.total_info, + "status": self.status, + }, + "checks": { + category.value: { + "status": result.status, + "issues": [issue.to_dict() for issue in result.issues], + } + for category, result in self.check_results.items() + }, + } + + def _generate_recommendations(self) -> List[str]: + """Generate actionable recommendations based on issues found.""" + recommendations = [] + + # Check for very large datasets (>200GB) + size_gb = self.dataset_stats.total_size_bytes / (1024 ** 3) + if size_gb > 200: + recommendations.append("\nDataset Size:") + recommendations.append( + f" ℹ Your dataset is very large ({self._format_size(self.dataset_stats.total_size_bytes)}). " + "If you need assistance uploading large files, please contact the PhysioNet team at contact@physionet.org" + ) + + # Collect unique suggestions from all issues + suggestions_by_category = {} + + for category, result in self.check_results.items(): + category_suggestions = {} + + for issue in result.issues: + if issue.suggestion: + # Group by suggestion to avoid duplicates + if issue.suggestion not in category_suggestions: + category_suggestions[issue.suggestion] = { + 'severity': issue.severity, + 'count': 0 + } + category_suggestions[issue.suggestion]['count'] += 1 + + if category_suggestions: + suggestions_by_category[category] = category_suggestions + + # Generate recommendations by category + for category, suggestions in suggestions_by_category.items(): + if not suggestions: + continue + + recommendations.append(f"\n{category.value.replace('_', ' ').title()}:") + + # Sort by severity (errors first) and then by count + sorted_suggestions = sorted( + suggestions.items(), + key=lambda x: (x[1]['severity'] != Severity.ERROR, -x[1]['count']) + ) + + for suggestion, info in sorted_suggestions: + count = info['count'] + icon = "✗" if info['severity'] == Severity.ERROR else "⚠" + count_str = f" ({count} file{'s' if count != 1 else ''})" if count > 1 else "" + recommendations.append(f" {icon} {suggestion}{count_str}") + + return recommendations + + @staticmethod + def _format_size(size_bytes: int) -> str: + """Format byte size as human-readable string.""" + for unit in ["B", "KB", "MB", "GB", "TB"]: + if size_bytes < 1024.0: + return f"{size_bytes:.1f} {unit}" + size_bytes /= 1024.0 + return f"{size_bytes:.1f} PB" diff --git a/physionet/validate/validator.py b/physionet/validate/validator.py new file mode 100644 index 00000000..094751ba --- /dev/null +++ b/physionet/validate/validator.py @@ -0,0 +1,153 @@ +"""Main validation logic.""" + +import os +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +from tqdm import tqdm + +from physionet.validate.config import ValidationConfig +from physionet.validate.models import ( + ValidationResult, + CheckResult, + ValidationIssue, + CheckCategory, + Severity, + DatasetStats, +) +from physionet.validate.checks import ( + check_filesystem, + check_documentation, + check_integrity, + check_quality, + check_privacy, +) + + +def validate_dataset( + dataset_path: str, + config: Optional[ValidationConfig] = None, + show_progress: bool = True +) -> ValidationResult: + """ + Validate a PhysioNet dataset before submission. + + Args: + dataset_path: Path to the dataset directory + config: Optional validation configuration. If None, uses defaults. + show_progress: Whether to show progress bar. Default True. + + Returns: + ValidationResult containing all validation issues and statistics + + Raises: + ValueError: If dataset_path doesn't exist or isn't a directory + """ + path = Path(dataset_path) + if not path.exists(): + raise ValueError(f"Dataset path does not exist: {dataset_path}") + if not path.is_dir(): + raise ValueError(f"Dataset path is not a directory: {dataset_path}") + + if config is None: + config = ValidationConfig() + + # Initialize result + result = ValidationResult( + dataset_path=path.name, # Use just the dataset folder name, not full path + timestamp=datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + ) + + # Calculate dataset statistics + result.dataset_stats = _calculate_stats(path, config) + + # Determine which checks to run + checks_to_run = [] + if config.check_filesystem: + checks_to_run.append(("Filesystem", CheckCategory.FILESYSTEM, check_filesystem)) + if config.check_documentation: + checks_to_run.append(("Documentation", CheckCategory.DOCUMENTATION, check_documentation)) + if config.check_integrity: + checks_to_run.append(("Integrity", CheckCategory.INTEGRITY, check_integrity)) + if config.check_quality: + checks_to_run.append(("Quality", CheckCategory.QUALITY, check_quality)) + if config.check_phi: + checks_to_run.append(("Privacy", CheckCategory.PRIVACY, check_privacy)) + + # Run validation checks with progress bar + if show_progress: + progress_bar = tqdm( + total=100, + desc="Running validation checks", + unit="%", + leave=False, + ncols=100, + bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}%" + ) + + steps_per_check = 100 // len(checks_to_run) if checks_to_run else 100 + + for i, (name, category, check_func) in enumerate(checks_to_run): + # Create a callback to update progress during this check + def update_progress(msg: str): + progress_bar.set_description(f"{name}: {msg}"[:80]) + + progress_bar.set_description(f"{name}"[:80]) + + # Call check function with progress callback if it supports it + try: + result.check_results[category] = check_func(path, config, progress_callback=update_progress) + except TypeError: + # Function doesn't support progress_callback parameter + result.check_results[category] = check_func(path, config) + + # Update progress + progress_bar.update(steps_per_check) + + progress_bar.close() + else: + for name, category, check_func in checks_to_run: + # Try with progress_callback first, fall back to without + try: + result.check_results[category] = check_func(path, config, progress_callback=None) + except TypeError: + result.check_results[category] = check_func(path, config) + + return result + + +def _calculate_stats(path: Path, config: ValidationConfig) -> DatasetStats: + """Calculate statistics about the dataset.""" + stats = DatasetStats() + + for root, dirs, files in os.walk(path): + # Filter out ignored directories + dirs[:] = [d for d in dirs if not _should_ignore(d, config.ignore_patterns)] + + stats.directory_count += len(dirs) + + for file in files: + if _should_ignore(file, config.ignore_patterns): + continue + + file_path = Path(root) / file + try: + stats.file_count += 1 + stats.total_size_bytes += file_path.stat().st_size + except (OSError, PermissionError): + # Skip files we can't access + pass + + return stats + + +def _should_ignore(name: str, patterns: list) -> bool: + """Check if a file or directory should be ignored.""" + for pattern in patterns: + if pattern.startswith("*"): + if name.endswith(pattern[1:]): + return True + elif pattern in name: + return True + return False diff --git a/pyproject.toml b/pyproject.toml index b2590640..b8d5c124 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "physionet" -version = "0.1.4" +version = "0.1.5" authors = [ { name="Tom Pollard", email="tpollard@mit.edu" }, ] @@ -23,6 +23,7 @@ dependencies = [ "pandas", "openpyxl", "requests", + "tqdm", ] [project.optional-dependencies] @@ -41,6 +42,9 @@ line-length = 119 [tool.pyright] reportMissingImports = true +[project.scripts] +physionet = "physionet.cli:main" + [project.urls] homepage = "https://github.com/MIT-LCP/physionet" repository = "https://github.com/MIT-LCP/physionet" diff --git a/tests/validate/__init__.py b/tests/validate/__init__.py new file mode 100644 index 00000000..1c130bf2 --- /dev/null +++ b/tests/validate/__init__.py @@ -0,0 +1 @@ +"""Tests for validation module.""" diff --git a/tests/validate/test_checks.py b/tests/validate/test_checks.py new file mode 100644 index 00000000..745dc27b --- /dev/null +++ b/tests/validate/test_checks.py @@ -0,0 +1,472 @@ +"""Tests for individual validation checks.""" + +import pytest +import csv +from pathlib import Path + +from physionet.validate import ValidationConfig +from physionet.validate.checks import ( + check_filesystem, + check_documentation, + check_integrity, + check_quality, + check_privacy, +) +from physionet.validate.models import Severity, CheckCategory + + +class TestFilesystemChecks: + """Tests for filesystem validation checks.""" + + def test_detects_git_directory(self, tmp_path): + """Test that .git directories are detected.""" + (tmp_path / ".git").mkdir() + (tmp_path / ".git" / "config").write_text("test") + + config = ValidationConfig() + result = check_filesystem(tmp_path, config) + + assert any(".git" in issue.message for issue in result.issues) + + def test_detects_hidden_files(self, tmp_path): + """Test that hidden files are detected.""" + (tmp_path / ".hidden").write_text("test") + + config = ValidationConfig() + result = check_filesystem(tmp_path, config) + + assert any(issue.file and ".hidden" in issue.file for issue in result.issues) + + def test_detects_temp_files(self, tmp_path): + """Test that temporary files are detected.""" + (tmp_path / "file.txt~").write_text("test") + (tmp_path / "temp.tmp").write_text("test") + + config = ValidationConfig() + result = check_filesystem(tmp_path, config) + + assert len(result.issues) >= 2 + + def test_detects_empty_files(self, tmp_path): + """Test that empty files are detected.""" + (tmp_path / "empty.txt").write_text("") + + config = ValidationConfig() + result = check_filesystem(tmp_path, config) + + assert any("Empty file" in issue.message for issue in result.issues) + + def test_detects_invalid_filename_characters(self, tmp_path): + """Test that invalid filename characters are detected.""" + # Note: This test might not work on all filesystems + try: + (tmp_path / "file.txt").write_text("test") + config = ValidationConfig() + result = check_filesystem(tmp_path, config) + assert any("invalid characters" in issue.message.lower() for issue in result.issues) + # Should show which character was found + assert any("<" in issue.message for issue in result.issues) + except OSError: + # Skip test if filesystem doesn't allow these characters + pytest.skip("Filesystem doesn't support invalid characters in filenames") + + def test_detects_path_separators_in_filenames(self, tmp_path): + """Test that path separators and other awkward characters are flagged.""" + # These characters should be caught even though they can't actually be in filenames on most systems + # We test the validation logic by checking the character set + from physionet.validate.checks.filesystem import check_filesystem + + # Create a file with a valid name for the actual test + (tmp_path / "normalfile.txt").write_text("test") + + config = ValidationConfig() + result = check_filesystem(tmp_path, config) + + # The check should flag files with /, \, quotes, etc if they could exist + # Since we can't create such files, we verify the character set in the code includes them + # This is tested indirectly through the previous test + + def test_detects_spaces_in_filenames(self, tmp_path): + """Test that filenames with spaces are flagged.""" + (tmp_path / "my data file.csv").write_text("col1,col2\n1,2\n") + (tmp_path / "analysis results.txt").write_text("test") + + config = ValidationConfig() + result = check_filesystem(tmp_path, config) + + # Should warn about both files with spaces + space_warnings = [ + issue for issue in result.issues + if "spaces" in issue.message.lower() + ] + assert len(space_warnings) == 2 + assert any("my data file.csv" in issue.file for issue in space_warnings) + assert any("analysis results.txt" in issue.file for issue in space_warnings) + + def test_detects_long_filenames(self, tmp_path): + """Test that excessively long filenames are flagged.""" + # Create a file with a very long name (120 characters total) + long_name = "a" * 116 + ".csv" # 116 + 4 = 120 characters + (tmp_path / long_name).write_text("col1,col2\n1,2\n") + + config = ValidationConfig() + result = check_filesystem(tmp_path, config) + + # Should warn about long filename + long_warnings = [ + issue for issue in result.issues + if "very long" in issue.message.lower() + ] + assert len(long_warnings) == 1 + assert "120 characters" in long_warnings[0].message + + def test_detects_extremely_long_filenames(self, tmp_path): + """Test that filenames exceeding maximum length are errors.""" + # Create a file with name exceeding 255 characters + extreme_name = "b" * 260 + ".csv" + try: + (tmp_path / extreme_name).write_text("col1,col2\n1,2\n") + + config = ValidationConfig() + result = check_filesystem(tmp_path, config) + + # Should error about exceeding maximum length + length_errors = [ + issue for issue in result.issues + if "exceeds maximum length" in issue.message.lower() + ] + assert len(length_errors) == 1 + assert "260 characters" in length_errors[0].message + except OSError: + # Skip test if filesystem doesn't support such long names + pytest.skip("Filesystem doesn't support filenames over 255 characters") + + def test_detects_proprietary_formats(self, tmp_path): + """Test that proprietary file formats are flagged.""" + # Create files with proprietary formats + (tmp_path / "data.xlsx").write_text("test") + (tmp_path / "analysis.mat").write_text("test") + (tmp_path / "results.sas7bdat").write_text("test") + + config = ValidationConfig() + result = check_filesystem(tmp_path, config) + + # Should warn about proprietary data formats (not .docx which is allowed) + proprietary_warnings = [ + issue for issue in result.issues + if "proprietary file format" in issue.message.lower() + ] + assert len(proprietary_warnings) == 3 + + # Check that suggestions include alternatives + suggestions = [issue.suggestion for issue in proprietary_warnings] + assert any(".csv" in s or ".parquet" in s for s in suggestions) + assert any(".zarr" in s for s in suggestions) + + def test_allows_open_formats(self, tmp_path): + """Test that open file formats are not flagged.""" + # Create files with open formats (including .docx which is now allowed) + (tmp_path / "README.md").write_text("# Test") + (tmp_path / "data.csv").write_text("col1,col2\n1,2\n") + (tmp_path / "signal.hdf5").write_text("test") + (tmp_path / "record.json").write_text("{}") + (tmp_path / "notes.txt").write_text("notes") + (tmp_path / "protocol.docx").write_text("test") # .docx is now allowed + + config = ValidationConfig() + result = check_filesystem(tmp_path, config) + + # Should not warn about proprietary formats + proprietary_warnings = [ + issue for issue in result.issues + if "proprietary file format" in issue.message.lower() + ] + assert len(proprietary_warnings) == 0 + + +class TestDocumentationChecks: + """Tests for documentation validation checks.""" + + def test_readme_required_by_default(self, tmp_path): + """Test that README.md is required by default.""" + config = ValidationConfig() + result = check_documentation(tmp_path, config) + + # Should have error for missing README.md + assert result.error_count == 1 + assert any("README.md" in issue.message for issue in result.issues) + + # Should have helpful suggestion about minimum content + readme_issue = [issue for issue in result.issues if "README.md" in issue.message][0] + assert "title and a brief description" in readme_issue.suggestion + + def test_custom_required_files(self, tmp_path): + """Test that custom required files are validated.""" + config = ValidationConfig(required_files=["README.md", "LICENSE"]) + result = check_documentation(tmp_path, config) + + # Should have errors for both missing files + assert result.error_count == 2 + assert any("README.md" in issue.message for issue in result.issues) + assert any("LICENSE" in issue.message for issue in result.issues) + + def test_required_file_exists(self, tmp_path): + """Test that existing required file passes validation.""" + readme = tmp_path / "README.md" + readme.write_text("# Title\n\nSome content.") + + config = ValidationConfig(required_files=["README.md"]) + result = check_documentation(tmp_path, config) + + # Should have no errors since README exists + assert result.error_count == 0 + + +class TestIntegrityChecks: + """Tests for data integrity validation checks.""" + + def test_validates_valid_csv(self, tmp_path): + """Test that valid CSV passes validation.""" + csv_file = tmp_path / "data.csv" + csv_file.write_text("col1,col2,col3\n1,2,3\n4,5,6\n") + + config = ValidationConfig() + result = check_integrity(tmp_path, config) + + assert result.error_count == 0 + + def test_detects_empty_csv(self, tmp_path): + """Test that empty CSV is detected.""" + csv_file = tmp_path / "data.csv" + csv_file.write_text("") + + config = ValidationConfig() + result = check_integrity(tmp_path, config) + + assert any("empty" in issue.message.lower() for issue in result.issues) + + def test_detects_duplicate_column_names(self, tmp_path): + """Test that duplicate column names are detected.""" + csv_file = tmp_path / "data.csv" + csv_file.write_text("col1,col2,col1\n1,2,3\n") + + config = ValidationConfig() + result = check_integrity(tmp_path, config) + + assert any("Duplicate" in issue.message for issue in result.issues) + + def test_detects_inconsistent_row_length(self, tmp_path): + """Test that inconsistent row lengths are detected.""" + csv_file = tmp_path / "data.csv" + csv_file.write_text("col1,col2,col3\n1,2,3\n4,5\n6,7,8,9\n") + + config = ValidationConfig() + result = check_integrity(tmp_path, config) + + # Should detect both short and long rows + assert result.error_count >= 2 + + def test_detects_encoding_issues(self, tmp_path): + """Test that encoding issues are detected.""" + csv_file = tmp_path / "data.csv" + # Write invalid UTF-8 + csv_file.write_bytes(b"col1,col2\n1,\xff\xfe\n") + + config = ValidationConfig() + result = check_integrity(tmp_path, config) + + assert any("encoding" in issue.message.lower() for issue in result.issues) + + +class TestQualityChecks: + """Tests for data quality validation checks.""" + + def test_detects_completely_empty_columns(self, tmp_path): + """Test that completely empty columns (100% missing) are detected.""" + csv_file = tmp_path / "data.csv" + # Create CSV with one column that's 100% empty + rows = ["col1,col2,col3\n"] + for i in range(10): + rows.append(f"{i},data,\n") + csv_file.write_text("".join(rows)) + + config = ValidationConfig() + result = check_quality(tmp_path, config) + + # Should detect the empty column + assert any("empty" in issue.message.lower() and "col3" in issue.column for issue in result.issues) + + def test_partial_missing_values_not_flagged(self, tmp_path): + """Test that partially missing columns (e.g., 75%) are not flagged.""" + csv_file = tmp_path / "data.csv" + # Create CSV with 75% missing values in a column + rows = ["col1,col2\n"] + for i in range(100): + if i < 75: + rows.append("1,\n") + else: + rows.append("1,2\n") + csv_file.write_text("".join(rows)) + + config = ValidationConfig() + result = check_quality(tmp_path, config) + + # Should NOT flag col2 since it has some data (25%) + assert not any("col2" in str(issue.column) for issue in result.issues) + + def test_detects_out_of_range_values(self, tmp_path): + """Test that out-of-range values are detected.""" + csv_file = tmp_path / "data.csv" + csv_file.write_text("heart_rate\n80\n350\n75\n") + + config = ValidationConfig(value_ranges={"heart_rate": (20, 300)}) + result = check_quality(tmp_path, config) + + assert any("outside expected range" in issue.message for issue in result.issues) + + +class TestPrivacyChecks: + """Tests for privacy validation checks.""" + + def test_date_format_not_flagged(self, tmp_path): + """Test that date formats (YYYY-MM-DD) are not automatically flagged as PHI. + + Dates are commonly used in medical datasets as de-identified timestamps. + They should not be flagged without additional context. + """ + csv_file = tmp_path / "data.csv" + csv_file.write_text("patient_id,admission_date\n1,2023-05-15\n2,2023-06-20\n") + + config = ValidationConfig() + result = check_privacy(tmp_path, config) + + # Dates alone should not be flagged + assert result.error_count == 0 + + def test_detects_email_addresses(self, tmp_path): + """Test that email addresses are detected as PHI.""" + csv_file = tmp_path / "data.csv" + csv_file.write_text("patient_id,contact\n1,patient@example.com\n2,test@test.com\n") + + config = ValidationConfig() + result = check_privacy(tmp_path, config) + + # Should have one warning for the 'contact' column with pattern type + assert result.warning_count == 1 + assert any( + issue.severity == Severity.WARNING + and "contact" in str(issue.column) + and "email address" in issue.message + for issue in result.issues + ) + + def test_detects_age_violations(self, tmp_path): + """Test that ages over limit are detected.""" + csv_file = tmp_path / "data.csv" + csv_file.write_text("patient_id,age\n1,92\n2,95\n3,85\n") + + config = ValidationConfig(allowed_age_max=89) + result = check_privacy(tmp_path, config) + + # Should have one warning for the age column (consolidated) + age_violations = [ + issue for issue in result.issues + if "age" in issue.message.lower() and issue.severity == Severity.WARNING + ] + assert len(age_violations) == 1 + assert "age" in age_violations[0].column.lower() + + def test_text_files_checked_for_phi(self, tmp_path): + """Test that text files are checked for PHI patterns.""" + text_file = tmp_path / "notes.txt" + text_file.write_text("Contact: test@example.com\nPhone: 555-123-4567") + + config = ValidationConfig() + result = check_privacy(tmp_path, config) + + # Should detect private information patterns in text files as a single consolidated warning with pattern types + assert result.warning_count >= 1 + assert any( + "private information detected" in issue.message + and ("email address" in issue.message or "phone number" in issue.message) + for issue in result.issues + ) + + def test_allows_year_only_dates(self, tmp_path): + """Test that year-only dates are allowed.""" + csv_file = tmp_path / "data.csv" + csv_file.write_text("patient_id,year\n1,2023\n2,2024\n") + + config = ValidationConfig() + result = check_privacy(tmp_path, config) + + # Should not flag year-only as PHI + phi_issues = [ + issue for issue in result.issues + if issue.severity == Severity.ERROR + ] + assert len(phi_issues) == 0 + + def test_detects_sensitive_config_files(self, tmp_path): + """Test that sensitive configuration files are detected.""" + # Create some sensitive files + (tmp_path / ".env").write_text("API_KEY=secret123") + (tmp_path / "credentials.json").write_text('{"key": "value"}') + (tmp_path / "id_rsa").write_text("-----BEGIN RSA PRIVATE KEY-----") + + config = ValidationConfig() + result = check_privacy(tmp_path, config) + + # Should detect all three sensitive files as errors + sensitive_file_errors = [ + issue for issue in result.issues + if issue.severity == Severity.ERROR and "Sensitive file detected" in issue.message + ] + assert len(sensitive_file_errors) == 3 + + def test_detects_files_with_sensitive_names(self, tmp_path): + """Test that files with sensitive keywords in names are flagged.""" + (tmp_path / "my_api_key.txt").write_text("some data") + (tmp_path / "database_password.csv").write_text("col1\nval1") + + config = ValidationConfig() + result = check_privacy(tmp_path, config) + + # Should warn about files with sensitive keywords in names + keyword_warnings = [ + issue for issue in result.issues + if issue.severity == Severity.WARNING and "name suggests sensitive content" in issue.message + ] + assert len(keyword_warnings) >= 2 + + def test_detects_key_file_extensions(self, tmp_path): + """Test that private key file extensions are detected.""" + (tmp_path / "server.pem").write_text("certificate") + (tmp_path / "private.key").write_text("key data") + + config = ValidationConfig() + result = check_privacy(tmp_path, config) + + # Should detect both key files + key_errors = [ + issue for issue in result.issues + if issue.severity == Severity.ERROR + ] + assert len(key_errors) >= 2 + + def test_sampling_large_files(self, tmp_path): + """Test that large files are sampled for performance.""" + csv_file = tmp_path / "large.csv" + + # Create a file with more rows than the sampling limit + rows = ["patient_id,email\n"] + for i in range(15000): # More than default max_rows_to_scan (10000) + rows.append(f"{i},test{i}@example.com\n") + csv_file.write_text("".join(rows)) + + config = ValidationConfig(max_rows_to_scan=1000, sample_large_files=True) + result = check_privacy(tmp_path, config) + + # Should still detect the email pattern even with sampling + assert result.warning_count >= 1 + assert any("email" in str(issue.column) for issue in result.issues) diff --git a/tests/validate/test_cli.py b/tests/validate/test_cli.py new file mode 100644 index 00000000..0cb17ed5 --- /dev/null +++ b/tests/validate/test_cli.py @@ -0,0 +1,134 @@ +"""Tests for CLI interface.""" + +import pytest +import json +import subprocess +import sys +from pathlib import Path + + +class TestValidateCLI: + """Tests for the validate CLI command.""" + + def test_cli_validates_directory(self, tmp_path): + """Test that CLI can validate a directory.""" + # Create a minimal dataset + readme = tmp_path / "README.md" + readme.write_text("""# Test Dataset + +## Background +Test background. + +## Methods +Test methods. + +## Data Description +Test data. + +## Usage Notes +Test usage. + +## References +Test references. +""") + + # Run CLI + result = subprocess.run( + [sys.executable, "-m", "physionet", "validate", str(tmp_path)], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "PhysioNet Dataset Validation Report" in result.stdout + + def test_cli_handles_nonexistent_path(self): + """Test that CLI handles nonexistent paths gracefully.""" + result = subprocess.run( + [sys.executable, "-m", "physionet", "validate", "/nonexistent/path"], + capture_output=True, + text=True, + ) + + assert result.returncode == 1 + assert "does not exist" in result.stderr + + def test_cli_generates_json_report(self, tmp_path): + """Test that CLI can generate JSON report.""" + # Create dataset + readme = tmp_path / "README.md" + readme.write_text("# Test") + + # Run CLI with --report + report_file = tmp_path / "report.json" + result = subprocess.run( + [sys.executable, "-m", "physionet", "validate", str(tmp_path), "--report", str(report_file)], + capture_output=True, + text=True, + ) + + # Check that report was created + assert report_file.exists() + + # Validate JSON structure + with open(report_file) as f: + report = json.load(f) + + assert "dataset_path" in report + assert "timestamp" in report + assert "summary" in report + assert "checks" in report + + def test_cli_filters_by_check_category(self, tmp_path): + """Test that CLI can filter checks by category.""" + readme = tmp_path / "README.md" + readme.write_text("# Test") + + result = subprocess.run( + [sys.executable, "-m", "physionet", "validate", str(tmp_path), "--checks", "filesystem"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + # Should only show filesystem checks + assert "Filesystem" in result.stdout or "filesystem" in result.stdout.lower() + + def test_cli_exits_with_error_on_validation_failure(self, tmp_path): + """Test that CLI exits with error code when validation fails.""" + # Create dataset with PHI + csv_file = tmp_path / "data.csv" + csv_file.write_text("patient_id,email\n1,test@example.com\n") + + result = subprocess.run( + [sys.executable, "-m", "physionet", "validate", str(tmp_path)], + capture_output=True, + text=True, + ) + + # Should exit with error code due to validation errors + assert result.returncode == 1 + + def test_cli_shows_help(self): + """Test that CLI shows help message.""" + result = subprocess.run( + [sys.executable, "-m", "physionet", "--help"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "validate" in result.stdout + + def test_validate_subcommand_help(self): + """Test that validate subcommand shows help.""" + result = subprocess.run( + [sys.executable, "-m", "physionet", "validate", "--help"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "path" in result.stdout + assert "--report" in result.stdout + assert "--checks" in result.stdout diff --git a/tests/validate/test_validator.py b/tests/validate/test_validator.py new file mode 100644 index 00000000..3d6b3a60 --- /dev/null +++ b/tests/validate/test_validator.py @@ -0,0 +1,183 @@ +"""Tests for main validation functionality.""" + +import pytest +import tempfile +from pathlib import Path + +from physionet.validate import validate_dataset, ValidationConfig +from physionet.validate.models import Severity, CheckCategory + + +class TestValidateDataset: + """Tests for validate_dataset function.""" + + def test_nonexistent_path_raises_error(self): + """Test that validating a nonexistent path raises ValueError.""" + with pytest.raises(ValueError, match="does not exist"): + validate_dataset("/nonexistent/path") + + def test_file_instead_of_directory_raises_error(self, tmp_path): + """Test that validating a file instead of directory raises ValueError.""" + test_file = tmp_path / "test.txt" + test_file.write_text("test") + + with pytest.raises(ValueError, match="not a directory"): + validate_dataset(str(test_file)) + + def test_empty_directory_validation(self, tmp_path): + """Test validation of an empty directory.""" + result = validate_dataset(str(tmp_path)) + + assert result.dataset_path == tmp_path.name + assert result.timestamp is not None + assert result.dataset_stats.file_count == 0 + assert result.dataset_stats.total_size_bytes == 0 + + # Should have error for missing README.md + assert result.total_errors == 1 + assert any("README.md" in str(issue.message) for issue in result.check_results[CheckCategory.DOCUMENTATION].issues) + + def test_minimal_valid_dataset(self, tmp_path): + """Test validation of a minimal valid dataset.""" + # Create README and a simple CSV file + (tmp_path / "README.md").write_text("# Test Dataset") + csv_file = tmp_path / "data.csv" + csv_file.write_text("id,value\n1,100\n2,200\n") + + result = validate_dataset(str(tmp_path)) + + assert result.dataset_stats.file_count == 2 + assert result.total_errors == 0 + + def test_validation_with_custom_config(self, tmp_path): + """Test validation with custom configuration.""" + # Create a dataset with custom requirements + readme = tmp_path / "README.md" + readme.write_text("# Test") + + config = ValidationConfig( + check_filesystem=True, + check_documentation=False, # Disable documentation checks + check_integrity=False, + check_quality=False, + check_phi=False, + ) + + result = validate_dataset(str(tmp_path), config) + + # Should only have filesystem checks + assert CheckCategory.FILESYSTEM in result.check_results + assert CheckCategory.DOCUMENTATION not in result.check_results + + def test_validation_without_progress_bar(self, tmp_path): + """Test validation with progress bar disabled.""" + readme = tmp_path / "README.md" + readme.write_text("# Test") + + # Should not raise any errors with show_progress=False + result = validate_dataset(str(tmp_path), show_progress=False) + assert result.total_errors == 0 + + +class TestValidationStats: + """Tests for dataset statistics calculation.""" + + def test_calculates_file_count(self, tmp_path): + """Test that file count is calculated correctly.""" + (tmp_path / "README.md").write_text("# Test") + (tmp_path / "data.csv").write_text("col1,col2\n1,2\n") + (tmp_path / "subdir").mkdir() + (tmp_path / "subdir" / "data2.csv").write_text("col1\n1\n") + + result = validate_dataset(str(tmp_path)) + + assert result.dataset_stats.file_count == 3 + assert result.dataset_stats.directory_count == 1 + + def test_calculates_total_size(self, tmp_path): + """Test that total size is calculated correctly.""" + content = "x" * 1000 + (tmp_path / "README.md").write_text(content) + + result = validate_dataset(str(tmp_path)) + + assert result.dataset_stats.total_size_bytes >= 1000 + + def test_ignores_specified_patterns(self, tmp_path): + """Test that ignored patterns are not counted in stats.""" + (tmp_path / "README.md").write_text("# Test") + (tmp_path / ".git").mkdir() + (tmp_path / ".git" / "config").write_text("test") + + result = validate_dataset(str(tmp_path)) + + # .git directory and its contents should be ignored + assert result.dataset_stats.file_count == 1 + + +class TestValidationResult: + """Tests for ValidationResult model.""" + + def test_summary_format(self, tmp_path): + """Test that summary is properly formatted.""" + (tmp_path / "README.md").write_text("# Test") + + result = validate_dataset(str(tmp_path)) + summary = result.summary() + + assert "PhysioNet Dataset Validation Report" in summary + assert tmp_path.name in summary + assert "Summary:" in summary + assert "Metadata:" in summary + assert "Validation Results:" in summary + + def test_to_dict_format(self, tmp_path): + """Test that to_dict produces valid structure.""" + (tmp_path / "README.md").write_text("# Test") + + result = validate_dataset(str(tmp_path)) + result_dict = result.to_dict() + + assert "dataset_path" in result_dict + assert "timestamp" in result_dict + assert "dataset_stats" in result_dict + assert "summary" in result_dict + assert "checks" in result_dict + + assert result_dict["summary"]["total_errors"] == result.total_errors + assert result_dict["summary"]["total_warnings"] == result.total_warnings + + def test_recommendations_section(self, tmp_path): + """Test that recommendations section is included when there are issues.""" + # Create files with issues to trigger recommendations + (tmp_path / "file with spaces.csv").write_text("col1,col2\n1,2\n") + (tmp_path / ".env").write_text("API_KEY=secret") + (tmp_path / "empty.txt").write_text("") + + result = validate_dataset(str(tmp_path)) + summary = result.summary() + + # Should include recommendations section + assert "Recommendations:" in summary + assert "Replace spaces with underscores or hyphens" in summary + assert "Remove" in summary # Various remove recommendations + + def test_large_dataset_recommendation(self, tmp_path): + """Test that large datasets get upload assistance recommendation.""" + # Create README to avoid documentation errors + (tmp_path / "README.md").write_text("# Large Dataset") + + # Create a large file (simulated - we'll modify the stats) + (tmp_path / "data.csv").write_text("col1,col2\n1,2\n") + + result = validate_dataset(str(tmp_path)) + + # Manually set large size for testing (>200GB) + result.dataset_stats.total_size_bytes = 250 * 1024 ** 3 # 250 GB + + summary = result.summary() + + # Should include contact recommendation for large datasets + assert "contact@physionet.org" in summary + assert "very large" in summary.lower() + assert "250" in summary # Should show the size