MIT-LCP · tompollard · Dec 12, 2025 · Dec 12, 2025
diff --git a/physionet/__init__.py b/physionet/__init__.py
@@ -1,3 +1,9 @@
-from .api import PhysioNetClient
+from physionet.api import PhysioNetClient
+
+try:
+    from importlib.metadata import version
+    __version__ = version("physionet")
+except Exception:
+    __version__ = "unknown"
 
 __all__ = ["PhysioNetClient"]
diff --git a/physionet/__main__.py b/physionet/__main__.py
@@ -0,0 +1,7 @@
+"""Allow running the CLI as a module: python -m physionet."""
+
+import sys
+from physionet.cli import main
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/physionet/cli.py b/physionet/cli.py
@@ -0,0 +1,142 @@
+"""Command-line interface for physionet package."""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from physionet.validate import validate_dataset, ValidationConfig
+
+
+def main():
+    """Main entry point for the CLI."""
+    parser = argparse.ArgumentParser(
+        prog="physionet",
+        description="Tools for working with PhysioNet datasets",
+    )
+
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+
+    # Validate subcommand
+    validate_parser = subparsers.add_parser(
+        "validate",
+        help="Validate a dataset before submission to PhysioNet",
+    )
+    validate_parser.add_argument(
+        "path",
+        help="Path to the dataset directory to validate",
+    )
+    validate_parser.add_argument(
+        "--report",
+        metavar="FILE",
+        help="Generate detailed JSON report and save to FILE",
+    )
+    validate_parser.add_argument(
+        "--checks",
+        metavar="CATEGORIES",
+        help="Comma-separated list of check categories to run (filesystem,documentation,integrity,quality,privacy)",
+    )
+    validate_parser.add_argument(
+        "--level",
+        choices=["error", "warning", "info"],
+        default="info",
+        help="Minimum severity level to display (default: info)",
+    )
+    validate_parser.add_argument(
+        "--no-sampling",
+        action="store_true",
+        help="Disable sampling for large files (scan all rows, slower but more thorough)",
+    )
+    validate_parser.add_argument(
+        "--max-rows",
+        type=int,
+        metavar="N",
+        help="Maximum rows to scan per CSV file (default: 10000)",
+    )
+
+    args = parser.parse_args()
+
+    if args.command == "validate":
+        return _handle_validate(args)
+    elif args.command is None:
+        parser.print_help()
+        return 0
+    else:
+        print(f"Unknown command: {args.command}", file=sys.stderr)
+        return 1
+
+
+def _handle_validate(args):
+    """Handle the validate subcommand."""
+    # Validate path
+    dataset_path = Path(args.path)
+    if not dataset_path.exists():
+        print(f"Error: Path does not exist: {args.path}", file=sys.stderr)
+        return 1
+
+    if not dataset_path.is_dir():
+        print(f"Error: Path is not a directory: {args.path}", file=sys.stderr)
+        return 1
+
+    # Configure validation
+    config = ValidationConfig()
+
+    # Parse check categories if specified
+    if args.checks:
+        categories = [c.strip().lower() for c in args.checks.split(",")]
+        config.check_filesystem = "filesystem" in categories
+        config.check_documentation = "documentation" in categories
+        config.check_integrity = "integrity" in categories
+        config.check_quality = "quality" in categories
+        config.check_phi = "privacy" in categories
+
+    # Configure sampling options
+    if args.no_sampling:
+        config.sample_large_files = False
+    if args.max_rows:
+        config.max_rows_to_scan = args.max_rows
+
+    # Run validation
+    try:
+        print(f"Validating dataset: {dataset_path}")
+        result = validate_dataset(str(dataset_path), config, show_progress=True)
+        print()
+
+        print(result.summary())
+
+        # Save validation report - either to specified path or default location
+        if args.report:
+            report_path = Path(args.report)
+            # Determine format based on file extension
+            if report_path.suffix.lower() == '.json':
+                # Save as JSON
+                with open(report_path, "w", encoding="utf-8") as f:
+                    json.dump(result.to_dict(), f, indent=2)
+            else:
+                # Save as Markdown
+                with open(report_path, "w", encoding="utf-8") as f:
+                    f.write(result.summary())
+        else:
+            # Default: save as Markdown in the root of the dataset folder
+            report_path = dataset_path / "PHYSIONET_REPORT.md"
+            with open(report_path, "w", encoding="utf-8") as f:
+                f.write(result.summary())
+
+        print()
+        print(f"Validation report saved to: {report_path}")
+
+        if result.status == "error":
+            return 1
+        elif result.status == "warning" and args.level == "error":
+            return 0  # Warnings don't fail if level is error
+        return 0
+
+    except Exception as e:
+        print(f"Error during validation: {str(e)}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/physionet/validate/__init__.py b/physionet/validate/__init__.py
@@ -0,0 +1,7 @@
+"""Dataset validation module for PhysioNet submissions."""
+
+from physionet.validate.validator import validate_dataset
+from physionet.validate.config import ValidationConfig
+from physionet.validate.models import ValidationResult
+
+__all__ = ["validate_dataset", "ValidationConfig", "ValidationResult"]
diff --git a/physionet/validate/checks/__init__.py b/physionet/validate/checks/__init__.py
@@ -0,0 +1,15 @@
+"""Validation check modules."""
+
+from physionet.validate.checks.filesystem import check_filesystem
+from physionet.validate.checks.documentation import check_documentation
+from physionet.validate.checks.integrity import check_integrity
+from physionet.validate.checks.quality import check_quality
+from physionet.validate.checks.privacy import check_privacy
+
+__all__ = [
+    "check_filesystem",
+    "check_documentation",
+    "check_integrity",
+    "check_quality",
+    "check_privacy",
+]
diff --git a/physionet/validate/checks/documentation.py b/physionet/validate/checks/documentation.py
@@ -0,0 +1,48 @@
+"""Documentation validation checks."""
+
+from pathlib import Path
+
+from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity
+from physionet.validate.config import ValidationConfig
+
+
+def check_documentation(path: Path, config: ValidationConfig) -> CheckResult:
+    """
+    Check documentation completeness.
+
+    Validates:
+    - Required files exist (if any are specified in config)
+
+    Args:
+        path: Path to dataset directory
+        config: Validation configuration
+
+    Returns:
+        CheckResult with any documentation issues found
+    """
+    result = CheckResult(category=CheckCategory.DOCUMENTATION)
+
+    # Check for required files
+    for required_file in config.required_files:
+        file_path = path / required_file
+        if not file_path.exists():
+            # Customize suggestion for README.md
+            if required_file == "README.md":
+                suggestion = (
+                    "Add README.md to your dataset. At minimum, the file should include "
+                    "a title and a brief description of the package content."
+                )
+            else:
+                suggestion = f"Add {required_file} to your dataset"
+
+            result.issues.append(
+                ValidationIssue(
+                    severity=Severity.ERROR,
+                    category=CheckCategory.DOCUMENTATION,
+                    file=required_file,
+                    message=f"Required file not found: {required_file}",
+                    suggestion=suggestion,
+                )
+            )
+
+    return result