diff --git a/physionet/__init__.py b/physionet/__init__.py
index e1d0d76b..1a64bec6 100644
--- a/physionet/__init__.py
+++ b/physionet/__init__.py
@@ -1,3 +1,9 @@
-from .api import PhysioNetClient
+from physionet.api import PhysioNetClient
+
+try:
+    from importlib.metadata import version
+    __version__ = version("physionet")
+except Exception:
+    __version__ = "unknown"
 
 __all__ = ["PhysioNetClient"]
diff --git a/physionet/__main__.py b/physionet/__main__.py
new file mode 100644
index 00000000..1e7a4f85
--- /dev/null
+++ b/physionet/__main__.py
@@ -0,0 +1,7 @@
+"""Allow running the CLI as a module: python -m physionet."""
+
+import sys
+from physionet.cli import main
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/physionet/cli.py b/physionet/cli.py
new file mode 100644
index 00000000..575eba3c
--- /dev/null
+++ b/physionet/cli.py
@@ -0,0 +1,142 @@
+"""Command-line interface for physionet package."""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from physionet.validate import validate_dataset, ValidationConfig
+
+
+def main():
+    """Main entry point for the CLI."""
+    parser = argparse.ArgumentParser(
+        prog="physionet",
+        description="Tools for working with PhysioNet datasets",
+    )
+
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+
+    # Validate subcommand
+    validate_parser = subparsers.add_parser(
+        "validate",
+        help="Validate a dataset before submission to PhysioNet",
+    )
+    validate_parser.add_argument(
+        "path",
+        help="Path to the dataset directory to validate",
+    )
+    validate_parser.add_argument(
+        "--report",
+        metavar="FILE",
+        help="Generate detailed JSON report and save to FILE",
+    )
+    validate_parser.add_argument(
+        "--checks",
+        metavar="CATEGORIES",
+        help="Comma-separated list of check categories to run (filesystem,documentation,integrity,quality,privacy)",
+    )
+    validate_parser.add_argument(
+        "--level",
+        choices=["error", "warning", "info"],
+        default="info",
+        help="Minimum severity level to display (default: info)",
+    )
+    validate_parser.add_argument(
+        "--no-sampling",
+        action="store_true",
+        help="Disable sampling for large files (scan all rows, slower but more thorough)",
+    )
+    validate_parser.add_argument(
+        "--max-rows",
+        type=int,
+        metavar="N",
+        help="Maximum rows to scan per CSV file (default: 10000)",
+    )
+
+    args = parser.parse_args()
+
+    if args.command == "validate":
+        return _handle_validate(args)
+    elif args.command is None:
+        parser.print_help()
+        return 0
+    else:
+        print(f"Unknown command: {args.command}", file=sys.stderr)
+        return 1
+
+
+def _handle_validate(args):
+    """Handle the validate subcommand."""
+    # Validate path
+    dataset_path = Path(args.path)
+    if not dataset_path.exists():
+        print(f"Error: Path does not exist: {args.path}", file=sys.stderr)
+        return 1
+
+    if not dataset_path.is_dir():
+        print(f"Error: Path is not a directory: {args.path}", file=sys.stderr)
+        return 1
+
+    # Configure validation
+    config = ValidationConfig()
+
+    # Parse check categories if specified
+    if args.checks:
+        categories = [c.strip().lower() for c in args.checks.split(",")]
+        config.check_filesystem = "filesystem" in categories
+        config.check_documentation = "documentation" in categories
+        config.check_integrity = "integrity" in categories
+        config.check_quality = "quality" in categories
+        config.check_phi = "privacy" in categories
+
+    # Configure sampling options
+    if args.no_sampling:
+        config.sample_large_files = False
+    if args.max_rows:
+        config.max_rows_to_scan = args.max_rows
+
+    # Run validation
+    try:
+        print(f"Validating dataset: {dataset_path}")
+        result = validate_dataset(str(dataset_path), config, show_progress=True)
+        print()
+
+        print(result.summary())
+
+        # Save validation report - either to specified path or default location
+        if args.report:
+            report_path = Path(args.report)
+            # Determine format based on file extension
+            if report_path.suffix.lower() == '.json':
+                # Save as JSON
+                with open(report_path, "w", encoding="utf-8") as f:
+                    json.dump(result.to_dict(), f, indent=2)
+            else:
+                # Save as Markdown
+                with open(report_path, "w", encoding="utf-8") as f:
+                    f.write(result.summary())
+        else:
+            # Default: save as Markdown in the root of the dataset folder
+            report_path = dataset_path / "PHYSIONET_REPORT.md"
+            with open(report_path, "w", encoding="utf-8") as f:
+                f.write(result.summary())
+
+        print()
+        print(f"Validation report saved to: {report_path}")
+
+        if result.status == "error":
+            return 1
+        elif result.status == "warning" and args.level == "error":
+            return 0  # Warnings don't fail if level is error
+        return 0
+
+    except Exception as e:
+        print(f"Error during validation: {str(e)}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/physionet/validate/__init__.py b/physionet/validate/__init__.py
new file mode 100644
index 00000000..cebe0280
--- /dev/null
+++ b/physionet/validate/__init__.py
@@ -0,0 +1,7 @@
+"""Dataset validation module for PhysioNet submissions."""
+
+from physionet.validate.validator import validate_dataset
+from physionet.validate.config import ValidationConfig
+from physionet.validate.models import ValidationResult
+
+__all__ = ["validate_dataset", "ValidationConfig", "ValidationResult"]
diff --git a/physionet/validate/checks/__init__.py b/physionet/validate/checks/__init__.py
new file mode 100644
index 00000000..b8ad08ea
--- /dev/null
+++ b/physionet/validate/checks/__init__.py
@@ -0,0 +1,15 @@
+"""Validation check modules."""
+
+from physionet.validate.checks.filesystem import check_filesystem
+from physionet.validate.checks.documentation import check_documentation
+from physionet.validate.checks.integrity import check_integrity
+from physionet.validate.checks.quality import check_quality
+from physionet.validate.checks.privacy import check_privacy
+
+__all__ = [
+    "check_filesystem",
+    "check_documentation",
+    "check_integrity",
+    "check_quality",
+    "check_privacy",
+]
diff --git a/physionet/validate/checks/documentation.py b/physionet/validate/checks/documentation.py
new file mode 100644
index 00000000..9d6edc04
--- /dev/null
+++ b/physionet/validate/checks/documentation.py
@@ -0,0 +1,48 @@
+"""Documentation validation checks."""
+
+from pathlib import Path
+
+from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity
+from physionet.validate.config import ValidationConfig
+
+
+def check_documentation(path: Path, config: ValidationConfig) -> CheckResult:
+    """
+    Check documentation completeness.
+
+    Validates:
+    - Required files exist (if any are specified in config)
+
+    Args:
+        path: Path to dataset directory
+        config: Validation configuration
+
+    Returns:
+        CheckResult with any documentation issues found
+    """
+    result = CheckResult(category=CheckCategory.DOCUMENTATION)
+
+    # Check for required files
+    for required_file in config.required_files:
+        file_path = path / required_file
+        if not file_path.exists():
+            # Customize suggestion for README.md
+            if required_file == "README.md":
+                suggestion = (
+                    "Add README.md to your dataset. At minimum, the file should include "
+                    "a title and a brief description of the package content."
+                )
+            else:
+                suggestion = f"Add {required_file} to your dataset"
+
+            result.issues.append(
+                ValidationIssue(
+                    severity=Severity.ERROR,
+                    category=CheckCategory.DOCUMENTATION,
+                    file=required_file,
+                    message=f"Required file not found: {required_file}",
+                    suggestion=suggestion,
+                )
+            )
+
+    return result
diff --git a/physionet/validate/checks/filesystem.py b/physionet/validate/checks/filesystem.py
new file mode 100644
index 00000000..c1f14f4e
--- /dev/null
+++ b/physionet/validate/checks/filesystem.py
@@ -0,0 +1,194 @@
+"""File system validation checks."""
+
+import os
+from pathlib import Path
+
+from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity
+from physionet.validate.config import ValidationConfig
+
+# Proprietary file formats and their recommended open alternatives
+PROPRIETARY_FORMATS = {
+    '.mat': 'MATLAB format; consider .csv, .zarr, .parquet, or .npy instead',
+    '.sas7bdat': 'SAS format; consider .csv or .parquet instead',
+    '.dta': 'Stata format; consider .csv or .parquet instead',
+    '.sav': 'SPSS format; consider .csv or .parquet instead',
+    '.xlsx': 'Excel format; consider .csv instead',
+    '.xls': 'Excel format; consider .csv instead',
+    '.rds': 'R binary format; consider .csv or .parquet instead',
+    '.rdata': 'R binary format; consider .csv or .parquet instead',
+    '.ppt': 'PowerPoint format; consider .pdf instead',
+    '.pptx': 'PowerPoint format; consider .pdf instead',
+}
+
+
+def check_filesystem(path: Path, config: ValidationConfig) -> CheckResult:
+    """
+    Check file system organization and structure.
+
+    Validates:
+    - File naming conventions
+    - Presence of version control artifacts
+    - File sizes
+    - Small file count
+
+    Args:
+        path: Path to dataset directory
+        config: Validation configuration
+
+    Returns:
+        CheckResult with any filesystem issues found
+    """
+    result = CheckResult(category=CheckCategory.FILESYSTEM)
+
+    # Check for version control artifacts
+    for pattern in [".git", ".svn", ".hg", "__pycache__", ".pytest_cache"]:
+        found_paths = list(path.rglob(pattern))
+        if found_paths:
+            result.issues.append(
+                ValidationIssue(
+                    severity=Severity.WARNING,
+                    category=CheckCategory.FILESYSTEM,
+                    message=f"Found version control/build artifacts: {pattern}",
+                    suggestion=f"Remove {pattern} directories before submission",
+                )
+            )
+
+    # Check for hidden and temp files
+    for root, dirs, files in os.walk(path):
+        # Filter ignored directories
+        dirs[:] = [d for d in dirs if not any(p in d for p in config.ignore_patterns)]
+
+        for file in files:
+            file_path = Path(root) / file
+
+            # Skip ignored files
+            if any(p in file for p in config.ignore_patterns):
+                continue
+
+            # Check for hidden files (starting with .)
+            if file.startswith(".") and file not in [".gitignore", ".gitattributes"]:
+                result.issues.append(
+                    ValidationIssue(
+                        severity=Severity.WARNING,
+                        category=CheckCategory.FILESYSTEM,
+                        file=str(file_path.relative_to(path)),
+                        message=f"Hidden file found: {file}",
+                        suggestion="Remove hidden files before submission",
+                    )
+                )
+
+            # Check for temp files
+            if file.endswith(("~", ".tmp", ".bak", ".swp")):
+                result.issues.append(
+                    ValidationIssue(
+                        severity=Severity.WARNING,
+                        category=CheckCategory.FILESYSTEM,
+                        file=str(file_path.relative_to(path)),
+                        message=f"Temporary file found: {file}",
+                        suggestion="Remove temporary files before submission",
+                    )
+                )
+
+            # Check file size
+            try:
+                size = file_path.stat().st_size
+                if size == 0:
+                    result.issues.append(
+                        ValidationIssue(
+                            severity=Severity.WARNING,
+                            category=CheckCategory.FILESYSTEM,
+                            file=str(file_path.relative_to(path)),
+                            message="Empty file (0 bytes)",
+                            suggestion="Remove empty files or add content",
+                        )
+                    )
+                elif config.max_file_size_bytes and size > config.max_file_size_bytes:
+                    result.issues.append(
+                        ValidationIssue(
+                            severity=Severity.INFO,
+                            category=CheckCategory.FILESYSTEM,
+                            file=str(file_path.relative_to(path)),
+                            message=f"Large file: {_format_size(size)}",
+                            suggestion="Consider splitting or compressing large files",
+                        )
+                    )
+            except (OSError, PermissionError):
+                pass
+
+            # Check for excessively long filenames
+            # Most filesystems support 255 characters, but recommend shorter for compatibility
+            if len(file) > 255:
+                result.issues.append(
+                    ValidationIssue(
+                        severity=Severity.ERROR,
+                        category=CheckCategory.FILESYSTEM,
+                        file=str(file_path.relative_to(path)),
+                        message=f"Filename exceeds maximum length ({len(file)} characters): {file[:50]}...",
+                        suggestion="Shorten filename to 255 characters or less",
+                    )
+                )
+            elif len(file) > 100:
+                result.issues.append(
+                    ValidationIssue(
+                        severity=Severity.WARNING,
+                        category=CheckCategory.FILESYSTEM,
+                        file=str(file_path.relative_to(path)),
+                        message=f"Filename is very long ({len(file)} characters): {file[:50]}...",
+                        suggestion="Consider shortening filename for better compatibility (recommended: under 100 characters)",
+                    )
+                )
+
+            # Check for spaces in filename
+            if " " in file:
+                result.issues.append(
+                    ValidationIssue(
+                        severity=Severity.WARNING,
+                        category=CheckCategory.FILESYSTEM,
+                        file=str(file_path.relative_to(path)),
+                        message=f"Filename contains spaces: {file}",
+                        suggestion="Replace spaces with underscores or hyphens",
+                    )
+                )
+
+            # Check for invalid/awkward characters in filename
+            # Include path separators, quotes, and other problematic characters
+            invalid_chars = set('<>:"|?*/\\\'')
+            found_invalid = [char for char in invalid_chars if char in file]
+
+            if found_invalid:
+                char_list = ", ".join(f"'{char}'" for char in found_invalid)
+                result.issues.append(
+                    ValidationIssue(
+                        severity=Severity.ERROR,
+                        category=CheckCategory.FILESYSTEM,
+                        file=str(file_path.relative_to(path)),
+                        message=f"Filename contains invalid characters ({char_list}): {file}",
+                        suggestion="Remove special characters from filename (use only letters, numbers, underscores, hyphens, and periods)",
+                    )
+                )
+
+            # Check for proprietary file formats
+            file_ext = "." + file.split(".")[-1] if "." in file else ""
+            file_ext_lower = file_ext.lower()
+
+            if file_ext_lower in PROPRIETARY_FORMATS:
+                result.issues.append(
+                    ValidationIssue(
+                        severity=Severity.WARNING,
+                        category=CheckCategory.FILESYSTEM,
+                        file=str(file_path.relative_to(path)),
+                        message=f"Proprietary file format detected: {file}",
+                        suggestion=f"{PROPRIETARY_FORMATS[file_ext_lower]}",
+                    )
+                )
+
+    return result
+
+
+def _format_size(size_bytes: int) -> str:
+    """Format byte size as human-readable string."""
+    for unit in ["B", "KB", "MB", "GB", "TB"]:
+        if size_bytes < 1024.0:
+            return f"{size_bytes:.1f} {unit}"
+        size_bytes /= 1024.0
+    return f"{size_bytes:.1f} PB"
diff --git a/physionet/validate/checks/integrity.py b/physionet/validate/checks/integrity.py
new file mode 100644
index 00000000..70428544
--- /dev/null
+++ b/physionet/validate/checks/integrity.py
@@ -0,0 +1,163 @@
+"""Data integrity validation checks."""
+
+import csv
+from pathlib import Path
+from typing import Optional, Callable
+
+from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity
+from physionet.validate.config import ValidationConfig
+
+
+def check_integrity(path: Path, config: ValidationConfig, progress_callback: Optional[Callable[[str], None]] = None) -> CheckResult:
+    """
+    Check data integrity and format validation.
+
+    Validates:
+    - CSV file structure
+    - File format validity
+    - Basic structural consistency
+
+    Args:
+        path: Path to dataset directory
+        config: Validation configuration
+        progress_callback: Optional callback to report progress
+
+    Returns:
+        CheckResult with any integrity issues found
+    """
+    result = CheckResult(category=CheckCategory.INTEGRITY)
+
+    # Find and validate CSV files
+    csv_files = list(path.rglob("*.csv"))
+    for i, csv_file in enumerate(csv_files):
+        if progress_callback:
+            progress_callback(f"Checking {csv_file.name} ({i+1}/{len(csv_files)} CSV files)")
+
+        if any(p in str(csv_file) for p in config.ignore_patterns):
+            continue
+
+        _validate_csv_structure(csv_file, path, result)
+
+    return result
+
+
+def _validate_csv_structure(csv_file: Path, base_path: Path, result: CheckResult) -> None:
+    """Validate CSV file structure."""
+    try:
+        with open(csv_file, "r", encoding="utf-8") as f:
+            # Try to detect dialect
+            sample = f.read(1024)
+            f.seek(0)
+
+            try:
+                dialect = csv.Sniffer().sniff(sample)
+            except csv.Error:
+                # Use default dialect if detection fails
+                dialect = csv.excel
+
+            reader = csv.reader(f, dialect)
+
+            # Read header
+            try:
+                header = next(reader)
+            except StopIteration:
+                result.issues.append(
+                    ValidationIssue(
+                        severity=Severity.ERROR,
+                        category=CheckCategory.INTEGRITY,
+                        file=str(csv_file.relative_to(base_path)),
+                        message="CSV file is empty",
+                    )
+                )
+                return
+
+            if not header:
+                result.issues.append(
+                    ValidationIssue(
+                        severity=Severity.ERROR,
+                        category=CheckCategory.INTEGRITY,
+                        file=str(csv_file.relative_to(base_path)),
+                        message="CSV file has no header row",
+                    )
+                )
+                return
+
+            # Check for duplicate column names
+            if len(header) != len(set(header)):
+                duplicates = [col for col in header if header.count(col) > 1]
+                result.issues.append(
+                    ValidationIssue(
+                        severity=Severity.ERROR,
+                        category=CheckCategory.INTEGRITY,
+                        file=str(csv_file.relative_to(base_path)),
+                        message=f"Duplicate column names found: {', '.join(set(duplicates))}",
+                    )
+                )
+
+            # Check for empty column names
+            if any(not col.strip() for col in header):
+                result.issues.append(
+                    ValidationIssue(
+                        severity=Severity.ERROR,
+                        category=CheckCategory.INTEGRITY,
+                        file=str(csv_file.relative_to(base_path)),
+                        message="CSV contains empty column names",
+                    )
+                )
+
+            # Validate row consistency
+            expected_cols = len(header)
+            row_count = 0
+            for line_num, row in enumerate(reader, start=2):  # Start at 2 (after header)
+                row_count += 1
+                if len(row) != expected_cols:
+                    result.issues.append(
+                        ValidationIssue(
+                            severity=Severity.ERROR,
+                            category=CheckCategory.INTEGRITY,
+                            file=str(csv_file.relative_to(base_path)),
+                            line=line_num,
+                            message=f"Row has {len(row)} columns, expected {expected_cols}",
+                        )
+                    )
+                    # Only report first few inconsistencies to avoid spam
+                    if len([i for i in result.issues if i.file == str(csv_file.relative_to(base_path))]) >= 5:
+                        result.issues.append(
+                            ValidationIssue(
+                                severity=Severity.INFO,
+                                category=CheckCategory.INTEGRITY,
+                                file=str(csv_file.relative_to(base_path)),
+                                message=f"Additional row inconsistencies may exist (showing first 5)",
+                            )
+                        )
+                        break
+
+            if row_count == 0:
+                result.issues.append(
+                    ValidationIssue(
+                        severity=Severity.WARNING,
+                        category=CheckCategory.INTEGRITY,
+                        file=str(csv_file.relative_to(base_path)),
+                        message="CSV file contains only header row (no data)",
+                    )
+                )
+
+    except UnicodeDecodeError:
+        result.issues.append(
+            ValidationIssue(
+                severity=Severity.ERROR,
+                category=CheckCategory.INTEGRITY,
+                file=str(csv_file.relative_to(base_path)),
+                message="CSV file has encoding issues (not valid UTF-8)",
+                suggestion="Convert file to UTF-8 encoding",
+            )
+        )
+    except Exception as e:
+        result.issues.append(
+            ValidationIssue(
+                severity=Severity.WARNING,
+                category=CheckCategory.INTEGRITY,
+                file=str(csv_file.relative_to(base_path)),
+                message=f"Could not validate CSV file: {str(e)}",
+            )
+        )
diff --git a/physionet/validate/checks/privacy.py b/physionet/validate/checks/privacy.py
new file mode 100644
index 00000000..3db7e0c8
--- /dev/null
+++ b/physionet/validate/checks/privacy.py
@@ -0,0 +1,312 @@
+"""Privacy and PHI validation checks."""
+
+import csv
+import os
+import re
+from pathlib import Path
+from typing import Optional, Callable
+
+from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity
+from physionet.validate.config import ValidationConfig
+
+# Pattern names for better error messages
+PHI_PATTERN_NAMES = {
+    r"\b\d{3}-\d{2}-\d{4}\b": "SSN",
+    r"\b[\w\.-]+@[\w\.-]+\.\w+\b": "email address",
+    r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b": "phone number",
+}
+
+# Sensitive configuration files that should not be included in datasets
+SENSITIVE_FILES = {
+    # API keys and credentials
+    ".env": "environment variables (may contain API keys)",
+    ".env.local": "local environment variables",
+    ".env.production": "production environment variables",
+    "credentials.json": "credential file",
+    "secrets.json": "secrets file",
+    "config.json": "configuration file (may contain credentials)",
+    ".aws/credentials": "AWS credentials",
+    ".aws/config": "AWS configuration",
+
+    # SSH and certificates
+    "id_rsa": "SSH private key",
+    "id_dsa": "SSH private key",
+    "id_ecdsa": "SSH private key",
+    "id_ed25519": "SSH private key",
+    ".pem": "private certificate/key",
+    ".key": "private key",
+    ".p12": "certificate file",
+    ".pfx": "certificate file",
+
+    # Database
+    ".pgpass": "PostgreSQL password file",
+    ".my.cnf": "MySQL configuration (may contain passwords)",
+
+    # Other sensitive files
+    ".netrc": "authentication credentials",
+    ".htpasswd": "HTTP authentication",
+    "docker-compose.override.yml": "Docker override (may contain secrets)",
+}
+
+
+def check_privacy(path: Path, config: ValidationConfig, progress_callback: Optional[Callable[[str], None]] = None) -> CheckResult:
+    """
+    Check for potential privacy issues and PHI.
+
+    Validates:
+    - PHI pattern detection
+    - Age de-identification
+    - Sensitive configuration files (keys, credentials)
+    - Date patterns
+
+    Args:
+        path: Path to dataset directory
+        config: Validation configuration
+        progress_callback: Optional callback to report progress
+
+    Returns:
+        CheckResult with any privacy issues found
+    """
+    result = CheckResult(category=CheckCategory.PRIVACY)
+
+    # Check for sensitive configuration files
+    if progress_callback:
+        progress_callback("Checking for sensitive configuration files")
+    _check_sensitive_files(path, config, result)
+
+    # Compile PHI patterns with names
+    pattern_info = [(re.compile(pattern), PHI_PATTERN_NAMES.get(pattern, "unknown pattern"))
+                    for pattern in config.phi_patterns]
+
+    # Check CSV files
+    csv_files = list(path.rglob("*.csv"))
+    for i, csv_file in enumerate(csv_files):
+        if progress_callback:
+            progress_callback(f"Checking {csv_file.name} ({i+1}/{len(csv_files)} CSV files)")
+
+        if any(p in str(csv_file) for p in config.ignore_patterns):
+            continue
+
+        _check_csv_privacy(csv_file, path, config, pattern_info, result)
+
+    # Check text files for PHI
+    text_files = list(path.rglob("*.txt"))
+    for i, text_file in enumerate(text_files):
+        if progress_callback:
+            progress_callback(f"Checking {text_file.name} ({i+1}/{len(text_files)} text files)")
+
+        if any(p in str(text_file) for p in config.ignore_patterns):
+            continue
+
+        _check_text_file_privacy(text_file, path, pattern_info, result, config)
+
+    return result
+
+
+def _check_sensitive_files(path: Path, config: ValidationConfig, result: CheckResult) -> None:
+    """Check for sensitive configuration files that shouldn't be in the dataset."""
+    for root, dirs, files in os.walk(path):
+        # Filter out ignored directories
+        dirs[:] = [d for d in dirs if not any(p in d for p in config.ignore_patterns)]
+
+        for file in files:
+            file_path = Path(root) / file
+            relative_path = str(file_path.relative_to(path))
+
+            # Skip ignored files
+            if any(p in str(file_path) for p in config.ignore_patterns):
+                continue
+
+            # Check exact filename matches
+            if file in SENSITIVE_FILES:
+                result.issues.append(
+                    ValidationIssue(
+                        severity=Severity.ERROR,
+                        category=CheckCategory.PRIVACY,
+                        file=relative_path,
+                        message=f"Sensitive file detected: {SENSITIVE_FILES[file]}",
+                        suggestion=f"Remove '{file}' from the dataset before submission",
+                    )
+                )
+                continue
+
+            # Check file extensions for sensitive files
+            for sensitive_name, description in SENSITIVE_FILES.items():
+                # Check if it's an extension pattern (starts with .)
+                if sensitive_name.startswith(".") and "." in file:
+                    ext = "." + file.split(".")[-1]
+                    if ext == sensitive_name:
+                        result.issues.append(
+                            ValidationIssue(
+                                severity=Severity.ERROR,
+                                category=CheckCategory.PRIVACY,
+                                file=relative_path,
+                                message=f"Sensitive file detected: {description}",
+                                suggestion=f"Remove '{file}' from the dataset before submission",
+                            )
+                        )
+                        break
+
+            # Check for common patterns in filenames
+            lower_file = file.lower()
+            if any(keyword in lower_file for keyword in ["password", "secret", "token", "apikey", "api_key"]):
+                result.issues.append(
+                    ValidationIssue(
+                        severity=Severity.WARNING,
+                        category=CheckCategory.PRIVACY,
+                        file=relative_path,
+                        message=f"File name suggests sensitive content: '{file}'",
+                        suggestion="Review file contents and remove if it contains credentials or keys",
+                    )
+                )
+
+
+def _check_csv_privacy(
+    csv_file: Path,
+    base_path: Path,
+    config: ValidationConfig,
+    pattern_info: list,
+    result: CheckResult
+) -> None:
+    """Check a CSV file for privacy issues."""
+    relative_path = str(csv_file.relative_to(base_path))
+
+    # Track which columns have which types of issues (to report only once per column)
+    # Maps column name to the pattern name that matched
+    phi_columns = {}  # {column: pattern_name}
+    age_columns = set()  # Columns with age violations
+
+    try:
+        with open(csv_file, "r", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+
+            # Determine if we should sample this file
+            rows_scanned = 0
+            max_rows = config.max_rows_to_scan
+
+            # Count total rows first if we're sampling
+            if config.sample_large_files and max_rows:
+                # Read all rows into list to enable sampling
+                all_rows = list(reader)
+                total_rows = len(all_rows)
+
+                if total_rows > max_rows:
+                    # Sample evenly distributed rows
+                    import random
+                    random.seed(42)  # Deterministic sampling
+                    step = total_rows / max_rows
+                    sampled_indices = [int(i * step) for i in range(max_rows)]
+                    rows_to_scan = [all_rows[i] for i in sampled_indices]
+                    is_sampled = True
+                else:
+                    rows_to_scan = all_rows
+                    is_sampled = False
+            else:
+                # No sampling, but still respect max_rows limit
+                rows_to_scan = reader
+                is_sampled = False
+
+            for line_num, row in enumerate(rows_to_scan, start=2):  # Start at 2 (after header)
+                # Stop if we've hit the limit (when not sampling)
+                if max_rows and not is_sampled and rows_scanned >= max_rows:
+                    break
+                rows_scanned += 1
+
+                for col, value in row.items():
+                    if not value:
+                        continue
+
+                    value_str = str(value).strip()
+
+                    # Check for PHI patterns (only track if not already found in this column)
+                    if col not in phi_columns:
+                        for pattern, pattern_name in pattern_info:
+                            if pattern.search(value_str):
+                                phi_columns[col] = pattern_name
+                                break
+
+                    # Check for age violations (only track if not already found in this column)
+                    if col not in age_columns and "age" in col.lower():
+                        try:
+                            age_value = float(value_str)
+                            if age_value > config.allowed_age_max:
+                                age_columns.add(col)
+                        except ValueError:
+                            pass
+
+        # Report one issue per column type with specific pattern info
+        for col, pattern_name in phi_columns.items():
+            result.issues.append(
+                ValidationIssue(
+                    severity=Severity.WARNING,
+                    category=CheckCategory.PRIVACY,
+                    file=relative_path,
+                    column=col,
+                    message=f"Potential private information detected in column '{col}' (pattern: {pattern_name})",
+                    suggestion="Review and remove or de-identify sensitive information",
+                )
+            )
+
+        for col in age_columns:
+            result.issues.append(
+                ValidationIssue(
+                    severity=Severity.WARNING,
+                    category=CheckCategory.PRIVACY,
+                    file=relative_path,
+                    column=col,
+                    message=f"Ages exceeding HIPAA limit of {config.allowed_age_max} found in column '{col}'",
+                    suggestion=f"De-identify ages >{config.allowed_age_max} (e.g., set to {config.allowed_age_max}+)",
+                )
+            )
+
+    except Exception as e:
+        result.issues.append(
+            ValidationIssue(
+                severity=Severity.WARNING,
+                category=CheckCategory.PRIVACY,
+                file=str(csv_file.relative_to(base_path)),
+                message=f"Could not perform privacy checks: {str(e)}",
+            )
+        )
+
+
+def _check_text_file_privacy(text_file: Path, base_path: Path, pattern_info: list, result: CheckResult, config: ValidationConfig) -> None:
+    """Check a text file for privacy issues."""
+    relative_path = str(text_file.relative_to(base_path))
+    detected_patterns = set()
+
+    try:
+        with open(text_file, "r", encoding="utf-8") as f:
+            content = f.read()
+
+            # Check for PHI patterns and track which ones are found
+            for line in content.split("\n"):
+                for pattern, pattern_name in pattern_info:
+                    if pattern.search(line):
+                        detected_patterns.add(pattern_name)
+
+        # Report once per file with specific patterns found
+        if detected_patterns:
+            patterns_str = ", ".join(sorted(detected_patterns))
+            result.issues.append(
+                ValidationIssue(
+                    severity=Severity.WARNING,
+                    category=CheckCategory.PRIVACY,
+                    file=relative_path,
+                    message=f"Potential private information detected ({patterns_str})",
+                    suggestion="Review and remove or de-identify sensitive information",
+                )
+            )
+
+    except UnicodeDecodeError:
+        # Skip binary files
+        pass
+    except Exception as e:
+        result.issues.append(
+            ValidationIssue(
+                severity=Severity.WARNING,
+                category=CheckCategory.PRIVACY,
+                file=str(text_file.relative_to(base_path)),
+                message=f"Could not perform privacy checks: {str(e)}",
+            )
+        )
diff --git a/physionet/validate/checks/quality.py b/physionet/validate/checks/quality.py
new file mode 100644
index 00000000..f2ec2794
--- /dev/null
+++ b/physionet/validate/checks/quality.py
@@ -0,0 +1,141 @@
+"""Data quality validation checks."""
+
+import csv
+from pathlib import Path
+from typing import Optional, Callable
+
+from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity
+from physionet.validate.config import ValidationConfig
+
+
+def check_quality(path: Path, config: ValidationConfig, progress_callback: Optional[Callable[[str], None]] = None) -> CheckResult:
+    """
+    Check data quality.
+
+    Validates:
+    - Missing value thresholds
+    - Value range plausibility
+    - Data type consistency
+
+    Args:
+        path: Path to dataset directory
+        config: Validation configuration
+        progress_callback: Optional callback to report progress
+
+    Returns:
+        CheckResult with any quality issues found
+    """
+    result = CheckResult(category=CheckCategory.QUALITY)
+
+    # Find and validate CSV files
+    csv_files = list(path.rglob("*.csv"))
+    for i, csv_file in enumerate(csv_files):
+        if progress_callback:
+            progress_callback(f"Checking {csv_file.name} ({i+1}/{len(csv_files)} CSV files)")
+
+        if any(p in str(csv_file) for p in config.ignore_patterns):
+            continue
+
+        _check_csv_quality(csv_file, path, config, result)
+
+    return result
+
+
+def _check_csv_quality(csv_file: Path, base_path: Path, config: ValidationConfig, result: CheckResult) -> None:
+    """Check quality metrics for a CSV file."""
+    try:
+        with open(csv_file, "r", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+
+            # Track column statistics
+            column_stats = {col: {"total": 0, "missing": 0, "values": []} for col in reader.fieldnames or []}
+
+            # Determine if we should sample this file
+            rows_scanned = 0
+            max_rows = config.max_rows_to_scan
+
+            # Sample if enabled and file is large
+            if config.sample_large_files and max_rows:
+                all_rows = list(reader)
+                total_rows = len(all_rows)
+
+                if total_rows > max_rows:
+                    # Sample evenly distributed rows
+                    import random
+                    random.seed(42)  # Deterministic sampling
+                    step = total_rows / max_rows
+                    sampled_indices = [int(i * step) for i in range(max_rows)]
+                    rows_to_scan = [all_rows[i] for i in sampled_indices]
+                else:
+                    rows_to_scan = all_rows
+            else:
+                rows_to_scan = reader
+
+            for row in rows_to_scan:
+                # Stop if we've hit the limit (when not sampling)
+                if max_rows and not config.sample_large_files and rows_scanned >= max_rows:
+                    break
+                rows_scanned += 1
+
+                for col, value in row.items():
+                    column_stats[col]["total"] += 1
+
+                    # Check for missing values
+                    if not value or value.strip() in ("", "NA", "N/A", "NULL", "null", "None", "NaN"):
+                        column_stats[col]["missing"] += 1
+                    else:
+                        # Store value for range checking if configured
+                        if col.lower().replace("_", " ") in [k.lower().replace("_", " ") for k in config.value_ranges]:
+                            try:
+                                numeric_value = float(value.strip())
+                                column_stats[col]["values"].append(numeric_value)
+                            except ValueError:
+                                pass
+
+            # Analyze results
+            for col, stats in column_stats.items():
+                if stats["total"] == 0:
+                    continue
+
+                # Check missing value threshold
+                missing_ratio = stats["missing"] / stats["total"]
+                if missing_ratio >= config.missing_value_threshold:
+                    result.issues.append(
+                        ValidationIssue(
+                            severity=Severity.WARNING,
+                            category=CheckCategory.QUALITY,
+                            file=str(csv_file.relative_to(base_path)),
+                            column=col,
+                            message=f"Column '{col}' is completely empty (100% missing values)",
+                            suggestion=f"Consider removing empty column '{col}' or adding data",
+                        )
+                    )
+
+                # Check value ranges
+                for range_key, (min_val, max_val) in config.value_ranges.items():
+                    if col.lower().replace("_", " ") == range_key.lower().replace("_", " "):
+                        for value in stats["values"]:
+                            if value < min_val or value > max_val:
+                                result.issues.append(
+                                    ValidationIssue(
+                                        severity=Severity.WARNING,
+                                        category=CheckCategory.QUALITY,
+                                        file=str(csv_file.relative_to(base_path)),
+                                        column=col,
+                                        value=str(value),
+                                        message=f"Value {value} in '{col}' outside expected range [{min_val}, {max_val}]",
+                                        suggestion="Verify data accuracy or adjust validation ranges",
+                                    )
+                                )
+                                # Limit warnings per column
+                                break
+
+    except Exception as e:
+        result.issues.append(
+            ValidationIssue(
+                severity=Severity.WARNING,
+                category=CheckCategory.QUALITY,
+                file=str(csv_file.relative_to(base_path)),
+                message=f"Could not perform quality checks: {str(e)}",
+            )
+        )
diff --git a/physionet/validate/config.py b/physionet/validate/config.py
new file mode 100644
index 00000000..f596d689
--- /dev/null
+++ b/physionet/validate/config.py
@@ -0,0 +1,44 @@
+"""Configuration for validation checks."""
+
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple
+
+
+@dataclass
+class ValidationConfig:
+    """Configuration for dataset validation."""
+
+    # General settings
+    check_filesystem: bool = True
+    check_documentation: bool = True
+    check_integrity: bool = True
+    check_quality: bool = True
+    check_phi: bool = True
+
+    # File system settings
+    max_file_size_bytes: Optional[int] = None  # None = no limit
+    warn_small_files_threshold: int = 100  # Warn if more than this many small files
+    ignore_patterns: List[str] = field(default_factory=lambda: [
+        ".git", ".gitignore", ".DS_Store", "__pycache__", "*.pyc", ".pytest_cache"
+    ])
+
+    # Documentation settings
+    required_files: List[str] = field(default_factory=lambda: ["README.md"])
+    recommended_readme_sections: List[str] = field(default_factory=list)
+
+    # Performance settings
+    max_rows_to_scan: Optional[int] = 10000  # Max rows to scan per CSV for privacy/quality checks (None = all rows)
+    sample_large_files: bool = True  # If True, sample rows from large files instead of scanning all
+
+    # Quality settings
+    missing_value_threshold: float = 1.0  # Warn if column has 100% missing values
+    value_ranges: Dict[str, Tuple[float, float]] = field(default_factory=dict)
+    # Example: {"heart_rate": (20, 300), "temperature": (32, 43)}
+
+    # Privacy settings
+    allowed_age_max: int = 89
+    phi_patterns: List[str] = field(default_factory=lambda: [
+        r"\b\d{3}-\d{2}-\d{4}\b",  # SSN pattern
+        r"\b[\w\.-]+@[\w\.-]+\.\w+\b",  # Email pattern
+        r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b",  # Phone pattern
+    ])
diff --git a/physionet/validate/models.py b/physionet/validate/models.py
new file mode 100644
index 00000000..acfdfb87
--- /dev/null
+++ b/physionet/validate/models.py
@@ -0,0 +1,309 @@
+"""Data models for validation results."""
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Dict, Any
+from enum import Enum
+from datetime import datetime
+
+
+class Severity(Enum):
+    """Severity levels for validation issues."""
+    ERROR = "error"
+    WARNING = "warning"
+    INFO = "info"
+
+
+class CheckCategory(Enum):
+    """Categories of validation checks."""
+    FILESYSTEM = "filesystem"
+    DOCUMENTATION = "documentation"
+    INTEGRITY = "integrity"
+    QUALITY = "quality"
+    PRIVACY = "privacy"
+
+
+@dataclass
+class ValidationIssue:
+    """Represents a single validation issue."""
+    severity: Severity
+    category: CheckCategory
+    message: str
+    file: Optional[str] = None
+    line: Optional[int] = None
+    column: Optional[str] = None
+    value: Optional[str] = None
+    suggestion: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert issue to dictionary format."""
+        result = {
+            "severity": self.severity.value,
+            "category": self.category.value,
+            "message": self.message,
+        }
+        if self.file:
+            result["file"] = self.file
+        if self.line is not None:
+            result["line"] = self.line
+        if self.column:
+            result["column"] = self.column
+        if self.value:
+            result["value"] = self.value
+        if self.suggestion:
+            result["suggestion"] = self.suggestion
+        return result
+
+
+@dataclass
+class CheckResult:
+    """Results from a specific category of checks."""
+    category: CheckCategory
+    issues: List[ValidationIssue] = field(default_factory=list)
+
+    @property
+    def status(self) -> str:
+        """Get overall status for this check category."""
+        if any(issue.severity == Severity.ERROR for issue in self.issues):
+            return "error"
+        elif any(issue.severity == Severity.WARNING for issue in self.issues):
+            return "warning"
+        return "pass"
+
+    @property
+    def error_count(self) -> int:
+        """Count of errors in this category."""
+        return sum(1 for issue in self.issues if issue.severity == Severity.ERROR)
+
+    @property
+    def warning_count(self) -> int:
+        """Count of warnings in this category."""
+        return sum(1 for issue in self.issues if issue.severity == Severity.WARNING)
+
+    @property
+    def info_count(self) -> int:
+        """Count of info messages in this category."""
+        return sum(1 for issue in self.issues if issue.severity == Severity.INFO)
+
+
+@dataclass
+class DatasetStats:
+    """Statistics about the dataset being validated."""
+    total_size_bytes: int = 0
+    file_count: int = 0
+    directory_count: int = 0
+
+
+@dataclass
+class ValidationResult:
+    """Complete validation results for a dataset."""
+    dataset_path: str
+    timestamp: str
+    check_results: Dict[CheckCategory, CheckResult] = field(default_factory=dict)
+    dataset_stats: DatasetStats = field(default_factory=DatasetStats)
+
+    @property
+    def total_errors(self) -> int:
+        """Total count of errors across all checks."""
+        return sum(result.error_count for result in self.check_results.values())
+
+    @property
+    def total_warnings(self) -> int:
+        """Total count of warnings across all checks."""
+        return sum(result.warning_count for result in self.check_results.values())
+
+    @property
+    def total_info(self) -> int:
+        """Total count of info messages across all checks."""
+        return sum(result.info_count for result in self.check_results.values())
+
+    @property
+    def status(self) -> str:
+        """Overall validation status."""
+        if self.total_errors > 0:
+            return "error"
+        elif self.total_warnings > 0:
+            return "warning"
+        return "pass"
+
+    def summary(self) -> str:
+        """Generate a human-readable summary."""
+        # Format timestamp as human-readable
+        try:
+            dt = datetime.fromisoformat(self.timestamp.replace('Z', '+00:00'))
+            formatted_timestamp = dt.strftime("%Y-%m-%d %H:%M:%S UTC")
+        except (ValueError, AttributeError):
+            formatted_timestamp = self.timestamp
+
+        # Get package version
+        try:
+            import physionet
+            validator_version = physionet.__version__
+        except (ImportError, AttributeError):
+            validator_version = "unknown"
+
+        lines = []
+
+        # Section 1: Metadata
+        lines.extend([
+            "PhysioNet Dataset Validation Report",
+            "=" * 50,
+            "",
+            "Metadata:",
+            f"  Dataset: {self.dataset_path}",
+            f"  Validator version: {validator_version}",
+            f"  Timestamp: {formatted_timestamp}",
+            f"  Total size: {self._format_size(self.dataset_stats.total_size_bytes)} "
+            f"({self.dataset_stats.file_count} files)",
+            "",
+        ])
+
+        # Section 2: Validation Results
+        lines.extend([
+            "Validation Results:",
+            "=" * 50,
+        ])
+
+        for category, result in self.check_results.items():
+            # Only show ✗ for errors, ✓ for pass or warnings-only
+            status_icon = "✗" if result.error_count > 0 else "✓"
+            issue_summary = ""
+            if result.error_count or result.warning_count:
+                parts = []
+                if result.error_count:
+                    parts.append(f"{result.error_count} error{'s' if result.error_count != 1 else ''}")
+                if result.warning_count:
+                    parts.append(f"{result.warning_count} warning{'s' if result.warning_count != 1 else ''}")
+                issue_summary = f" ({', '.join(parts)})"
+
+            lines.append(f"{status_icon} {category.value.replace('_', ' ').title()}{issue_summary}")
+
+            for issue in result.issues:
+                icon = "✗" if issue.severity == Severity.ERROR else "⚠"
+                location = f" {issue.file}"
+                if issue.line:
+                    location += f":{issue.line}"
+                lines.append(f"  {icon}{location} - {issue.message}")
+
+        lines.append("")
+
+        # Section 3: Summary
+        lines.extend([
+            "Summary:",
+            "=" * 50,
+            f"{self.total_errors} error{'s' if self.total_errors != 1 else ''}, "
+            f"{self.total_warnings} warning{'s' if self.total_warnings != 1 else ''}",
+            "",
+        ])
+
+        if self.status == "error":
+            lines.append("✗ Dataset has errors that must be fixed before submission")
+        elif self.status == "warning":
+            lines.append("⚠ Dataset has warnings that should be addressed before submission")
+        else:
+            lines.append("✓ Dataset passed validation")
+
+        # Add recommendations section if there are issues
+        recommendations = self._generate_recommendations()
+        if recommendations:
+            lines.extend([
+                "",
+                "Recommendations:",
+                "=" * 50,
+            ])
+            lines.extend(recommendations)
+
+        # Add note about including validation report in submission
+        lines.extend([
+            "",
+            "Note: A validation report (PHYSIONET_REPORT.md) has been saved in your",
+            "      dataset folder. Please include this file in your final submission.",
+        ])
+
+        return "\n".join(lines) + "\n"
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert validation result to dictionary format."""
+        return {
+            "dataset_path": self.dataset_path,
+            "timestamp": self.timestamp,
+            "dataset_stats": {
+                "total_size_bytes": self.dataset_stats.total_size_bytes,
+                "file_count": self.dataset_stats.file_count,
+                "directory_count": self.dataset_stats.directory_count,
+            },
+            "summary": {
+                "total_errors": self.total_errors,
+                "total_warnings": self.total_warnings,
+                "total_info": self.total_info,
+                "status": self.status,
+            },
+            "checks": {
+                category.value: {
+                    "status": result.status,
+                    "issues": [issue.to_dict() for issue in result.issues],
+                }
+                for category, result in self.check_results.items()
+            },
+        }
+
+    def _generate_recommendations(self) -> List[str]:
+        """Generate actionable recommendations based on issues found."""
+        recommendations = []
+
+        # Check for very large datasets (>200GB)
+        size_gb = self.dataset_stats.total_size_bytes / (1024 ** 3)
+        if size_gb > 200:
+            recommendations.append("\nDataset Size:")
+            recommendations.append(
+                f"  ℹ  Your dataset is very large ({self._format_size(self.dataset_stats.total_size_bytes)}). "
+                "If you need assistance uploading large files, please contact the PhysioNet team at contact@physionet.org"
+            )
+
+        # Collect unique suggestions from all issues
+        suggestions_by_category = {}
+
+        for category, result in self.check_results.items():
+            category_suggestions = {}
+
+            for issue in result.issues:
+                if issue.suggestion:
+                    # Group by suggestion to avoid duplicates
+                    if issue.suggestion not in category_suggestions:
+                        category_suggestions[issue.suggestion] = {
+                            'severity': issue.severity,
+                            'count': 0
+                        }
+                    category_suggestions[issue.suggestion]['count'] += 1
+
+            if category_suggestions:
+                suggestions_by_category[category] = category_suggestions
+
+        # Generate recommendations by category
+        for category, suggestions in suggestions_by_category.items():
+            if not suggestions:
+                continue
+
+            recommendations.append(f"\n{category.value.replace('_', ' ').title()}:")
+
+            # Sort by severity (errors first) and then by count
+            sorted_suggestions = sorted(
+                suggestions.items(),
+                key=lambda x: (x[1]['severity'] != Severity.ERROR, -x[1]['count'])
+            )
+
+            for suggestion, info in sorted_suggestions:
+                count = info['count']
+                icon = "✗" if info['severity'] == Severity.ERROR else "⚠"
+                count_str = f" ({count} file{'s' if count != 1 else ''})" if count > 1 else ""
+                recommendations.append(f"  {icon} {suggestion}{count_str}")
+
+        return recommendations
+
+    @staticmethod
+    def _format_size(size_bytes: int) -> str:
+        """Format byte size as human-readable string."""
+        for unit in ["B", "KB", "MB", "GB", "TB"]:
+            if size_bytes < 1024.0:
+                return f"{size_bytes:.1f} {unit}"
+            size_bytes /= 1024.0
+        return f"{size_bytes:.1f} PB"
diff --git a/physionet/validate/validator.py b/physionet/validate/validator.py
new file mode 100644
index 00000000..094751ba
--- /dev/null
+++ b/physionet/validate/validator.py
@@ -0,0 +1,153 @@
+"""Main validation logic."""
+
+import os
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+from tqdm import tqdm
+
+from physionet.validate.config import ValidationConfig
+from physionet.validate.models import (
+    ValidationResult,
+    CheckResult,
+    ValidationIssue,
+    CheckCategory,
+    Severity,
+    DatasetStats,
+)
+from physionet.validate.checks import (
+    check_filesystem,
+    check_documentation,
+    check_integrity,
+    check_quality,
+    check_privacy,
+)
+
+
+def validate_dataset(
+    dataset_path: str,
+    config: Optional[ValidationConfig] = None,
+    show_progress: bool = True
+) -> ValidationResult:
+    """
+    Validate a PhysioNet dataset before submission.
+
+    Args:
+        dataset_path: Path to the dataset directory
+        config: Optional validation configuration. If None, uses defaults.
+        show_progress: Whether to show progress bar. Default True.
+
+    Returns:
+        ValidationResult containing all validation issues and statistics
+
+    Raises:
+        ValueError: If dataset_path doesn't exist or isn't a directory
+    """
+    path = Path(dataset_path)
+    if not path.exists():
+        raise ValueError(f"Dataset path does not exist: {dataset_path}")
+    if not path.is_dir():
+        raise ValueError(f"Dataset path is not a directory: {dataset_path}")
+
+    if config is None:
+        config = ValidationConfig()
+
+    # Initialize result
+    result = ValidationResult(
+        dataset_path=path.name,  # Use just the dataset folder name, not full path
+        timestamp=datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
+    )
+
+    # Calculate dataset statistics
+    result.dataset_stats = _calculate_stats(path, config)
+
+    # Determine which checks to run
+    checks_to_run = []
+    if config.check_filesystem:
+        checks_to_run.append(("Filesystem", CheckCategory.FILESYSTEM, check_filesystem))
+    if config.check_documentation:
+        checks_to_run.append(("Documentation", CheckCategory.DOCUMENTATION, check_documentation))
+    if config.check_integrity:
+        checks_to_run.append(("Integrity", CheckCategory.INTEGRITY, check_integrity))
+    if config.check_quality:
+        checks_to_run.append(("Quality", CheckCategory.QUALITY, check_quality))
+    if config.check_phi:
+        checks_to_run.append(("Privacy", CheckCategory.PRIVACY, check_privacy))
+
+    # Run validation checks with progress bar
+    if show_progress:
+        progress_bar = tqdm(
+            total=100,
+            desc="Running validation checks",
+            unit="%",
+            leave=False,
+            ncols=100,
+            bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}%"
+        )
+
+        steps_per_check = 100 // len(checks_to_run) if checks_to_run else 100
+
+        for i, (name, category, check_func) in enumerate(checks_to_run):
+            # Create a callback to update progress during this check
+            def update_progress(msg: str):
+                progress_bar.set_description(f"{name}: {msg}"[:80])
+
+            progress_bar.set_description(f"{name}"[:80])
+
+            # Call check function with progress callback if it supports it
+            try:
+                result.check_results[category] = check_func(path, config, progress_callback=update_progress)
+            except TypeError:
+                # Function doesn't support progress_callback parameter
+                result.check_results[category] = check_func(path, config)
+
+            # Update progress
+            progress_bar.update(steps_per_check)
+
+        progress_bar.close()
+    else:
+        for name, category, check_func in checks_to_run:
+            # Try with progress_callback first, fall back to without
+            try:
+                result.check_results[category] = check_func(path, config, progress_callback=None)
+            except TypeError:
+                result.check_results[category] = check_func(path, config)
+
+    return result
+
+
+def _calculate_stats(path: Path, config: ValidationConfig) -> DatasetStats:
+    """Calculate statistics about the dataset."""
+    stats = DatasetStats()
+
+    for root, dirs, files in os.walk(path):
+        # Filter out ignored directories
+        dirs[:] = [d for d in dirs if not _should_ignore(d, config.ignore_patterns)]
+
+        stats.directory_count += len(dirs)
+
+        for file in files:
+            if _should_ignore(file, config.ignore_patterns):
+                continue
+
+            file_path = Path(root) / file
+            try:
+                stats.file_count += 1
+                stats.total_size_bytes += file_path.stat().st_size
+            except (OSError, PermissionError):
+                # Skip files we can't access
+                pass
+
+    return stats
+
+
+def _should_ignore(name: str, patterns: list) -> bool:
+    """Check if a file or directory should be ignored."""
+    for pattern in patterns:
+        if pattern.startswith("*"):
+            if name.endswith(pattern[1:]):
+                return True
+        elif pattern in name:
+            return True
+    return False
diff --git a/pyproject.toml b/pyproject.toml
index b2590640..b8d5c124 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "physionet"
-version = "0.1.4"
+version = "0.1.5"
 authors = [
   { name="Tom Pollard", email="tpollard@mit.edu" },
 ]
@@ -23,6 +23,7 @@ dependencies = [
     "pandas",
     "openpyxl",
     "requests",
+    "tqdm",
 ]
 
 [project.optional-dependencies]
@@ -41,6 +42,9 @@ line-length = 119
 [tool.pyright]
 reportMissingImports = true
 
+[project.scripts]
+physionet = "physionet.cli:main"
+
 [project.urls]
 homepage = "https://github.com/MIT-LCP/physionet"
 repository = "https://github.com/MIT-LCP/physionet"
diff --git a/tests/validate/__init__.py b/tests/validate/__init__.py
new file mode 100644
index 00000000..1c130bf2
--- /dev/null
+++ b/tests/validate/__init__.py
@@ -0,0 +1 @@
+"""Tests for validation module."""
diff --git a/tests/validate/test_checks.py b/tests/validate/test_checks.py
new file mode 100644
index 00000000..745dc27b
--- /dev/null
+++ b/tests/validate/test_checks.py
@@ -0,0 +1,472 @@
+"""Tests for individual validation checks."""
+
+import pytest
+import csv
+from pathlib import Path
+
+from physionet.validate import ValidationConfig
+from physionet.validate.checks import (
+    check_filesystem,
+    check_documentation,
+    check_integrity,
+    check_quality,
+    check_privacy,
+)
+from physionet.validate.models import Severity, CheckCategory
+
+
+class TestFilesystemChecks:
+    """Tests for filesystem validation checks."""
+
+    def test_detects_git_directory(self, tmp_path):
+        """Test that .git directories are detected."""
+        (tmp_path / ".git").mkdir()
+        (tmp_path / ".git" / "config").write_text("test")
+
+        config = ValidationConfig()
+        result = check_filesystem(tmp_path, config)
+
+        assert any(".git" in issue.message for issue in result.issues)
+
+    def test_detects_hidden_files(self, tmp_path):
+        """Test that hidden files are detected."""
+        (tmp_path / ".hidden").write_text("test")
+
+        config = ValidationConfig()
+        result = check_filesystem(tmp_path, config)
+
+        assert any(issue.file and ".hidden" in issue.file for issue in result.issues)
+
+    def test_detects_temp_files(self, tmp_path):
+        """Test that temporary files are detected."""
+        (tmp_path / "file.txt~").write_text("test")
+        (tmp_path / "temp.tmp").write_text("test")
+
+        config = ValidationConfig()
+        result = check_filesystem(tmp_path, config)
+
+        assert len(result.issues) >= 2
+
+    def test_detects_empty_files(self, tmp_path):
+        """Test that empty files are detected."""
+        (tmp_path / "empty.txt").write_text("")
+
+        config = ValidationConfig()
+        result = check_filesystem(tmp_path, config)
+
+        assert any("Empty file" in issue.message for issue in result.issues)
+
+    def test_detects_invalid_filename_characters(self, tmp_path):
+        """Test that invalid filename characters are detected."""
+        # Note: This test might not work on all filesystems
+        try:
+            (tmp_path / "file<test>.txt").write_text("test")
+            config = ValidationConfig()
+            result = check_filesystem(tmp_path, config)
+            assert any("invalid characters" in issue.message.lower() for issue in result.issues)
+            # Should show which character was found
+            assert any("<" in issue.message for issue in result.issues)
+        except OSError:
+            # Skip test if filesystem doesn't allow these characters
+            pytest.skip("Filesystem doesn't support invalid characters in filenames")
+
+    def test_detects_path_separators_in_filenames(self, tmp_path):
+        """Test that path separators and other awkward characters are flagged."""
+        # These characters should be caught even though they can't actually be in filenames on most systems
+        # We test the validation logic by checking the character set
+        from physionet.validate.checks.filesystem import check_filesystem
+
+        # Create a file with a valid name for the actual test
+        (tmp_path / "normalfile.txt").write_text("test")
+
+        config = ValidationConfig()
+        result = check_filesystem(tmp_path, config)
+
+        # The check should flag files with /, \, quotes, etc if they could exist
+        # Since we can't create such files, we verify the character set in the code includes them
+        # This is tested indirectly through the previous test
+
+    def test_detects_spaces_in_filenames(self, tmp_path):
+        """Test that filenames with spaces are flagged."""
+        (tmp_path / "my data file.csv").write_text("col1,col2\n1,2\n")
+        (tmp_path / "analysis results.txt").write_text("test")
+
+        config = ValidationConfig()
+        result = check_filesystem(tmp_path, config)
+
+        # Should warn about both files with spaces
+        space_warnings = [
+            issue for issue in result.issues
+            if "spaces" in issue.message.lower()
+        ]
+        assert len(space_warnings) == 2
+        assert any("my data file.csv" in issue.file for issue in space_warnings)
+        assert any("analysis results.txt" in issue.file for issue in space_warnings)
+
+    def test_detects_long_filenames(self, tmp_path):
+        """Test that excessively long filenames are flagged."""
+        # Create a file with a very long name (120 characters total)
+        long_name = "a" * 116 + ".csv"  # 116 + 4 = 120 characters
+        (tmp_path / long_name).write_text("col1,col2\n1,2\n")
+
+        config = ValidationConfig()
+        result = check_filesystem(tmp_path, config)
+
+        # Should warn about long filename
+        long_warnings = [
+            issue for issue in result.issues
+            if "very long" in issue.message.lower()
+        ]
+        assert len(long_warnings) == 1
+        assert "120 characters" in long_warnings[0].message
+
+    def test_detects_extremely_long_filenames(self, tmp_path):
+        """Test that filenames exceeding maximum length are errors."""
+        # Create a file with name exceeding 255 characters
+        extreme_name = "b" * 260 + ".csv"
+        try:
+            (tmp_path / extreme_name).write_text("col1,col2\n1,2\n")
+
+            config = ValidationConfig()
+            result = check_filesystem(tmp_path, config)
+
+            # Should error about exceeding maximum length
+            length_errors = [
+                issue for issue in result.issues
+                if "exceeds maximum length" in issue.message.lower()
+            ]
+            assert len(length_errors) == 1
+            assert "260 characters" in length_errors[0].message
+        except OSError:
+            # Skip test if filesystem doesn't support such long names
+            pytest.skip("Filesystem doesn't support filenames over 255 characters")
+
+    def test_detects_proprietary_formats(self, tmp_path):
+        """Test that proprietary file formats are flagged."""
+        # Create files with proprietary formats
+        (tmp_path / "data.xlsx").write_text("test")
+        (tmp_path / "analysis.mat").write_text("test")
+        (tmp_path / "results.sas7bdat").write_text("test")
+
+        config = ValidationConfig()
+        result = check_filesystem(tmp_path, config)
+
+        # Should warn about proprietary data formats (not .docx which is allowed)
+        proprietary_warnings = [
+            issue for issue in result.issues
+            if "proprietary file format" in issue.message.lower()
+        ]
+        assert len(proprietary_warnings) == 3
+
+        # Check that suggestions include alternatives
+        suggestions = [issue.suggestion for issue in proprietary_warnings]
+        assert any(".csv" in s or ".parquet" in s for s in suggestions)
+        assert any(".zarr" in s for s in suggestions)
+
+    def test_allows_open_formats(self, tmp_path):
+        """Test that open file formats are not flagged."""
+        # Create files with open formats (including .docx which is now allowed)
+        (tmp_path / "README.md").write_text("# Test")
+        (tmp_path / "data.csv").write_text("col1,col2\n1,2\n")
+        (tmp_path / "signal.hdf5").write_text("test")
+        (tmp_path / "record.json").write_text("{}")
+        (tmp_path / "notes.txt").write_text("notes")
+        (tmp_path / "protocol.docx").write_text("test")  # .docx is now allowed
+
+        config = ValidationConfig()
+        result = check_filesystem(tmp_path, config)
+
+        # Should not warn about proprietary formats
+        proprietary_warnings = [
+            issue for issue in result.issues
+            if "proprietary file format" in issue.message.lower()
+        ]
+        assert len(proprietary_warnings) == 0
+
+
+class TestDocumentationChecks:
+    """Tests for documentation validation checks."""
+
+    def test_readme_required_by_default(self, tmp_path):
+        """Test that README.md is required by default."""
+        config = ValidationConfig()
+        result = check_documentation(tmp_path, config)
+
+        # Should have error for missing README.md
+        assert result.error_count == 1
+        assert any("README.md" in issue.message for issue in result.issues)
+
+        # Should have helpful suggestion about minimum content
+        readme_issue = [issue for issue in result.issues if "README.md" in issue.message][0]
+        assert "title and a brief description" in readme_issue.suggestion
+
+    def test_custom_required_files(self, tmp_path):
+        """Test that custom required files are validated."""
+        config = ValidationConfig(required_files=["README.md", "LICENSE"])
+        result = check_documentation(tmp_path, config)
+
+        # Should have errors for both missing files
+        assert result.error_count == 2
+        assert any("README.md" in issue.message for issue in result.issues)
+        assert any("LICENSE" in issue.message for issue in result.issues)
+
+    def test_required_file_exists(self, tmp_path):
+        """Test that existing required file passes validation."""
+        readme = tmp_path / "README.md"
+        readme.write_text("# Title\n\nSome content.")
+
+        config = ValidationConfig(required_files=["README.md"])
+        result = check_documentation(tmp_path, config)
+
+        # Should have no errors since README exists
+        assert result.error_count == 0
+
+
+class TestIntegrityChecks:
+    """Tests for data integrity validation checks."""
+
+    def test_validates_valid_csv(self, tmp_path):
+        """Test that valid CSV passes validation."""
+        csv_file = tmp_path / "data.csv"
+        csv_file.write_text("col1,col2,col3\n1,2,3\n4,5,6\n")
+
+        config = ValidationConfig()
+        result = check_integrity(tmp_path, config)
+
+        assert result.error_count == 0
+
+    def test_detects_empty_csv(self, tmp_path):
+        """Test that empty CSV is detected."""
+        csv_file = tmp_path / "data.csv"
+        csv_file.write_text("")
+
+        config = ValidationConfig()
+        result = check_integrity(tmp_path, config)
+
+        assert any("empty" in issue.message.lower() for issue in result.issues)
+
+    def test_detects_duplicate_column_names(self, tmp_path):
+        """Test that duplicate column names are detected."""
+        csv_file = tmp_path / "data.csv"
+        csv_file.write_text("col1,col2,col1\n1,2,3\n")
+
+        config = ValidationConfig()
+        result = check_integrity(tmp_path, config)
+
+        assert any("Duplicate" in issue.message for issue in result.issues)
+
+    def test_detects_inconsistent_row_length(self, tmp_path):
+        """Test that inconsistent row lengths are detected."""
+        csv_file = tmp_path / "data.csv"
+        csv_file.write_text("col1,col2,col3\n1,2,3\n4,5\n6,7,8,9\n")
+
+        config = ValidationConfig()
+        result = check_integrity(tmp_path, config)
+
+        # Should detect both short and long rows
+        assert result.error_count >= 2
+
+    def test_detects_encoding_issues(self, tmp_path):
+        """Test that encoding issues are detected."""
+        csv_file = tmp_path / "data.csv"
+        # Write invalid UTF-8
+        csv_file.write_bytes(b"col1,col2\n1,\xff\xfe\n")
+
+        config = ValidationConfig()
+        result = check_integrity(tmp_path, config)
+
+        assert any("encoding" in issue.message.lower() for issue in result.issues)
+
+
+class TestQualityChecks:
+    """Tests for data quality validation checks."""
+
+    def test_detects_completely_empty_columns(self, tmp_path):
+        """Test that completely empty columns (100% missing) are detected."""
+        csv_file = tmp_path / "data.csv"
+        # Create CSV with one column that's 100% empty
+        rows = ["col1,col2,col3\n"]
+        for i in range(10):
+            rows.append(f"{i},data,\n")
+        csv_file.write_text("".join(rows))
+
+        config = ValidationConfig()
+        result = check_quality(tmp_path, config)
+
+        # Should detect the empty column
+        assert any("empty" in issue.message.lower() and "col3" in issue.column for issue in result.issues)
+
+    def test_partial_missing_values_not_flagged(self, tmp_path):
+        """Test that partially missing columns (e.g., 75%) are not flagged."""
+        csv_file = tmp_path / "data.csv"
+        # Create CSV with 75% missing values in a column
+        rows = ["col1,col2\n"]
+        for i in range(100):
+            if i < 75:
+                rows.append("1,\n")
+            else:
+                rows.append("1,2\n")
+        csv_file.write_text("".join(rows))
+
+        config = ValidationConfig()
+        result = check_quality(tmp_path, config)
+
+        # Should NOT flag col2 since it has some data (25%)
+        assert not any("col2" in str(issue.column) for issue in result.issues)
+
+    def test_detects_out_of_range_values(self, tmp_path):
+        """Test that out-of-range values are detected."""
+        csv_file = tmp_path / "data.csv"
+        csv_file.write_text("heart_rate\n80\n350\n75\n")
+
+        config = ValidationConfig(value_ranges={"heart_rate": (20, 300)})
+        result = check_quality(tmp_path, config)
+
+        assert any("outside expected range" in issue.message for issue in result.issues)
+
+
+class TestPrivacyChecks:
+    """Tests for privacy validation checks."""
+
+    def test_date_format_not_flagged(self, tmp_path):
+        """Test that date formats (YYYY-MM-DD) are not automatically flagged as PHI.
+
+        Dates are commonly used in medical datasets as de-identified timestamps.
+        They should not be flagged without additional context.
+        """
+        csv_file = tmp_path / "data.csv"
+        csv_file.write_text("patient_id,admission_date\n1,2023-05-15\n2,2023-06-20\n")
+
+        config = ValidationConfig()
+        result = check_privacy(tmp_path, config)
+
+        # Dates alone should not be flagged
+        assert result.error_count == 0
+
+    def test_detects_email_addresses(self, tmp_path):
+        """Test that email addresses are detected as PHI."""
+        csv_file = tmp_path / "data.csv"
+        csv_file.write_text("patient_id,contact\n1,patient@example.com\n2,test@test.com\n")
+
+        config = ValidationConfig()
+        result = check_privacy(tmp_path, config)
+
+        # Should have one warning for the 'contact' column with pattern type
+        assert result.warning_count == 1
+        assert any(
+            issue.severity == Severity.WARNING
+            and "contact" in str(issue.column)
+            and "email address" in issue.message
+            for issue in result.issues
+        )
+
+    def test_detects_age_violations(self, tmp_path):
+        """Test that ages over limit are detected."""
+        csv_file = tmp_path / "data.csv"
+        csv_file.write_text("patient_id,age\n1,92\n2,95\n3,85\n")
+
+        config = ValidationConfig(allowed_age_max=89)
+        result = check_privacy(tmp_path, config)
+
+        # Should have one warning for the age column (consolidated)
+        age_violations = [
+            issue for issue in result.issues
+            if "age" in issue.message.lower() and issue.severity == Severity.WARNING
+        ]
+        assert len(age_violations) == 1
+        assert "age" in age_violations[0].column.lower()
+
+    def test_text_files_checked_for_phi(self, tmp_path):
+        """Test that text files are checked for PHI patterns."""
+        text_file = tmp_path / "notes.txt"
+        text_file.write_text("Contact: test@example.com\nPhone: 555-123-4567")
+
+        config = ValidationConfig()
+        result = check_privacy(tmp_path, config)
+
+        # Should detect private information patterns in text files as a single consolidated warning with pattern types
+        assert result.warning_count >= 1
+        assert any(
+            "private information detected" in issue.message
+            and ("email address" in issue.message or "phone number" in issue.message)
+            for issue in result.issues
+        )
+
+    def test_allows_year_only_dates(self, tmp_path):
+        """Test that year-only dates are allowed."""
+        csv_file = tmp_path / "data.csv"
+        csv_file.write_text("patient_id,year\n1,2023\n2,2024\n")
+
+        config = ValidationConfig()
+        result = check_privacy(tmp_path, config)
+
+        # Should not flag year-only as PHI
+        phi_issues = [
+            issue for issue in result.issues
+            if issue.severity == Severity.ERROR
+        ]
+        assert len(phi_issues) == 0
+
+    def test_detects_sensitive_config_files(self, tmp_path):
+        """Test that sensitive configuration files are detected."""
+        # Create some sensitive files
+        (tmp_path / ".env").write_text("API_KEY=secret123")
+        (tmp_path / "credentials.json").write_text('{"key": "value"}')
+        (tmp_path / "id_rsa").write_text("-----BEGIN RSA PRIVATE KEY-----")
+
+        config = ValidationConfig()
+        result = check_privacy(tmp_path, config)
+
+        # Should detect all three sensitive files as errors
+        sensitive_file_errors = [
+            issue for issue in result.issues
+            if issue.severity == Severity.ERROR and "Sensitive file detected" in issue.message
+        ]
+        assert len(sensitive_file_errors) == 3
+
+    def test_detects_files_with_sensitive_names(self, tmp_path):
+        """Test that files with sensitive keywords in names are flagged."""
+        (tmp_path / "my_api_key.txt").write_text("some data")
+        (tmp_path / "database_password.csv").write_text("col1\nval1")
+
+        config = ValidationConfig()
+        result = check_privacy(tmp_path, config)
+
+        # Should warn about files with sensitive keywords in names
+        keyword_warnings = [
+            issue for issue in result.issues
+            if issue.severity == Severity.WARNING and "name suggests sensitive content" in issue.message
+        ]
+        assert len(keyword_warnings) >= 2
+
+    def test_detects_key_file_extensions(self, tmp_path):
+        """Test that private key file extensions are detected."""
+        (tmp_path / "server.pem").write_text("certificate")
+        (tmp_path / "private.key").write_text("key data")
+
+        config = ValidationConfig()
+        result = check_privacy(tmp_path, config)
+
+        # Should detect both key files
+        key_errors = [
+            issue for issue in result.issues
+            if issue.severity == Severity.ERROR
+        ]
+        assert len(key_errors) >= 2
+
+    def test_sampling_large_files(self, tmp_path):
+        """Test that large files are sampled for performance."""
+        csv_file = tmp_path / "large.csv"
+
+        # Create a file with more rows than the sampling limit
+        rows = ["patient_id,email\n"]
+        for i in range(15000):  # More than default max_rows_to_scan (10000)
+            rows.append(f"{i},test{i}@example.com\n")
+        csv_file.write_text("".join(rows))
+
+        config = ValidationConfig(max_rows_to_scan=1000, sample_large_files=True)
+        result = check_privacy(tmp_path, config)
+
+        # Should still detect the email pattern even with sampling
+        assert result.warning_count >= 1
+        assert any("email" in str(issue.column) for issue in result.issues)
diff --git a/tests/validate/test_cli.py b/tests/validate/test_cli.py
new file mode 100644
index 00000000..0cb17ed5
--- /dev/null
+++ b/tests/validate/test_cli.py
@@ -0,0 +1,134 @@
+"""Tests for CLI interface."""
+
+import pytest
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+
+class TestValidateCLI:
+    """Tests for the validate CLI command."""
+
+    def test_cli_validates_directory(self, tmp_path):
+        """Test that CLI can validate a directory."""
+        # Create a minimal dataset
+        readme = tmp_path / "README.md"
+        readme.write_text("""# Test Dataset
+
+## Background
+Test background.
+
+## Methods
+Test methods.
+
+## Data Description
+Test data.
+
+## Usage Notes
+Test usage.
+
+## References
+Test references.
+""")
+
+        # Run CLI
+        result = subprocess.run(
+            [sys.executable, "-m", "physionet", "validate", str(tmp_path)],
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        assert "PhysioNet Dataset Validation Report" in result.stdout
+
+    def test_cli_handles_nonexistent_path(self):
+        """Test that CLI handles nonexistent paths gracefully."""
+        result = subprocess.run(
+            [sys.executable, "-m", "physionet", "validate", "/nonexistent/path"],
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert "does not exist" in result.stderr
+
+    def test_cli_generates_json_report(self, tmp_path):
+        """Test that CLI can generate JSON report."""
+        # Create dataset
+        readme = tmp_path / "README.md"
+        readme.write_text("# Test")
+
+        # Run CLI with --report
+        report_file = tmp_path / "report.json"
+        result = subprocess.run(
+            [sys.executable, "-m", "physionet", "validate", str(tmp_path), "--report", str(report_file)],
+            capture_output=True,
+            text=True,
+        )
+
+        # Check that report was created
+        assert report_file.exists()
+
+        # Validate JSON structure
+        with open(report_file) as f:
+            report = json.load(f)
+
+        assert "dataset_path" in report
+        assert "timestamp" in report
+        assert "summary" in report
+        assert "checks" in report
+
+    def test_cli_filters_by_check_category(self, tmp_path):
+        """Test that CLI can filter checks by category."""
+        readme = tmp_path / "README.md"
+        readme.write_text("# Test")
+
+        result = subprocess.run(
+            [sys.executable, "-m", "physionet", "validate", str(tmp_path), "--checks", "filesystem"],
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        # Should only show filesystem checks
+        assert "Filesystem" in result.stdout or "filesystem" in result.stdout.lower()
+
+    def test_cli_exits_with_error_on_validation_failure(self, tmp_path):
+        """Test that CLI exits with error code when validation fails."""
+        # Create dataset with PHI
+        csv_file = tmp_path / "data.csv"
+        csv_file.write_text("patient_id,email\n1,test@example.com\n")
+
+        result = subprocess.run(
+            [sys.executable, "-m", "physionet", "validate", str(tmp_path)],
+            capture_output=True,
+            text=True,
+        )
+
+        # Should exit with error code due to validation errors
+        assert result.returncode == 1
+
+    def test_cli_shows_help(self):
+        """Test that CLI shows help message."""
+        result = subprocess.run(
+            [sys.executable, "-m", "physionet", "--help"],
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        assert "validate" in result.stdout
+
+    def test_validate_subcommand_help(self):
+        """Test that validate subcommand shows help."""
+        result = subprocess.run(
+            [sys.executable, "-m", "physionet", "validate", "--help"],
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        assert "path" in result.stdout
+        assert "--report" in result.stdout
+        assert "--checks" in result.stdout
diff --git a/tests/validate/test_validator.py b/tests/validate/test_validator.py
new file mode 100644
index 00000000..3d6b3a60
--- /dev/null
+++ b/tests/validate/test_validator.py
@@ -0,0 +1,183 @@
+"""Tests for main validation functionality."""
+
+import pytest
+import tempfile
+from pathlib import Path
+
+from physionet.validate import validate_dataset, ValidationConfig
+from physionet.validate.models import Severity, CheckCategory
+
+
+class TestValidateDataset:
+    """Tests for validate_dataset function."""
+
+    def test_nonexistent_path_raises_error(self):
+        """Test that validating a nonexistent path raises ValueError."""
+        with pytest.raises(ValueError, match="does not exist"):
+            validate_dataset("/nonexistent/path")
+
+    def test_file_instead_of_directory_raises_error(self, tmp_path):
+        """Test that validating a file instead of directory raises ValueError."""
+        test_file = tmp_path / "test.txt"
+        test_file.write_text("test")
+
+        with pytest.raises(ValueError, match="not a directory"):
+            validate_dataset(str(test_file))
+
+    def test_empty_directory_validation(self, tmp_path):
+        """Test validation of an empty directory."""
+        result = validate_dataset(str(tmp_path))
+
+        assert result.dataset_path == tmp_path.name
+        assert result.timestamp is not None
+        assert result.dataset_stats.file_count == 0
+        assert result.dataset_stats.total_size_bytes == 0
+
+        # Should have error for missing README.md
+        assert result.total_errors == 1
+        assert any("README.md" in str(issue.message) for issue in result.check_results[CheckCategory.DOCUMENTATION].issues)
+
+    def test_minimal_valid_dataset(self, tmp_path):
+        """Test validation of a minimal valid dataset."""
+        # Create README and a simple CSV file
+        (tmp_path / "README.md").write_text("# Test Dataset")
+        csv_file = tmp_path / "data.csv"
+        csv_file.write_text("id,value\n1,100\n2,200\n")
+
+        result = validate_dataset(str(tmp_path))
+
+        assert result.dataset_stats.file_count == 2
+        assert result.total_errors == 0
+
+    def test_validation_with_custom_config(self, tmp_path):
+        """Test validation with custom configuration."""
+        # Create a dataset with custom requirements
+        readme = tmp_path / "README.md"
+        readme.write_text("# Test")
+
+        config = ValidationConfig(
+            check_filesystem=True,
+            check_documentation=False,  # Disable documentation checks
+            check_integrity=False,
+            check_quality=False,
+            check_phi=False,
+        )
+
+        result = validate_dataset(str(tmp_path), config)
+
+        # Should only have filesystem checks
+        assert CheckCategory.FILESYSTEM in result.check_results
+        assert CheckCategory.DOCUMENTATION not in result.check_results
+
+    def test_validation_without_progress_bar(self, tmp_path):
+        """Test validation with progress bar disabled."""
+        readme = tmp_path / "README.md"
+        readme.write_text("# Test")
+
+        # Should not raise any errors with show_progress=False
+        result = validate_dataset(str(tmp_path), show_progress=False)
+        assert result.total_errors == 0
+
+
+class TestValidationStats:
+    """Tests for dataset statistics calculation."""
+
+    def test_calculates_file_count(self, tmp_path):
+        """Test that file count is calculated correctly."""
+        (tmp_path / "README.md").write_text("# Test")
+        (tmp_path / "data.csv").write_text("col1,col2\n1,2\n")
+        (tmp_path / "subdir").mkdir()
+        (tmp_path / "subdir" / "data2.csv").write_text("col1\n1\n")
+
+        result = validate_dataset(str(tmp_path))
+
+        assert result.dataset_stats.file_count == 3
+        assert result.dataset_stats.directory_count == 1
+
+    def test_calculates_total_size(self, tmp_path):
+        """Test that total size is calculated correctly."""
+        content = "x" * 1000
+        (tmp_path / "README.md").write_text(content)
+
+        result = validate_dataset(str(tmp_path))
+
+        assert result.dataset_stats.total_size_bytes >= 1000
+
+    def test_ignores_specified_patterns(self, tmp_path):
+        """Test that ignored patterns are not counted in stats."""
+        (tmp_path / "README.md").write_text("# Test")
+        (tmp_path / ".git").mkdir()
+        (tmp_path / ".git" / "config").write_text("test")
+
+        result = validate_dataset(str(tmp_path))
+
+        # .git directory and its contents should be ignored
+        assert result.dataset_stats.file_count == 1
+
+
+class TestValidationResult:
+    """Tests for ValidationResult model."""
+
+    def test_summary_format(self, tmp_path):
+        """Test that summary is properly formatted."""
+        (tmp_path / "README.md").write_text("# Test")
+
+        result = validate_dataset(str(tmp_path))
+        summary = result.summary()
+
+        assert "PhysioNet Dataset Validation Report" in summary
+        assert tmp_path.name in summary
+        assert "Summary:" in summary
+        assert "Metadata:" in summary
+        assert "Validation Results:" in summary
+
+    def test_to_dict_format(self, tmp_path):
+        """Test that to_dict produces valid structure."""
+        (tmp_path / "README.md").write_text("# Test")
+
+        result = validate_dataset(str(tmp_path))
+        result_dict = result.to_dict()
+
+        assert "dataset_path" in result_dict
+        assert "timestamp" in result_dict
+        assert "dataset_stats" in result_dict
+        assert "summary" in result_dict
+        assert "checks" in result_dict
+
+        assert result_dict["summary"]["total_errors"] == result.total_errors
+        assert result_dict["summary"]["total_warnings"] == result.total_warnings
+
+    def test_recommendations_section(self, tmp_path):
+        """Test that recommendations section is included when there are issues."""
+        # Create files with issues to trigger recommendations
+        (tmp_path / "file with spaces.csv").write_text("col1,col2\n1,2\n")
+        (tmp_path / ".env").write_text("API_KEY=secret")
+        (tmp_path / "empty.txt").write_text("")
+
+        result = validate_dataset(str(tmp_path))
+        summary = result.summary()
+
+        # Should include recommendations section
+        assert "Recommendations:" in summary
+        assert "Replace spaces with underscores or hyphens" in summary
+        assert "Remove" in summary  # Various remove recommendations
+
+    def test_large_dataset_recommendation(self, tmp_path):
+        """Test that large datasets get upload assistance recommendation."""
+        # Create README to avoid documentation errors
+        (tmp_path / "README.md").write_text("# Large Dataset")
+
+        # Create a large file (simulated - we'll modify the stats)
+        (tmp_path / "data.csv").write_text("col1,col2\n1,2\n")
+
+        result = validate_dataset(str(tmp_path))
+
+        # Manually set large size for testing (>200GB)
+        result.dataset_stats.total_size_bytes = 250 * 1024 ** 3  # 250 GB
+
+        summary = result.summary()
+
+        # Should include contact recommendation for large datasets
+        assert "contact@physionet.org" in summary
+        assert "very large" in summary.lower()
+        assert "250" in summary  # Should show the size